diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { - "best_metric": 1.1752163171768188, - "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-1600", - "epoch": 0.2048, + "best_metric": 0.9677584767341614, + "best_model_checkpoint": "./bert_mlm_finetuned/checkpoint-7400", + "epoch": 1.151936, "eval_steps": 100, - "global_step": 1600, + "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -2375,6 +2375,10958 @@ "eval_samples_per_second": 150.056, "eval_steps_per_second": 18.757, "step": 1600 + }, + { + "epoch": 0.20544, + "grad_norm": 3.658411741256714, + "learning_rate": 9.756341158226578e-06, + "loss": 4.8328, + "step": 1605 + }, + { + "epoch": 0.20608, + "grad_norm": 3.2841243743896484, + "learning_rate": 9.754783653689595e-06, + "loss": 4.7692, + "step": 1610 + }, + { + "epoch": 0.20672, + "grad_norm": 3.305380344390869, + "learning_rate": 9.75322131221888e-06, + "loss": 4.7308, + "step": 1615 + }, + { + "epoch": 0.20736, + "grad_norm": 3.596205472946167, + "learning_rate": 9.751654135403764e-06, + "loss": 4.7954, + "step": 1620 + }, + { + "epoch": 0.208, + "grad_norm": 3.223118782043457, + "learning_rate": 9.750082124838505e-06, + "loss": 4.7433, + "step": 1625 + }, + { + "epoch": 0.20864, + "grad_norm": 3.376711845397949, + "learning_rate": 9.748505282122269e-06, + "loss": 4.8109, + "step": 1630 + }, + { + "epoch": 0.20928, + "grad_norm": 3.416400194168091, + "learning_rate": 9.746923608859147e-06, + "loss": 4.8006, + "step": 1635 + }, + { + "epoch": 0.20992, + "grad_norm": 3.3802921772003174, + "learning_rate": 9.745337106658139e-06, + "loss": 4.7256, + "step": 1640 + }, + { + "epoch": 0.21056, + "grad_norm": 3.3929057121276855, + "learning_rate": 9.743745777133153e-06, + "loss": 4.6883, + "step": 1645 + }, + { + "epoch": 0.2112, + "grad_norm": 3.730523109436035, + "learning_rate": 9.742149621903018e-06, + "loss": 4.9453, + "step": 1650 + }, + { + "epoch": 0.21184, + "grad_norm": 3.5694522857666016, + "learning_rate": 9.740548642591463e-06, + "loss": 4.9234, + "step": 1655 + }, + { + "epoch": 0.21248, + "grad_norm": 3.386958122253418, + "learning_rate": 9.73894284082713e-06, + "loss": 4.6964, + "step": 1660 + }, + { + "epoch": 0.21312, + "grad_norm": 3.6847736835479736, + "learning_rate": 9.737332218243565e-06, + "loss": 4.8865, + "step": 1665 + }, + { + "epoch": 0.21376, + "grad_norm": 3.856374979019165, + "learning_rate": 9.735716776479215e-06, + "loss": 4.7383, + "step": 1670 + }, + { + "epoch": 0.2144, + "grad_norm": 3.566790819168091, + "learning_rate": 9.734096517177436e-06, + "loss": 4.7605, + "step": 1675 + }, + { + "epoch": 0.21504, + "grad_norm": 4.066038131713867, + "learning_rate": 9.732471441986479e-06, + "loss": 4.6265, + "step": 1680 + }, + { + "epoch": 0.21568, + "grad_norm": 3.6826140880584717, + "learning_rate": 9.730841552559496e-06, + "loss": 4.8244, + "step": 1685 + }, + { + "epoch": 0.21632, + "grad_norm": 3.5930440425872803, + "learning_rate": 9.729206850554537e-06, + "loss": 4.8389, + "step": 1690 + }, + { + "epoch": 0.21696, + "grad_norm": 3.6579551696777344, + "learning_rate": 9.727567337634547e-06, + "loss": 4.7213, + "step": 1695 + }, + { + "epoch": 0.2176, + "grad_norm": 3.4516382217407227, + "learning_rate": 9.725923015467368e-06, + "loss": 4.7244, + "step": 1700 + }, + { + "epoch": 0.2176, + "eval_loss": 1.1895771026611328, + "eval_runtime": 6.8327, + "eval_samples_per_second": 146.355, + "eval_steps_per_second": 18.294, + "step": 1700 + }, + { + "epoch": 0.21824, + "grad_norm": 3.3916966915130615, + "learning_rate": 9.724273885725728e-06, + "loss": 4.7104, + "step": 1705 + }, + { + "epoch": 0.21888, + "grad_norm": 3.560164451599121, + "learning_rate": 9.72261995008725e-06, + "loss": 4.7226, + "step": 1710 + }, + { + "epoch": 0.21952, + "grad_norm": 3.647092342376709, + "learning_rate": 9.72096121023445e-06, + "loss": 4.8889, + "step": 1715 + }, + { + "epoch": 0.22016, + "grad_norm": 3.1930177211761475, + "learning_rate": 9.719297667854718e-06, + "loss": 4.6211, + "step": 1720 + }, + { + "epoch": 0.2208, + "grad_norm": 3.2158684730529785, + "learning_rate": 9.717629324640347e-06, + "loss": 4.684, + "step": 1725 + }, + { + "epoch": 0.22144, + "grad_norm": 3.610088348388672, + "learning_rate": 9.715956182288498e-06, + "loss": 4.858, + "step": 1730 + }, + { + "epoch": 0.22208, + "grad_norm": 3.80896258354187, + "learning_rate": 9.714278242501222e-06, + "loss": 4.6532, + "step": 1735 + }, + { + "epoch": 0.22272, + "grad_norm": 3.3281309604644775, + "learning_rate": 9.712595506985452e-06, + "loss": 4.7788, + "step": 1740 + }, + { + "epoch": 0.22336, + "grad_norm": 3.551541328430176, + "learning_rate": 9.710907977452995e-06, + "loss": 4.46, + "step": 1745 + }, + { + "epoch": 0.224, + "grad_norm": 3.282968044281006, + "learning_rate": 9.709215655620538e-06, + "loss": 4.6924, + "step": 1750 + }, + { + "epoch": 0.22464, + "grad_norm": 3.20308780670166, + "learning_rate": 9.707518543209638e-06, + "loss": 4.7473, + "step": 1755 + }, + { + "epoch": 0.22528, + "grad_norm": 3.8178443908691406, + "learning_rate": 9.705816641946733e-06, + "loss": 4.6526, + "step": 1760 + }, + { + "epoch": 0.22592, + "grad_norm": 3.6582953929901123, + "learning_rate": 9.704109953563126e-06, + "loss": 4.5572, + "step": 1765 + }, + { + "epoch": 0.22656, + "grad_norm": 3.1742563247680664, + "learning_rate": 9.702398479794994e-06, + "loss": 4.6242, + "step": 1770 + }, + { + "epoch": 0.2272, + "grad_norm": 3.8248085975646973, + "learning_rate": 9.70068222238338e-06, + "loss": 4.6163, + "step": 1775 + }, + { + "epoch": 0.22784, + "grad_norm": 3.4912667274475098, + "learning_rate": 9.698961183074194e-06, + "loss": 4.6932, + "step": 1780 + }, + { + "epoch": 0.22848, + "grad_norm": 3.227597236633301, + "learning_rate": 9.69723536361821e-06, + "loss": 4.7776, + "step": 1785 + }, + { + "epoch": 0.22912, + "grad_norm": 3.329366445541382, + "learning_rate": 9.695504765771066e-06, + "loss": 4.6441, + "step": 1790 + }, + { + "epoch": 0.22976, + "grad_norm": 3.5038723945617676, + "learning_rate": 9.693769391293257e-06, + "loss": 4.6334, + "step": 1795 + }, + { + "epoch": 0.2304, + "grad_norm": 3.5325582027435303, + "learning_rate": 9.692029241950144e-06, + "loss": 4.5945, + "step": 1800 + }, + { + "epoch": 0.2304, + "eval_loss": 1.1731183528900146, + "eval_runtime": 6.8034, + "eval_samples_per_second": 146.984, + "eval_steps_per_second": 18.373, + "step": 1800 + }, + { + "epoch": 0.23104, + "grad_norm": 3.3038785457611084, + "learning_rate": 9.69028431951194e-06, + "loss": 4.6645, + "step": 1805 + }, + { + "epoch": 0.23168, + "grad_norm": 5.602443695068359, + "learning_rate": 9.688534625753713e-06, + "loss": 4.7724, + "step": 1810 + }, + { + "epoch": 0.23232, + "grad_norm": 3.3495841026306152, + "learning_rate": 9.686780162455389e-06, + "loss": 4.6676, + "step": 1815 + }, + { + "epoch": 0.23296, + "grad_norm": 3.6556856632232666, + "learning_rate": 9.685020931401745e-06, + "loss": 4.6832, + "step": 1820 + }, + { + "epoch": 0.2336, + "grad_norm": 3.3571536540985107, + "learning_rate": 9.683256934382406e-06, + "loss": 4.693, + "step": 1825 + }, + { + "epoch": 0.23424, + "grad_norm": 3.542635917663574, + "learning_rate": 9.681488173191843e-06, + "loss": 4.6774, + "step": 1830 + }, + { + "epoch": 0.23488, + "grad_norm": 3.4921188354492188, + "learning_rate": 9.679714649629381e-06, + "loss": 4.6432, + "step": 1835 + }, + { + "epoch": 0.23552, + "grad_norm": 3.424345016479492, + "learning_rate": 9.677936365499183e-06, + "loss": 4.7415, + "step": 1840 + }, + { + "epoch": 0.23616, + "grad_norm": 3.3347842693328857, + "learning_rate": 9.676153322610259e-06, + "loss": 4.61, + "step": 1845 + }, + { + "epoch": 0.2368, + "grad_norm": 3.375143051147461, + "learning_rate": 9.674365522776456e-06, + "loss": 4.6775, + "step": 1850 + }, + { + "epoch": 0.23744, + "grad_norm": 3.3928396701812744, + "learning_rate": 9.672572967816464e-06, + "loss": 4.7787, + "step": 1855 + }, + { + "epoch": 0.23808, + "grad_norm": 3.4334990978240967, + "learning_rate": 9.670775659553808e-06, + "loss": 4.7256, + "step": 1860 + }, + { + "epoch": 0.23872, + "grad_norm": 3.456284761428833, + "learning_rate": 9.668973599816847e-06, + "loss": 4.5238, + "step": 1865 + }, + { + "epoch": 0.23936, + "grad_norm": 3.2843079566955566, + "learning_rate": 9.66716679043878e-06, + "loss": 4.6657, + "step": 1870 + }, + { + "epoch": 0.24, + "grad_norm": 3.8443307876586914, + "learning_rate": 9.66535523325763e-06, + "loss": 4.7962, + "step": 1875 + }, + { + "epoch": 0.24064, + "grad_norm": 3.5036277770996094, + "learning_rate": 9.663538930116251e-06, + "loss": 4.6989, + "step": 1880 + }, + { + "epoch": 0.24128, + "grad_norm": 3.1674866676330566, + "learning_rate": 9.661717882862333e-06, + "loss": 4.6334, + "step": 1885 + }, + { + "epoch": 0.24192, + "grad_norm": 3.5648207664489746, + "learning_rate": 9.659892093348383e-06, + "loss": 4.7952, + "step": 1890 + }, + { + "epoch": 0.24256, + "grad_norm": 3.5110340118408203, + "learning_rate": 9.658061563431734e-06, + "loss": 4.5461, + "step": 1895 + }, + { + "epoch": 0.2432, + "grad_norm": 3.374955415725708, + "learning_rate": 9.656226294974545e-06, + "loss": 4.5967, + "step": 1900 + }, + { + "epoch": 0.2432, + "eval_loss": 1.1671475172042847, + "eval_runtime": 6.824, + "eval_samples_per_second": 146.542, + "eval_steps_per_second": 18.318, + "step": 1900 + }, + { + "epoch": 0.24384, + "grad_norm": 3.341566324234009, + "learning_rate": 9.65438628984379e-06, + "loss": 4.7055, + "step": 1905 + }, + { + "epoch": 0.24448, + "grad_norm": 3.5377919673919678, + "learning_rate": 9.652541549911267e-06, + "loss": 4.729, + "step": 1910 + }, + { + "epoch": 0.24512, + "grad_norm": 3.4675512313842773, + "learning_rate": 9.65069207705359e-06, + "loss": 4.5535, + "step": 1915 + }, + { + "epoch": 0.24576, + "grad_norm": 3.2654201984405518, + "learning_rate": 9.648837873152182e-06, + "loss": 4.7749, + "step": 1920 + }, + { + "epoch": 0.2464, + "grad_norm": 3.5051968097686768, + "learning_rate": 9.646978940093283e-06, + "loss": 4.774, + "step": 1925 + }, + { + "epoch": 0.24704, + "grad_norm": 3.352511405944824, + "learning_rate": 9.645115279767947e-06, + "loss": 4.6353, + "step": 1930 + }, + { + "epoch": 0.24768, + "grad_norm": 3.4990932941436768, + "learning_rate": 9.64324689407203e-06, + "loss": 4.5353, + "step": 1935 + }, + { + "epoch": 0.24832, + "grad_norm": 3.782811164855957, + "learning_rate": 9.641373784906198e-06, + "loss": 4.7274, + "step": 1940 + }, + { + "epoch": 0.24896, + "grad_norm": 4.125209808349609, + "learning_rate": 9.639495954175926e-06, + "loss": 4.6549, + "step": 1945 + }, + { + "epoch": 0.2496, + "grad_norm": 3.609104633331299, + "learning_rate": 9.637613403791487e-06, + "loss": 4.7379, + "step": 1950 + }, + { + "epoch": 0.25024, + "grad_norm": 3.548325538635254, + "learning_rate": 9.635726135667955e-06, + "loss": 4.6096, + "step": 1955 + }, + { + "epoch": 0.25088, + "grad_norm": 3.4659600257873535, + "learning_rate": 9.63383415172521e-06, + "loss": 4.6373, + "step": 1960 + }, + { + "epoch": 0.25152, + "grad_norm": 3.2835114002227783, + "learning_rate": 9.631937453887917e-06, + "loss": 4.7666, + "step": 1965 + }, + { + "epoch": 0.25216, + "grad_norm": 3.5387489795684814, + "learning_rate": 9.63003604408555e-06, + "loss": 4.6144, + "step": 1970 + }, + { + "epoch": 0.2528, + "grad_norm": 3.3846347332000732, + "learning_rate": 9.628129924252368e-06, + "loss": 4.6594, + "step": 1975 + }, + { + "epoch": 0.25344, + "grad_norm": 3.6264524459838867, + "learning_rate": 9.626219096327424e-06, + "loss": 4.5614, + "step": 1980 + }, + { + "epoch": 0.25408, + "grad_norm": 3.254225492477417, + "learning_rate": 9.62430356225456e-06, + "loss": 4.6334, + "step": 1985 + }, + { + "epoch": 0.25472, + "grad_norm": 3.4200801849365234, + "learning_rate": 9.622383323982404e-06, + "loss": 4.6949, + "step": 1990 + }, + { + "epoch": 0.25536, + "grad_norm": 3.5615053176879883, + "learning_rate": 9.620458383464372e-06, + "loss": 4.6219, + "step": 1995 + }, + { + "epoch": 0.256, + "grad_norm": 3.5107123851776123, + "learning_rate": 9.618528742658662e-06, + "loss": 4.7314, + "step": 2000 + }, + { + "epoch": 0.256, + "eval_loss": 1.1562089920043945, + "eval_runtime": 6.9874, + "eval_samples_per_second": 143.114, + "eval_steps_per_second": 17.889, + "step": 2000 + }, + { + "epoch": 0.25664, + "grad_norm": 3.184966802597046, + "learning_rate": 9.616594403528255e-06, + "loss": 4.6189, + "step": 2005 + }, + { + "epoch": 0.25728, + "grad_norm": 3.4554250240325928, + "learning_rate": 9.61465536804091e-06, + "loss": 4.568, + "step": 2010 + }, + { + "epoch": 0.25792, + "grad_norm": 3.6082921028137207, + "learning_rate": 9.612711638169163e-06, + "loss": 4.6917, + "step": 2015 + }, + { + "epoch": 0.25856, + "grad_norm": 3.2755281925201416, + "learning_rate": 9.610763215890326e-06, + "loss": 4.5872, + "step": 2020 + }, + { + "epoch": 0.2592, + "grad_norm": 3.3548803329467773, + "learning_rate": 9.608810103186488e-06, + "loss": 4.5392, + "step": 2025 + }, + { + "epoch": 0.25984, + "grad_norm": 4.0107293128967285, + "learning_rate": 9.606852302044502e-06, + "loss": 4.6321, + "step": 2030 + }, + { + "epoch": 0.26048, + "grad_norm": 3.364811420440674, + "learning_rate": 9.604889814455997e-06, + "loss": 4.653, + "step": 2035 + }, + { + "epoch": 0.26112, + "grad_norm": 3.319972038269043, + "learning_rate": 9.602922642417368e-06, + "loss": 4.8393, + "step": 2040 + }, + { + "epoch": 0.26176, + "grad_norm": 3.2800612449645996, + "learning_rate": 9.600950787929773e-06, + "loss": 4.694, + "step": 2045 + }, + { + "epoch": 0.2624, + "grad_norm": 3.339520215988159, + "learning_rate": 9.598974252999136e-06, + "loss": 4.5774, + "step": 2050 + }, + { + "epoch": 0.26304, + "grad_norm": 3.2331771850585938, + "learning_rate": 9.59699303963614e-06, + "loss": 4.547, + "step": 2055 + }, + { + "epoch": 0.26368, + "grad_norm": 3.783783197402954, + "learning_rate": 9.595007149856228e-06, + "loss": 4.659, + "step": 2060 + }, + { + "epoch": 0.26432, + "grad_norm": 3.4253666400909424, + "learning_rate": 9.593016585679605e-06, + "loss": 4.6534, + "step": 2065 + }, + { + "epoch": 0.26496, + "grad_norm": 4.119887828826904, + "learning_rate": 9.591021349131222e-06, + "loss": 4.6676, + "step": 2070 + }, + { + "epoch": 0.2656, + "grad_norm": 3.2150423526763916, + "learning_rate": 9.589021442240789e-06, + "loss": 4.6495, + "step": 2075 + }, + { + "epoch": 0.26624, + "grad_norm": 3.347801923751831, + "learning_rate": 9.58701686704277e-06, + "loss": 4.5401, + "step": 2080 + }, + { + "epoch": 0.26688, + "grad_norm": 3.378760814666748, + "learning_rate": 9.585007625576368e-06, + "loss": 4.6443, + "step": 2085 + }, + { + "epoch": 0.26752, + "grad_norm": 3.516089677810669, + "learning_rate": 9.58299371988554e-06, + "loss": 4.6488, + "step": 2090 + }, + { + "epoch": 0.26816, + "grad_norm": 3.7253007888793945, + "learning_rate": 9.58097515201899e-06, + "loss": 4.5439, + "step": 2095 + }, + { + "epoch": 0.2688, + "grad_norm": 3.3970401287078857, + "learning_rate": 9.57895192403016e-06, + "loss": 4.5624, + "step": 2100 + }, + { + "epoch": 0.2688, + "eval_loss": 1.1436244249343872, + "eval_runtime": 6.885, + "eval_samples_per_second": 145.242, + "eval_steps_per_second": 18.155, + "step": 2100 + }, + { + "epoch": 0.26944, + "grad_norm": 3.578413486480713, + "learning_rate": 9.576924037977233e-06, + "loss": 4.6812, + "step": 2105 + }, + { + "epoch": 0.27008, + "grad_norm": 3.292541027069092, + "learning_rate": 9.574891495923133e-06, + "loss": 4.6228, + "step": 2110 + }, + { + "epoch": 0.27072, + "grad_norm": 3.393113851547241, + "learning_rate": 9.572854299935517e-06, + "loss": 4.6127, + "step": 2115 + }, + { + "epoch": 0.27136, + "grad_norm": 3.20076322555542, + "learning_rate": 9.570812452086779e-06, + "loss": 4.6345, + "step": 2120 + }, + { + "epoch": 0.272, + "grad_norm": 3.2253365516662598, + "learning_rate": 9.568765954454047e-06, + "loss": 4.6655, + "step": 2125 + }, + { + "epoch": 0.27264, + "grad_norm": 3.3711695671081543, + "learning_rate": 9.566714809119173e-06, + "loss": 4.519, + "step": 2130 + }, + { + "epoch": 0.27328, + "grad_norm": 3.2909939289093018, + "learning_rate": 9.564659018168743e-06, + "loss": 4.669, + "step": 2135 + }, + { + "epoch": 0.27392, + "grad_norm": 3.535353899002075, + "learning_rate": 9.562598583694067e-06, + "loss": 4.7123, + "step": 2140 + }, + { + "epoch": 0.27456, + "grad_norm": 3.3832180500030518, + "learning_rate": 9.560533507791174e-06, + "loss": 4.6207, + "step": 2145 + }, + { + "epoch": 0.2752, + "grad_norm": 3.3504884243011475, + "learning_rate": 9.558463792560826e-06, + "loss": 4.5949, + "step": 2150 + }, + { + "epoch": 0.27584, + "grad_norm": 3.9149844646453857, + "learning_rate": 9.556389440108493e-06, + "loss": 4.6805, + "step": 2155 + }, + { + "epoch": 0.27648, + "grad_norm": 3.232273817062378, + "learning_rate": 9.554310452544366e-06, + "loss": 4.6718, + "step": 2160 + }, + { + "epoch": 0.27712, + "grad_norm": 3.4460339546203613, + "learning_rate": 9.552226831983353e-06, + "loss": 4.4873, + "step": 2165 + }, + { + "epoch": 0.27776, + "grad_norm": 3.3323307037353516, + "learning_rate": 9.550138580545077e-06, + "loss": 4.6764, + "step": 2170 + }, + { + "epoch": 0.2784, + "grad_norm": 3.325422525405884, + "learning_rate": 9.548045700353865e-06, + "loss": 4.6288, + "step": 2175 + }, + { + "epoch": 0.27904, + "grad_norm": 3.620631694793701, + "learning_rate": 9.545948193538759e-06, + "loss": 4.397, + "step": 2180 + }, + { + "epoch": 0.27968, + "grad_norm": 3.4073946475982666, + "learning_rate": 9.543846062233502e-06, + "loss": 4.6072, + "step": 2185 + }, + { + "epoch": 0.28032, + "grad_norm": 3.269423246383667, + "learning_rate": 9.54173930857655e-06, + "loss": 4.6166, + "step": 2190 + }, + { + "epoch": 0.28096, + "grad_norm": 3.4265708923339844, + "learning_rate": 9.539627934711049e-06, + "loss": 4.7151, + "step": 2195 + }, + { + "epoch": 0.2816, + "grad_norm": 3.5224685668945312, + "learning_rate": 9.537511942784857e-06, + "loss": 4.6422, + "step": 2200 + }, + { + "epoch": 0.2816, + "eval_loss": 1.1447081565856934, + "eval_runtime": 6.5029, + "eval_samples_per_second": 153.778, + "eval_steps_per_second": 19.222, + "step": 2200 + }, + { + "epoch": 0.28224, + "grad_norm": 3.5605506896972656, + "learning_rate": 9.535391334950523e-06, + "loss": 4.5522, + "step": 2205 + }, + { + "epoch": 0.28288, + "grad_norm": 3.5018362998962402, + "learning_rate": 9.533266113365293e-06, + "loss": 4.5282, + "step": 2210 + }, + { + "epoch": 0.28352, + "grad_norm": 3.2997825145721436, + "learning_rate": 9.531136280191107e-06, + "loss": 4.5023, + "step": 2215 + }, + { + "epoch": 0.28416, + "grad_norm": 3.419358015060425, + "learning_rate": 9.529001837594599e-06, + "loss": 4.5891, + "step": 2220 + }, + { + "epoch": 0.2848, + "grad_norm": 3.175732374191284, + "learning_rate": 9.526862787747081e-06, + "loss": 4.6561, + "step": 2225 + }, + { + "epoch": 0.28544, + "grad_norm": 4.888601779937744, + "learning_rate": 9.524719132824569e-06, + "loss": 4.3645, + "step": 2230 + }, + { + "epoch": 0.28608, + "grad_norm": 3.487359046936035, + "learning_rate": 9.52257087500775e-06, + "loss": 4.4451, + "step": 2235 + }, + { + "epoch": 0.28672, + "grad_norm": 3.3482818603515625, + "learning_rate": 9.520418016482001e-06, + "loss": 4.7056, + "step": 2240 + }, + { + "epoch": 0.28736, + "grad_norm": 3.410296678543091, + "learning_rate": 9.518260559437371e-06, + "loss": 4.5788, + "step": 2245 + }, + { + "epoch": 0.288, + "grad_norm": 3.3393115997314453, + "learning_rate": 9.516098506068596e-06, + "loss": 4.5909, + "step": 2250 + }, + { + "epoch": 0.28864, + "grad_norm": 3.641216993331909, + "learning_rate": 9.513931858575084e-06, + "loss": 4.4144, + "step": 2255 + }, + { + "epoch": 0.28928, + "grad_norm": 3.332921266555786, + "learning_rate": 9.511760619160915e-06, + "loss": 4.5648, + "step": 2260 + }, + { + "epoch": 0.28992, + "grad_norm": 3.7218687534332275, + "learning_rate": 9.509584790034842e-06, + "loss": 4.5362, + "step": 2265 + }, + { + "epoch": 0.29056, + "grad_norm": 3.2724854946136475, + "learning_rate": 9.50740437341029e-06, + "loss": 4.601, + "step": 2270 + }, + { + "epoch": 0.2912, + "grad_norm": 3.7653846740722656, + "learning_rate": 9.50521937150534e-06, + "loss": 4.5099, + "step": 2275 + }, + { + "epoch": 0.29184, + "grad_norm": 3.398200035095215, + "learning_rate": 9.503029786542753e-06, + "loss": 4.6251, + "step": 2280 + }, + { + "epoch": 0.29248, + "grad_norm": 3.4145519733428955, + "learning_rate": 9.50083562074994e-06, + "loss": 4.517, + "step": 2285 + }, + { + "epoch": 0.29312, + "grad_norm": 3.541825294494629, + "learning_rate": 9.498636876358975e-06, + "loss": 4.6727, + "step": 2290 + }, + { + "epoch": 0.29376, + "grad_norm": 3.482581615447998, + "learning_rate": 9.496433555606594e-06, + "loss": 4.6123, + "step": 2295 + }, + { + "epoch": 0.2944, + "grad_norm": 3.4173848628997803, + "learning_rate": 9.494225660734186e-06, + "loss": 4.5458, + "step": 2300 + }, + { + "epoch": 0.2944, + "eval_loss": 1.1363893747329712, + "eval_runtime": 7.4169, + "eval_samples_per_second": 134.828, + "eval_steps_per_second": 16.853, + "step": 2300 + }, + { + "epoch": 0.29504, + "grad_norm": 3.2871265411376953, + "learning_rate": 9.492013193987788e-06, + "loss": 4.4816, + "step": 2305 + }, + { + "epoch": 0.29568, + "grad_norm": 3.55437970161438, + "learning_rate": 9.489796157618094e-06, + "loss": 4.5505, + "step": 2310 + }, + { + "epoch": 0.29632, + "grad_norm": 3.7020423412323, + "learning_rate": 9.487574553880447e-06, + "loss": 4.4503, + "step": 2315 + }, + { + "epoch": 0.29696, + "grad_norm": 3.1921563148498535, + "learning_rate": 9.485348385034834e-06, + "loss": 4.4986, + "step": 2320 + }, + { + "epoch": 0.2976, + "grad_norm": 3.4034197330474854, + "learning_rate": 9.483117653345883e-06, + "loss": 4.4827, + "step": 2325 + }, + { + "epoch": 0.29824, + "grad_norm": 3.3942666053771973, + "learning_rate": 9.480882361082871e-06, + "loss": 4.6199, + "step": 2330 + }, + { + "epoch": 0.29888, + "grad_norm": 3.1674904823303223, + "learning_rate": 9.478642510519706e-06, + "loss": 4.5244, + "step": 2335 + }, + { + "epoch": 0.29952, + "grad_norm": 3.4777538776397705, + "learning_rate": 9.476398103934941e-06, + "loss": 4.5746, + "step": 2340 + }, + { + "epoch": 0.30016, + "grad_norm": 3.476076602935791, + "learning_rate": 9.474149143611757e-06, + "loss": 4.5492, + "step": 2345 + }, + { + "epoch": 0.3008, + "grad_norm": 3.8204102516174316, + "learning_rate": 9.471895631837972e-06, + "loss": 4.6468, + "step": 2350 + }, + { + "epoch": 0.30144, + "grad_norm": 3.413212299346924, + "learning_rate": 9.469637570906032e-06, + "loss": 4.4807, + "step": 2355 + }, + { + "epoch": 0.30208, + "grad_norm": 3.604353904724121, + "learning_rate": 9.467374963113011e-06, + "loss": 4.5727, + "step": 2360 + }, + { + "epoch": 0.30272, + "grad_norm": 3.6752219200134277, + "learning_rate": 9.46510781076061e-06, + "loss": 4.6183, + "step": 2365 + }, + { + "epoch": 0.30336, + "grad_norm": 3.5153515338897705, + "learning_rate": 9.462836116155151e-06, + "loss": 4.4947, + "step": 2370 + }, + { + "epoch": 0.304, + "grad_norm": 3.4051671028137207, + "learning_rate": 9.460559881607579e-06, + "loss": 4.5685, + "step": 2375 + }, + { + "epoch": 0.30464, + "grad_norm": 3.418074131011963, + "learning_rate": 9.45827910943345e-06, + "loss": 4.7324, + "step": 2380 + }, + { + "epoch": 0.30528, + "grad_norm": 3.331264019012451, + "learning_rate": 9.455993801952949e-06, + "loss": 4.6685, + "step": 2385 + }, + { + "epoch": 0.30592, + "grad_norm": 3.6609222888946533, + "learning_rate": 9.453703961490863e-06, + "loss": 4.6168, + "step": 2390 + }, + { + "epoch": 0.30656, + "grad_norm": 3.3909974098205566, + "learning_rate": 9.451409590376598e-06, + "loss": 4.6097, + "step": 2395 + }, + { + "epoch": 0.3072, + "grad_norm": 3.4507110118865967, + "learning_rate": 9.449110690944163e-06, + "loss": 4.5849, + "step": 2400 + }, + { + "epoch": 0.3072, + "eval_loss": 1.1387531757354736, + "eval_runtime": 6.7091, + "eval_samples_per_second": 149.052, + "eval_steps_per_second": 18.631, + "step": 2400 + }, + { + "epoch": 0.30784, + "grad_norm": 3.8138394355773926, + "learning_rate": 9.44680726553218e-06, + "loss": 4.5321, + "step": 2405 + }, + { + "epoch": 0.30848, + "grad_norm": 3.298640727996826, + "learning_rate": 9.444499316483865e-06, + "loss": 4.5425, + "step": 2410 + }, + { + "epoch": 0.30912, + "grad_norm": 3.421692371368408, + "learning_rate": 9.442186846147048e-06, + "loss": 4.517, + "step": 2415 + }, + { + "epoch": 0.30976, + "grad_norm": 3.3749470710754395, + "learning_rate": 9.439869856874153e-06, + "loss": 4.527, + "step": 2420 + }, + { + "epoch": 0.3104, + "grad_norm": 3.163632869720459, + "learning_rate": 9.437548351022197e-06, + "loss": 4.639, + "step": 2425 + }, + { + "epoch": 0.31104, + "grad_norm": 3.22723126411438, + "learning_rate": 9.435222330952799e-06, + "loss": 4.5637, + "step": 2430 + }, + { + "epoch": 0.31168, + "grad_norm": 3.6903302669525146, + "learning_rate": 9.432891799032162e-06, + "loss": 4.5799, + "step": 2435 + }, + { + "epoch": 0.31232, + "grad_norm": 3.5754201412200928, + "learning_rate": 9.430556757631087e-06, + "loss": 4.5296, + "step": 2440 + }, + { + "epoch": 0.31296, + "grad_norm": 3.152308702468872, + "learning_rate": 9.428217209124958e-06, + "loss": 4.5233, + "step": 2445 + }, + { + "epoch": 0.3136, + "grad_norm": 3.4750640392303467, + "learning_rate": 9.425873155893744e-06, + "loss": 4.3894, + "step": 2450 + }, + { + "epoch": 0.31424, + "grad_norm": 3.1524953842163086, + "learning_rate": 9.423524600321999e-06, + "loss": 4.3978, + "step": 2455 + }, + { + "epoch": 0.31488, + "grad_norm": 3.344024658203125, + "learning_rate": 9.421171544798854e-06, + "loss": 4.5563, + "step": 2460 + }, + { + "epoch": 0.31552, + "grad_norm": 3.460477352142334, + "learning_rate": 9.418813991718017e-06, + "loss": 4.5687, + "step": 2465 + }, + { + "epoch": 0.31616, + "grad_norm": 3.2936015129089355, + "learning_rate": 9.416451943477778e-06, + "loss": 4.6714, + "step": 2470 + }, + { + "epoch": 0.3168, + "grad_norm": 3.373152732849121, + "learning_rate": 9.41408540248099e-06, + "loss": 4.5469, + "step": 2475 + }, + { + "epoch": 0.31744, + "grad_norm": 3.7370991706848145, + "learning_rate": 9.411714371135087e-06, + "loss": 4.5638, + "step": 2480 + }, + { + "epoch": 0.31808, + "grad_norm": 3.427574872970581, + "learning_rate": 9.40933885185206e-06, + "loss": 4.4045, + "step": 2485 + }, + { + "epoch": 0.31872, + "grad_norm": 3.459003448486328, + "learning_rate": 9.406958847048477e-06, + "loss": 4.5258, + "step": 2490 + }, + { + "epoch": 0.31936, + "grad_norm": 3.361318588256836, + "learning_rate": 9.40457435914546e-06, + "loss": 4.5269, + "step": 2495 + }, + { + "epoch": 0.32, + "grad_norm": 3.2740869522094727, + "learning_rate": 9.402185390568693e-06, + "loss": 4.6282, + "step": 2500 + }, + { + "epoch": 0.32, + "eval_loss": 1.1232563257217407, + "eval_runtime": 6.9879, + "eval_samples_per_second": 143.105, + "eval_steps_per_second": 17.888, + "step": 2500 + }, + { + "epoch": 0.32064, + "grad_norm": 3.3837263584136963, + "learning_rate": 9.399791943748419e-06, + "loss": 4.5378, + "step": 2505 + }, + { + "epoch": 0.32128, + "grad_norm": 3.248398542404175, + "learning_rate": 9.397394021119441e-06, + "loss": 4.4764, + "step": 2510 + }, + { + "epoch": 0.32192, + "grad_norm": 3.6042652130126953, + "learning_rate": 9.39499162512111e-06, + "loss": 4.5116, + "step": 2515 + }, + { + "epoch": 0.32256, + "grad_norm": 3.6905040740966797, + "learning_rate": 9.39258475819733e-06, + "loss": 4.3691, + "step": 2520 + }, + { + "epoch": 0.3232, + "grad_norm": 3.299175500869751, + "learning_rate": 9.390173422796548e-06, + "loss": 4.4966, + "step": 2525 + }, + { + "epoch": 0.32384, + "grad_norm": 3.312781810760498, + "learning_rate": 9.387757621371765e-06, + "loss": 4.6154, + "step": 2530 + }, + { + "epoch": 0.32448, + "grad_norm": 3.447141408920288, + "learning_rate": 9.38533735638052e-06, + "loss": 4.4373, + "step": 2535 + }, + { + "epoch": 0.32512, + "grad_norm": 3.611647605895996, + "learning_rate": 9.382912630284893e-06, + "loss": 4.5573, + "step": 2540 + }, + { + "epoch": 0.32576, + "grad_norm": 3.256063461303711, + "learning_rate": 9.380483445551503e-06, + "loss": 4.3667, + "step": 2545 + }, + { + "epoch": 0.3264, + "grad_norm": 3.6989307403564453, + "learning_rate": 9.378049804651506e-06, + "loss": 4.5231, + "step": 2550 + }, + { + "epoch": 0.32704, + "grad_norm": 3.1487154960632324, + "learning_rate": 9.37561171006059e-06, + "loss": 4.5691, + "step": 2555 + }, + { + "epoch": 0.32768, + "grad_norm": 3.951996326446533, + "learning_rate": 9.373169164258971e-06, + "loss": 4.6015, + "step": 2560 + }, + { + "epoch": 0.32832, + "grad_norm": 3.4129538536071777, + "learning_rate": 9.370722169731396e-06, + "loss": 4.499, + "step": 2565 + }, + { + "epoch": 0.32896, + "grad_norm": 3.4212002754211426, + "learning_rate": 9.36827072896714e-06, + "loss": 4.531, + "step": 2570 + }, + { + "epoch": 0.3296, + "grad_norm": 3.6318917274475098, + "learning_rate": 9.365814844459994e-06, + "loss": 4.5956, + "step": 2575 + }, + { + "epoch": 0.33024, + "grad_norm": 3.5327017307281494, + "learning_rate": 9.363354518708277e-06, + "loss": 4.506, + "step": 2580 + }, + { + "epoch": 0.33088, + "grad_norm": 3.4287726879119873, + "learning_rate": 9.360889754214823e-06, + "loss": 4.4378, + "step": 2585 + }, + { + "epoch": 0.33152, + "grad_norm": 3.153348207473755, + "learning_rate": 9.358420553486977e-06, + "loss": 4.5132, + "step": 2590 + }, + { + "epoch": 0.33216, + "grad_norm": 3.2982351779937744, + "learning_rate": 9.355946919036605e-06, + "loss": 4.5825, + "step": 2595 + }, + { + "epoch": 0.3328, + "grad_norm": 3.3084805011749268, + "learning_rate": 9.353468853380079e-06, + "loss": 4.4806, + "step": 2600 + }, + { + "epoch": 0.3328, + "eval_loss": 1.1227222681045532, + "eval_runtime": 7.0684, + "eval_samples_per_second": 141.475, + "eval_steps_per_second": 17.684, + "step": 2600 + }, + { + "epoch": 0.33344, + "grad_norm": 3.333228588104248, + "learning_rate": 9.350986359038277e-06, + "loss": 4.5084, + "step": 2605 + }, + { + "epoch": 0.33408, + "grad_norm": 3.350033760070801, + "learning_rate": 9.348499438536585e-06, + "loss": 4.4618, + "step": 2610 + }, + { + "epoch": 0.33472, + "grad_norm": 3.4025425910949707, + "learning_rate": 9.34600809440489e-06, + "loss": 4.531, + "step": 2615 + }, + { + "epoch": 0.33536, + "grad_norm": 3.4806454181671143, + "learning_rate": 9.343512329177582e-06, + "loss": 4.4493, + "step": 2620 + }, + { + "epoch": 0.336, + "grad_norm": 3.3315722942352295, + "learning_rate": 9.341012145393546e-06, + "loss": 4.3422, + "step": 2625 + }, + { + "epoch": 0.33664, + "grad_norm": 3.1910319328308105, + "learning_rate": 9.338507545596162e-06, + "loss": 4.3205, + "step": 2630 + }, + { + "epoch": 0.33728, + "grad_norm": 3.3633666038513184, + "learning_rate": 9.335998532333303e-06, + "loss": 4.3457, + "step": 2635 + }, + { + "epoch": 0.33792, + "grad_norm": 3.2793116569519043, + "learning_rate": 9.333485108157329e-06, + "loss": 4.424, + "step": 2640 + }, + { + "epoch": 0.33856, + "grad_norm": 3.2517988681793213, + "learning_rate": 9.330967275625094e-06, + "loss": 4.4829, + "step": 2645 + }, + { + "epoch": 0.3392, + "grad_norm": 3.5355207920074463, + "learning_rate": 9.328445037297929e-06, + "loss": 4.516, + "step": 2650 + }, + { + "epoch": 0.33984, + "grad_norm": 3.376213312149048, + "learning_rate": 9.32591839574165e-06, + "loss": 4.355, + "step": 2655 + }, + { + "epoch": 0.34048, + "grad_norm": 3.498028516769409, + "learning_rate": 9.323387353526552e-06, + "loss": 4.4594, + "step": 2660 + }, + { + "epoch": 0.34112, + "grad_norm": 3.289701461791992, + "learning_rate": 9.320851913227407e-06, + "loss": 4.3759, + "step": 2665 + }, + { + "epoch": 0.34176, + "grad_norm": 3.2121317386627197, + "learning_rate": 9.318312077423463e-06, + "loss": 4.5469, + "step": 2670 + }, + { + "epoch": 0.3424, + "grad_norm": 3.5415709018707275, + "learning_rate": 9.315767848698435e-06, + "loss": 4.5208, + "step": 2675 + }, + { + "epoch": 0.34304, + "grad_norm": 3.387648820877075, + "learning_rate": 9.313219229640511e-06, + "loss": 4.3834, + "step": 2680 + }, + { + "epoch": 0.34368, + "grad_norm": 3.4357047080993652, + "learning_rate": 9.310666222842343e-06, + "loss": 4.4647, + "step": 2685 + }, + { + "epoch": 0.34432, + "grad_norm": 3.5037624835968018, + "learning_rate": 9.308108830901046e-06, + "loss": 4.4008, + "step": 2690 + }, + { + "epoch": 0.34496, + "grad_norm": 3.3989768028259277, + "learning_rate": 9.305547056418198e-06, + "loss": 4.4072, + "step": 2695 + }, + { + "epoch": 0.3456, + "grad_norm": 3.28924298286438, + "learning_rate": 9.302980901999833e-06, + "loss": 4.4879, + "step": 2700 + }, + { + "epoch": 0.3456, + "eval_loss": 1.1052284240722656, + "eval_runtime": 6.8594, + "eval_samples_per_second": 145.785, + "eval_steps_per_second": 18.223, + "step": 2700 + }, + { + "epoch": 0.34624, + "grad_norm": 3.323194980621338, + "learning_rate": 9.300410370256444e-06, + "loss": 4.39, + "step": 2705 + }, + { + "epoch": 0.34688, + "grad_norm": 3.3750932216644287, + "learning_rate": 9.297835463802972e-06, + "loss": 4.4385, + "step": 2710 + }, + { + "epoch": 0.34752, + "grad_norm": 3.1815054416656494, + "learning_rate": 9.295256185258811e-06, + "loss": 4.6009, + "step": 2715 + }, + { + "epoch": 0.34816, + "grad_norm": 3.5953733921051025, + "learning_rate": 9.292672537247808e-06, + "loss": 4.5078, + "step": 2720 + }, + { + "epoch": 0.3488, + "grad_norm": 3.0168521404266357, + "learning_rate": 9.290084522398243e-06, + "loss": 4.4222, + "step": 2725 + }, + { + "epoch": 0.34944, + "grad_norm": 3.386521816253662, + "learning_rate": 9.287492143342847e-06, + "loss": 4.4376, + "step": 2730 + }, + { + "epoch": 0.35008, + "grad_norm": 3.3437111377716064, + "learning_rate": 9.28489540271879e-06, + "loss": 4.4659, + "step": 2735 + }, + { + "epoch": 0.35072, + "grad_norm": 3.2060422897338867, + "learning_rate": 9.282294303167677e-06, + "loss": 4.5246, + "step": 2740 + }, + { + "epoch": 0.35136, + "grad_norm": 3.335606813430786, + "learning_rate": 9.279688847335545e-06, + "loss": 4.4502, + "step": 2745 + }, + { + "epoch": 0.352, + "grad_norm": 3.2676444053649902, + "learning_rate": 9.27707903787287e-06, + "loss": 4.4734, + "step": 2750 + }, + { + "epoch": 0.35264, + "grad_norm": 3.4844555854797363, + "learning_rate": 9.274464877434548e-06, + "loss": 4.4142, + "step": 2755 + }, + { + "epoch": 0.35328, + "grad_norm": 3.7898640632629395, + "learning_rate": 9.271846368679907e-06, + "loss": 4.3501, + "step": 2760 + }, + { + "epoch": 0.35392, + "grad_norm": 3.182154893875122, + "learning_rate": 9.269223514272697e-06, + "loss": 4.3514, + "step": 2765 + }, + { + "epoch": 0.35456, + "grad_norm": 3.4136388301849365, + "learning_rate": 9.266596316881085e-06, + "loss": 4.5064, + "step": 2770 + }, + { + "epoch": 0.3552, + "grad_norm": 3.0961410999298096, + "learning_rate": 9.263964779177663e-06, + "loss": 4.3653, + "step": 2775 + }, + { + "epoch": 0.35584, + "grad_norm": 3.2388274669647217, + "learning_rate": 9.261328903839434e-06, + "loss": 4.4265, + "step": 2780 + }, + { + "epoch": 0.35648, + "grad_norm": 3.3582630157470703, + "learning_rate": 9.258688693547815e-06, + "loss": 4.4644, + "step": 2785 + }, + { + "epoch": 0.35712, + "grad_norm": 3.393575429916382, + "learning_rate": 9.25604415098863e-06, + "loss": 4.4185, + "step": 2790 + }, + { + "epoch": 0.35776, + "grad_norm": 3.30346941947937, + "learning_rate": 9.253395278852115e-06, + "loss": 4.497, + "step": 2795 + }, + { + "epoch": 0.3584, + "grad_norm": 3.3295273780822754, + "learning_rate": 9.250742079832905e-06, + "loss": 4.3593, + "step": 2800 + }, + { + "epoch": 0.3584, + "eval_loss": 1.1154146194458008, + "eval_runtime": 6.98, + "eval_samples_per_second": 143.267, + "eval_steps_per_second": 17.908, + "step": 2800 + }, + { + "epoch": 0.35904, + "grad_norm": 3.4841437339782715, + "learning_rate": 9.248084556630039e-06, + "loss": 4.5087, + "step": 2805 + }, + { + "epoch": 0.35968, + "grad_norm": 3.5100257396698, + "learning_rate": 9.245422711946959e-06, + "loss": 4.4858, + "step": 2810 + }, + { + "epoch": 0.36032, + "grad_norm": 3.8446364402770996, + "learning_rate": 9.242756548491496e-06, + "loss": 4.3412, + "step": 2815 + }, + { + "epoch": 0.36096, + "grad_norm": 3.121630907058716, + "learning_rate": 9.240086068975878e-06, + "loss": 4.3509, + "step": 2820 + }, + { + "epoch": 0.3616, + "grad_norm": 3.465914726257324, + "learning_rate": 9.237411276116724e-06, + "loss": 4.428, + "step": 2825 + }, + { + "epoch": 0.36224, + "grad_norm": 3.416682243347168, + "learning_rate": 9.234732172635041e-06, + "loss": 4.5354, + "step": 2830 + }, + { + "epoch": 0.36288, + "grad_norm": 3.1812517642974854, + "learning_rate": 9.232048761256218e-06, + "loss": 4.5089, + "step": 2835 + }, + { + "epoch": 0.36352, + "grad_norm": 3.454968214035034, + "learning_rate": 9.22936104471003e-06, + "loss": 4.2911, + "step": 2840 + }, + { + "epoch": 0.36416, + "grad_norm": 3.487147569656372, + "learning_rate": 9.226669025730633e-06, + "loss": 4.3171, + "step": 2845 + }, + { + "epoch": 0.3648, + "grad_norm": 3.4108667373657227, + "learning_rate": 9.22397270705655e-06, + "loss": 4.3087, + "step": 2850 + }, + { + "epoch": 0.36544, + "grad_norm": 3.26461124420166, + "learning_rate": 9.22127209143069e-06, + "loss": 4.4842, + "step": 2855 + }, + { + "epoch": 0.36608, + "grad_norm": 3.8364768028259277, + "learning_rate": 9.21856718160033e-06, + "loss": 4.538, + "step": 2860 + }, + { + "epoch": 0.36672, + "grad_norm": 3.4763057231903076, + "learning_rate": 9.215857980317109e-06, + "loss": 4.4181, + "step": 2865 + }, + { + "epoch": 0.36736, + "grad_norm": 3.5775294303894043, + "learning_rate": 9.213144490337036e-06, + "loss": 4.4466, + "step": 2870 + }, + { + "epoch": 0.368, + "grad_norm": 3.724712610244751, + "learning_rate": 9.210426714420487e-06, + "loss": 4.3985, + "step": 2875 + }, + { + "epoch": 0.36864, + "grad_norm": 3.675708532333374, + "learning_rate": 9.20770465533219e-06, + "loss": 4.4455, + "step": 2880 + }, + { + "epoch": 0.36928, + "grad_norm": 3.481708526611328, + "learning_rate": 9.204978315841238e-06, + "loss": 4.408, + "step": 2885 + }, + { + "epoch": 0.36992, + "grad_norm": 3.3906009197235107, + "learning_rate": 9.20224769872107e-06, + "loss": 4.4277, + "step": 2890 + }, + { + "epoch": 0.37056, + "grad_norm": 3.425987720489502, + "learning_rate": 9.199512806749485e-06, + "loss": 4.3105, + "step": 2895 + }, + { + "epoch": 0.3712, + "grad_norm": 3.183659791946411, + "learning_rate": 9.196773642708623e-06, + "loss": 4.449, + "step": 2900 + }, + { + "epoch": 0.3712, + "eval_loss": 1.1105434894561768, + "eval_runtime": 9.6182, + "eval_samples_per_second": 103.97, + "eval_steps_per_second": 12.996, + "step": 2900 + }, + { + "epoch": 0.37184, + "grad_norm": 3.461367130279541, + "learning_rate": 9.194030209384975e-06, + "loss": 4.5999, + "step": 2905 + }, + { + "epoch": 0.37248, + "grad_norm": 3.140835762023926, + "learning_rate": 9.191282509569375e-06, + "loss": 4.4779, + "step": 2910 + }, + { + "epoch": 0.37312, + "grad_norm": 3.2149133682250977, + "learning_rate": 9.188530546056993e-06, + "loss": 4.4018, + "step": 2915 + }, + { + "epoch": 0.37376, + "grad_norm": 3.6455352306365967, + "learning_rate": 9.185774321647343e-06, + "loss": 4.5308, + "step": 2920 + }, + { + "epoch": 0.3744, + "grad_norm": 3.063474178314209, + "learning_rate": 9.183013839144266e-06, + "loss": 4.4842, + "step": 2925 + }, + { + "epoch": 0.37504, + "grad_norm": 3.5478434562683105, + "learning_rate": 9.18024910135594e-06, + "loss": 4.3344, + "step": 2930 + }, + { + "epoch": 0.37568, + "grad_norm": 3.481499433517456, + "learning_rate": 9.177480111094871e-06, + "loss": 4.4559, + "step": 2935 + }, + { + "epoch": 0.37632, + "grad_norm": 3.8366904258728027, + "learning_rate": 9.174706871177888e-06, + "loss": 4.3211, + "step": 2940 + }, + { + "epoch": 0.37696, + "grad_norm": 3.365673303604126, + "learning_rate": 9.171929384426146e-06, + "loss": 4.2437, + "step": 2945 + }, + { + "epoch": 0.3776, + "grad_norm": 3.6502323150634766, + "learning_rate": 9.16914765366512e-06, + "loss": 4.3133, + "step": 2950 + }, + { + "epoch": 0.37824, + "grad_norm": 4.653753757476807, + "learning_rate": 9.166361681724602e-06, + "loss": 4.3793, + "step": 2955 + }, + { + "epoch": 0.37888, + "grad_norm": 3.373015880584717, + "learning_rate": 9.163571471438696e-06, + "loss": 4.3469, + "step": 2960 + }, + { + "epoch": 0.37952, + "grad_norm": 3.6513311862945557, + "learning_rate": 9.160777025645822e-06, + "loss": 4.2295, + "step": 2965 + }, + { + "epoch": 0.38016, + "grad_norm": 3.158674955368042, + "learning_rate": 9.157978347188706e-06, + "loss": 4.4705, + "step": 2970 + }, + { + "epoch": 0.3808, + "grad_norm": 3.2553975582122803, + "learning_rate": 9.15517543891438e-06, + "loss": 4.3731, + "step": 2975 + }, + { + "epoch": 0.38144, + "grad_norm": 3.6190109252929688, + "learning_rate": 9.152368303674178e-06, + "loss": 4.482, + "step": 2980 + }, + { + "epoch": 0.38208, + "grad_norm": 3.3591349124908447, + "learning_rate": 9.149556944323737e-06, + "loss": 4.4614, + "step": 2985 + }, + { + "epoch": 0.38272, + "grad_norm": 3.4077179431915283, + "learning_rate": 9.146741363722987e-06, + "loss": 4.4401, + "step": 2990 + }, + { + "epoch": 0.38336, + "grad_norm": 3.372915267944336, + "learning_rate": 9.143921564736156e-06, + "loss": 4.4644, + "step": 2995 + }, + { + "epoch": 0.384, + "grad_norm": 3.4412505626678467, + "learning_rate": 9.141097550231762e-06, + "loss": 4.1504, + "step": 3000 + }, + { + "epoch": 0.384, + "eval_loss": 1.101137399673462, + "eval_runtime": 7.1886, + "eval_samples_per_second": 139.11, + "eval_steps_per_second": 17.389, + "step": 3000 + }, + { + "epoch": 0.38464, + "grad_norm": 3.461183786392212, + "learning_rate": 9.13826932308261e-06, + "loss": 4.373, + "step": 3005 + }, + { + "epoch": 0.38528, + "grad_norm": 3.3834474086761475, + "learning_rate": 9.136003710192325e-06, + "loss": 4.3295, + "step": 3010 + }, + { + "epoch": 0.38592, + "grad_norm": 3.351191520690918, + "learning_rate": 9.133167907535756e-06, + "loss": 4.4578, + "step": 3015 + }, + { + "epoch": 0.38656, + "grad_norm": 3.236766815185547, + "learning_rate": 9.130327900301077e-06, + "loss": 4.4021, + "step": 3020 + }, + { + "epoch": 0.3872, + "grad_norm": 3.217759370803833, + "learning_rate": 9.12748369137736e-06, + "loss": 4.3417, + "step": 3025 + }, + { + "epoch": 0.38784, + "grad_norm": 3.5346148014068604, + "learning_rate": 9.124635283657956e-06, + "loss": 4.5156, + "step": 3030 + }, + { + "epoch": 0.38848, + "grad_norm": 3.3899123668670654, + "learning_rate": 9.121782680040487e-06, + "loss": 4.3682, + "step": 3035 + }, + { + "epoch": 0.38912, + "grad_norm": 3.3264894485473633, + "learning_rate": 9.11892588342684e-06, + "loss": 4.5256, + "step": 3040 + }, + { + "epoch": 0.38976, + "grad_norm": 3.329000473022461, + "learning_rate": 9.116064896723167e-06, + "loss": 4.4699, + "step": 3045 + }, + { + "epoch": 0.3904, + "grad_norm": 3.4215424060821533, + "learning_rate": 9.113199722839889e-06, + "loss": 4.4322, + "step": 3050 + }, + { + "epoch": 0.39104, + "grad_norm": 3.363974094390869, + "learning_rate": 9.110330364691682e-06, + "loss": 4.4147, + "step": 3055 + }, + { + "epoch": 0.39168, + "grad_norm": 3.2653472423553467, + "learning_rate": 9.10745682519748e-06, + "loss": 4.4322, + "step": 3060 + }, + { + "epoch": 0.39232, + "grad_norm": 3.5401949882507324, + "learning_rate": 9.104579107280465e-06, + "loss": 4.3729, + "step": 3065 + }, + { + "epoch": 0.39296, + "grad_norm": 3.2343528270721436, + "learning_rate": 9.101697213868079e-06, + "loss": 4.4719, + "step": 3070 + }, + { + "epoch": 0.3936, + "grad_norm": 3.2686214447021484, + "learning_rate": 9.098811147892004e-06, + "loss": 4.4189, + "step": 3075 + }, + { + "epoch": 0.39424, + "grad_norm": 3.325529098510742, + "learning_rate": 9.095920912288173e-06, + "loss": 4.2017, + "step": 3080 + }, + { + "epoch": 0.39488, + "grad_norm": 3.393070936203003, + "learning_rate": 9.093026509996752e-06, + "loss": 4.3442, + "step": 3085 + }, + { + "epoch": 0.39552, + "grad_norm": 3.3179240226745605, + "learning_rate": 9.090127943962156e-06, + "loss": 4.3873, + "step": 3090 + }, + { + "epoch": 0.39616, + "grad_norm": 3.1389029026031494, + "learning_rate": 9.087225217133029e-06, + "loss": 4.3073, + "step": 3095 + }, + { + "epoch": 0.3968, + "grad_norm": 3.4911952018737793, + "learning_rate": 9.084318332462247e-06, + "loss": 4.4417, + "step": 3100 + }, + { + "epoch": 0.3968, + "eval_loss": 1.0903624296188354, + "eval_runtime": 6.757, + "eval_samples_per_second": 147.995, + "eval_steps_per_second": 18.499, + "step": 3100 + }, + { + "epoch": 0.39744, + "grad_norm": 3.211641550064087, + "learning_rate": 9.08140729290692e-06, + "loss": 4.2307, + "step": 3105 + }, + { + "epoch": 0.39808, + "grad_norm": 3.5543551445007324, + "learning_rate": 9.078492101428381e-06, + "loss": 4.4694, + "step": 3110 + }, + { + "epoch": 0.39872, + "grad_norm": 4.135831832885742, + "learning_rate": 9.075572760992193e-06, + "loss": 4.4585, + "step": 3115 + }, + { + "epoch": 0.39936, + "grad_norm": 3.209195613861084, + "learning_rate": 9.07264927456813e-06, + "loss": 4.4306, + "step": 3120 + }, + { + "epoch": 0.4, + "grad_norm": 3.220716714859009, + "learning_rate": 9.06972164513019e-06, + "loss": 4.5434, + "step": 3125 + }, + { + "epoch": 0.40064, + "grad_norm": 3.3828017711639404, + "learning_rate": 9.066789875656583e-06, + "loss": 4.2933, + "step": 3130 + }, + { + "epoch": 0.40128, + "grad_norm": 3.2086288928985596, + "learning_rate": 9.063853969129734e-06, + "loss": 4.2783, + "step": 3135 + }, + { + "epoch": 0.40192, + "grad_norm": 3.4894447326660156, + "learning_rate": 9.060913928536272e-06, + "loss": 4.4412, + "step": 3140 + }, + { + "epoch": 0.40256, + "grad_norm": 3.0965099334716797, + "learning_rate": 9.057969756867036e-06, + "loss": 4.2927, + "step": 3145 + }, + { + "epoch": 0.4032, + "grad_norm": 3.0679054260253906, + "learning_rate": 9.055021457117064e-06, + "loss": 4.3709, + "step": 3150 + }, + { + "epoch": 0.40384, + "grad_norm": 3.4988932609558105, + "learning_rate": 9.052069032285594e-06, + "loss": 4.2851, + "step": 3155 + }, + { + "epoch": 0.40448, + "grad_norm": 3.3882834911346436, + "learning_rate": 9.04911248537606e-06, + "loss": 4.4369, + "step": 3160 + }, + { + "epoch": 0.40512, + "grad_norm": 3.63289737701416, + "learning_rate": 9.046151819396094e-06, + "loss": 4.4555, + "step": 3165 + }, + { + "epoch": 0.40576, + "grad_norm": 3.321216106414795, + "learning_rate": 9.04318703735751e-06, + "loss": 4.4082, + "step": 3170 + }, + { + "epoch": 0.4064, + "grad_norm": 3.799292802810669, + "learning_rate": 9.040218142276318e-06, + "loss": 4.3891, + "step": 3175 + }, + { + "epoch": 0.40704, + "grad_norm": 3.191375255584717, + "learning_rate": 9.037245137172703e-06, + "loss": 4.2423, + "step": 3180 + }, + { + "epoch": 0.40768, + "grad_norm": 3.5997314453125, + "learning_rate": 9.03426802507104e-06, + "loss": 4.3708, + "step": 3185 + }, + { + "epoch": 0.40832, + "grad_norm": 3.4215621948242188, + "learning_rate": 9.031286808999875e-06, + "loss": 4.3509, + "step": 3190 + }, + { + "epoch": 0.40896, + "grad_norm": 3.418410539627075, + "learning_rate": 9.028301491991932e-06, + "loss": 4.3573, + "step": 3195 + }, + { + "epoch": 0.4096, + "grad_norm": 3.293621778488159, + "learning_rate": 9.025312077084109e-06, + "loss": 4.2765, + "step": 3200 + }, + { + "epoch": 0.4096, + "eval_loss": 1.0897722244262695, + "eval_runtime": 7.2579, + "eval_samples_per_second": 137.781, + "eval_steps_per_second": 17.223, + "step": 3200 + }, + { + "epoch": 0.41024, + "grad_norm": 3.231015205383301, + "learning_rate": 9.022318567317468e-06, + "loss": 4.4425, + "step": 3205 + }, + { + "epoch": 0.41088, + "grad_norm": 3.2523105144500732, + "learning_rate": 9.019320965737237e-06, + "loss": 4.3704, + "step": 3210 + }, + { + "epoch": 0.41152, + "grad_norm": 3.635181188583374, + "learning_rate": 9.01631927539281e-06, + "loss": 4.2446, + "step": 3215 + }, + { + "epoch": 0.41216, + "grad_norm": 5.132925033569336, + "learning_rate": 9.01331349933774e-06, + "loss": 4.2925, + "step": 3220 + }, + { + "epoch": 0.4128, + "grad_norm": 3.6050596237182617, + "learning_rate": 9.010303640629733e-06, + "loss": 4.3073, + "step": 3225 + }, + { + "epoch": 0.41344, + "grad_norm": 3.5942418575286865, + "learning_rate": 9.007289702330649e-06, + "loss": 4.2469, + "step": 3230 + }, + { + "epoch": 0.41408, + "grad_norm": 3.1797220706939697, + "learning_rate": 9.004271687506503e-06, + "loss": 4.2212, + "step": 3235 + }, + { + "epoch": 0.41472, + "grad_norm": 3.2658779621124268, + "learning_rate": 9.001249599227448e-06, + "loss": 4.2773, + "step": 3240 + }, + { + "epoch": 0.41536, + "grad_norm": 3.512411117553711, + "learning_rate": 8.998223440567792e-06, + "loss": 4.376, + "step": 3245 + }, + { + "epoch": 0.416, + "grad_norm": 3.5214426517486572, + "learning_rate": 8.995193214605972e-06, + "loss": 4.3563, + "step": 3250 + }, + { + "epoch": 0.41664, + "grad_norm": 3.3954243659973145, + "learning_rate": 8.992158924424572e-06, + "loss": 4.4415, + "step": 3255 + }, + { + "epoch": 0.41728, + "grad_norm": 3.4646153450012207, + "learning_rate": 8.989120573110307e-06, + "loss": 4.3228, + "step": 3260 + }, + { + "epoch": 0.41792, + "grad_norm": 3.5404181480407715, + "learning_rate": 8.986078163754017e-06, + "loss": 4.3483, + "step": 3265 + }, + { + "epoch": 0.41856, + "grad_norm": 3.178579092025757, + "learning_rate": 8.983031699450683e-06, + "loss": 4.3122, + "step": 3270 + }, + { + "epoch": 0.4192, + "grad_norm": 3.556442975997925, + "learning_rate": 8.979981183299402e-06, + "loss": 4.3761, + "step": 3275 + }, + { + "epoch": 0.41984, + "grad_norm": 12.68402099609375, + "learning_rate": 8.976926618403395e-06, + "loss": 4.2578, + "step": 3280 + }, + { + "epoch": 0.42048, + "grad_norm": 3.3316643238067627, + "learning_rate": 8.973868007870001e-06, + "loss": 4.3723, + "step": 3285 + }, + { + "epoch": 0.42112, + "grad_norm": 3.6438074111938477, + "learning_rate": 8.970805354810676e-06, + "loss": 4.3699, + "step": 3290 + }, + { + "epoch": 0.42176, + "grad_norm": 2.941915988922119, + "learning_rate": 8.967738662340985e-06, + "loss": 4.2281, + "step": 3295 + }, + { + "epoch": 0.4224, + "grad_norm": 3.2409346103668213, + "learning_rate": 8.96466793358061e-06, + "loss": 4.3198, + "step": 3300 + }, + { + "epoch": 0.4224, + "eval_loss": 1.0934110879898071, + "eval_runtime": 7.852, + "eval_samples_per_second": 127.356, + "eval_steps_per_second": 15.92, + "step": 3300 + }, + { + "epoch": 0.42304, + "grad_norm": 3.6601648330688477, + "learning_rate": 8.961593171653329e-06, + "loss": 4.4013, + "step": 3305 + }, + { + "epoch": 0.42368, + "grad_norm": 3.4598097801208496, + "learning_rate": 8.95851437968703e-06, + "loss": 4.3119, + "step": 3310 + }, + { + "epoch": 0.42432, + "grad_norm": 6.440554141998291, + "learning_rate": 8.955431560813698e-06, + "loss": 4.2568, + "step": 3315 + }, + { + "epoch": 0.42496, + "grad_norm": 3.2681210041046143, + "learning_rate": 8.952344718169415e-06, + "loss": 4.4502, + "step": 3320 + }, + { + "epoch": 0.4256, + "grad_norm": 3.3809800148010254, + "learning_rate": 8.949253854894356e-06, + "loss": 4.3411, + "step": 3325 + }, + { + "epoch": 0.42624, + "grad_norm": 3.58083438873291, + "learning_rate": 8.946158974132783e-06, + "loss": 4.2869, + "step": 3330 + }, + { + "epoch": 0.42688, + "grad_norm": 3.4199907779693604, + "learning_rate": 8.943060079033054e-06, + "loss": 4.3011, + "step": 3335 + }, + { + "epoch": 0.42752, + "grad_norm": 3.229564905166626, + "learning_rate": 8.939957172747602e-06, + "loss": 4.3878, + "step": 3340 + }, + { + "epoch": 0.42816, + "grad_norm": 3.4797444343566895, + "learning_rate": 8.936850258432943e-06, + "loss": 4.3084, + "step": 3345 + }, + { + "epoch": 0.4288, + "grad_norm": 3.4006948471069336, + "learning_rate": 8.933739339249669e-06, + "loss": 4.3248, + "step": 3350 + }, + { + "epoch": 0.42944, + "grad_norm": 3.5028109550476074, + "learning_rate": 8.930624418362452e-06, + "loss": 4.1799, + "step": 3355 + }, + { + "epoch": 0.43008, + "grad_norm": 3.072166681289673, + "learning_rate": 8.927505498940027e-06, + "loss": 4.3052, + "step": 3360 + }, + { + "epoch": 0.43072, + "grad_norm": 3.346359968185425, + "learning_rate": 8.9243825841552e-06, + "loss": 4.4108, + "step": 3365 + }, + { + "epoch": 0.43136, + "grad_norm": 3.1402132511138916, + "learning_rate": 8.921255677184844e-06, + "loss": 4.4025, + "step": 3370 + }, + { + "epoch": 0.432, + "grad_norm": 3.7172560691833496, + "learning_rate": 8.918124781209889e-06, + "loss": 4.426, + "step": 3375 + }, + { + "epoch": 0.43264, + "grad_norm": 3.6983797550201416, + "learning_rate": 8.914989899415323e-06, + "loss": 4.4391, + "step": 3380 + }, + { + "epoch": 0.43328, + "grad_norm": 3.75880765914917, + "learning_rate": 8.911851034990194e-06, + "loss": 4.4118, + "step": 3385 + }, + { + "epoch": 0.43392, + "grad_norm": 3.346653699874878, + "learning_rate": 8.908708191127596e-06, + "loss": 4.3974, + "step": 3390 + }, + { + "epoch": 0.43456, + "grad_norm": 3.2262954711914062, + "learning_rate": 8.90556137102467e-06, + "loss": 4.3007, + "step": 3395 + }, + { + "epoch": 0.4352, + "grad_norm": 3.443249225616455, + "learning_rate": 8.90241057788261e-06, + "loss": 4.3532, + "step": 3400 + }, + { + "epoch": 0.4352, + "eval_loss": 1.0824164152145386, + "eval_runtime": 8.7963, + "eval_samples_per_second": 113.684, + "eval_steps_per_second": 14.21, + "step": 3400 + }, + { + "epoch": 0.43584, + "grad_norm": 3.880614757537842, + "learning_rate": 8.899255814906643e-06, + "loss": 4.3109, + "step": 3405 + }, + { + "epoch": 0.43648, + "grad_norm": 3.4008679389953613, + "learning_rate": 8.896097085306036e-06, + "loss": 4.3569, + "step": 3410 + }, + { + "epoch": 0.43712, + "grad_norm": 3.386129856109619, + "learning_rate": 8.8929343922941e-06, + "loss": 4.4246, + "step": 3415 + }, + { + "epoch": 0.43776, + "grad_norm": 3.3957948684692383, + "learning_rate": 8.889767739088165e-06, + "loss": 4.3674, + "step": 3420 + }, + { + "epoch": 0.4384, + "grad_norm": 3.2077996730804443, + "learning_rate": 8.886597128909598e-06, + "loss": 4.1689, + "step": 3425 + }, + { + "epoch": 0.43904, + "grad_norm": 2.9480648040771484, + "learning_rate": 8.883422564983789e-06, + "loss": 4.3577, + "step": 3430 + }, + { + "epoch": 0.43968, + "grad_norm": 3.6000452041625977, + "learning_rate": 8.880244050540147e-06, + "loss": 4.2518, + "step": 3435 + }, + { + "epoch": 0.44032, + "grad_norm": 3.707974672317505, + "learning_rate": 8.877061588812107e-06, + "loss": 4.2916, + "step": 3440 + }, + { + "epoch": 0.44096, + "grad_norm": 3.2383322715759277, + "learning_rate": 8.873875183037115e-06, + "loss": 4.2048, + "step": 3445 + }, + { + "epoch": 0.4416, + "grad_norm": 3.4959189891815186, + "learning_rate": 8.870684836456625e-06, + "loss": 4.2896, + "step": 3450 + }, + { + "epoch": 0.44224, + "grad_norm": 3.520314931869507, + "learning_rate": 8.867490552316109e-06, + "loss": 4.3005, + "step": 3455 + }, + { + "epoch": 0.44288, + "grad_norm": 3.2967355251312256, + "learning_rate": 8.864292333865037e-06, + "loss": 4.2496, + "step": 3460 + }, + { + "epoch": 0.44352, + "grad_norm": 3.222911834716797, + "learning_rate": 8.861090184356887e-06, + "loss": 4.2884, + "step": 3465 + }, + { + "epoch": 0.44416, + "grad_norm": 3.0785837173461914, + "learning_rate": 8.857884107049128e-06, + "loss": 4.2479, + "step": 3470 + }, + { + "epoch": 0.4448, + "grad_norm": 3.3931188583374023, + "learning_rate": 8.854674105203236e-06, + "loss": 4.2806, + "step": 3475 + }, + { + "epoch": 0.44544, + "grad_norm": 3.117079257965088, + "learning_rate": 8.85146018208467e-06, + "loss": 4.309, + "step": 3480 + }, + { + "epoch": 0.44608, + "grad_norm": 3.419799566268921, + "learning_rate": 8.848242340962882e-06, + "loss": 4.2736, + "step": 3485 + }, + { + "epoch": 0.44672, + "grad_norm": 3.147315740585327, + "learning_rate": 8.845020585111307e-06, + "loss": 4.4178, + "step": 3490 + }, + { + "epoch": 0.44736, + "grad_norm": 3.2093615531921387, + "learning_rate": 8.841794917807369e-06, + "loss": 4.2141, + "step": 3495 + }, + { + "epoch": 0.448, + "grad_norm": 3.126490354537964, + "learning_rate": 8.838565342332462e-06, + "loss": 4.3183, + "step": 3500 + }, + { + "epoch": 0.448, + "eval_loss": 1.0745737552642822, + "eval_runtime": 6.765, + "eval_samples_per_second": 147.82, + "eval_steps_per_second": 18.478, + "step": 3500 + }, + { + "epoch": 0.44864, + "grad_norm": 3.4378812313079834, + "learning_rate": 8.83533186197196e-06, + "loss": 4.3091, + "step": 3505 + }, + { + "epoch": 0.44928, + "grad_norm": 3.225221633911133, + "learning_rate": 8.832094480015211e-06, + "loss": 4.2099, + "step": 3510 + }, + { + "epoch": 0.44992, + "grad_norm": 3.491705894470215, + "learning_rate": 8.82885319975553e-06, + "loss": 4.3552, + "step": 3515 + }, + { + "epoch": 0.45056, + "grad_norm": 3.602252960205078, + "learning_rate": 8.825608024490198e-06, + "loss": 4.3082, + "step": 3520 + }, + { + "epoch": 0.4512, + "grad_norm": 3.412418842315674, + "learning_rate": 8.822358957520459e-06, + "loss": 4.1762, + "step": 3525 + }, + { + "epoch": 0.45184, + "grad_norm": 3.254765272140503, + "learning_rate": 8.819106002151513e-06, + "loss": 4.334, + "step": 3530 + }, + { + "epoch": 0.45248, + "grad_norm": 3.0931169986724854, + "learning_rate": 8.81584916169252e-06, + "loss": 4.1329, + "step": 3535 + }, + { + "epoch": 0.45312, + "grad_norm": 3.564138889312744, + "learning_rate": 8.812588439456588e-06, + "loss": 4.157, + "step": 3540 + }, + { + "epoch": 0.45376, + "grad_norm": 3.654914140701294, + "learning_rate": 8.809323838760778e-06, + "loss": 4.3768, + "step": 3545 + }, + { + "epoch": 0.4544, + "grad_norm": 3.495839834213257, + "learning_rate": 8.806055362926093e-06, + "loss": 4.224, + "step": 3550 + }, + { + "epoch": 0.45504, + "grad_norm": 3.4077813625335693, + "learning_rate": 8.802783015277483e-06, + "loss": 4.3817, + "step": 3555 + }, + { + "epoch": 0.45568, + "grad_norm": 3.136080741882324, + "learning_rate": 8.799506799143826e-06, + "loss": 4.2902, + "step": 3560 + }, + { + "epoch": 0.45632, + "grad_norm": 3.2735695838928223, + "learning_rate": 8.79622671785795e-06, + "loss": 4.1645, + "step": 3565 + }, + { + "epoch": 0.45696, + "grad_norm": 3.584965944290161, + "learning_rate": 8.792942774756602e-06, + "loss": 4.4454, + "step": 3570 + }, + { + "epoch": 0.4576, + "grad_norm": 3.2631680965423584, + "learning_rate": 8.789654973180465e-06, + "loss": 4.349, + "step": 3575 + }, + { + "epoch": 0.45824, + "grad_norm": 3.2688302993774414, + "learning_rate": 8.786363316474147e-06, + "loss": 4.3671, + "step": 3580 + }, + { + "epoch": 0.45888, + "grad_norm": 4.0966572761535645, + "learning_rate": 8.783067807986172e-06, + "loss": 4.3114, + "step": 3585 + }, + { + "epoch": 0.45952, + "grad_norm": 3.381613254547119, + "learning_rate": 8.779768451068988e-06, + "loss": 4.2705, + "step": 3590 + }, + { + "epoch": 0.46016, + "grad_norm": 3.2401769161224365, + "learning_rate": 8.776465249078958e-06, + "loss": 4.3132, + "step": 3595 + }, + { + "epoch": 0.4608, + "grad_norm": 3.099407911300659, + "learning_rate": 8.773158205376351e-06, + "loss": 4.1963, + "step": 3600 + }, + { + "epoch": 0.4608, + "eval_loss": 1.0701223611831665, + "eval_runtime": 6.7989, + "eval_samples_per_second": 147.084, + "eval_steps_per_second": 18.385, + "step": 3600 + }, + { + "epoch": 0.46144, + "grad_norm": 3.2464311122894287, + "learning_rate": 8.76984732332535e-06, + "loss": 4.309, + "step": 3605 + }, + { + "epoch": 0.46208, + "grad_norm": 3.6688852310180664, + "learning_rate": 8.76653260629404e-06, + "loss": 4.2009, + "step": 3610 + }, + { + "epoch": 0.46272, + "grad_norm": 3.16396164894104, + "learning_rate": 8.763214057654405e-06, + "loss": 4.3162, + "step": 3615 + }, + { + "epoch": 0.46336, + "grad_norm": 3.2529261112213135, + "learning_rate": 8.759891680782336e-06, + "loss": 4.3083, + "step": 3620 + }, + { + "epoch": 0.464, + "grad_norm": 3.212313175201416, + "learning_rate": 8.756565479057604e-06, + "loss": 4.3336, + "step": 3625 + }, + { + "epoch": 0.46464, + "grad_norm": 3.034917116165161, + "learning_rate": 8.753235455863883e-06, + "loss": 4.3935, + "step": 3630 + }, + { + "epoch": 0.46528, + "grad_norm": 3.362323045730591, + "learning_rate": 8.749901614588728e-06, + "loss": 4.1964, + "step": 3635 + }, + { + "epoch": 0.46592, + "grad_norm": 3.1814639568328857, + "learning_rate": 8.746563958623584e-06, + "loss": 4.3555, + "step": 3640 + }, + { + "epoch": 0.46656, + "grad_norm": 3.348071336746216, + "learning_rate": 8.743222491363767e-06, + "loss": 4.3297, + "step": 3645 + }, + { + "epoch": 0.4672, + "grad_norm": 3.4134304523468018, + "learning_rate": 8.739877216208483e-06, + "loss": 4.1711, + "step": 3650 + }, + { + "epoch": 0.46784, + "grad_norm": 3.3224220275878906, + "learning_rate": 8.736528136560798e-06, + "loss": 4.4583, + "step": 3655 + }, + { + "epoch": 0.46848, + "grad_norm": 3.0759694576263428, + "learning_rate": 8.73317525582766e-06, + "loss": 4.1875, + "step": 3660 + }, + { + "epoch": 0.46912, + "grad_norm": 3.429316997528076, + "learning_rate": 8.729818577419875e-06, + "loss": 4.2585, + "step": 3665 + }, + { + "epoch": 0.46976, + "grad_norm": 3.197808265686035, + "learning_rate": 8.72645810475212e-06, + "loss": 4.3153, + "step": 3670 + }, + { + "epoch": 0.4704, + "grad_norm": 3.236347198486328, + "learning_rate": 8.723093841242922e-06, + "loss": 4.2445, + "step": 3675 + }, + { + "epoch": 0.47104, + "grad_norm": 3.3201181888580322, + "learning_rate": 8.719725790314675e-06, + "loss": 4.2265, + "step": 3680 + }, + { + "epoch": 0.47168, + "grad_norm": 3.4637041091918945, + "learning_rate": 8.716353955393618e-06, + "loss": 4.1416, + "step": 3685 + }, + { + "epoch": 0.47232, + "grad_norm": 3.4432976245880127, + "learning_rate": 8.712978339909845e-06, + "loss": 4.2733, + "step": 3690 + }, + { + "epoch": 0.47296, + "grad_norm": 3.350703239440918, + "learning_rate": 8.709598947297291e-06, + "loss": 4.2555, + "step": 3695 + }, + { + "epoch": 0.4736, + "grad_norm": 3.470686197280884, + "learning_rate": 8.706215780993735e-06, + "loss": 4.2987, + "step": 3700 + }, + { + "epoch": 0.4736, + "eval_loss": 1.0674490928649902, + "eval_runtime": 6.691, + "eval_samples_per_second": 149.454, + "eval_steps_per_second": 18.682, + "step": 3700 + }, + { + "epoch": 0.47424, + "grad_norm": 3.153869390487671, + "learning_rate": 8.702828844440798e-06, + "loss": 4.1894, + "step": 3705 + }, + { + "epoch": 0.47488, + "grad_norm": 3.3305816650390625, + "learning_rate": 8.699438141083933e-06, + "loss": 4.3086, + "step": 3710 + }, + { + "epoch": 0.47552, + "grad_norm": 3.200063705444336, + "learning_rate": 8.696043674372424e-06, + "loss": 4.1944, + "step": 3715 + }, + { + "epoch": 0.47616, + "grad_norm": 3.437494993209839, + "learning_rate": 8.692645447759387e-06, + "loss": 4.219, + "step": 3720 + }, + { + "epoch": 0.4768, + "grad_norm": 3.2587502002716064, + "learning_rate": 8.68924346470176e-06, + "loss": 4.1808, + "step": 3725 + }, + { + "epoch": 0.47744, + "grad_norm": 3.0455007553100586, + "learning_rate": 8.685837728660305e-06, + "loss": 4.142, + "step": 3730 + }, + { + "epoch": 0.47808, + "grad_norm": 3.3263583183288574, + "learning_rate": 8.6824282430996e-06, + "loss": 4.2156, + "step": 3735 + }, + { + "epoch": 0.47872, + "grad_norm": 3.3989768028259277, + "learning_rate": 8.679015011488032e-06, + "loss": 4.2058, + "step": 3740 + }, + { + "epoch": 0.47936, + "grad_norm": 3.58839750289917, + "learning_rate": 8.675598037297812e-06, + "loss": 4.2954, + "step": 3745 + }, + { + "epoch": 0.48, + "grad_norm": 3.5100045204162598, + "learning_rate": 8.672177324004946e-06, + "loss": 4.3007, + "step": 3750 + }, + { + "epoch": 0.48064, + "grad_norm": 3.6057488918304443, + "learning_rate": 8.668752875089248e-06, + "loss": 4.4091, + "step": 3755 + }, + { + "epoch": 0.48128, + "grad_norm": 3.4816482067108154, + "learning_rate": 8.665324694034335e-06, + "loss": 4.2736, + "step": 3760 + }, + { + "epoch": 0.48192, + "grad_norm": 3.406355857849121, + "learning_rate": 8.661892784327616e-06, + "loss": 4.3212, + "step": 3765 + }, + { + "epoch": 0.48256, + "grad_norm": 3.6660380363464355, + "learning_rate": 8.658457149460296e-06, + "loss": 4.1134, + "step": 3770 + }, + { + "epoch": 0.4832, + "grad_norm": 3.302281618118286, + "learning_rate": 8.655017792927367e-06, + "loss": 4.2658, + "step": 3775 + }, + { + "epoch": 0.48384, + "grad_norm": 3.288099765777588, + "learning_rate": 8.65157471822761e-06, + "loss": 4.298, + "step": 3780 + }, + { + "epoch": 0.48448, + "grad_norm": 4.572479248046875, + "learning_rate": 8.648127928863586e-06, + "loss": 4.2769, + "step": 3785 + }, + { + "epoch": 0.48512, + "grad_norm": 3.6005682945251465, + "learning_rate": 8.644677428341637e-06, + "loss": 4.2542, + "step": 3790 + }, + { + "epoch": 0.48576, + "grad_norm": 3.3716723918914795, + "learning_rate": 8.641223220171877e-06, + "loss": 4.2514, + "step": 3795 + }, + { + "epoch": 0.4864, + "grad_norm": 3.3423805236816406, + "learning_rate": 8.637765307868197e-06, + "loss": 4.1449, + "step": 3800 + }, + { + "epoch": 0.4864, + "eval_loss": 1.071823239326477, + "eval_runtime": 7.6566, + "eval_samples_per_second": 130.606, + "eval_steps_per_second": 16.326, + "step": 3800 + }, + { + "epoch": 0.48704, + "grad_norm": 3.4172847270965576, + "learning_rate": 8.634303694948249e-06, + "loss": 4.1549, + "step": 3805 + }, + { + "epoch": 0.48768, + "grad_norm": 3.3541908264160156, + "learning_rate": 8.630838384933456e-06, + "loss": 4.5025, + "step": 3810 + }, + { + "epoch": 0.48832, + "grad_norm": 3.3319921493530273, + "learning_rate": 8.627369381349e-06, + "loss": 4.2343, + "step": 3815 + }, + { + "epoch": 0.48896, + "grad_norm": 3.2183337211608887, + "learning_rate": 8.623896687723817e-06, + "loss": 4.209, + "step": 3820 + }, + { + "epoch": 0.4896, + "grad_norm": 3.541289806365967, + "learning_rate": 8.6204203075906e-06, + "loss": 4.2734, + "step": 3825 + }, + { + "epoch": 0.49024, + "grad_norm": 3.3665432929992676, + "learning_rate": 8.616940244485794e-06, + "loss": 4.3104, + "step": 3830 + }, + { + "epoch": 0.49088, + "grad_norm": 3.4722721576690674, + "learning_rate": 8.61345650194959e-06, + "loss": 4.2407, + "step": 3835 + }, + { + "epoch": 0.49152, + "grad_norm": 3.581308126449585, + "learning_rate": 8.609969083525913e-06, + "loss": 4.2074, + "step": 3840 + }, + { + "epoch": 0.49216, + "grad_norm": 4.030796051025391, + "learning_rate": 8.606477992762442e-06, + "loss": 4.2008, + "step": 3845 + }, + { + "epoch": 0.4928, + "grad_norm": 3.2765376567840576, + "learning_rate": 8.602983233210582e-06, + "loss": 4.2747, + "step": 3850 + }, + { + "epoch": 0.49344, + "grad_norm": 3.3987598419189453, + "learning_rate": 8.599484808425471e-06, + "loss": 4.3367, + "step": 3855 + }, + { + "epoch": 0.49408, + "grad_norm": 3.40262508392334, + "learning_rate": 8.59598272196598e-06, + "loss": 4.2551, + "step": 3860 + }, + { + "epoch": 0.49472, + "grad_norm": 3.381237268447876, + "learning_rate": 8.592476977394703e-06, + "loss": 4.2735, + "step": 3865 + }, + { + "epoch": 0.49536, + "grad_norm": 3.3735103607177734, + "learning_rate": 8.588967578277952e-06, + "loss": 4.2059, + "step": 3870 + }, + { + "epoch": 0.496, + "grad_norm": 3.5227203369140625, + "learning_rate": 8.585454528185758e-06, + "loss": 4.2452, + "step": 3875 + }, + { + "epoch": 0.49664, + "grad_norm": 3.357236623764038, + "learning_rate": 8.58193783069187e-06, + "loss": 4.4393, + "step": 3880 + }, + { + "epoch": 0.49728, + "grad_norm": 3.4981918334960938, + "learning_rate": 8.578417489373747e-06, + "loss": 4.3561, + "step": 3885 + }, + { + "epoch": 0.49792, + "grad_norm": 3.3115687370300293, + "learning_rate": 8.574893507812548e-06, + "loss": 4.2126, + "step": 3890 + }, + { + "epoch": 0.49856, + "grad_norm": 3.4311251640319824, + "learning_rate": 8.571365889593139e-06, + "loss": 4.2719, + "step": 3895 + }, + { + "epoch": 0.4992, + "grad_norm": 3.6359755992889404, + "learning_rate": 8.56783463830409e-06, + "loss": 4.2107, + "step": 3900 + }, + { + "epoch": 0.4992, + "eval_loss": 1.0585970878601074, + "eval_runtime": 6.76, + "eval_samples_per_second": 147.928, + "eval_steps_per_second": 18.491, + "step": 3900 + }, + { + "epoch": 0.49984, + "grad_norm": 3.3010311126708984, + "learning_rate": 8.564299757537663e-06, + "loss": 4.1933, + "step": 3905 + }, + { + "epoch": 0.50048, + "grad_norm": 3.1731021404266357, + "learning_rate": 8.560761250889808e-06, + "loss": 4.2303, + "step": 3910 + }, + { + "epoch": 0.50112, + "grad_norm": 3.087322950363159, + "learning_rate": 8.557219121960173e-06, + "loss": 4.1189, + "step": 3915 + }, + { + "epoch": 0.50176, + "grad_norm": 3.6227645874023438, + "learning_rate": 8.553673374352081e-06, + "loss": 4.438, + "step": 3920 + }, + { + "epoch": 0.5024, + "grad_norm": 3.2071166038513184, + "learning_rate": 8.550124011672543e-06, + "loss": 4.2284, + "step": 3925 + }, + { + "epoch": 0.50304, + "grad_norm": 3.6163535118103027, + "learning_rate": 8.546571037532244e-06, + "loss": 4.2896, + "step": 3930 + }, + { + "epoch": 0.50368, + "grad_norm": 3.202073335647583, + "learning_rate": 8.543014455545545e-06, + "loss": 4.1561, + "step": 3935 + }, + { + "epoch": 0.50432, + "grad_norm": 3.4429636001586914, + "learning_rate": 8.539454269330476e-06, + "loss": 4.1965, + "step": 3940 + }, + { + "epoch": 0.50496, + "grad_norm": 3.5354838371276855, + "learning_rate": 8.535890482508735e-06, + "loss": 4.2258, + "step": 3945 + }, + { + "epoch": 0.5056, + "grad_norm": 4.535614967346191, + "learning_rate": 8.532323098705679e-06, + "loss": 4.2261, + "step": 3950 + }, + { + "epoch": 0.50624, + "grad_norm": 3.529849052429199, + "learning_rate": 8.52875212155033e-06, + "loss": 4.2718, + "step": 3955 + }, + { + "epoch": 0.50688, + "grad_norm": 3.2982187271118164, + "learning_rate": 8.525177554675361e-06, + "loss": 4.0328, + "step": 3960 + }, + { + "epoch": 0.50752, + "grad_norm": 3.3150134086608887, + "learning_rate": 8.521599401717095e-06, + "loss": 4.1423, + "step": 3965 + }, + { + "epoch": 0.50816, + "grad_norm": 3.494948387145996, + "learning_rate": 8.51801766631551e-06, + "loss": 4.2847, + "step": 3970 + }, + { + "epoch": 0.5088, + "grad_norm": 3.396127700805664, + "learning_rate": 8.514432352114224e-06, + "loss": 4.4652, + "step": 3975 + }, + { + "epoch": 0.50944, + "grad_norm": 3.2317397594451904, + "learning_rate": 8.510843462760494e-06, + "loss": 4.2053, + "step": 3980 + }, + { + "epoch": 0.51008, + "grad_norm": 3.7028400897979736, + "learning_rate": 8.507251001905216e-06, + "loss": 4.2045, + "step": 3985 + }, + { + "epoch": 0.51072, + "grad_norm": 3.373598337173462, + "learning_rate": 8.50365497320292e-06, + "loss": 4.0433, + "step": 3990 + }, + { + "epoch": 0.51136, + "grad_norm": 3.230725049972534, + "learning_rate": 8.500055380311763e-06, + "loss": 4.2078, + "step": 3995 + }, + { + "epoch": 0.512, + "grad_norm": 3.5658986568450928, + "learning_rate": 8.496452226893533e-06, + "loss": 4.281, + "step": 4000 + }, + { + "epoch": 0.512, + "eval_loss": 1.0626306533813477, + "eval_runtime": 7.4047, + "eval_samples_per_second": 135.049, + "eval_steps_per_second": 16.881, + "step": 4000 + }, + { + "epoch": 0.51264, + "grad_norm": 3.789924144744873, + "learning_rate": 8.492845516613632e-06, + "loss": 4.2659, + "step": 4005 + }, + { + "epoch": 0.51328, + "grad_norm": 3.218669891357422, + "learning_rate": 8.489235253141088e-06, + "loss": 4.1622, + "step": 4010 + }, + { + "epoch": 0.51392, + "grad_norm": 3.1417505741119385, + "learning_rate": 8.485621440148538e-06, + "loss": 4.3878, + "step": 4015 + }, + { + "epoch": 0.51456, + "grad_norm": 3.402475118637085, + "learning_rate": 8.482004081312234e-06, + "loss": 4.2486, + "step": 4020 + }, + { + "epoch": 0.5152, + "grad_norm": 3.214193820953369, + "learning_rate": 8.47838318031203e-06, + "loss": 4.2596, + "step": 4025 + }, + { + "epoch": 0.51584, + "grad_norm": 3.7518293857574463, + "learning_rate": 8.47475874083139e-06, + "loss": 4.1408, + "step": 4030 + }, + { + "epoch": 0.51648, + "grad_norm": 3.5070385932922363, + "learning_rate": 8.471130766557373e-06, + "loss": 4.3088, + "step": 4035 + }, + { + "epoch": 0.51712, + "grad_norm": 3.1660029888153076, + "learning_rate": 8.467499261180636e-06, + "loss": 4.237, + "step": 4040 + }, + { + "epoch": 0.51776, + "grad_norm": 3.3773226737976074, + "learning_rate": 8.463864228395426e-06, + "loss": 4.2198, + "step": 4045 + }, + { + "epoch": 0.5184, + "grad_norm": 3.256511926651001, + "learning_rate": 8.46022567189958e-06, + "loss": 4.21, + "step": 4050 + }, + { + "epoch": 0.51904, + "grad_norm": 3.1459224224090576, + "learning_rate": 8.456583595394519e-06, + "loss": 4.3253, + "step": 4055 + }, + { + "epoch": 0.51968, + "grad_norm": 3.153252363204956, + "learning_rate": 8.452938002585243e-06, + "loss": 4.2464, + "step": 4060 + }, + { + "epoch": 0.52032, + "grad_norm": 3.502012014389038, + "learning_rate": 8.449288897180335e-06, + "loss": 4.1066, + "step": 4065 + }, + { + "epoch": 0.52096, + "grad_norm": 3.99371075630188, + "learning_rate": 8.445636282891945e-06, + "loss": 4.1363, + "step": 4070 + }, + { + "epoch": 0.5216, + "grad_norm": 3.0267117023468018, + "learning_rate": 8.441980163435793e-06, + "loss": 4.3352, + "step": 4075 + }, + { + "epoch": 0.52224, + "grad_norm": 3.478861093521118, + "learning_rate": 8.43832054253117e-06, + "loss": 4.2754, + "step": 4080 + }, + { + "epoch": 0.52288, + "grad_norm": 3.4219508171081543, + "learning_rate": 8.434657423900925e-06, + "loss": 4.2462, + "step": 4085 + }, + { + "epoch": 0.52352, + "grad_norm": 3.4356322288513184, + "learning_rate": 8.430990811271464e-06, + "loss": 4.2702, + "step": 4090 + }, + { + "epoch": 0.52416, + "grad_norm": 3.3565707206726074, + "learning_rate": 8.427320708372749e-06, + "loss": 4.3038, + "step": 4095 + }, + { + "epoch": 0.5248, + "grad_norm": 3.221428871154785, + "learning_rate": 8.423647118938293e-06, + "loss": 4.1941, + "step": 4100 + }, + { + "epoch": 0.5248, + "eval_loss": 1.0621204376220703, + "eval_runtime": 6.8853, + "eval_samples_per_second": 145.237, + "eval_steps_per_second": 18.155, + "step": 4100 + }, + { + "epoch": 0.52544, + "grad_norm": 3.1570796966552734, + "learning_rate": 8.419970046705155e-06, + "loss": 4.2666, + "step": 4105 + }, + { + "epoch": 0.52608, + "grad_norm": 3.1513359546661377, + "learning_rate": 8.416289495413939e-06, + "loss": 4.2907, + "step": 4110 + }, + { + "epoch": 0.52672, + "grad_norm": 3.519578456878662, + "learning_rate": 8.412605468808786e-06, + "loss": 4.1469, + "step": 4115 + }, + { + "epoch": 0.52736, + "grad_norm": 3.4287312030792236, + "learning_rate": 8.408917970637372e-06, + "loss": 4.3648, + "step": 4120 + }, + { + "epoch": 0.528, + "grad_norm": 3.1880617141723633, + "learning_rate": 8.405227004650903e-06, + "loss": 4.265, + "step": 4125 + }, + { + "epoch": 0.52864, + "grad_norm": 3.3588428497314453, + "learning_rate": 8.40153257460412e-06, + "loss": 4.1882, + "step": 4130 + }, + { + "epoch": 0.52928, + "grad_norm": 3.5101263523101807, + "learning_rate": 8.397834684255279e-06, + "loss": 4.2479, + "step": 4135 + }, + { + "epoch": 0.52992, + "grad_norm": 3.492445945739746, + "learning_rate": 8.394133337366164e-06, + "loss": 4.0681, + "step": 4140 + }, + { + "epoch": 0.53056, + "grad_norm": 3.169119358062744, + "learning_rate": 8.390428537702066e-06, + "loss": 4.2595, + "step": 4145 + }, + { + "epoch": 0.5312, + "grad_norm": 3.325134038925171, + "learning_rate": 8.3867202890318e-06, + "loss": 4.1674, + "step": 4150 + }, + { + "epoch": 0.53184, + "grad_norm": 3.290656805038452, + "learning_rate": 8.38300859512768e-06, + "loss": 4.3233, + "step": 4155 + }, + { + "epoch": 0.53248, + "grad_norm": 3.161649465560913, + "learning_rate": 8.379293459765527e-06, + "loss": 4.2074, + "step": 4160 + }, + { + "epoch": 0.53312, + "grad_norm": 3.6237411499023438, + "learning_rate": 8.375574886724666e-06, + "loss": 4.1539, + "step": 4165 + }, + { + "epoch": 0.53376, + "grad_norm": 3.308765172958374, + "learning_rate": 8.371852879787917e-06, + "loss": 4.2598, + "step": 4170 + }, + { + "epoch": 0.5344, + "grad_norm": 3.173496723175049, + "learning_rate": 8.368127442741592e-06, + "loss": 4.2533, + "step": 4175 + }, + { + "epoch": 0.53504, + "grad_norm": 3.7781951427459717, + "learning_rate": 8.364398579375496e-06, + "loss": 4.253, + "step": 4180 + }, + { + "epoch": 0.53568, + "grad_norm": 3.1767258644104004, + "learning_rate": 8.360666293482915e-06, + "loss": 4.2968, + "step": 4185 + }, + { + "epoch": 0.53632, + "grad_norm": 3.150240659713745, + "learning_rate": 8.356930588860622e-06, + "loss": 4.1578, + "step": 4190 + }, + { + "epoch": 0.53696, + "grad_norm": 3.2313241958618164, + "learning_rate": 8.35319146930886e-06, + "loss": 4.33, + "step": 4195 + }, + { + "epoch": 0.5376, + "grad_norm": 3.2731451988220215, + "learning_rate": 8.349448938631354e-06, + "loss": 4.3148, + "step": 4200 + }, + { + "epoch": 0.5376, + "eval_loss": 1.0485703945159912, + "eval_runtime": 6.9069, + "eval_samples_per_second": 144.782, + "eval_steps_per_second": 18.098, + "step": 4200 + }, + { + "epoch": 0.53824, + "grad_norm": 3.3490631580352783, + "learning_rate": 8.345703000635297e-06, + "loss": 4.1731, + "step": 4205 + }, + { + "epoch": 0.53888, + "grad_norm": 3.5214695930480957, + "learning_rate": 8.341953659131343e-06, + "loss": 4.2613, + "step": 4210 + }, + { + "epoch": 0.53952, + "grad_norm": 4.7053632736206055, + "learning_rate": 8.338200917933616e-06, + "loss": 4.2745, + "step": 4215 + }, + { + "epoch": 0.54016, + "grad_norm": 3.4698004722595215, + "learning_rate": 8.334444780859689e-06, + "loss": 4.1812, + "step": 4220 + }, + { + "epoch": 0.5408, + "grad_norm": 3.492169141769409, + "learning_rate": 8.330685251730603e-06, + "loss": 4.3112, + "step": 4225 + }, + { + "epoch": 0.54144, + "grad_norm": 3.470062017440796, + "learning_rate": 8.326922334370835e-06, + "loss": 4.2367, + "step": 4230 + }, + { + "epoch": 0.54208, + "grad_norm": 3.188481330871582, + "learning_rate": 8.32315603260832e-06, + "loss": 4.2524, + "step": 4235 + }, + { + "epoch": 0.54272, + "grad_norm": 3.286050319671631, + "learning_rate": 8.31938635027443e-06, + "loss": 4.1992, + "step": 4240 + }, + { + "epoch": 0.54336, + "grad_norm": 3.456850051879883, + "learning_rate": 8.315613291203977e-06, + "loss": 4.267, + "step": 4245 + }, + { + "epoch": 0.544, + "grad_norm": 3.5051589012145996, + "learning_rate": 8.311836859235208e-06, + "loss": 4.1083, + "step": 4250 + }, + { + "epoch": 0.54464, + "grad_norm": 3.4664196968078613, + "learning_rate": 8.308057058209803e-06, + "loss": 4.2551, + "step": 4255 + }, + { + "epoch": 0.54528, + "grad_norm": 3.6029019355773926, + "learning_rate": 8.304273891972869e-06, + "loss": 4.2654, + "step": 4260 + }, + { + "epoch": 0.54592, + "grad_norm": 3.2132017612457275, + "learning_rate": 8.300487364372934e-06, + "loss": 4.1052, + "step": 4265 + }, + { + "epoch": 0.54656, + "grad_norm": 3.590670585632324, + "learning_rate": 8.296697479261944e-06, + "loss": 4.2485, + "step": 4270 + }, + { + "epoch": 0.5472, + "grad_norm": 3.327327013015747, + "learning_rate": 8.292904240495267e-06, + "loss": 4.1854, + "step": 4275 + }, + { + "epoch": 0.54784, + "grad_norm": 3.534122943878174, + "learning_rate": 8.28910765193168e-06, + "loss": 4.192, + "step": 4280 + }, + { + "epoch": 0.54848, + "grad_norm": 3.2001917362213135, + "learning_rate": 8.285307717433363e-06, + "loss": 4.2273, + "step": 4285 + }, + { + "epoch": 0.54912, + "grad_norm": 3.3666787147521973, + "learning_rate": 8.281504440865905e-06, + "loss": 4.2344, + "step": 4290 + }, + { + "epoch": 0.54976, + "grad_norm": 3.4703445434570312, + "learning_rate": 8.277697826098291e-06, + "loss": 4.114, + "step": 4295 + }, + { + "epoch": 0.5504, + "grad_norm": 3.424564838409424, + "learning_rate": 8.27388787700291e-06, + "loss": 4.195, + "step": 4300 + }, + { + "epoch": 0.5504, + "eval_loss": 1.0426100492477417, + "eval_runtime": 6.7447, + "eval_samples_per_second": 148.265, + "eval_steps_per_second": 18.533, + "step": 4300 + }, + { + "epoch": 0.55104, + "grad_norm": 3.4007487297058105, + "learning_rate": 8.27007459745553e-06, + "loss": 4.2053, + "step": 4305 + }, + { + "epoch": 0.55168, + "grad_norm": 3.2374963760375977, + "learning_rate": 8.266257991335316e-06, + "loss": 4.1603, + "step": 4310 + }, + { + "epoch": 0.55232, + "grad_norm": 3.1317055225372314, + "learning_rate": 8.262438062524817e-06, + "loss": 4.3015, + "step": 4315 + }, + { + "epoch": 0.55296, + "grad_norm": 3.174532890319824, + "learning_rate": 8.25861481490996e-06, + "loss": 4.2574, + "step": 4320 + }, + { + "epoch": 0.5536, + "grad_norm": 3.3424572944641113, + "learning_rate": 8.254788252380046e-06, + "loss": 4.0812, + "step": 4325 + }, + { + "epoch": 0.55424, + "grad_norm": 3.1913397312164307, + "learning_rate": 8.250958378827752e-06, + "loss": 4.2152, + "step": 4330 + }, + { + "epoch": 0.55488, + "grad_norm": 3.294013500213623, + "learning_rate": 8.24712519814912e-06, + "loss": 4.1572, + "step": 4335 + }, + { + "epoch": 0.55552, + "grad_norm": 3.1539976596832275, + "learning_rate": 8.24328871424356e-06, + "loss": 4.2349, + "step": 4340 + }, + { + "epoch": 0.55616, + "grad_norm": 3.6340556144714355, + "learning_rate": 8.239448931013839e-06, + "loss": 4.262, + "step": 4345 + }, + { + "epoch": 0.5568, + "grad_norm": 3.097135305404663, + "learning_rate": 8.235605852366082e-06, + "loss": 4.2263, + "step": 4350 + }, + { + "epoch": 0.55744, + "grad_norm": 3.1234707832336426, + "learning_rate": 8.231759482209764e-06, + "loss": 4.1102, + "step": 4355 + }, + { + "epoch": 0.55808, + "grad_norm": 3.3812243938446045, + "learning_rate": 8.227909824457714e-06, + "loss": 4.1727, + "step": 4360 + }, + { + "epoch": 0.55872, + "grad_norm": 3.526226043701172, + "learning_rate": 8.224056883026097e-06, + "loss": 4.0455, + "step": 4365 + }, + { + "epoch": 0.55936, + "grad_norm": 3.2296152114868164, + "learning_rate": 8.220200661834428e-06, + "loss": 4.2122, + "step": 4370 + }, + { + "epoch": 0.56, + "grad_norm": 3.571004629135132, + "learning_rate": 8.216341164805547e-06, + "loss": 4.2038, + "step": 4375 + }, + { + "epoch": 0.56064, + "grad_norm": 4.520260334014893, + "learning_rate": 8.212478395865642e-06, + "loss": 4.2543, + "step": 4380 + }, + { + "epoch": 0.56128, + "grad_norm": 3.549973249435425, + "learning_rate": 8.208612358944212e-06, + "loss": 4.3407, + "step": 4385 + }, + { + "epoch": 0.56192, + "grad_norm": 3.5902371406555176, + "learning_rate": 8.204743057974093e-06, + "loss": 4.1843, + "step": 4390 + }, + { + "epoch": 0.56256, + "grad_norm": 3.5541188716888428, + "learning_rate": 8.200870496891437e-06, + "loss": 4.1567, + "step": 4395 + }, + { + "epoch": 0.5632, + "grad_norm": 3.159623146057129, + "learning_rate": 8.196994679635713e-06, + "loss": 4.2037, + "step": 4400 + }, + { + "epoch": 0.5632, + "eval_loss": 1.0485684871673584, + "eval_runtime": 6.7526, + "eval_samples_per_second": 148.091, + "eval_steps_per_second": 18.511, + "step": 4400 + }, + { + "epoch": 0.56384, + "grad_norm": 3.462564468383789, + "learning_rate": 8.1931156101497e-06, + "loss": 4.2367, + "step": 4405 + }, + { + "epoch": 0.56448, + "grad_norm": 3.467090606689453, + "learning_rate": 8.189233292379488e-06, + "loss": 4.1135, + "step": 4410 + }, + { + "epoch": 0.56512, + "grad_norm": 3.3077733516693115, + "learning_rate": 8.185347730274471e-06, + "loss": 4.1801, + "step": 4415 + }, + { + "epoch": 0.56576, + "grad_norm": 3.233238697052002, + "learning_rate": 8.181458927787347e-06, + "loss": 4.1566, + "step": 4420 + }, + { + "epoch": 0.5664, + "grad_norm": 3.4241669178009033, + "learning_rate": 8.177566888874101e-06, + "loss": 4.2094, + "step": 4425 + }, + { + "epoch": 0.56704, + "grad_norm": 3.7361552715301514, + "learning_rate": 8.17367161749402e-06, + "loss": 4.0974, + "step": 4430 + }, + { + "epoch": 0.56768, + "grad_norm": 3.274073362350464, + "learning_rate": 8.169773117609675e-06, + "loss": 4.1142, + "step": 4435 + }, + { + "epoch": 0.56832, + "grad_norm": 3.3000011444091797, + "learning_rate": 8.165871393186919e-06, + "loss": 4.2093, + "step": 4440 + }, + { + "epoch": 0.56896, + "grad_norm": 3.3069565296173096, + "learning_rate": 8.16196644819489e-06, + "loss": 4.2301, + "step": 4445 + }, + { + "epoch": 0.5696, + "grad_norm": 3.4007809162139893, + "learning_rate": 8.158058286606e-06, + "loss": 4.111, + "step": 4450 + }, + { + "epoch": 0.57024, + "grad_norm": 3.8727948665618896, + "learning_rate": 8.154146912395933e-06, + "loss": 4.196, + "step": 4455 + }, + { + "epoch": 0.57088, + "grad_norm": 3.4134340286254883, + "learning_rate": 8.150232329543643e-06, + "loss": 4.1377, + "step": 4460 + }, + { + "epoch": 0.57152, + "grad_norm": 3.355658531188965, + "learning_rate": 8.146314542031343e-06, + "loss": 4.0884, + "step": 4465 + }, + { + "epoch": 0.57216, + "grad_norm": 3.8688745498657227, + "learning_rate": 8.142393553844511e-06, + "loss": 4.1839, + "step": 4470 + }, + { + "epoch": 0.5728, + "grad_norm": 3.614151954650879, + "learning_rate": 8.138469368971882e-06, + "loss": 4.2426, + "step": 4475 + }, + { + "epoch": 0.57344, + "grad_norm": 3.294654607772827, + "learning_rate": 8.134541991405438e-06, + "loss": 4.0466, + "step": 4480 + }, + { + "epoch": 0.57408, + "grad_norm": 3.144278049468994, + "learning_rate": 8.130611425140412e-06, + "loss": 4.0591, + "step": 4485 + }, + { + "epoch": 0.57472, + "grad_norm": 3.2101783752441406, + "learning_rate": 8.126677674175278e-06, + "loss": 4.132, + "step": 4490 + }, + { + "epoch": 0.57536, + "grad_norm": 3.0196874141693115, + "learning_rate": 8.122740742511754e-06, + "loss": 4.1155, + "step": 4495 + }, + { + "epoch": 0.576, + "grad_norm": 3.2182085514068604, + "learning_rate": 8.118800634154792e-06, + "loss": 4.2686, + "step": 4500 + }, + { + "epoch": 0.576, + "eval_loss": 1.0499341487884521, + "eval_runtime": 6.7956, + "eval_samples_per_second": 147.154, + "eval_steps_per_second": 18.394, + "step": 4500 + }, + { + "epoch": 0.57664, + "grad_norm": 3.595808982849121, + "learning_rate": 8.114857353112572e-06, + "loss": 4.1841, + "step": 4505 + }, + { + "epoch": 0.57728, + "grad_norm": 3.4274849891662598, + "learning_rate": 8.110910903396508e-06, + "loss": 4.1231, + "step": 4510 + }, + { + "epoch": 0.57792, + "grad_norm": 3.538372278213501, + "learning_rate": 8.106961289021232e-06, + "loss": 4.1426, + "step": 4515 + }, + { + "epoch": 0.57856, + "grad_norm": 3.3008785247802734, + "learning_rate": 8.103008514004596e-06, + "loss": 4.0978, + "step": 4520 + }, + { + "epoch": 0.5792, + "grad_norm": 3.7046406269073486, + "learning_rate": 8.099052582367671e-06, + "loss": 4.1844, + "step": 4525 + }, + { + "epoch": 0.57984, + "grad_norm": 3.538774013519287, + "learning_rate": 8.095093498134736e-06, + "loss": 4.1799, + "step": 4530 + }, + { + "epoch": 0.58048, + "grad_norm": 3.4396979808807373, + "learning_rate": 8.091131265333277e-06, + "loss": 4.2845, + "step": 4535 + }, + { + "epoch": 0.58112, + "grad_norm": 3.499345302581787, + "learning_rate": 8.087165887993984e-06, + "loss": 4.3056, + "step": 4540 + }, + { + "epoch": 0.58176, + "grad_norm": 3.0737617015838623, + "learning_rate": 8.083197370150748e-06, + "loss": 4.0669, + "step": 4545 + }, + { + "epoch": 0.5824, + "grad_norm": 3.5053136348724365, + "learning_rate": 8.079225715840646e-06, + "loss": 4.1382, + "step": 4550 + }, + { + "epoch": 0.58304, + "grad_norm": 3.3943347930908203, + "learning_rate": 8.075250929103959e-06, + "loss": 4.2206, + "step": 4555 + }, + { + "epoch": 0.58368, + "grad_norm": 3.1711487770080566, + "learning_rate": 8.071273013984144e-06, + "loss": 4.2383, + "step": 4560 + }, + { + "epoch": 0.58432, + "grad_norm": 3.3679018020629883, + "learning_rate": 8.067291974527845e-06, + "loss": 4.1345, + "step": 4565 + }, + { + "epoch": 0.58496, + "grad_norm": 3.348994255065918, + "learning_rate": 8.063307814784882e-06, + "loss": 4.2213, + "step": 4570 + }, + { + "epoch": 0.5856, + "grad_norm": 3.837846517562866, + "learning_rate": 8.059320538808251e-06, + "loss": 4.1051, + "step": 4575 + }, + { + "epoch": 0.58624, + "grad_norm": 3.153294801712036, + "learning_rate": 8.05533015065412e-06, + "loss": 4.0953, + "step": 4580 + }, + { + "epoch": 0.58688, + "grad_norm": 3.3224375247955322, + "learning_rate": 8.051336654381816e-06, + "loss": 4.1112, + "step": 4585 + }, + { + "epoch": 0.58752, + "grad_norm": 3.488499641418457, + "learning_rate": 8.047340054053836e-06, + "loss": 4.0806, + "step": 4590 + }, + { + "epoch": 0.58816, + "grad_norm": 3.5140116214752197, + "learning_rate": 8.043340353735828e-06, + "loss": 4.132, + "step": 4595 + }, + { + "epoch": 0.5888, + "grad_norm": 3.4562926292419434, + "learning_rate": 8.0393375574966e-06, + "loss": 4.0194, + "step": 4600 + }, + { + "epoch": 0.5888, + "eval_loss": 1.0482516288757324, + "eval_runtime": 7.2824, + "eval_samples_per_second": 137.317, + "eval_steps_per_second": 17.165, + "step": 4600 + }, + { + "epoch": 0.58944, + "grad_norm": 3.2118759155273438, + "learning_rate": 8.035331669408104e-06, + "loss": 4.108, + "step": 4605 + }, + { + "epoch": 0.59008, + "grad_norm": 3.044614315032959, + "learning_rate": 8.031322693545438e-06, + "loss": 4.1502, + "step": 4610 + }, + { + "epoch": 0.59072, + "grad_norm": 3.399355888366699, + "learning_rate": 8.027310633986845e-06, + "loss": 4.1071, + "step": 4615 + }, + { + "epoch": 0.59136, + "grad_norm": 3.31364107131958, + "learning_rate": 8.023295494813701e-06, + "loss": 4.205, + "step": 4620 + }, + { + "epoch": 0.592, + "grad_norm": 3.1222352981567383, + "learning_rate": 8.019277280110516e-06, + "loss": 4.1323, + "step": 4625 + }, + { + "epoch": 0.59264, + "grad_norm": 3.3674120903015137, + "learning_rate": 8.01525599396493e-06, + "loss": 4.0599, + "step": 4630 + }, + { + "epoch": 0.59328, + "grad_norm": 3.2672739028930664, + "learning_rate": 8.011231640467705e-06, + "loss": 4.1417, + "step": 4635 + }, + { + "epoch": 0.59392, + "grad_norm": 3.2652289867401123, + "learning_rate": 8.007204223712726e-06, + "loss": 4.223, + "step": 4640 + }, + { + "epoch": 0.59456, + "grad_norm": 4.0131001472473145, + "learning_rate": 8.00317374779699e-06, + "loss": 4.1341, + "step": 4645 + }, + { + "epoch": 0.5952, + "grad_norm": 3.488124132156372, + "learning_rate": 7.999140216820613e-06, + "loss": 4.0751, + "step": 4650 + }, + { + "epoch": 0.59584, + "grad_norm": 3.4057059288024902, + "learning_rate": 7.99510363488681e-06, + "loss": 4.182, + "step": 4655 + }, + { + "epoch": 0.59648, + "grad_norm": 3.633147954940796, + "learning_rate": 7.991064006101909e-06, + "loss": 4.1708, + "step": 4660 + }, + { + "epoch": 0.59712, + "grad_norm": 3.4964585304260254, + "learning_rate": 7.98702133457533e-06, + "loss": 4.123, + "step": 4665 + }, + { + "epoch": 0.59776, + "grad_norm": 3.620976448059082, + "learning_rate": 7.982975624419591e-06, + "loss": 4.3387, + "step": 4670 + }, + { + "epoch": 0.5984, + "grad_norm": 3.191387414932251, + "learning_rate": 7.978926879750303e-06, + "loss": 4.1979, + "step": 4675 + }, + { + "epoch": 0.59904, + "grad_norm": 3.342824697494507, + "learning_rate": 7.974875104686164e-06, + "loss": 4.2519, + "step": 4680 + }, + { + "epoch": 0.59968, + "grad_norm": 3.3812179565429688, + "learning_rate": 7.97082030334895e-06, + "loss": 4.1762, + "step": 4685 + }, + { + "epoch": 0.60032, + "grad_norm": 3.3878660202026367, + "learning_rate": 7.966762479863517e-06, + "loss": 4.1721, + "step": 4690 + }, + { + "epoch": 0.60096, + "grad_norm": 3.517085075378418, + "learning_rate": 7.962701638357799e-06, + "loss": 4.0974, + "step": 4695 + }, + { + "epoch": 0.6016, + "grad_norm": 3.536073923110962, + "learning_rate": 7.9586377829628e-06, + "loss": 4.2323, + "step": 4700 + }, + { + "epoch": 0.6016, + "eval_loss": 1.0386443138122559, + "eval_runtime": 7.8627, + "eval_samples_per_second": 127.183, + "eval_steps_per_second": 15.898, + "step": 4700 + }, + { + "epoch": 0.60224, + "grad_norm": 3.297013759613037, + "learning_rate": 7.954570917812585e-06, + "loss": 4.0663, + "step": 4705 + }, + { + "epoch": 0.60288, + "grad_norm": 3.464735507965088, + "learning_rate": 7.950501047044287e-06, + "loss": 4.2503, + "step": 4710 + }, + { + "epoch": 0.60352, + "grad_norm": 3.2324845790863037, + "learning_rate": 7.946428174798089e-06, + "loss": 4.0498, + "step": 4715 + }, + { + "epoch": 0.60416, + "grad_norm": 3.490805149078369, + "learning_rate": 7.942352305217236e-06, + "loss": 4.2563, + "step": 4720 + }, + { + "epoch": 0.6048, + "grad_norm": 3.37969708442688, + "learning_rate": 7.938273442448015e-06, + "loss": 3.9272, + "step": 4725 + }, + { + "epoch": 0.60544, + "grad_norm": 3.358905076980591, + "learning_rate": 7.934191590639762e-06, + "loss": 4.1162, + "step": 4730 + }, + { + "epoch": 0.60608, + "grad_norm": 3.1589395999908447, + "learning_rate": 7.930106753944853e-06, + "loss": 3.951, + "step": 4735 + }, + { + "epoch": 0.60672, + "grad_norm": 3.8178842067718506, + "learning_rate": 7.926018936518698e-06, + "loss": 4.3294, + "step": 4740 + }, + { + "epoch": 0.60736, + "grad_norm": 3.419395923614502, + "learning_rate": 7.921928142519742e-06, + "loss": 4.2388, + "step": 4745 + }, + { + "epoch": 0.608, + "grad_norm": 3.332667827606201, + "learning_rate": 7.91783437610946e-06, + "loss": 4.2338, + "step": 4750 + }, + { + "epoch": 0.60864, + "grad_norm": 3.7604660987854004, + "learning_rate": 7.913737641452342e-06, + "loss": 4.2115, + "step": 4755 + }, + { + "epoch": 0.60928, + "grad_norm": 3.185386896133423, + "learning_rate": 7.909637942715906e-06, + "loss": 4.091, + "step": 4760 + }, + { + "epoch": 0.60992, + "grad_norm": 3.2540390491485596, + "learning_rate": 7.905535284070685e-06, + "loss": 4.1399, + "step": 4765 + }, + { + "epoch": 0.61056, + "grad_norm": 3.3786585330963135, + "learning_rate": 7.901429669690218e-06, + "loss": 4.1883, + "step": 4770 + }, + { + "epoch": 0.6112, + "grad_norm": 3.554351329803467, + "learning_rate": 7.897321103751054e-06, + "loss": 4.209, + "step": 4775 + }, + { + "epoch": 0.61184, + "grad_norm": 3.373978614807129, + "learning_rate": 7.893209590432744e-06, + "loss": 4.2378, + "step": 4780 + }, + { + "epoch": 0.61248, + "grad_norm": 3.1719655990600586, + "learning_rate": 7.889095133917839e-06, + "loss": 4.1013, + "step": 4785 + }, + { + "epoch": 0.61312, + "grad_norm": 3.1673762798309326, + "learning_rate": 7.884977738391882e-06, + "loss": 4.0557, + "step": 4790 + }, + { + "epoch": 0.61376, + "grad_norm": 3.32163143157959, + "learning_rate": 7.880857408043404e-06, + "loss": 4.2376, + "step": 4795 + }, + { + "epoch": 0.6144, + "grad_norm": 3.029355764389038, + "learning_rate": 7.876734147063927e-06, + "loss": 4.1745, + "step": 4800 + }, + { + "epoch": 0.6144, + "eval_loss": 1.0420140027999878, + "eval_runtime": 6.7065, + "eval_samples_per_second": 149.109, + "eval_steps_per_second": 18.639, + "step": 4800 + }, + { + "epoch": 0.61504, + "grad_norm": 3.270620822906494, + "learning_rate": 7.872607959647947e-06, + "loss": 4.0588, + "step": 4805 + }, + { + "epoch": 0.61568, + "grad_norm": 3.68705153465271, + "learning_rate": 7.868478849992944e-06, + "loss": 4.1343, + "step": 4810 + }, + { + "epoch": 0.61632, + "grad_norm": 3.301039934158325, + "learning_rate": 7.86434682229937e-06, + "loss": 4.1913, + "step": 4815 + }, + { + "epoch": 0.61696, + "grad_norm": 3.441736936569214, + "learning_rate": 7.860211880770637e-06, + "loss": 4.1783, + "step": 4820 + }, + { + "epoch": 0.6176, + "grad_norm": 3.1942312717437744, + "learning_rate": 7.85607402961313e-06, + "loss": 4.1702, + "step": 4825 + }, + { + "epoch": 0.61824, + "grad_norm": 3.3528614044189453, + "learning_rate": 7.851933273036194e-06, + "loss": 4.0512, + "step": 4830 + }, + { + "epoch": 0.61888, + "grad_norm": 3.235706090927124, + "learning_rate": 7.847789615252123e-06, + "loss": 3.9943, + "step": 4835 + }, + { + "epoch": 0.61952, + "grad_norm": 3.4161102771759033, + "learning_rate": 7.84364306047617e-06, + "loss": 4.1663, + "step": 4840 + }, + { + "epoch": 0.62016, + "grad_norm": 3.2843005657196045, + "learning_rate": 7.839493612926528e-06, + "loss": 4.1458, + "step": 4845 + }, + { + "epoch": 0.6208, + "grad_norm": 3.2336485385894775, + "learning_rate": 7.835341276824338e-06, + "loss": 4.1267, + "step": 4850 + }, + { + "epoch": 0.62144, + "grad_norm": 3.2358272075653076, + "learning_rate": 7.831186056393679e-06, + "loss": 4.1099, + "step": 4855 + }, + { + "epoch": 0.62208, + "grad_norm": 3.2333428859710693, + "learning_rate": 7.827027955861557e-06, + "loss": 4.0636, + "step": 4860 + }, + { + "epoch": 0.62272, + "grad_norm": 3.440809726715088, + "learning_rate": 7.822866979457917e-06, + "loss": 4.2263, + "step": 4865 + }, + { + "epoch": 0.62336, + "grad_norm": 3.2690722942352295, + "learning_rate": 7.818703131415627e-06, + "loss": 4.1923, + "step": 4870 + }, + { + "epoch": 0.624, + "grad_norm": 3.135249614715576, + "learning_rate": 7.814536415970475e-06, + "loss": 3.9522, + "step": 4875 + }, + { + "epoch": 0.62464, + "grad_norm": 3.6807265281677246, + "learning_rate": 7.810366837361165e-06, + "loss": 4.1948, + "step": 4880 + }, + { + "epoch": 0.62528, + "grad_norm": 3.511955738067627, + "learning_rate": 7.806194399829314e-06, + "loss": 4.1177, + "step": 4885 + }, + { + "epoch": 0.62592, + "grad_norm": 3.115957260131836, + "learning_rate": 7.802019107619452e-06, + "loss": 4.1148, + "step": 4890 + }, + { + "epoch": 0.62656, + "grad_norm": 3.540649890899658, + "learning_rate": 7.797840964979007e-06, + "loss": 4.1329, + "step": 4895 + }, + { + "epoch": 0.6272, + "grad_norm": 3.370652675628662, + "learning_rate": 7.793659976158306e-06, + "loss": 4.0991, + "step": 4900 + }, + { + "epoch": 0.6272, + "eval_loss": 1.0484163761138916, + "eval_runtime": 6.6185, + "eval_samples_per_second": 151.092, + "eval_steps_per_second": 18.886, + "step": 4900 + }, + { + "epoch": 0.62784, + "grad_norm": 3.38389253616333, + "learning_rate": 7.78947614541058e-06, + "loss": 4.1487, + "step": 4905 + }, + { + "epoch": 0.62848, + "grad_norm": 3.2582650184631348, + "learning_rate": 7.78528947699194e-06, + "loss": 4.0647, + "step": 4910 + }, + { + "epoch": 0.62912, + "grad_norm": 3.4006094932556152, + "learning_rate": 7.781099975161393e-06, + "loss": 4.0973, + "step": 4915 + }, + { + "epoch": 0.62976, + "grad_norm": 3.3823771476745605, + "learning_rate": 7.776907644180822e-06, + "loss": 4.0354, + "step": 4920 + }, + { + "epoch": 0.6304, + "grad_norm": 3.21818470954895, + "learning_rate": 7.772712488314991e-06, + "loss": 4.0933, + "step": 4925 + }, + { + "epoch": 0.63104, + "grad_norm": 3.893780469894409, + "learning_rate": 7.768514511831537e-06, + "loss": 4.1754, + "step": 4930 + }, + { + "epoch": 0.63168, + "grad_norm": 3.3422024250030518, + "learning_rate": 7.764313719000966e-06, + "loss": 4.1792, + "step": 4935 + }, + { + "epoch": 0.63232, + "grad_norm": 3.573408365249634, + "learning_rate": 7.76011011409665e-06, + "loss": 3.9603, + "step": 4940 + }, + { + "epoch": 0.63296, + "grad_norm": 3.3298516273498535, + "learning_rate": 7.755903701394822e-06, + "loss": 4.1212, + "step": 4945 + }, + { + "epoch": 0.6336, + "grad_norm": 3.294663429260254, + "learning_rate": 7.75169448517457e-06, + "loss": 4.0832, + "step": 4950 + }, + { + "epoch": 0.63424, + "grad_norm": 3.372905731201172, + "learning_rate": 7.747482469717832e-06, + "loss": 4.1185, + "step": 4955 + }, + { + "epoch": 0.63488, + "grad_norm": 3.442746639251709, + "learning_rate": 7.743267659309396e-06, + "loss": 4.0458, + "step": 4960 + }, + { + "epoch": 0.63552, + "grad_norm": 3.5581016540527344, + "learning_rate": 7.739050058236898e-06, + "loss": 4.206, + "step": 4965 + }, + { + "epoch": 0.63616, + "grad_norm": 3.5394022464752197, + "learning_rate": 7.734829670790804e-06, + "loss": 4.0824, + "step": 4970 + }, + { + "epoch": 0.6368, + "grad_norm": 3.5203707218170166, + "learning_rate": 7.73060650126442e-06, + "loss": 4.1966, + "step": 4975 + }, + { + "epoch": 0.63744, + "grad_norm": 3.2009241580963135, + "learning_rate": 7.726380553953879e-06, + "loss": 4.1998, + "step": 4980 + }, + { + "epoch": 0.63808, + "grad_norm": 3.217941999435425, + "learning_rate": 7.722151833158142e-06, + "loss": 3.9779, + "step": 4985 + }, + { + "epoch": 0.63872, + "grad_norm": 3.275895118713379, + "learning_rate": 7.717920343178993e-06, + "loss": 4.1102, + "step": 4990 + }, + { + "epoch": 0.63936, + "grad_norm": 3.0897116661071777, + "learning_rate": 7.713686088321029e-06, + "loss": 4.1574, + "step": 4995 + }, + { + "epoch": 0.64, + "grad_norm": 3.437019109725952, + "learning_rate": 7.709449072891661e-06, + "loss": 4.3435, + "step": 5000 + }, + { + "epoch": 0.64, + "eval_loss": 1.0283212661743164, + "eval_runtime": 6.8036, + "eval_samples_per_second": 146.98, + "eval_steps_per_second": 18.373, + "step": 5000 + }, + { + "epoch": 0.64064, + "grad_norm": 3.3842532634735107, + "learning_rate": 7.70520930120111e-06, + "loss": 4.0462, + "step": 5005 + }, + { + "epoch": 0.64128, + "grad_norm": 4.276431083679199, + "learning_rate": 7.700966777562402e-06, + "loss": 4.1034, + "step": 5010 + }, + { + "epoch": 0.64192, + "grad_norm": 3.322986125946045, + "learning_rate": 7.696721506291353e-06, + "loss": 4.0853, + "step": 5015 + }, + { + "epoch": 0.64256, + "grad_norm": 3.593050718307495, + "learning_rate": 7.69247349170659e-06, + "loss": 4.0402, + "step": 5020 + }, + { + "epoch": 0.6432, + "grad_norm": 3.6134753227233887, + "learning_rate": 7.688222738129519e-06, + "loss": 4.0731, + "step": 5025 + }, + { + "epoch": 0.64384, + "grad_norm": 3.3642125129699707, + "learning_rate": 7.683969249884331e-06, + "loss": 4.2386, + "step": 5030 + }, + { + "epoch": 0.64448, + "grad_norm": 3.5435409545898438, + "learning_rate": 7.679713031298009e-06, + "loss": 4.0897, + "step": 5035 + }, + { + "epoch": 0.64512, + "grad_norm": 3.3521769046783447, + "learning_rate": 7.675454086700307e-06, + "loss": 4.0395, + "step": 5040 + }, + { + "epoch": 0.64576, + "grad_norm": 3.2340495586395264, + "learning_rate": 7.671192420423748e-06, + "loss": 4.1929, + "step": 5045 + }, + { + "epoch": 0.6464, + "grad_norm": 3.205925226211548, + "learning_rate": 7.666928036803635e-06, + "loss": 4.122, + "step": 5050 + }, + { + "epoch": 0.64704, + "grad_norm": 3.4584217071533203, + "learning_rate": 7.662660940178024e-06, + "loss": 4.132, + "step": 5055 + }, + { + "epoch": 0.64768, + "grad_norm": 3.254892110824585, + "learning_rate": 7.65839113488774e-06, + "loss": 4.1643, + "step": 5060 + }, + { + "epoch": 0.64832, + "grad_norm": 3.1381659507751465, + "learning_rate": 7.654118625276355e-06, + "loss": 4.0238, + "step": 5065 + }, + { + "epoch": 0.64896, + "grad_norm": 3.4624714851379395, + "learning_rate": 7.649843415690198e-06, + "loss": 4.0247, + "step": 5070 + }, + { + "epoch": 0.6496, + "grad_norm": 3.3247270584106445, + "learning_rate": 7.645565510478344e-06, + "loss": 4.03, + "step": 5075 + }, + { + "epoch": 0.65024, + "grad_norm": 3.419830322265625, + "learning_rate": 7.641284913992608e-06, + "loss": 4.0039, + "step": 5080 + }, + { + "epoch": 0.65088, + "grad_norm": 3.156008243560791, + "learning_rate": 7.637001630587544e-06, + "loss": 4.0387, + "step": 5085 + }, + { + "epoch": 0.65152, + "grad_norm": 3.25286865234375, + "learning_rate": 7.63271566462044e-06, + "loss": 4.1358, + "step": 5090 + }, + { + "epoch": 0.65216, + "grad_norm": 3.1121902465820312, + "learning_rate": 7.62842702045131e-06, + "loss": 4.1283, + "step": 5095 + }, + { + "epoch": 0.6528, + "grad_norm": 3.177816390991211, + "learning_rate": 7.624135702442896e-06, + "loss": 4.0129, + "step": 5100 + }, + { + "epoch": 0.6528, + "eval_loss": 1.0441325902938843, + "eval_runtime": 6.9093, + "eval_samples_per_second": 144.732, + "eval_steps_per_second": 18.091, + "step": 5100 + }, + { + "epoch": 0.65344, + "grad_norm": 3.3002333641052246, + "learning_rate": 7.61984171496066e-06, + "loss": 4.0557, + "step": 5105 + }, + { + "epoch": 0.65408, + "grad_norm": 3.5926401615142822, + "learning_rate": 7.615545062372775e-06, + "loss": 4.1188, + "step": 5110 + }, + { + "epoch": 0.65472, + "grad_norm": 3.4342427253723145, + "learning_rate": 7.611245749050132e-06, + "loss": 4.0734, + "step": 5115 + }, + { + "epoch": 0.65536, + "grad_norm": 3.499502658843994, + "learning_rate": 7.606943779366324e-06, + "loss": 4.1231, + "step": 5120 + }, + { + "epoch": 0.656, + "grad_norm": 3.2994086742401123, + "learning_rate": 7.602639157697645e-06, + "loss": 4.0108, + "step": 5125 + }, + { + "epoch": 0.65664, + "grad_norm": 3.3510470390319824, + "learning_rate": 7.5983318884230915e-06, + "loss": 4.1677, + "step": 5130 + }, + { + "epoch": 0.65728, + "grad_norm": 3.6185731887817383, + "learning_rate": 7.5940219759243495e-06, + "loss": 4.0468, + "step": 5135 + }, + { + "epoch": 0.65792, + "grad_norm": 3.3450589179992676, + "learning_rate": 7.589709424585796e-06, + "loss": 4.2103, + "step": 5140 + }, + { + "epoch": 0.65856, + "grad_norm": 3.277587413787842, + "learning_rate": 7.585394238794492e-06, + "loss": 4.1963, + "step": 5145 + }, + { + "epoch": 0.6592, + "grad_norm": 3.7275032997131348, + "learning_rate": 7.581076422940179e-06, + "loss": 4.1603, + "step": 5150 + }, + { + "epoch": 0.65984, + "grad_norm": 3.197981595993042, + "learning_rate": 7.5767559814152735e-06, + "loss": 4.0356, + "step": 5155 + }, + { + "epoch": 0.66048, + "grad_norm": 3.3903021812438965, + "learning_rate": 7.57243291861486e-06, + "loss": 4.1375, + "step": 5160 + }, + { + "epoch": 0.66112, + "grad_norm": 3.2676641941070557, + "learning_rate": 7.568107238936694e-06, + "loss": 4.0345, + "step": 5165 + }, + { + "epoch": 0.66176, + "grad_norm": 3.254869222640991, + "learning_rate": 7.563778946781193e-06, + "loss": 4.0845, + "step": 5170 + }, + { + "epoch": 0.6624, + "grad_norm": 3.259308338165283, + "learning_rate": 7.559448046551429e-06, + "loss": 4.039, + "step": 5175 + }, + { + "epoch": 0.66304, + "grad_norm": 3.208815813064575, + "learning_rate": 7.555114542653128e-06, + "loss": 4.1614, + "step": 5180 + }, + { + "epoch": 0.66368, + "grad_norm": 3.4502336978912354, + "learning_rate": 7.550778439494668e-06, + "loss": 4.0919, + "step": 5185 + }, + { + "epoch": 0.66432, + "grad_norm": 3.1363143920898438, + "learning_rate": 7.546439741487066e-06, + "loss": 4.0748, + "step": 5190 + }, + { + "epoch": 0.66496, + "grad_norm": 3.3942441940307617, + "learning_rate": 7.5420984530439826e-06, + "loss": 4.1168, + "step": 5195 + }, + { + "epoch": 0.6656, + "grad_norm": 3.31434965133667, + "learning_rate": 7.537754578581711e-06, + "loss": 4.189, + "step": 5200 + }, + { + "epoch": 0.6656, + "eval_loss": 1.0360530614852905, + "eval_runtime": 7.2424, + "eval_samples_per_second": 138.076, + "eval_steps_per_second": 17.26, + "step": 5200 + }, + { + "epoch": 0.66624, + "grad_norm": 3.268174648284912, + "learning_rate": 7.533408122519177e-06, + "loss": 4.1333, + "step": 5205 + }, + { + "epoch": 0.66688, + "grad_norm": 3.358441114425659, + "learning_rate": 7.5290590892779325e-06, + "loss": 4.2067, + "step": 5210 + }, + { + "epoch": 0.66752, + "grad_norm": 3.2438085079193115, + "learning_rate": 7.5247074832821495e-06, + "loss": 4.0523, + "step": 5215 + }, + { + "epoch": 0.66816, + "grad_norm": 3.4631876945495605, + "learning_rate": 7.52035330895862e-06, + "loss": 4.1713, + "step": 5220 + }, + { + "epoch": 0.6688, + "grad_norm": 3.3959803581237793, + "learning_rate": 7.515996570736746e-06, + "loss": 4.1517, + "step": 5225 + }, + { + "epoch": 0.66944, + "grad_norm": 3.5359513759613037, + "learning_rate": 7.511637273048538e-06, + "loss": 4.0533, + "step": 5230 + }, + { + "epoch": 0.67008, + "grad_norm": 3.3612425327301025, + "learning_rate": 7.50727542032861e-06, + "loss": 4.0731, + "step": 5235 + }, + { + "epoch": 0.67072, + "grad_norm": 3.462785243988037, + "learning_rate": 7.502911017014177e-06, + "loss": 4.1512, + "step": 5240 + }, + { + "epoch": 0.67136, + "grad_norm": 3.3726119995117188, + "learning_rate": 7.49854406754505e-06, + "loss": 4.0378, + "step": 5245 + }, + { + "epoch": 0.672, + "grad_norm": 3.513679027557373, + "learning_rate": 7.494174576363623e-06, + "loss": 4.1591, + "step": 5250 + }, + { + "epoch": 0.67264, + "grad_norm": 3.5844063758850098, + "learning_rate": 7.489802547914885e-06, + "loss": 4.1491, + "step": 5255 + }, + { + "epoch": 0.67328, + "grad_norm": 3.381340742111206, + "learning_rate": 7.485427986646399e-06, + "loss": 4.0664, + "step": 5260 + }, + { + "epoch": 0.67392, + "grad_norm": 3.2730493545532227, + "learning_rate": 7.481050897008308e-06, + "loss": 4.1653, + "step": 5265 + }, + { + "epoch": 0.67456, + "grad_norm": 3.4449875354766846, + "learning_rate": 7.476671283453325e-06, + "loss": 4.0288, + "step": 5270 + }, + { + "epoch": 0.6752, + "grad_norm": 3.37095308303833, + "learning_rate": 7.472289150436734e-06, + "loss": 4.1401, + "step": 5275 + }, + { + "epoch": 0.67584, + "grad_norm": 3.202610969543457, + "learning_rate": 7.4679045024163765e-06, + "loss": 4.0416, + "step": 5280 + }, + { + "epoch": 0.67648, + "grad_norm": 3.40617036819458, + "learning_rate": 7.463517343852659e-06, + "loss": 3.9643, + "step": 5285 + }, + { + "epoch": 0.67712, + "grad_norm": 3.3042664527893066, + "learning_rate": 7.459127679208536e-06, + "loss": 4.0707, + "step": 5290 + }, + { + "epoch": 0.67776, + "grad_norm": 3.46657657623291, + "learning_rate": 7.454735512949515e-06, + "loss": 4.0634, + "step": 5295 + }, + { + "epoch": 0.6784, + "grad_norm": 3.1254940032958984, + "learning_rate": 7.450340849543647e-06, + "loss": 4.0323, + "step": 5300 + }, + { + "epoch": 0.6784, + "eval_loss": 1.018658995628357, + "eval_runtime": 7.1603, + "eval_samples_per_second": 139.658, + "eval_steps_per_second": 17.457, + "step": 5300 + }, + { + "epoch": 0.67904, + "grad_norm": 3.234954595565796, + "learning_rate": 7.445943693461524e-06, + "loss": 4.1678, + "step": 5305 + }, + { + "epoch": 0.67968, + "grad_norm": 3.1998000144958496, + "learning_rate": 7.441544049176272e-06, + "loss": 4.0177, + "step": 5310 + }, + { + "epoch": 0.68032, + "grad_norm": 3.3766448497772217, + "learning_rate": 7.437141921163551e-06, + "loss": 4.0408, + "step": 5315 + }, + { + "epoch": 0.68096, + "grad_norm": 3.486940860748291, + "learning_rate": 7.432737313901546e-06, + "loss": 4.0692, + "step": 5320 + }, + { + "epoch": 0.6816, + "grad_norm": 3.4825565814971924, + "learning_rate": 7.428330231870963e-06, + "loss": 4.1543, + "step": 5325 + }, + { + "epoch": 0.68224, + "grad_norm": 3.2892098426818848, + "learning_rate": 7.423920679555029e-06, + "loss": 4.0388, + "step": 5330 + }, + { + "epoch": 0.68288, + "grad_norm": 3.7257015705108643, + "learning_rate": 7.419508661439479e-06, + "loss": 4.1455, + "step": 5335 + }, + { + "epoch": 0.68352, + "grad_norm": 3.36126708984375, + "learning_rate": 7.415094182012561e-06, + "loss": 4.0867, + "step": 5340 + }, + { + "epoch": 0.68416, + "grad_norm": 3.403200387954712, + "learning_rate": 7.410677245765024e-06, + "loss": 3.9311, + "step": 5345 + }, + { + "epoch": 0.6848, + "grad_norm": 3.3477866649627686, + "learning_rate": 7.406257857190118e-06, + "loss": 4.0525, + "step": 5350 + }, + { + "epoch": 0.68544, + "grad_norm": 3.6489646434783936, + "learning_rate": 7.401836020783586e-06, + "loss": 4.1045, + "step": 5355 + }, + { + "epoch": 0.68608, + "grad_norm": 3.322072744369507, + "learning_rate": 7.397411741043663e-06, + "loss": 4.065, + "step": 5360 + }, + { + "epoch": 0.68672, + "grad_norm": 3.329051971435547, + "learning_rate": 7.3929850224710675e-06, + "loss": 4.0974, + "step": 5365 + }, + { + "epoch": 0.68736, + "grad_norm": 2.996154308319092, + "learning_rate": 7.388555869569001e-06, + "loss": 4.127, + "step": 5370 + }, + { + "epoch": 0.688, + "grad_norm": 3.2369041442871094, + "learning_rate": 7.3841242868431395e-06, + "loss": 4.2309, + "step": 5375 + }, + { + "epoch": 0.68864, + "grad_norm": 3.5981876850128174, + "learning_rate": 7.379690278801633e-06, + "loss": 4.0174, + "step": 5380 + }, + { + "epoch": 0.68928, + "grad_norm": 3.3885374069213867, + "learning_rate": 7.375253849955097e-06, + "loss": 4.0164, + "step": 5385 + }, + { + "epoch": 0.68992, + "grad_norm": 3.208528518676758, + "learning_rate": 7.37081500481661e-06, + "loss": 4.2045, + "step": 5390 + }, + { + "epoch": 0.69056, + "grad_norm": 3.3751118183135986, + "learning_rate": 7.366373747901708e-06, + "loss": 3.9949, + "step": 5395 + }, + { + "epoch": 0.6912, + "grad_norm": 3.512998580932617, + "learning_rate": 7.361930083728383e-06, + "loss": 3.9627, + "step": 5400 + }, + { + "epoch": 0.6912, + "eval_loss": 1.028464436531067, + "eval_runtime": 7.0644, + "eval_samples_per_second": 141.555, + "eval_steps_per_second": 17.694, + "step": 5400 + }, + { + "epoch": 0.69184, + "grad_norm": 3.4754066467285156, + "learning_rate": 7.35748401681707e-06, + "loss": 4.1225, + "step": 5405 + }, + { + "epoch": 0.69248, + "grad_norm": 3.389253854751587, + "learning_rate": 7.353035551690657e-06, + "loss": 4.1547, + "step": 5410 + }, + { + "epoch": 0.69312, + "grad_norm": 3.2215898036956787, + "learning_rate": 7.3485846928744635e-06, + "loss": 4.1092, + "step": 5415 + }, + { + "epoch": 0.69376, + "grad_norm": 3.4230103492736816, + "learning_rate": 7.344131444896249e-06, + "loss": 4.0162, + "step": 5420 + }, + { + "epoch": 0.6944, + "grad_norm": 3.33898663520813, + "learning_rate": 7.3396758122862e-06, + "loss": 4.0614, + "step": 5425 + }, + { + "epoch": 0.69504, + "grad_norm": 2.969716787338257, + "learning_rate": 7.335217799576935e-06, + "loss": 3.9111, + "step": 5430 + }, + { + "epoch": 0.69568, + "grad_norm": 3.3126251697540283, + "learning_rate": 7.3307574113034825e-06, + "loss": 4.0983, + "step": 5435 + }, + { + "epoch": 0.69632, + "grad_norm": 3.4935758113861084, + "learning_rate": 7.326294652003301e-06, + "loss": 4.1387, + "step": 5440 + }, + { + "epoch": 0.69696, + "grad_norm": 4.090250492095947, + "learning_rate": 7.3218295262162506e-06, + "loss": 4.2359, + "step": 5445 + }, + { + "epoch": 0.6976, + "grad_norm": 3.3421008586883545, + "learning_rate": 7.317362038484603e-06, + "loss": 3.9774, + "step": 5450 + }, + { + "epoch": 0.69824, + "grad_norm": 3.265035629272461, + "learning_rate": 7.312892193353035e-06, + "loss": 4.0956, + "step": 5455 + }, + { + "epoch": 0.69888, + "grad_norm": 3.2407443523406982, + "learning_rate": 7.308419995368616e-06, + "loss": 4.1528, + "step": 5460 + }, + { + "epoch": 0.69952, + "grad_norm": 3.419646739959717, + "learning_rate": 7.303945449080813e-06, + "loss": 3.9912, + "step": 5465 + }, + { + "epoch": 0.70016, + "grad_norm": 3.481679677963257, + "learning_rate": 7.29946855904148e-06, + "loss": 4.1215, + "step": 5470 + }, + { + "epoch": 0.7008, + "grad_norm": 3.273656129837036, + "learning_rate": 7.294989329804857e-06, + "loss": 3.9999, + "step": 5475 + }, + { + "epoch": 0.70144, + "grad_norm": 3.0722243785858154, + "learning_rate": 7.29050776592756e-06, + "loss": 4.1311, + "step": 5480 + }, + { + "epoch": 0.70208, + "grad_norm": 3.490076780319214, + "learning_rate": 7.286023871968585e-06, + "loss": 4.1776, + "step": 5485 + }, + { + "epoch": 0.70272, + "grad_norm": 3.279628276824951, + "learning_rate": 7.281537652489295e-06, + "loss": 4.0772, + "step": 5490 + }, + { + "epoch": 0.70336, + "grad_norm": 3.328392505645752, + "learning_rate": 7.277049112053418e-06, + "loss": 4.0715, + "step": 5495 + }, + { + "epoch": 0.704, + "grad_norm": 3.36295747756958, + "learning_rate": 7.272558255227047e-06, + "loss": 4.1156, + "step": 5500 + }, + { + "epoch": 0.704, + "eval_loss": 1.024857997894287, + "eval_runtime": 6.7913, + "eval_samples_per_second": 147.247, + "eval_steps_per_second": 18.406, + "step": 5500 + }, + { + "epoch": 0.70464, + "grad_norm": 3.70607328414917, + "learning_rate": 7.268065086578627e-06, + "loss": 4.0388, + "step": 5505 + }, + { + "epoch": 0.70528, + "grad_norm": 3.330237865447998, + "learning_rate": 7.263569610678958e-06, + "loss": 4.2135, + "step": 5510 + }, + { + "epoch": 0.70592, + "grad_norm": 3.2931463718414307, + "learning_rate": 7.259071832101186e-06, + "loss": 4.1573, + "step": 5515 + }, + { + "epoch": 0.70656, + "grad_norm": 3.4031548500061035, + "learning_rate": 7.254571755420796e-06, + "loss": 4.2077, + "step": 5520 + }, + { + "epoch": 0.7072, + "grad_norm": 3.3596532344818115, + "learning_rate": 7.250069385215619e-06, + "loss": 4.0455, + "step": 5525 + }, + { + "epoch": 0.70784, + "grad_norm": 3.28298282623291, + "learning_rate": 7.245564726065811e-06, + "loss": 4.0913, + "step": 5530 + }, + { + "epoch": 0.70848, + "grad_norm": 3.297868251800537, + "learning_rate": 7.241057782553862e-06, + "loss": 4.098, + "step": 5535 + }, + { + "epoch": 0.70912, + "grad_norm": 3.110238552093506, + "learning_rate": 7.2365485592645815e-06, + "loss": 4.0421, + "step": 5540 + }, + { + "epoch": 0.70976, + "grad_norm": 3.42336106300354, + "learning_rate": 7.232037060785102e-06, + "loss": 4.1329, + "step": 5545 + }, + { + "epoch": 0.7104, + "grad_norm": 3.1975576877593994, + "learning_rate": 7.227523291704866e-06, + "loss": 4.1384, + "step": 5550 + }, + { + "epoch": 0.71104, + "grad_norm": 3.52624773979187, + "learning_rate": 7.2230072566156305e-06, + "loss": 4.0816, + "step": 5555 + }, + { + "epoch": 0.71168, + "grad_norm": 3.3925981521606445, + "learning_rate": 7.218488960111455e-06, + "loss": 4.023, + "step": 5560 + }, + { + "epoch": 0.71232, + "grad_norm": 3.7120208740234375, + "learning_rate": 7.213968406788703e-06, + "loss": 4.2338, + "step": 5565 + }, + { + "epoch": 0.71296, + "grad_norm": 3.375377893447876, + "learning_rate": 7.209445601246027e-06, + "loss": 4.1258, + "step": 5570 + }, + { + "epoch": 0.7136, + "grad_norm": 3.4685449600219727, + "learning_rate": 7.204920548084378e-06, + "loss": 4.0949, + "step": 5575 + }, + { + "epoch": 0.71424, + "grad_norm": 3.564234733581543, + "learning_rate": 7.200393251906985e-06, + "loss": 4.0926, + "step": 5580 + }, + { + "epoch": 0.71488, + "grad_norm": 3.3394558429718018, + "learning_rate": 7.19586371731937e-06, + "loss": 4.0769, + "step": 5585 + }, + { + "epoch": 0.71552, + "grad_norm": 3.314234972000122, + "learning_rate": 7.191331948929323e-06, + "loss": 4.0644, + "step": 5590 + }, + { + "epoch": 0.71616, + "grad_norm": 3.2954790592193604, + "learning_rate": 7.18679795134691e-06, + "loss": 4.0076, + "step": 5595 + }, + { + "epoch": 0.7168, + "grad_norm": 3.253471851348877, + "learning_rate": 7.182261729184463e-06, + "loss": 4.013, + "step": 5600 + }, + { + "epoch": 0.7168, + "eval_loss": 1.016201376914978, + "eval_runtime": 6.8775, + "eval_samples_per_second": 145.401, + "eval_steps_per_second": 18.175, + "step": 5600 + }, + { + "epoch": 0.71744, + "grad_norm": 3.3830933570861816, + "learning_rate": 7.17772328705658e-06, + "loss": 4.1317, + "step": 5605 + }, + { + "epoch": 0.71808, + "grad_norm": 2.9672000408172607, + "learning_rate": 7.173182629580113e-06, + "loss": 3.9456, + "step": 5610 + }, + { + "epoch": 0.71872, + "grad_norm": 3.272923469543457, + "learning_rate": 7.168639761374173e-06, + "loss": 4.0034, + "step": 5615 + }, + { + "epoch": 0.71936, + "grad_norm": 3.0897445678710938, + "learning_rate": 7.1640946870601135e-06, + "loss": 3.9826, + "step": 5620 + }, + { + "epoch": 0.72, + "grad_norm": 3.123854398727417, + "learning_rate": 7.159547411261538e-06, + "loss": 3.9751, + "step": 5625 + }, + { + "epoch": 0.72064, + "grad_norm": 3.2446231842041016, + "learning_rate": 7.154997938604287e-06, + "loss": 3.9737, + "step": 5630 + }, + { + "epoch": 0.72128, + "grad_norm": 3.1185810565948486, + "learning_rate": 7.150446273716435e-06, + "loss": 3.9893, + "step": 5635 + }, + { + "epoch": 0.72192, + "grad_norm": 3.1517674922943115, + "learning_rate": 7.145892421228289e-06, + "loss": 4.022, + "step": 5640 + }, + { + "epoch": 0.72256, + "grad_norm": 3.4120028018951416, + "learning_rate": 7.141336385772377e-06, + "loss": 4.119, + "step": 5645 + }, + { + "epoch": 0.7232, + "grad_norm": 3.3022778034210205, + "learning_rate": 7.136778171983456e-06, + "loss": 3.9917, + "step": 5650 + }, + { + "epoch": 0.72384, + "grad_norm": 3.258190155029297, + "learning_rate": 7.1322177844984884e-06, + "loss": 4.0934, + "step": 5655 + }, + { + "epoch": 0.72448, + "grad_norm": 4.144087314605713, + "learning_rate": 7.127655227956656e-06, + "loss": 4.1012, + "step": 5660 + }, + { + "epoch": 0.72512, + "grad_norm": 3.4005286693573, + "learning_rate": 7.123090506999342e-06, + "loss": 4.1081, + "step": 5665 + }, + { + "epoch": 0.72576, + "grad_norm": 3.2259960174560547, + "learning_rate": 7.118523626270137e-06, + "loss": 4.1488, + "step": 5670 + }, + { + "epoch": 0.7264, + "grad_norm": 3.616593599319458, + "learning_rate": 7.113954590414822e-06, + "loss": 4.0788, + "step": 5675 + }, + { + "epoch": 0.72704, + "grad_norm": 3.442378520965576, + "learning_rate": 7.109383404081378e-06, + "loss": 3.9939, + "step": 5680 + }, + { + "epoch": 0.72768, + "grad_norm": 3.6199593544006348, + "learning_rate": 7.104810071919964e-06, + "loss": 3.9184, + "step": 5685 + }, + { + "epoch": 0.72832, + "grad_norm": 3.5864577293395996, + "learning_rate": 7.10023459858293e-06, + "loss": 4.1875, + "step": 5690 + }, + { + "epoch": 0.72896, + "grad_norm": 3.2463033199310303, + "learning_rate": 7.095656988724802e-06, + "loss": 4.0096, + "step": 5695 + }, + { + "epoch": 0.7296, + "grad_norm": 3.480454206466675, + "learning_rate": 7.0910772470022784e-06, + "loss": 4.1788, + "step": 5700 + }, + { + "epoch": 0.7296, + "eval_loss": 1.0143406391143799, + "eval_runtime": 7.072, + "eval_samples_per_second": 141.402, + "eval_steps_per_second": 17.675, + "step": 5700 + }, + { + "epoch": 0.73024, + "grad_norm": 3.228858709335327, + "learning_rate": 7.086495378074225e-06, + "loss": 4.0522, + "step": 5705 + }, + { + "epoch": 0.73088, + "grad_norm": 3.310290813446045, + "learning_rate": 7.081911386601677e-06, + "loss": 4.0268, + "step": 5710 + }, + { + "epoch": 0.73152, + "grad_norm": 3.3378937244415283, + "learning_rate": 7.07732527724782e-06, + "loss": 4.1019, + "step": 5715 + }, + { + "epoch": 0.73216, + "grad_norm": 3.3621530532836914, + "learning_rate": 7.072737054678004e-06, + "loss": 4.1155, + "step": 5720 + }, + { + "epoch": 0.7328, + "grad_norm": 3.041964530944824, + "learning_rate": 7.06814672355972e-06, + "loss": 4.113, + "step": 5725 + }, + { + "epoch": 0.73344, + "grad_norm": 3.450547218322754, + "learning_rate": 7.063554288562611e-06, + "loss": 3.9201, + "step": 5730 + }, + { + "epoch": 0.73408, + "grad_norm": 3.349160671234131, + "learning_rate": 7.058959754358455e-06, + "loss": 4.1675, + "step": 5735 + }, + { + "epoch": 0.73472, + "grad_norm": 3.076353073120117, + "learning_rate": 7.0543631256211705e-06, + "loss": 4.0993, + "step": 5740 + }, + { + "epoch": 0.73536, + "grad_norm": 3.2057204246520996, + "learning_rate": 7.0497644070268e-06, + "loss": 4.0656, + "step": 5745 + }, + { + "epoch": 0.736, + "grad_norm": 3.22353196144104, + "learning_rate": 7.045163603253519e-06, + "loss": 3.9592, + "step": 5750 + }, + { + "epoch": 0.73664, + "grad_norm": 3.1400163173675537, + "learning_rate": 7.040560718981618e-06, + "loss": 4.1322, + "step": 5755 + }, + { + "epoch": 0.73728, + "grad_norm": 3.4224133491516113, + "learning_rate": 7.035955758893509e-06, + "loss": 3.9546, + "step": 5760 + }, + { + "epoch": 0.73792, + "grad_norm": 3.4225850105285645, + "learning_rate": 7.031348727673713e-06, + "loss": 4.1259, + "step": 5765 + }, + { + "epoch": 0.73856, + "grad_norm": 3.331876039505005, + "learning_rate": 7.026739630008861e-06, + "loss": 4.026, + "step": 5770 + }, + { + "epoch": 0.7392, + "grad_norm": 3.3707172870635986, + "learning_rate": 7.022128470587679e-06, + "loss": 4.2298, + "step": 5775 + }, + { + "epoch": 0.73984, + "grad_norm": 3.1047780513763428, + "learning_rate": 7.017515254100998e-06, + "loss": 4.0686, + "step": 5780 + }, + { + "epoch": 0.74048, + "grad_norm": 3.0352683067321777, + "learning_rate": 7.012899985241738e-06, + "loss": 3.9605, + "step": 5785 + }, + { + "epoch": 0.74112, + "grad_norm": 3.595513343811035, + "learning_rate": 7.008282668704907e-06, + "loss": 4.023, + "step": 5790 + }, + { + "epoch": 0.74176, + "grad_norm": 3.1861355304718018, + "learning_rate": 7.0036633091875985e-06, + "loss": 4.1268, + "step": 5795 + }, + { + "epoch": 0.7424, + "grad_norm": 3.247361898422241, + "learning_rate": 6.99904191138898e-06, + "loss": 4.1142, + "step": 5800 + }, + { + "epoch": 0.7424, + "eval_loss": 1.005723476409912, + "eval_runtime": 7.0291, + "eval_samples_per_second": 142.265, + "eval_steps_per_second": 17.783, + "step": 5800 + }, + { + "epoch": 0.74304, + "grad_norm": 3.3942816257476807, + "learning_rate": 6.994418480010297e-06, + "loss": 4.0313, + "step": 5805 + }, + { + "epoch": 0.74368, + "grad_norm": 3.075566291809082, + "learning_rate": 6.989793019754858e-06, + "loss": 3.9531, + "step": 5810 + }, + { + "epoch": 0.74432, + "grad_norm": 3.2327611446380615, + "learning_rate": 6.985165535328042e-06, + "loss": 4.0313, + "step": 5815 + }, + { + "epoch": 0.74496, + "grad_norm": 3.170311689376831, + "learning_rate": 6.980536031437284e-06, + "loss": 4.1406, + "step": 5820 + }, + { + "epoch": 0.7456, + "grad_norm": 3.5235507488250732, + "learning_rate": 6.975904512792073e-06, + "loss": 4.0573, + "step": 5825 + }, + { + "epoch": 0.74624, + "grad_norm": 3.7707526683807373, + "learning_rate": 6.971270984103947e-06, + "loss": 3.9786, + "step": 5830 + }, + { + "epoch": 0.74688, + "grad_norm": 3.1437079906463623, + "learning_rate": 6.966635450086492e-06, + "loss": 4.1191, + "step": 5835 + }, + { + "epoch": 0.74752, + "grad_norm": 3.192091703414917, + "learning_rate": 6.961997915455328e-06, + "loss": 4.0604, + "step": 5840 + }, + { + "epoch": 0.74816, + "grad_norm": 3.214885950088501, + "learning_rate": 6.957358384928119e-06, + "loss": 4.2096, + "step": 5845 + }, + { + "epoch": 0.7488, + "grad_norm": 3.4825382232666016, + "learning_rate": 6.952716863224551e-06, + "loss": 4.1515, + "step": 5850 + }, + { + "epoch": 0.74944, + "grad_norm": 3.2055091857910156, + "learning_rate": 6.948073355066339e-06, + "loss": 3.993, + "step": 5855 + }, + { + "epoch": 0.75008, + "grad_norm": 3.43379282951355, + "learning_rate": 6.9434278651772205e-06, + "loss": 3.9822, + "step": 5860 + }, + { + "epoch": 0.75072, + "grad_norm": 2.9562249183654785, + "learning_rate": 6.938780398282945e-06, + "loss": 3.9083, + "step": 5865 + }, + { + "epoch": 0.75136, + "grad_norm": 3.5767860412597656, + "learning_rate": 6.934130959111276e-06, + "loss": 4.1058, + "step": 5870 + }, + { + "epoch": 0.752, + "grad_norm": 3.2050375938415527, + "learning_rate": 6.929479552391985e-06, + "loss": 4.1939, + "step": 5875 + }, + { + "epoch": 0.75264, + "grad_norm": 3.266775369644165, + "learning_rate": 6.924826182856839e-06, + "loss": 4.011, + "step": 5880 + }, + { + "epoch": 0.75328, + "grad_norm": 3.255232810974121, + "learning_rate": 6.920170855239607e-06, + "loss": 3.9561, + "step": 5885 + }, + { + "epoch": 0.75392, + "grad_norm": 3.48268985748291, + "learning_rate": 6.915513574276049e-06, + "loss": 4.2584, + "step": 5890 + }, + { + "epoch": 0.75456, + "grad_norm": 3.360429525375366, + "learning_rate": 6.910854344703912e-06, + "loss": 4.0792, + "step": 5895 + }, + { + "epoch": 0.7552, + "grad_norm": 3.310023307800293, + "learning_rate": 6.906193171262922e-06, + "loss": 4.1032, + "step": 5900 + }, + { + "epoch": 0.7552, + "eval_loss": 1.0207964181900024, + "eval_runtime": 6.8554, + "eval_samples_per_second": 145.871, + "eval_steps_per_second": 18.234, + "step": 5900 + }, + { + "epoch": 0.75584, + "grad_norm": 3.7510457038879395, + "learning_rate": 6.9015300586947876e-06, + "loss": 4.1674, + "step": 5905 + }, + { + "epoch": 0.75648, + "grad_norm": 3.67040753364563, + "learning_rate": 6.896865011743187e-06, + "loss": 4.0561, + "step": 5910 + }, + { + "epoch": 0.75712, + "grad_norm": 3.0624032020568848, + "learning_rate": 6.892198035153767e-06, + "loss": 3.8555, + "step": 5915 + }, + { + "epoch": 0.75776, + "grad_norm": 3.3737337589263916, + "learning_rate": 6.887529133674137e-06, + "loss": 4.1243, + "step": 5920 + }, + { + "epoch": 0.7584, + "grad_norm": 3.3432435989379883, + "learning_rate": 6.882858312053864e-06, + "loss": 4.0493, + "step": 5925 + }, + { + "epoch": 0.75904, + "grad_norm": 3.475452423095703, + "learning_rate": 6.8781855750444704e-06, + "loss": 4.141, + "step": 5930 + }, + { + "epoch": 0.75968, + "grad_norm": 3.452732801437378, + "learning_rate": 6.873510927399425e-06, + "loss": 4.0296, + "step": 5935 + }, + { + "epoch": 0.76032, + "grad_norm": 3.238232135772705, + "learning_rate": 6.86883437387414e-06, + "loss": 4.045, + "step": 5940 + }, + { + "epoch": 0.76096, + "grad_norm": 3.2131717205047607, + "learning_rate": 6.86415591922597e-06, + "loss": 4.0805, + "step": 5945 + }, + { + "epoch": 0.7616, + "grad_norm": 3.2297801971435547, + "learning_rate": 6.859475568214199e-06, + "loss": 3.993, + "step": 5950 + }, + { + "epoch": 0.76224, + "grad_norm": 3.0972037315368652, + "learning_rate": 6.854793325600042e-06, + "loss": 3.9922, + "step": 5955 + }, + { + "epoch": 0.76288, + "grad_norm": 3.390723943710327, + "learning_rate": 6.850109196146641e-06, + "loss": 4.0131, + "step": 5960 + }, + { + "epoch": 0.76352, + "grad_norm": 4.386783599853516, + "learning_rate": 6.84542318461905e-06, + "loss": 3.9761, + "step": 5965 + }, + { + "epoch": 0.76416, + "grad_norm": 3.243786096572876, + "learning_rate": 6.840735295784245e-06, + "loss": 3.9654, + "step": 5970 + }, + { + "epoch": 0.7648, + "grad_norm": 3.3371636867523193, + "learning_rate": 6.83604553441111e-06, + "loss": 4.0202, + "step": 5975 + }, + { + "epoch": 0.76544, + "grad_norm": 3.3030002117156982, + "learning_rate": 6.831353905270433e-06, + "loss": 4.0206, + "step": 5980 + }, + { + "epoch": 0.76608, + "grad_norm": 3.3030149936676025, + "learning_rate": 6.8266604131349015e-06, + "loss": 4.0615, + "step": 5985 + }, + { + "epoch": 0.76672, + "grad_norm": 3.28432559967041, + "learning_rate": 6.821965062779098e-06, + "loss": 3.9029, + "step": 5990 + }, + { + "epoch": 0.76736, + "grad_norm": 3.092695951461792, + "learning_rate": 6.817267858979497e-06, + "loss": 3.9879, + "step": 5995 + }, + { + "epoch": 0.768, + "grad_norm": 3.263754367828369, + "learning_rate": 6.812568806514457e-06, + "loss": 3.9958, + "step": 6000 + }, + { + "epoch": 0.768, + "eval_loss": 1.0079742670059204, + "eval_runtime": 6.9911, + "eval_samples_per_second": 143.039, + "eval_steps_per_second": 17.88, + "step": 6000 + }, + { + "epoch": 0.76864, + "grad_norm": 3.4394893646240234, + "learning_rate": 6.807867910164216e-06, + "loss": 4.0567, + "step": 6005 + }, + { + "epoch": 0.76928, + "grad_norm": 3.2342753410339355, + "learning_rate": 6.803165174710895e-06, + "loss": 3.9647, + "step": 6010 + }, + { + "epoch": 0.76992, + "grad_norm": 3.2559092044830322, + "learning_rate": 6.798460604938475e-06, + "loss": 4.0027, + "step": 6015 + }, + { + "epoch": 0.77056, + "grad_norm": 3.206169843673706, + "learning_rate": 6.79375420563281e-06, + "loss": 4.0209, + "step": 6020 + }, + { + "epoch": 0.7712, + "grad_norm": 3.1606526374816895, + "learning_rate": 6.789045981581612e-06, + "loss": 3.9584, + "step": 6025 + }, + { + "epoch": 0.77184, + "grad_norm": 3.431877613067627, + "learning_rate": 6.784335937574456e-06, + "loss": 3.9336, + "step": 6030 + }, + { + "epoch": 0.77248, + "grad_norm": 3.3919317722320557, + "learning_rate": 6.779624078402755e-06, + "loss": 4.0337, + "step": 6035 + }, + { + "epoch": 0.77312, + "grad_norm": 3.114204168319702, + "learning_rate": 6.774910408859781e-06, + "loss": 4.0327, + "step": 6040 + }, + { + "epoch": 0.77376, + "grad_norm": 3.5469913482666016, + "learning_rate": 6.770194933740645e-06, + "loss": 4.1115, + "step": 6045 + }, + { + "epoch": 0.7744, + "grad_norm": 3.3311476707458496, + "learning_rate": 6.76547765784229e-06, + "loss": 4.0417, + "step": 6050 + }, + { + "epoch": 0.77504, + "grad_norm": 3.26802921295166, + "learning_rate": 6.760758585963495e-06, + "loss": 4.0431, + "step": 6055 + }, + { + "epoch": 0.77568, + "grad_norm": 3.6536500453948975, + "learning_rate": 6.756037722904867e-06, + "loss": 4.0063, + "step": 6060 + }, + { + "epoch": 0.77632, + "grad_norm": 3.4150497913360596, + "learning_rate": 6.7513150734688285e-06, + "loss": 4.076, + "step": 6065 + }, + { + "epoch": 0.77696, + "grad_norm": 3.2071707248687744, + "learning_rate": 6.746590642459628e-06, + "loss": 4.0914, + "step": 6070 + }, + { + "epoch": 0.7776, + "grad_norm": 3.3766379356384277, + "learning_rate": 6.741864434683319e-06, + "loss": 3.9751, + "step": 6075 + }, + { + "epoch": 0.77824, + "grad_norm": 3.4125139713287354, + "learning_rate": 6.737136454947768e-06, + "loss": 4.3161, + "step": 6080 + }, + { + "epoch": 0.77888, + "grad_norm": 3.257509231567383, + "learning_rate": 6.73240670806264e-06, + "loss": 3.925, + "step": 6085 + }, + { + "epoch": 0.77952, + "grad_norm": 3.6318936347961426, + "learning_rate": 6.727675198839403e-06, + "loss": 4.025, + "step": 6090 + }, + { + "epoch": 0.78016, + "grad_norm": 3.7387313842773438, + "learning_rate": 6.722941932091309e-06, + "loss": 4.2462, + "step": 6095 + }, + { + "epoch": 0.7808, + "grad_norm": 3.1425154209136963, + "learning_rate": 6.718206912633407e-06, + "loss": 4.0087, + "step": 6100 + }, + { + "epoch": 0.7808, + "eval_loss": 1.0091650485992432, + "eval_runtime": 8.1261, + "eval_samples_per_second": 123.06, + "eval_steps_per_second": 15.383, + "step": 6100 + }, + { + "epoch": 0.78144, + "grad_norm": 3.4126670360565186, + "learning_rate": 6.7134701452825225e-06, + "loss": 4.0624, + "step": 6105 + }, + { + "epoch": 0.78208, + "grad_norm": 3.345456600189209, + "learning_rate": 6.7087316348572626e-06, + "loss": 4.0735, + "step": 6110 + }, + { + "epoch": 0.78272, + "grad_norm": 3.3764288425445557, + "learning_rate": 6.703991386178008e-06, + "loss": 3.9901, + "step": 6115 + }, + { + "epoch": 0.78336, + "grad_norm": 3.302940845489502, + "learning_rate": 6.699249404066906e-06, + "loss": 4.0439, + "step": 6120 + }, + { + "epoch": 0.784, + "grad_norm": 3.295030355453491, + "learning_rate": 6.694505693347866e-06, + "loss": 3.9957, + "step": 6125 + }, + { + "epoch": 0.78464, + "grad_norm": 3.1348817348480225, + "learning_rate": 6.689760258846557e-06, + "loss": 3.8965, + "step": 6130 + }, + { + "epoch": 0.78528, + "grad_norm": 3.459271192550659, + "learning_rate": 6.685013105390404e-06, + "loss": 4.0026, + "step": 6135 + }, + { + "epoch": 0.78592, + "grad_norm": 3.3416779041290283, + "learning_rate": 6.680264237808578e-06, + "loss": 3.9384, + "step": 6140 + }, + { + "epoch": 0.78656, + "grad_norm": 3.24820876121521, + "learning_rate": 6.6755136609319945e-06, + "loss": 3.992, + "step": 6145 + }, + { + "epoch": 0.7872, + "grad_norm": 3.351857900619507, + "learning_rate": 6.670761379593308e-06, + "loss": 4.1481, + "step": 6150 + }, + { + "epoch": 0.78784, + "grad_norm": 3.287588119506836, + "learning_rate": 6.666007398626907e-06, + "loss": 3.9051, + "step": 6155 + }, + { + "epoch": 0.78848, + "grad_norm": 3.1063315868377686, + "learning_rate": 6.661251722868907e-06, + "loss": 3.9763, + "step": 6160 + }, + { + "epoch": 0.78912, + "grad_norm": 3.2057151794433594, + "learning_rate": 6.65649435715715e-06, + "loss": 3.9564, + "step": 6165 + }, + { + "epoch": 0.78976, + "grad_norm": 3.377319574356079, + "learning_rate": 6.6517353063311985e-06, + "loss": 4.0139, + "step": 6170 + }, + { + "epoch": 0.7904, + "grad_norm": 3.3640501499176025, + "learning_rate": 6.646974575232326e-06, + "loss": 3.9932, + "step": 6175 + }, + { + "epoch": 0.79104, + "grad_norm": 3.394688367843628, + "learning_rate": 6.642212168703512e-06, + "loss": 3.9131, + "step": 6180 + }, + { + "epoch": 0.79168, + "grad_norm": 3.154578447341919, + "learning_rate": 6.637448091589451e-06, + "loss": 4.0465, + "step": 6185 + }, + { + "epoch": 0.79232, + "grad_norm": 3.3188998699188232, + "learning_rate": 6.632682348736529e-06, + "loss": 3.9791, + "step": 6190 + }, + { + "epoch": 0.79296, + "grad_norm": 3.3017475605010986, + "learning_rate": 6.627914944992827e-06, + "loss": 3.9438, + "step": 6195 + }, + { + "epoch": 0.7936, + "grad_norm": 3.4923770427703857, + "learning_rate": 6.623145885208117e-06, + "loss": 4.0598, + "step": 6200 + }, + { + "epoch": 0.7936, + "eval_loss": 1.0139714479446411, + "eval_runtime": 6.7565, + "eval_samples_per_second": 148.006, + "eval_steps_per_second": 18.501, + "step": 6200 + }, + { + "epoch": 0.79424, + "grad_norm": 3.5045602321624756, + "learning_rate": 6.618375174233857e-06, + "loss": 3.9786, + "step": 6205 + }, + { + "epoch": 0.79488, + "grad_norm": 3.394674301147461, + "learning_rate": 6.613602816923183e-06, + "loss": 3.9143, + "step": 6210 + }, + { + "epoch": 0.79552, + "grad_norm": 3.285093307495117, + "learning_rate": 6.608828818130903e-06, + "loss": 3.9806, + "step": 6215 + }, + { + "epoch": 0.79616, + "grad_norm": 3.3816416263580322, + "learning_rate": 6.604053182713501e-06, + "loss": 3.9389, + "step": 6220 + }, + { + "epoch": 0.7968, + "grad_norm": 3.101391315460205, + "learning_rate": 6.599275915529124e-06, + "loss": 4.0285, + "step": 6225 + }, + { + "epoch": 0.79744, + "grad_norm": 3.581902027130127, + "learning_rate": 6.594497021437573e-06, + "loss": 4.1661, + "step": 6230 + }, + { + "epoch": 0.79808, + "grad_norm": 3.477958917617798, + "learning_rate": 6.5897165053003145e-06, + "loss": 4.0367, + "step": 6235 + }, + { + "epoch": 0.79872, + "grad_norm": 3.292573928833008, + "learning_rate": 6.584934371980452e-06, + "loss": 4.0413, + "step": 6240 + }, + { + "epoch": 0.79936, + "grad_norm": 3.326529026031494, + "learning_rate": 6.58015062634275e-06, + "loss": 4.0497, + "step": 6245 + }, + { + "epoch": 0.8, + "grad_norm": 3.278815269470215, + "learning_rate": 6.575365273253598e-06, + "loss": 4.0507, + "step": 6250 + }, + { + "epoch": 0.80064, + "grad_norm": 3.2232770919799805, + "learning_rate": 6.570578317581029e-06, + "loss": 4.0234, + "step": 6255 + }, + { + "epoch": 0.80128, + "grad_norm": 3.564846992492676, + "learning_rate": 6.5657897641947045e-06, + "loss": 3.8909, + "step": 6260 + }, + { + "epoch": 0.80192, + "grad_norm": 3.3442466259002686, + "learning_rate": 6.560999617965914e-06, + "loss": 4.0105, + "step": 6265 + }, + { + "epoch": 0.80256, + "grad_norm": 3.1271111965179443, + "learning_rate": 6.5562078837675625e-06, + "loss": 3.9133, + "step": 6270 + }, + { + "epoch": 0.8032, + "grad_norm": 3.4506609439849854, + "learning_rate": 6.551414566474173e-06, + "loss": 3.9542, + "step": 6275 + }, + { + "epoch": 0.80384, + "grad_norm": 3.406909942626953, + "learning_rate": 6.546619670961878e-06, + "loss": 4.0518, + "step": 6280 + }, + { + "epoch": 0.80448, + "grad_norm": 3.421560525894165, + "learning_rate": 6.5418232021084175e-06, + "loss": 4.0037, + "step": 6285 + }, + { + "epoch": 0.80512, + "grad_norm": 3.2941651344299316, + "learning_rate": 6.537025164793129e-06, + "loss": 3.8791, + "step": 6290 + }, + { + "epoch": 0.80576, + "grad_norm": 3.4293181896209717, + "learning_rate": 6.532225563896949e-06, + "loss": 3.9889, + "step": 6295 + }, + { + "epoch": 0.8064, + "grad_norm": 3.2118945121765137, + "learning_rate": 6.527424404302403e-06, + "loss": 4.1149, + "step": 6300 + }, + { + "epoch": 0.8064, + "eval_loss": 0.9931904077529907, + "eval_runtime": 6.719, + "eval_samples_per_second": 148.832, + "eval_steps_per_second": 18.604, + "step": 6300 + }, + { + "epoch": 0.80704, + "grad_norm": 3.647317409515381, + "learning_rate": 6.522621690893598e-06, + "loss": 3.9717, + "step": 6305 + }, + { + "epoch": 0.80768, + "grad_norm": 3.0840768814086914, + "learning_rate": 6.517817428556231e-06, + "loss": 3.9465, + "step": 6310 + }, + { + "epoch": 0.80832, + "grad_norm": 3.4289841651916504, + "learning_rate": 6.513011622177565e-06, + "loss": 3.9262, + "step": 6315 + }, + { + "epoch": 0.80896, + "grad_norm": 3.369891881942749, + "learning_rate": 6.508204276646441e-06, + "loss": 4.0869, + "step": 6320 + }, + { + "epoch": 0.8096, + "grad_norm": 3.300008535385132, + "learning_rate": 6.5033953968532604e-06, + "loss": 3.9217, + "step": 6325 + }, + { + "epoch": 0.81024, + "grad_norm": 3.2289252281188965, + "learning_rate": 6.4985849876899894e-06, + "loss": 4.1345, + "step": 6330 + }, + { + "epoch": 0.81088, + "grad_norm": 3.045872688293457, + "learning_rate": 6.493773054050147e-06, + "loss": 3.942, + "step": 6335 + }, + { + "epoch": 0.81152, + "grad_norm": 3.694000482559204, + "learning_rate": 6.4889596008288065e-06, + "loss": 3.9691, + "step": 6340 + }, + { + "epoch": 0.81216, + "grad_norm": 3.224107503890991, + "learning_rate": 6.484144632922582e-06, + "loss": 4.1082, + "step": 6345 + }, + { + "epoch": 0.8128, + "grad_norm": 3.7049143314361572, + "learning_rate": 6.479328155229634e-06, + "loss": 4.1134, + "step": 6350 + }, + { + "epoch": 0.81344, + "grad_norm": 3.076693534851074, + "learning_rate": 6.474510172649653e-06, + "loss": 3.9307, + "step": 6355 + }, + { + "epoch": 0.81408, + "grad_norm": 3.1345927715301514, + "learning_rate": 6.469690690083867e-06, + "loss": 4.0009, + "step": 6360 + }, + { + "epoch": 0.81472, + "grad_norm": 3.370507001876831, + "learning_rate": 6.464869712435024e-06, + "loss": 4.0593, + "step": 6365 + }, + { + "epoch": 0.81536, + "grad_norm": 3.774531126022339, + "learning_rate": 6.460047244607397e-06, + "loss": 3.9653, + "step": 6370 + }, + { + "epoch": 0.816, + "grad_norm": 3.2934482097625732, + "learning_rate": 6.455223291506772e-06, + "loss": 3.9456, + "step": 6375 + }, + { + "epoch": 0.81664, + "grad_norm": 3.284327745437622, + "learning_rate": 6.450397858040449e-06, + "loss": 3.9911, + "step": 6380 + }, + { + "epoch": 0.81728, + "grad_norm": 3.251145124435425, + "learning_rate": 6.4455709491172295e-06, + "loss": 3.9766, + "step": 6385 + }, + { + "epoch": 0.81792, + "grad_norm": 3.2382752895355225, + "learning_rate": 6.44074256964742e-06, + "loss": 4.0249, + "step": 6390 + }, + { + "epoch": 0.81856, + "grad_norm": 3.185112476348877, + "learning_rate": 6.435912724542822e-06, + "loss": 4.0807, + "step": 6395 + }, + { + "epoch": 0.8192, + "grad_norm": 3.3324737548828125, + "learning_rate": 6.431081418716729e-06, + "loss": 3.9421, + "step": 6400 + }, + { + "epoch": 0.8192, + "eval_loss": 1.0041779279708862, + "eval_runtime": 6.7694, + "eval_samples_per_second": 147.723, + "eval_steps_per_second": 18.465, + "step": 6400 + }, + { + "epoch": 0.81984, + "grad_norm": 3.61067271232605, + "learning_rate": 6.426248657083916e-06, + "loss": 4.0269, + "step": 6405 + }, + { + "epoch": 0.82048, + "grad_norm": 3.5463812351226807, + "learning_rate": 6.421414444560643e-06, + "loss": 3.9552, + "step": 6410 + }, + { + "epoch": 0.82112, + "grad_norm": 3.159905433654785, + "learning_rate": 6.416578786064645e-06, + "loss": 3.9453, + "step": 6415 + }, + { + "epoch": 0.82176, + "grad_norm": 3.876019239425659, + "learning_rate": 6.41174168651513e-06, + "loss": 3.8959, + "step": 6420 + }, + { + "epoch": 0.8224, + "grad_norm": 3.502474308013916, + "learning_rate": 6.406903150832766e-06, + "loss": 3.8898, + "step": 6425 + }, + { + "epoch": 0.82304, + "grad_norm": 3.2995212078094482, + "learning_rate": 6.402063183939687e-06, + "loss": 4.0422, + "step": 6430 + }, + { + "epoch": 0.82368, + "grad_norm": 3.3530595302581787, + "learning_rate": 6.397221790759484e-06, + "loss": 4.1236, + "step": 6435 + }, + { + "epoch": 0.82432, + "grad_norm": 3.162374973297119, + "learning_rate": 6.392378976217195e-06, + "loss": 4.0261, + "step": 6440 + }, + { + "epoch": 0.82496, + "grad_norm": 3.0621769428253174, + "learning_rate": 6.387534745239306e-06, + "loss": 3.9925, + "step": 6445 + }, + { + "epoch": 0.8256, + "grad_norm": 3.232025146484375, + "learning_rate": 6.382689102753741e-06, + "loss": 3.8912, + "step": 6450 + }, + { + "epoch": 0.82624, + "grad_norm": 3.550158739089966, + "learning_rate": 6.377842053689865e-06, + "loss": 4.0563, + "step": 6455 + }, + { + "epoch": 0.82688, + "grad_norm": 3.5361146926879883, + "learning_rate": 6.372993602978471e-06, + "loss": 4.1191, + "step": 6460 + }, + { + "epoch": 0.82752, + "grad_norm": 3.995157241821289, + "learning_rate": 6.368143755551779e-06, + "loss": 4.1001, + "step": 6465 + }, + { + "epoch": 0.82816, + "grad_norm": 3.403334856033325, + "learning_rate": 6.363292516343427e-06, + "loss": 4.0052, + "step": 6470 + }, + { + "epoch": 0.8288, + "grad_norm": 3.6193649768829346, + "learning_rate": 6.358439890288471e-06, + "loss": 4.0026, + "step": 6475 + }, + { + "epoch": 0.82944, + "grad_norm": 3.133255958557129, + "learning_rate": 6.353585882323378e-06, + "loss": 4.0432, + "step": 6480 + }, + { + "epoch": 0.83008, + "grad_norm": 3.300370693206787, + "learning_rate": 6.348730497386022e-06, + "loss": 4.0762, + "step": 6485 + }, + { + "epoch": 0.83072, + "grad_norm": 3.2615299224853516, + "learning_rate": 6.3438737404156725e-06, + "loss": 4.0634, + "step": 6490 + }, + { + "epoch": 0.83136, + "grad_norm": 3.20989727973938, + "learning_rate": 6.3390156163530015e-06, + "loss": 4.0828, + "step": 6495 + }, + { + "epoch": 0.832, + "grad_norm": 3.197261333465576, + "learning_rate": 6.334156130140068e-06, + "loss": 3.8688, + "step": 6500 + }, + { + "epoch": 0.832, + "eval_loss": 1.004305362701416, + "eval_runtime": 7.1118, + "eval_samples_per_second": 140.611, + "eval_steps_per_second": 17.576, + "step": 6500 + }, + { + "epoch": 0.83264, + "grad_norm": 3.9616518020629883, + "learning_rate": 6.329295286720316e-06, + "loss": 3.9318, + "step": 6505 + }, + { + "epoch": 0.83328, + "grad_norm": 3.124983787536621, + "learning_rate": 6.324433091038573e-06, + "loss": 4.0023, + "step": 6510 + }, + { + "epoch": 0.83392, + "grad_norm": 3.1082472801208496, + "learning_rate": 6.31956954804104e-06, + "loss": 4.0784, + "step": 6515 + }, + { + "epoch": 0.83456, + "grad_norm": 3.4243500232696533, + "learning_rate": 6.314704662675289e-06, + "loss": 4.1357, + "step": 6520 + }, + { + "epoch": 0.8352, + "grad_norm": 3.482208490371704, + "learning_rate": 6.3098384398902565e-06, + "loss": 4.3399, + "step": 6525 + }, + { + "epoch": 0.83584, + "grad_norm": 3.1997745037078857, + "learning_rate": 6.3049708846362425e-06, + "loss": 4.0126, + "step": 6530 + }, + { + "epoch": 0.83648, + "grad_norm": 3.6432223320007324, + "learning_rate": 6.300102001864902e-06, + "loss": 3.9565, + "step": 6535 + }, + { + "epoch": 0.83712, + "grad_norm": 3.2619552612304688, + "learning_rate": 6.2952317965292355e-06, + "loss": 4.0278, + "step": 6540 + }, + { + "epoch": 0.83776, + "grad_norm": 3.5579400062561035, + "learning_rate": 6.290360273583596e-06, + "loss": 4.067, + "step": 6545 + }, + { + "epoch": 0.8384, + "grad_norm": 4.077335834503174, + "learning_rate": 6.28548743798367e-06, + "loss": 3.9901, + "step": 6550 + }, + { + "epoch": 0.83904, + "grad_norm": 3.147125482559204, + "learning_rate": 6.280613294686486e-06, + "loss": 3.941, + "step": 6555 + }, + { + "epoch": 0.83968, + "grad_norm": 3.3126308917999268, + "learning_rate": 6.275737848650398e-06, + "loss": 4.1567, + "step": 6560 + }, + { + "epoch": 0.84032, + "grad_norm": 3.32991886138916, + "learning_rate": 6.270861104835086e-06, + "loss": 3.8803, + "step": 6565 + }, + { + "epoch": 0.84096, + "grad_norm": 3.420491933822632, + "learning_rate": 6.265983068201553e-06, + "loss": 3.9706, + "step": 6570 + }, + { + "epoch": 0.8416, + "grad_norm": 4.031120300292969, + "learning_rate": 6.261103743712116e-06, + "loss": 3.9856, + "step": 6575 + }, + { + "epoch": 0.84224, + "grad_norm": 3.198617696762085, + "learning_rate": 6.256223136330398e-06, + "loss": 3.9416, + "step": 6580 + }, + { + "epoch": 0.84288, + "grad_norm": 3.415013313293457, + "learning_rate": 6.251341251021334e-06, + "loss": 4.1193, + "step": 6585 + }, + { + "epoch": 0.84352, + "grad_norm": 3.3532445430755615, + "learning_rate": 6.246458092751151e-06, + "loss": 4.1583, + "step": 6590 + }, + { + "epoch": 0.84416, + "grad_norm": 3.5485520362854004, + "learning_rate": 6.241573666487379e-06, + "loss": 3.9059, + "step": 6595 + }, + { + "epoch": 0.8448, + "grad_norm": 3.8386924266815186, + "learning_rate": 6.236687977198832e-06, + "loss": 3.9479, + "step": 6600 + }, + { + "epoch": 0.8448, + "eval_loss": 0.9972337484359741, + "eval_runtime": 6.9957, + "eval_samples_per_second": 142.944, + "eval_steps_per_second": 17.868, + "step": 6600 + }, + { + "epoch": 0.84544, + "grad_norm": 3.2764065265655518, + "learning_rate": 6.231801029855614e-06, + "loss": 4.0085, + "step": 6605 + }, + { + "epoch": 0.84608, + "grad_norm": 3.119434118270874, + "learning_rate": 6.226912829429104e-06, + "loss": 3.9582, + "step": 6610 + }, + { + "epoch": 0.84672, + "grad_norm": 3.6029162406921387, + "learning_rate": 6.222023380891955e-06, + "loss": 3.8984, + "step": 6615 + }, + { + "epoch": 0.84736, + "grad_norm": 3.146862268447876, + "learning_rate": 6.217132689218097e-06, + "loss": 4.0213, + "step": 6620 + }, + { + "epoch": 0.848, + "grad_norm": 3.5199005603790283, + "learning_rate": 6.212240759382717e-06, + "loss": 3.9775, + "step": 6625 + }, + { + "epoch": 0.84864, + "grad_norm": 3.4542529582977295, + "learning_rate": 6.207347596362265e-06, + "loss": 3.9534, + "step": 6630 + }, + { + "epoch": 0.84928, + "grad_norm": 3.4714694023132324, + "learning_rate": 6.202453205134444e-06, + "loss": 3.9529, + "step": 6635 + }, + { + "epoch": 0.84992, + "grad_norm": 3.232454299926758, + "learning_rate": 6.19755759067821e-06, + "loss": 4.0047, + "step": 6640 + }, + { + "epoch": 0.85056, + "grad_norm": 3.1209444999694824, + "learning_rate": 6.192660757973758e-06, + "loss": 3.973, + "step": 6645 + }, + { + "epoch": 0.8512, + "grad_norm": 3.401189088821411, + "learning_rate": 6.187762712002529e-06, + "loss": 3.9679, + "step": 6650 + }, + { + "epoch": 0.85184, + "grad_norm": 3.1131510734558105, + "learning_rate": 6.182863457747188e-06, + "loss": 3.971, + "step": 6655 + }, + { + "epoch": 0.85248, + "grad_norm": 3.463775396347046, + "learning_rate": 6.177963000191642e-06, + "loss": 3.9328, + "step": 6660 + }, + { + "epoch": 0.85312, + "grad_norm": 3.3299026489257812, + "learning_rate": 6.17306134432101e-06, + "loss": 3.9863, + "step": 6665 + }, + { + "epoch": 0.85376, + "grad_norm": 3.549071788787842, + "learning_rate": 6.168158495121637e-06, + "loss": 3.9282, + "step": 6670 + }, + { + "epoch": 0.8544, + "grad_norm": 3.52746844291687, + "learning_rate": 6.163254457581083e-06, + "loss": 3.8922, + "step": 6675 + }, + { + "epoch": 0.85504, + "grad_norm": 3.465867757797241, + "learning_rate": 6.158349236688111e-06, + "loss": 4.0779, + "step": 6680 + }, + { + "epoch": 0.85568, + "grad_norm": 3.3179733753204346, + "learning_rate": 6.153442837432694e-06, + "loss": 4.0155, + "step": 6685 + }, + { + "epoch": 0.85632, + "grad_norm": 3.2581310272216797, + "learning_rate": 6.148535264806001e-06, + "loss": 3.9893, + "step": 6690 + }, + { + "epoch": 0.85696, + "grad_norm": 3.1983416080474854, + "learning_rate": 6.14362652380039e-06, + "loss": 3.945, + "step": 6695 + }, + { + "epoch": 0.8576, + "grad_norm": 3.1799731254577637, + "learning_rate": 6.138716619409416e-06, + "loss": 4.0342, + "step": 6700 + }, + { + "epoch": 0.8576, + "eval_loss": 1.0036591291427612, + "eval_runtime": 8.0911, + "eval_samples_per_second": 123.593, + "eval_steps_per_second": 15.449, + "step": 6700 + }, + { + "epoch": 0.85824, + "grad_norm": 3.089611291885376, + "learning_rate": 6.133805556627813e-06, + "loss": 3.9963, + "step": 6705 + }, + { + "epoch": 0.85888, + "grad_norm": 3.209955930709839, + "learning_rate": 6.128893340451495e-06, + "loss": 4.0122, + "step": 6710 + }, + { + "epoch": 0.85952, + "grad_norm": 3.2523767948150635, + "learning_rate": 6.123979975877546e-06, + "loss": 3.9918, + "step": 6715 + }, + { + "epoch": 0.86016, + "grad_norm": 3.52829909324646, + "learning_rate": 6.11906546790422e-06, + "loss": 4.0027, + "step": 6720 + }, + { + "epoch": 0.8608, + "grad_norm": 3.4737231731414795, + "learning_rate": 6.114149821530938e-06, + "loss": 4.2401, + "step": 6725 + }, + { + "epoch": 0.86144, + "grad_norm": 3.210808277130127, + "learning_rate": 6.109233041758274e-06, + "loss": 3.9913, + "step": 6730 + }, + { + "epoch": 0.86208, + "grad_norm": 3.2163102626800537, + "learning_rate": 6.104315133587955e-06, + "loss": 3.9215, + "step": 6735 + }, + { + "epoch": 0.86272, + "grad_norm": 3.3578877449035645, + "learning_rate": 6.099396102022859e-06, + "loss": 3.9928, + "step": 6740 + }, + { + "epoch": 0.86336, + "grad_norm": 3.540017604827881, + "learning_rate": 6.094475952067006e-06, + "loss": 4.0551, + "step": 6745 + }, + { + "epoch": 0.864, + "grad_norm": 3.1817259788513184, + "learning_rate": 6.089554688725554e-06, + "loss": 3.9157, + "step": 6750 + }, + { + "epoch": 0.86464, + "grad_norm": 3.662182331085205, + "learning_rate": 6.0846323170047895e-06, + "loss": 3.9394, + "step": 6755 + }, + { + "epoch": 0.86528, + "grad_norm": 4.058024883270264, + "learning_rate": 6.079708841912133e-06, + "loss": 4.0893, + "step": 6760 + }, + { + "epoch": 0.86592, + "grad_norm": 3.132685899734497, + "learning_rate": 6.074784268456125e-06, + "loss": 3.8952, + "step": 6765 + }, + { + "epoch": 0.86656, + "grad_norm": 3.383697986602783, + "learning_rate": 6.069858601646416e-06, + "loss": 4.118, + "step": 6770 + }, + { + "epoch": 0.8672, + "grad_norm": 3.267975091934204, + "learning_rate": 6.064931846493782e-06, + "loss": 3.9389, + "step": 6775 + }, + { + "epoch": 0.86784, + "grad_norm": 3.556748151779175, + "learning_rate": 6.060004008010096e-06, + "loss": 3.8576, + "step": 6780 + }, + { + "epoch": 0.86848, + "grad_norm": 3.565812587738037, + "learning_rate": 6.05507509120834e-06, + "loss": 4.0734, + "step": 6785 + }, + { + "epoch": 0.86912, + "grad_norm": 3.3282663822174072, + "learning_rate": 6.050145101102586e-06, + "loss": 4.057, + "step": 6790 + }, + { + "epoch": 0.86976, + "grad_norm": 3.3860867023468018, + "learning_rate": 6.045214042708003e-06, + "loss": 3.949, + "step": 6795 + }, + { + "epoch": 0.8704, + "grad_norm": 3.271362543106079, + "learning_rate": 6.0402819210408435e-06, + "loss": 3.9863, + "step": 6800 + }, + { + "epoch": 0.8704, + "eval_loss": 0.998278021812439, + "eval_runtime": 7.4136, + "eval_samples_per_second": 134.888, + "eval_steps_per_second": 16.861, + "step": 6800 + }, + { + "epoch": 0.87104, + "grad_norm": 3.317915678024292, + "learning_rate": 6.035348741118444e-06, + "loss": 4.0284, + "step": 6805 + }, + { + "epoch": 0.87168, + "grad_norm": 3.094139814376831, + "learning_rate": 6.030414507959217e-06, + "loss": 3.8683, + "step": 6810 + }, + { + "epoch": 0.87232, + "grad_norm": 3.2260992527008057, + "learning_rate": 6.025479226582647e-06, + "loss": 4.1184, + "step": 6815 + }, + { + "epoch": 0.87296, + "grad_norm": 3.1242997646331787, + "learning_rate": 6.020542902009282e-06, + "loss": 3.9198, + "step": 6820 + }, + { + "epoch": 0.8736, + "grad_norm": 4.395246505737305, + "learning_rate": 6.015605539260736e-06, + "loss": 3.8812, + "step": 6825 + }, + { + "epoch": 0.87424, + "grad_norm": 3.3219218254089355, + "learning_rate": 6.010667143359672e-06, + "loss": 3.9868, + "step": 6830 + }, + { + "epoch": 0.87488, + "grad_norm": 3.319570541381836, + "learning_rate": 6.005727719329813e-06, + "loss": 3.9658, + "step": 6835 + }, + { + "epoch": 0.87552, + "grad_norm": 3.454033136367798, + "learning_rate": 6.000787272195919e-06, + "loss": 4.0934, + "step": 6840 + }, + { + "epoch": 0.87616, + "grad_norm": 3.215515613555908, + "learning_rate": 5.995845806983798e-06, + "loss": 3.9793, + "step": 6845 + }, + { + "epoch": 0.8768, + "grad_norm": 3.322244882583618, + "learning_rate": 5.99090332872029e-06, + "loss": 4.0305, + "step": 6850 + }, + { + "epoch": 0.87744, + "grad_norm": 3.3186748027801514, + "learning_rate": 5.9859598424332656e-06, + "loss": 4.034, + "step": 6855 + }, + { + "epoch": 0.87808, + "grad_norm": 3.5926856994628906, + "learning_rate": 5.9810153531516215e-06, + "loss": 3.9167, + "step": 6860 + }, + { + "epoch": 0.87872, + "grad_norm": 3.6226749420166016, + "learning_rate": 5.976069865905276e-06, + "loss": 3.8649, + "step": 6865 + }, + { + "epoch": 0.87936, + "grad_norm": 3.1754322052001953, + "learning_rate": 5.971123385725159e-06, + "loss": 4.06, + "step": 6870 + }, + { + "epoch": 0.88, + "grad_norm": 3.3285293579101562, + "learning_rate": 5.966175917643214e-06, + "loss": 3.8871, + "step": 6875 + }, + { + "epoch": 0.88064, + "grad_norm": 3.416433334350586, + "learning_rate": 5.961227466692388e-06, + "loss": 3.983, + "step": 6880 + }, + { + "epoch": 0.88128, + "grad_norm": 3.308425188064575, + "learning_rate": 5.95627803790663e-06, + "loss": 3.9962, + "step": 6885 + }, + { + "epoch": 0.88192, + "grad_norm": 3.1291959285736084, + "learning_rate": 5.951327636320878e-06, + "loss": 4.047, + "step": 6890 + }, + { + "epoch": 0.88256, + "grad_norm": 3.337362766265869, + "learning_rate": 5.946376266971068e-06, + "loss": 4.0805, + "step": 6895 + }, + { + "epoch": 0.8832, + "grad_norm": 3.3893113136291504, + "learning_rate": 5.94142393489411e-06, + "loss": 4.0791, + "step": 6900 + }, + { + "epoch": 0.8832, + "eval_loss": 0.9899783730506897, + "eval_runtime": 6.6267, + "eval_samples_per_second": 150.904, + "eval_steps_per_second": 18.863, + "step": 6900 + }, + { + "epoch": 0.88384, + "grad_norm": 3.4587414264678955, + "learning_rate": 5.936470645127906e-06, + "loss": 4.0523, + "step": 6905 + }, + { + "epoch": 0.88448, + "grad_norm": 3.2666921615600586, + "learning_rate": 5.93151640271132e-06, + "loss": 4.0537, + "step": 6910 + }, + { + "epoch": 0.88512, + "grad_norm": 3.273695707321167, + "learning_rate": 5.926561212684194e-06, + "loss": 3.9064, + "step": 6915 + }, + { + "epoch": 0.88576, + "grad_norm": 3.240837574005127, + "learning_rate": 5.921605080087328e-06, + "loss": 3.8327, + "step": 6920 + }, + { + "epoch": 0.8864, + "grad_norm": 3.1848044395446777, + "learning_rate": 5.916648009962487e-06, + "loss": 3.8566, + "step": 6925 + }, + { + "epoch": 0.88704, + "grad_norm": 3.440258741378784, + "learning_rate": 5.911690007352384e-06, + "loss": 3.8696, + "step": 6930 + }, + { + "epoch": 0.88768, + "grad_norm": 16.90064239501953, + "learning_rate": 5.906731077300681e-06, + "loss": 4.1432, + "step": 6935 + }, + { + "epoch": 0.88832, + "grad_norm": 13.862409591674805, + "learning_rate": 5.901771224851989e-06, + "loss": 3.9778, + "step": 6940 + }, + { + "epoch": 0.88896, + "grad_norm": 3.237497568130493, + "learning_rate": 5.896810455051849e-06, + "loss": 3.9928, + "step": 6945 + }, + { + "epoch": 0.8896, + "grad_norm": 3.367391347885132, + "learning_rate": 5.891848772946744e-06, + "loss": 3.9045, + "step": 6950 + }, + { + "epoch": 0.89024, + "grad_norm": 3.2093420028686523, + "learning_rate": 5.88688618358408e-06, + "loss": 3.9564, + "step": 6955 + }, + { + "epoch": 0.89088, + "grad_norm": 3.18294358253479, + "learning_rate": 5.8819226920121855e-06, + "loss": 3.8482, + "step": 6960 + }, + { + "epoch": 0.89152, + "grad_norm": 3.097348928451538, + "learning_rate": 5.876958303280308e-06, + "loss": 3.7759, + "step": 6965 + }, + { + "epoch": 0.89216, + "grad_norm": 3.166609764099121, + "learning_rate": 5.871993022438609e-06, + "loss": 3.8114, + "step": 6970 + }, + { + "epoch": 0.8928, + "grad_norm": 3.337101459503174, + "learning_rate": 5.867026854538156e-06, + "loss": 4.0362, + "step": 6975 + }, + { + "epoch": 0.89344, + "grad_norm": 3.179570436477661, + "learning_rate": 5.862059804630917e-06, + "loss": 4.0118, + "step": 6980 + }, + { + "epoch": 0.89408, + "grad_norm": 3.0153615474700928, + "learning_rate": 5.857091877769762e-06, + "loss": 3.9093, + "step": 6985 + }, + { + "epoch": 0.89472, + "grad_norm": 3.310558557510376, + "learning_rate": 5.852123079008451e-06, + "loss": 3.9642, + "step": 6990 + }, + { + "epoch": 0.89536, + "grad_norm": 3.084897518157959, + "learning_rate": 5.8471534134016274e-06, + "loss": 3.9152, + "step": 6995 + }, + { + "epoch": 0.896, + "grad_norm": 3.2825677394866943, + "learning_rate": 5.842182886004823e-06, + "loss": 3.9253, + "step": 7000 + }, + { + "epoch": 0.896, + "eval_loss": 0.981210470199585, + "eval_runtime": 6.7345, + "eval_samples_per_second": 148.489, + "eval_steps_per_second": 18.561, + "step": 7000 + }, + { + "epoch": 0.89664, + "grad_norm": 3.5258679389953613, + "learning_rate": 5.837211501874438e-06, + "loss": 3.9532, + "step": 7005 + }, + { + "epoch": 0.89728, + "grad_norm": 3.6216719150543213, + "learning_rate": 5.832239266067754e-06, + "loss": 4.1073, + "step": 7010 + }, + { + "epoch": 0.89792, + "grad_norm": 3.2402870655059814, + "learning_rate": 5.8272661836429115e-06, + "loss": 3.9726, + "step": 7015 + }, + { + "epoch": 0.89856, + "grad_norm": 3.2960052490234375, + "learning_rate": 5.822292259658914e-06, + "loss": 3.795, + "step": 7020 + }, + { + "epoch": 0.8992, + "grad_norm": 3.0338668823242188, + "learning_rate": 5.817317499175622e-06, + "loss": 3.8567, + "step": 7025 + }, + { + "epoch": 0.89984, + "grad_norm": 3.07011342048645, + "learning_rate": 5.812341907253749e-06, + "loss": 3.8671, + "step": 7030 + }, + { + "epoch": 0.90048, + "grad_norm": 3.0267298221588135, + "learning_rate": 5.807365488954849e-06, + "loss": 3.9413, + "step": 7035 + }, + { + "epoch": 0.90112, + "grad_norm": 3.2175261974334717, + "learning_rate": 5.802388249341322e-06, + "loss": 3.8435, + "step": 7040 + }, + { + "epoch": 0.90176, + "grad_norm": 3.4809112548828125, + "learning_rate": 5.797410193476399e-06, + "loss": 4.1278, + "step": 7045 + }, + { + "epoch": 0.9024, + "grad_norm": 3.0254108905792236, + "learning_rate": 5.792431326424144e-06, + "loss": 4.05, + "step": 7050 + }, + { + "epoch": 0.90304, + "grad_norm": 3.203744888305664, + "learning_rate": 5.787451653249448e-06, + "loss": 3.8707, + "step": 7055 + }, + { + "epoch": 0.90368, + "grad_norm": 3.6568257808685303, + "learning_rate": 5.782471179018016e-06, + "loss": 3.8331, + "step": 7060 + }, + { + "epoch": 0.90432, + "grad_norm": 3.4851605892181396, + "learning_rate": 5.777489908796374e-06, + "loss": 4.0123, + "step": 7065 + }, + { + "epoch": 0.90496, + "grad_norm": 5.221633434295654, + "learning_rate": 5.772507847651857e-06, + "loss": 3.8164, + "step": 7070 + }, + { + "epoch": 0.9056, + "grad_norm": 3.0933420658111572, + "learning_rate": 5.7675250006525985e-06, + "loss": 3.8729, + "step": 7075 + }, + { + "epoch": 0.90624, + "grad_norm": 3.4247007369995117, + "learning_rate": 5.7625413728675405e-06, + "loss": 3.9543, + "step": 7080 + }, + { + "epoch": 0.90688, + "grad_norm": 3.3053174018859863, + "learning_rate": 5.75755696936641e-06, + "loss": 4.0038, + "step": 7085 + }, + { + "epoch": 0.90752, + "grad_norm": 3.358633279800415, + "learning_rate": 5.752571795219732e-06, + "loss": 3.9082, + "step": 7090 + }, + { + "epoch": 0.90816, + "grad_norm": 3.2632157802581787, + "learning_rate": 5.74758585549881e-06, + "loss": 3.9467, + "step": 7095 + }, + { + "epoch": 0.9088, + "grad_norm": 3.311492443084717, + "learning_rate": 5.742599155275726e-06, + "loss": 3.9281, + "step": 7100 + }, + { + "epoch": 0.9088, + "eval_loss": 1.0036808252334595, + "eval_runtime": 6.9664, + "eval_samples_per_second": 143.545, + "eval_steps_per_second": 17.943, + "step": 7100 + }, + { + "epoch": 0.90944, + "grad_norm": 3.206937789916992, + "learning_rate": 5.73761169962334e-06, + "loss": 4.1132, + "step": 7105 + }, + { + "epoch": 0.91008, + "grad_norm": 3.1101412773132324, + "learning_rate": 5.732623493615273e-06, + "loss": 3.9453, + "step": 7110 + }, + { + "epoch": 0.91072, + "grad_norm": 3.2870442867279053, + "learning_rate": 5.72763454232592e-06, + "loss": 3.9204, + "step": 7115 + }, + { + "epoch": 0.91136, + "grad_norm": 3.2983288764953613, + "learning_rate": 5.722644850830423e-06, + "loss": 3.9068, + "step": 7120 + }, + { + "epoch": 0.912, + "grad_norm": 3.8806018829345703, + "learning_rate": 5.717654424204686e-06, + "loss": 3.9637, + "step": 7125 + }, + { + "epoch": 0.91264, + "grad_norm": 3.4543910026550293, + "learning_rate": 5.7126632675253555e-06, + "loss": 3.9966, + "step": 7130 + }, + { + "epoch": 0.91328, + "grad_norm": 3.230792760848999, + "learning_rate": 5.707671385869822e-06, + "loss": 3.9935, + "step": 7135 + }, + { + "epoch": 0.91392, + "grad_norm": 3.474703073501587, + "learning_rate": 5.702678784316213e-06, + "loss": 3.8884, + "step": 7140 + }, + { + "epoch": 0.91456, + "grad_norm": 3.442061185836792, + "learning_rate": 5.697685467943391e-06, + "loss": 3.9302, + "step": 7145 + }, + { + "epoch": 0.9152, + "grad_norm": 3.4667983055114746, + "learning_rate": 5.6926914418309405e-06, + "loss": 4.066, + "step": 7150 + }, + { + "epoch": 0.91584, + "grad_norm": 3.329434394836426, + "learning_rate": 5.687696711059174e-06, + "loss": 3.9571, + "step": 7155 + }, + { + "epoch": 0.91648, + "grad_norm": 3.290762424468994, + "learning_rate": 5.682701280709117e-06, + "loss": 3.9454, + "step": 7160 + }, + { + "epoch": 0.91712, + "grad_norm": 3.295773983001709, + "learning_rate": 5.677705155862508e-06, + "loss": 3.9735, + "step": 7165 + }, + { + "epoch": 0.91776, + "grad_norm": 3.379249334335327, + "learning_rate": 5.672708341601791e-06, + "loss": 4.0056, + "step": 7170 + }, + { + "epoch": 0.9184, + "grad_norm": 3.3451154232025146, + "learning_rate": 5.667710843010113e-06, + "loss": 4.0466, + "step": 7175 + }, + { + "epoch": 0.91904, + "grad_norm": 3.2415895462036133, + "learning_rate": 5.662712665171315e-06, + "loss": 3.8953, + "step": 7180 + }, + { + "epoch": 0.91968, + "grad_norm": 3.1182918548583984, + "learning_rate": 5.657713813169932e-06, + "loss": 3.9718, + "step": 7185 + }, + { + "epoch": 0.92032, + "grad_norm": 3.2211804389953613, + "learning_rate": 5.6527142920911796e-06, + "loss": 3.9707, + "step": 7190 + }, + { + "epoch": 0.92096, + "grad_norm": 3.490556478500366, + "learning_rate": 5.64771410702096e-06, + "loss": 3.9644, + "step": 7195 + }, + { + "epoch": 0.9216, + "grad_norm": 3.5458273887634277, + "learning_rate": 5.642713263045847e-06, + "loss": 4.0083, + "step": 7200 + }, + { + "epoch": 0.9216, + "eval_loss": 0.9912606477737427, + "eval_runtime": 7.2742, + "eval_samples_per_second": 137.471, + "eval_steps_per_second": 17.184, + "step": 7200 + }, + { + "epoch": 0.92224, + "grad_norm": 3.4939253330230713, + "learning_rate": 5.637711765253088e-06, + "loss": 3.9563, + "step": 7205 + }, + { + "epoch": 0.92288, + "grad_norm": 3.1346709728240967, + "learning_rate": 5.63270961873059e-06, + "loss": 3.9072, + "step": 7210 + }, + { + "epoch": 0.92352, + "grad_norm": 3.3341445922851562, + "learning_rate": 5.627706828566928e-06, + "loss": 3.9547, + "step": 7215 + }, + { + "epoch": 0.92416, + "grad_norm": 3.094125986099243, + "learning_rate": 5.622703399851321e-06, + "loss": 3.9011, + "step": 7220 + }, + { + "epoch": 0.9248, + "grad_norm": 3.118218421936035, + "learning_rate": 5.61769933767365e-06, + "loss": 3.7838, + "step": 7225 + }, + { + "epoch": 0.92544, + "grad_norm": 3.289456844329834, + "learning_rate": 5.61269464712443e-06, + "loss": 4.0058, + "step": 7230 + }, + { + "epoch": 0.92608, + "grad_norm": 3.357569694519043, + "learning_rate": 5.6076893332948215e-06, + "loss": 4.03, + "step": 7235 + }, + { + "epoch": 0.92672, + "grad_norm": 3.568295955657959, + "learning_rate": 5.6026834012766155e-06, + "loss": 3.9034, + "step": 7240 + }, + { + "epoch": 0.92736, + "grad_norm": 3.2108635902404785, + "learning_rate": 5.597676856162235e-06, + "loss": 4.0486, + "step": 7245 + }, + { + "epoch": 0.928, + "grad_norm": 3.312995672225952, + "learning_rate": 5.592669703044722e-06, + "loss": 3.9875, + "step": 7250 + }, + { + "epoch": 0.92864, + "grad_norm": 3.5728683471679688, + "learning_rate": 5.587661947017744e-06, + "loss": 4.0232, + "step": 7255 + }, + { + "epoch": 0.92928, + "grad_norm": 3.0373477935791016, + "learning_rate": 5.582653593175574e-06, + "loss": 3.8868, + "step": 7260 + }, + { + "epoch": 0.92992, + "grad_norm": 3.6742284297943115, + "learning_rate": 5.577644646613099e-06, + "loss": 3.8612, + "step": 7265 + }, + { + "epoch": 0.93056, + "grad_norm": 3.126483917236328, + "learning_rate": 5.572635112425806e-06, + "loss": 3.7641, + "step": 7270 + }, + { + "epoch": 0.9312, + "grad_norm": 3.2737932205200195, + "learning_rate": 5.567624995709781e-06, + "loss": 3.9605, + "step": 7275 + }, + { + "epoch": 0.93184, + "grad_norm": 3.3350517749786377, + "learning_rate": 5.562614301561704e-06, + "loss": 4.0245, + "step": 7280 + }, + { + "epoch": 0.93248, + "grad_norm": 3.372002601623535, + "learning_rate": 5.557603035078838e-06, + "loss": 3.999, + "step": 7285 + }, + { + "epoch": 0.93312, + "grad_norm": 3.2564427852630615, + "learning_rate": 5.552591201359031e-06, + "loss": 3.8675, + "step": 7290 + }, + { + "epoch": 0.93376, + "grad_norm": 3.0988609790802, + "learning_rate": 5.547578805500711e-06, + "loss": 3.804, + "step": 7295 + }, + { + "epoch": 0.9344, + "grad_norm": 3.1962246894836426, + "learning_rate": 5.542565852602872e-06, + "loss": 3.986, + "step": 7300 + }, + { + "epoch": 0.9344, + "eval_loss": 0.9871136546134949, + "eval_runtime": 6.8497, + "eval_samples_per_second": 145.991, + "eval_steps_per_second": 18.249, + "step": 7300 + }, + { + "epoch": 0.93504, + "grad_norm": 3.38354229927063, + "learning_rate": 5.537552347765078e-06, + "loss": 3.8876, + "step": 7305 + }, + { + "epoch": 0.93568, + "grad_norm": 3.4597370624542236, + "learning_rate": 5.5325382960874544e-06, + "loss": 3.9455, + "step": 7310 + }, + { + "epoch": 0.93632, + "grad_norm": 3.528806686401367, + "learning_rate": 5.5275237026706805e-06, + "loss": 4.0232, + "step": 7315 + }, + { + "epoch": 0.93696, + "grad_norm": 3.351661205291748, + "learning_rate": 5.522508572615993e-06, + "loss": 3.9921, + "step": 7320 + }, + { + "epoch": 0.9376, + "grad_norm": 3.342860221862793, + "learning_rate": 5.517492911025165e-06, + "loss": 4.051, + "step": 7325 + }, + { + "epoch": 0.93824, + "grad_norm": 3.2882394790649414, + "learning_rate": 5.51247672300052e-06, + "loss": 3.819, + "step": 7330 + }, + { + "epoch": 0.93888, + "grad_norm": 3.3351001739501953, + "learning_rate": 5.507460013644907e-06, + "loss": 3.873, + "step": 7335 + }, + { + "epoch": 0.93952, + "grad_norm": 3.4074318408966064, + "learning_rate": 5.502442788061718e-06, + "loss": 3.7729, + "step": 7340 + }, + { + "epoch": 0.94016, + "grad_norm": 3.293647289276123, + "learning_rate": 5.497425051354856e-06, + "loss": 3.859, + "step": 7345 + }, + { + "epoch": 0.9408, + "grad_norm": 3.289747714996338, + "learning_rate": 5.492406808628757e-06, + "loss": 4.0432, + "step": 7350 + }, + { + "epoch": 0.94144, + "grad_norm": 3.2682368755340576, + "learning_rate": 5.487388064988361e-06, + "loss": 4.0227, + "step": 7355 + }, + { + "epoch": 0.94208, + "grad_norm": 3.1242454051971436, + "learning_rate": 5.482368825539125e-06, + "loss": 3.9899, + "step": 7360 + }, + { + "epoch": 0.94272, + "grad_norm": 5.209671974182129, + "learning_rate": 5.478353080428558e-06, + "loss": 3.8918, + "step": 7365 + }, + { + "epoch": 0.94336, + "grad_norm": 3.0842902660369873, + "learning_rate": 5.4733329613907585e-06, + "loss": 3.8996, + "step": 7370 + }, + { + "epoch": 0.944, + "grad_norm": 3.6524879932403564, + "learning_rate": 5.468312360842056e-06, + "loss": 4.1494, + "step": 7375 + }, + { + "epoch": 0.94464, + "grad_norm": 3.5056962966918945, + "learning_rate": 5.463291283889796e-06, + "loss": 3.766, + "step": 7380 + }, + { + "epoch": 0.94528, + "grad_norm": 3.3486692905426025, + "learning_rate": 5.4582697356418036e-06, + "loss": 3.9352, + "step": 7385 + }, + { + "epoch": 0.94592, + "grad_norm": 3.435131549835205, + "learning_rate": 5.4532477212063876e-06, + "loss": 4.045, + "step": 7390 + }, + { + "epoch": 0.94656, + "grad_norm": 3.6526622772216797, + "learning_rate": 5.448225245692329e-06, + "loss": 4.046, + "step": 7395 + }, + { + "epoch": 0.9472, + "grad_norm": 3.1922192573547363, + "learning_rate": 5.443202314208879e-06, + "loss": 4.0255, + "step": 7400 + }, + { + "epoch": 0.9472, + "eval_loss": 0.9677584767341614, + "eval_runtime": 8.257, + "eval_samples_per_second": 121.109, + "eval_steps_per_second": 15.139, + "step": 7400 + }, + { + "epoch": 0.94784, + "grad_norm": 3.202497959136963, + "learning_rate": 5.4381789318657505e-06, + "loss": 3.9725, + "step": 7405 + }, + { + "epoch": 0.94848, + "grad_norm": 3.406205177307129, + "learning_rate": 5.433155103773118e-06, + "loss": 3.8766, + "step": 7410 + }, + { + "epoch": 0.94912, + "grad_norm": 3.2586803436279297, + "learning_rate": 5.428130835041609e-06, + "loss": 3.8658, + "step": 7415 + }, + { + "epoch": 0.94976, + "grad_norm": 3.390014410018921, + "learning_rate": 5.4231061307822966e-06, + "loss": 4.0069, + "step": 7420 + }, + { + "epoch": 0.9504, + "grad_norm": 3.1910688877105713, + "learning_rate": 5.418080996106698e-06, + "loss": 3.9555, + "step": 7425 + }, + { + "epoch": 0.95104, + "grad_norm": 3.1069347858428955, + "learning_rate": 5.413055436126771e-06, + "loss": 3.8353, + "step": 7430 + }, + { + "epoch": 0.95168, + "grad_norm": 3.4949285984039307, + "learning_rate": 5.408029455954902e-06, + "loss": 4.0338, + "step": 7435 + }, + { + "epoch": 0.95232, + "grad_norm": 3.304924488067627, + "learning_rate": 5.403003060703908e-06, + "loss": 4.0458, + "step": 7440 + }, + { + "epoch": 0.95296, + "grad_norm": 3.4705872535705566, + "learning_rate": 5.397976255487028e-06, + "loss": 4.0438, + "step": 7445 + }, + { + "epoch": 0.9536, + "grad_norm": 3.259751796722412, + "learning_rate": 5.3929490454179155e-06, + "loss": 3.8711, + "step": 7450 + }, + { + "epoch": 0.95424, + "grad_norm": 3.047318458557129, + "learning_rate": 5.387921435610637e-06, + "loss": 3.9242, + "step": 7455 + }, + { + "epoch": 0.95488, + "grad_norm": 3.2884445190429688, + "learning_rate": 5.382893431179668e-06, + "loss": 3.8527, + "step": 7460 + }, + { + "epoch": 0.95552, + "grad_norm": 3.164912462234497, + "learning_rate": 5.377865037239882e-06, + "loss": 3.9622, + "step": 7465 + }, + { + "epoch": 0.95616, + "grad_norm": 3.415105104446411, + "learning_rate": 5.372836258906552e-06, + "loss": 3.8921, + "step": 7470 + }, + { + "epoch": 0.9568, + "grad_norm": 3.607869863510132, + "learning_rate": 5.367807101295337e-06, + "loss": 3.8691, + "step": 7475 + }, + { + "epoch": 0.95744, + "grad_norm": 3.4621803760528564, + "learning_rate": 5.362777569522288e-06, + "loss": 3.9839, + "step": 7480 + }, + { + "epoch": 0.95808, + "grad_norm": 3.2376821041107178, + "learning_rate": 5.357747668703834e-06, + "loss": 3.9305, + "step": 7485 + }, + { + "epoch": 0.95872, + "grad_norm": 3.524980306625366, + "learning_rate": 5.352717403956777e-06, + "loss": 3.9235, + "step": 7490 + }, + { + "epoch": 0.95936, + "grad_norm": 3.1107211112976074, + "learning_rate": 5.347686780398293e-06, + "loss": 3.9071, + "step": 7495 + }, + { + "epoch": 0.96, + "grad_norm": 3.0860273838043213, + "learning_rate": 5.342655803145923e-06, + "loss": 3.9086, + "step": 7500 + }, + { + "epoch": 0.96, + "eval_loss": 0.9887832403182983, + "eval_runtime": 7.0587, + "eval_samples_per_second": 141.668, + "eval_steps_per_second": 17.709, + "step": 7500 + }, + { + "epoch": 0.96064, + "grad_norm": 3.2908318042755127, + "learning_rate": 5.337624477317562e-06, + "loss": 3.8602, + "step": 7505 + }, + { + "epoch": 0.96128, + "grad_norm": 3.1805858612060547, + "learning_rate": 5.332592808031467e-06, + "loss": 3.8726, + "step": 7510 + }, + { + "epoch": 0.96192, + "grad_norm": 3.564436912536621, + "learning_rate": 5.327560800406241e-06, + "loss": 3.9942, + "step": 7515 + }, + { + "epoch": 0.96256, + "grad_norm": 3.2359700202941895, + "learning_rate": 5.322528459560829e-06, + "loss": 4.0791, + "step": 7520 + }, + { + "epoch": 0.9632, + "grad_norm": 3.3315274715423584, + "learning_rate": 5.317495790614522e-06, + "loss": 3.9426, + "step": 7525 + }, + { + "epoch": 0.96384, + "grad_norm": 3.3361990451812744, + "learning_rate": 5.312462798686935e-06, + "loss": 3.9885, + "step": 7530 + }, + { + "epoch": 0.96448, + "grad_norm": 3.4341814517974854, + "learning_rate": 5.30742948889802e-06, + "loss": 4.0134, + "step": 7535 + }, + { + "epoch": 0.96512, + "grad_norm": 3.099053144454956, + "learning_rate": 5.302395866368046e-06, + "loss": 3.9386, + "step": 7540 + }, + { + "epoch": 0.96576, + "grad_norm": 3.2882211208343506, + "learning_rate": 5.2973619362176064e-06, + "loss": 3.9915, + "step": 7545 + }, + { + "epoch": 0.9664, + "grad_norm": 3.2575504779815674, + "learning_rate": 5.292327703567604e-06, + "loss": 3.9657, + "step": 7550 + }, + { + "epoch": 0.96704, + "grad_norm": 3.5143399238586426, + "learning_rate": 5.287293173539248e-06, + "loss": 3.9202, + "step": 7555 + }, + { + "epoch": 0.96768, + "grad_norm": 3.3682284355163574, + "learning_rate": 5.282258351254054e-06, + "loss": 3.9375, + "step": 7560 + }, + { + "epoch": 0.96832, + "grad_norm": 3.3960514068603516, + "learning_rate": 5.277223241833831e-06, + "loss": 3.8977, + "step": 7565 + }, + { + "epoch": 0.96896, + "grad_norm": 3.5102431774139404, + "learning_rate": 5.27218785040068e-06, + "loss": 4.0427, + "step": 7570 + }, + { + "epoch": 0.9696, + "grad_norm": 3.05222487449646, + "learning_rate": 5.267152182076996e-06, + "loss": 3.8364, + "step": 7575 + }, + { + "epoch": 0.97024, + "grad_norm": 3.2691268920898438, + "learning_rate": 5.262116241985446e-06, + "loss": 3.8768, + "step": 7580 + }, + { + "epoch": 0.97088, + "grad_norm": 3.137843132019043, + "learning_rate": 5.257080035248977e-06, + "loss": 3.9938, + "step": 7585 + }, + { + "epoch": 0.97152, + "grad_norm": 3.9772887229919434, + "learning_rate": 5.2520435669908106e-06, + "loss": 3.9055, + "step": 7590 + }, + { + "epoch": 0.97216, + "grad_norm": 3.306682825088501, + "learning_rate": 5.247006842334433e-06, + "loss": 4.0086, + "step": 7595 + }, + { + "epoch": 0.9728, + "grad_norm": 3.2649736404418945, + "learning_rate": 5.241969866403588e-06, + "loss": 3.9806, + "step": 7600 + }, + { + "epoch": 0.9728, + "eval_loss": 0.9813916683197021, + "eval_runtime": 6.8057, + "eval_samples_per_second": 146.935, + "eval_steps_per_second": 18.367, + "step": 7600 + }, + { + "epoch": 0.97344, + "grad_norm": 3.297919988632202, + "learning_rate": 5.236932644322278e-06, + "loss": 3.9641, + "step": 7605 + }, + { + "epoch": 0.97408, + "grad_norm": 3.2999958992004395, + "learning_rate": 5.231895181214753e-06, + "loss": 3.8996, + "step": 7610 + }, + { + "epoch": 0.97472, + "grad_norm": 3.5226521492004395, + "learning_rate": 5.226857482205513e-06, + "loss": 3.9295, + "step": 7615 + }, + { + "epoch": 0.97536, + "grad_norm": 3.5334079265594482, + "learning_rate": 5.221819552419293e-06, + "loss": 3.8712, + "step": 7620 + }, + { + "epoch": 0.976, + "grad_norm": 3.410144805908203, + "learning_rate": 5.216781396981066e-06, + "loss": 3.9702, + "step": 7625 + }, + { + "epoch": 0.97664, + "grad_norm": 3.240433931350708, + "learning_rate": 5.211743021016033e-06, + "loss": 4.043, + "step": 7630 + }, + { + "epoch": 0.97728, + "grad_norm": 3.2035484313964844, + "learning_rate": 5.206704429649621e-06, + "loss": 3.8622, + "step": 7635 + }, + { + "epoch": 0.97792, + "grad_norm": 3.1555721759796143, + "learning_rate": 5.2016656280074725e-06, + "loss": 3.9523, + "step": 7640 + }, + { + "epoch": 0.97856, + "grad_norm": 3.2172229290008545, + "learning_rate": 5.196626621215449e-06, + "loss": 3.9912, + "step": 7645 + }, + { + "epoch": 0.9792, + "grad_norm": 3.324141025543213, + "learning_rate": 5.191587414399615e-06, + "loss": 3.9707, + "step": 7650 + }, + { + "epoch": 0.97984, + "grad_norm": 3.333359479904175, + "learning_rate": 5.1865480126862436e-06, + "loss": 3.8726, + "step": 7655 + }, + { + "epoch": 0.98048, + "grad_norm": 3.3763959407806396, + "learning_rate": 5.181508421201803e-06, + "loss": 4.063, + "step": 7660 + }, + { + "epoch": 0.98112, + "grad_norm": 3.8444693088531494, + "learning_rate": 5.1764686450729575e-06, + "loss": 3.9117, + "step": 7665 + }, + { + "epoch": 0.98176, + "grad_norm": 3.223740577697754, + "learning_rate": 5.171428689426554e-06, + "loss": 3.8024, + "step": 7670 + }, + { + "epoch": 0.9824, + "grad_norm": 3.107224941253662, + "learning_rate": 5.166388559389628e-06, + "loss": 3.9839, + "step": 7675 + }, + { + "epoch": 0.98304, + "grad_norm": 3.4000933170318604, + "learning_rate": 5.161348260089388e-06, + "loss": 4.0005, + "step": 7680 + }, + { + "epoch": 0.98368, + "grad_norm": 3.662874937057495, + "learning_rate": 5.156307796653217e-06, + "loss": 3.896, + "step": 7685 + }, + { + "epoch": 0.98432, + "grad_norm": 3.305297613143921, + "learning_rate": 5.151267174208665e-06, + "loss": 3.9105, + "step": 7690 + }, + { + "epoch": 0.98496, + "grad_norm": 3.186823844909668, + "learning_rate": 5.146226397883442e-06, + "loss": 3.9882, + "step": 7695 + }, + { + "epoch": 0.9856, + "grad_norm": 3.5797548294067383, + "learning_rate": 5.1411854728054155e-06, + "loss": 4.0733, + "step": 7700 + }, + { + "epoch": 0.9856, + "eval_loss": 0.9768617749214172, + "eval_runtime": 7.1654, + "eval_samples_per_second": 139.559, + "eval_steps_per_second": 17.445, + "step": 7700 + }, + { + "epoch": 0.98624, + "grad_norm": 3.3153810501098633, + "learning_rate": 5.136144404102606e-06, + "loss": 4.0024, + "step": 7705 + }, + { + "epoch": 0.98688, + "grad_norm": 3.4547183513641357, + "learning_rate": 5.131103196903175e-06, + "loss": 4.0543, + "step": 7710 + }, + { + "epoch": 0.98752, + "grad_norm": 3.2928266525268555, + "learning_rate": 5.126061856335432e-06, + "loss": 3.9963, + "step": 7715 + }, + { + "epoch": 0.98816, + "grad_norm": 3.229193687438965, + "learning_rate": 5.121020387527818e-06, + "loss": 4.004, + "step": 7720 + }, + { + "epoch": 0.9888, + "grad_norm": 3.1806933879852295, + "learning_rate": 5.115978795608903e-06, + "loss": 3.9922, + "step": 7725 + }, + { + "epoch": 0.98944, + "grad_norm": 3.0850744247436523, + "learning_rate": 5.110937085707388e-06, + "loss": 3.8514, + "step": 7730 + }, + { + "epoch": 0.99008, + "grad_norm": 3.4792258739471436, + "learning_rate": 5.105895262952087e-06, + "loss": 3.9743, + "step": 7735 + }, + { + "epoch": 0.99072, + "grad_norm": 3.4325263500213623, + "learning_rate": 5.100853332471932e-06, + "loss": 3.9103, + "step": 7740 + }, + { + "epoch": 0.99136, + "grad_norm": 3.19690203666687, + "learning_rate": 5.095811299395967e-06, + "loss": 3.9015, + "step": 7745 + }, + { + "epoch": 0.992, + "grad_norm": 3.202651023864746, + "learning_rate": 5.090769168853337e-06, + "loss": 3.8134, + "step": 7750 + }, + { + "epoch": 0.99264, + "grad_norm": 3.395777940750122, + "learning_rate": 5.085726945973285e-06, + "loss": 3.9866, + "step": 7755 + }, + { + "epoch": 0.99328, + "grad_norm": 3.347100019454956, + "learning_rate": 5.080684635885155e-06, + "loss": 3.9207, + "step": 7760 + }, + { + "epoch": 0.99392, + "grad_norm": 3.1593613624572754, + "learning_rate": 5.0756422437183705e-06, + "loss": 3.9379, + "step": 7765 + }, + { + "epoch": 0.99456, + "grad_norm": 3.2734835147857666, + "learning_rate": 5.070599774602445e-06, + "loss": 3.7975, + "step": 7770 + }, + { + "epoch": 0.9952, + "grad_norm": 3.32014536857605, + "learning_rate": 5.065557233666968e-06, + "loss": 4.0122, + "step": 7775 + }, + { + "epoch": 0.99584, + "grad_norm": 3.6023435592651367, + "learning_rate": 5.060514626041602e-06, + "loss": 3.9277, + "step": 7780 + }, + { + "epoch": 0.99648, + "grad_norm": 3.0981125831604004, + "learning_rate": 5.055471956856076e-06, + "loss": 3.8617, + "step": 7785 + }, + { + "epoch": 0.99712, + "grad_norm": 3.361649513244629, + "learning_rate": 5.0504292312401845e-06, + "loss": 3.8414, + "step": 7790 + }, + { + "epoch": 0.99776, + "grad_norm": 3.3960330486297607, + "learning_rate": 5.0453864543237786e-06, + "loss": 3.8511, + "step": 7795 + }, + { + "epoch": 0.9984, + "grad_norm": 3.236677646636963, + "learning_rate": 5.040343631236761e-06, + "loss": 3.8611, + "step": 7800 + }, + { + "epoch": 0.9984, + "eval_loss": 0.991203248500824, + "eval_runtime": 7.463, + "eval_samples_per_second": 133.995, + "eval_steps_per_second": 16.749, + "step": 7800 + }, + { + "epoch": 0.99904, + "grad_norm": 3.3027491569519043, + "learning_rate": 5.035300767109081e-06, + "loss": 3.8784, + "step": 7805 + }, + { + "epoch": 0.99968, + "grad_norm": 3.059692144393921, + "learning_rate": 5.03025786707073e-06, + "loss": 3.856, + "step": 7810 + }, + { + "epoch": 1.000256, + "grad_norm": 3.319530725479126, + "learning_rate": 5.025214936251735e-06, + "loss": 3.515, + "step": 7815 + }, + { + "epoch": 1.000896, + "grad_norm": 3.3164639472961426, + "learning_rate": 5.0201719797821595e-06, + "loss": 3.8596, + "step": 7820 + }, + { + "epoch": 1.001536, + "grad_norm": 3.717644691467285, + "learning_rate": 5.015129002792082e-06, + "loss": 3.9824, + "step": 7825 + }, + { + "epoch": 1.002176, + "grad_norm": 3.3352081775665283, + "learning_rate": 5.0100860104116135e-06, + "loss": 3.9783, + "step": 7830 + }, + { + "epoch": 1.002816, + "grad_norm": 3.228071689605713, + "learning_rate": 5.0050430077708756e-06, + "loss": 4.0029, + "step": 7835 + }, + { + "epoch": 1.003456, + "grad_norm": 3.2079076766967773, + "learning_rate": 5e-06, + "loss": 3.8237, + "step": 7840 + }, + { + "epoch": 1.004096, + "grad_norm": 3.2352001667022705, + "learning_rate": 4.994956992229126e-06, + "loss": 3.9639, + "step": 7845 + }, + { + "epoch": 1.004736, + "grad_norm": 3.2654805183410645, + "learning_rate": 4.989913989588388e-06, + "loss": 3.9077, + "step": 7850 + }, + { + "epoch": 1.005376, + "grad_norm": 3.5538816452026367, + "learning_rate": 4.9848709972079195e-06, + "loss": 3.96, + "step": 7855 + }, + { + "epoch": 1.006016, + "grad_norm": 3.4138593673706055, + "learning_rate": 4.979828020217843e-06, + "loss": 3.9993, + "step": 7860 + }, + { + "epoch": 1.006656, + "grad_norm": 3.351036787033081, + "learning_rate": 4.974785063748266e-06, + "loss": 3.9911, + "step": 7865 + }, + { + "epoch": 1.007296, + "grad_norm": 3.1687943935394287, + "learning_rate": 4.969742132929272e-06, + "loss": 3.9839, + "step": 7870 + }, + { + "epoch": 1.007936, + "grad_norm": 3.325587034225464, + "learning_rate": 4.964699232890919e-06, + "loss": 3.9703, + "step": 7875 + }, + { + "epoch": 1.008576, + "grad_norm": 3.418788194656372, + "learning_rate": 4.95965636876324e-06, + "loss": 3.9424, + "step": 7880 + }, + { + "epoch": 1.009216, + "grad_norm": 3.2540955543518066, + "learning_rate": 4.954613545676223e-06, + "loss": 3.9614, + "step": 7885 + }, + { + "epoch": 1.009856, + "grad_norm": 3.170858383178711, + "learning_rate": 4.949570768759817e-06, + "loss": 4.0629, + "step": 7890 + }, + { + "epoch": 1.010496, + "grad_norm": 3.227459192276001, + "learning_rate": 4.944528043143926e-06, + "loss": 3.9351, + "step": 7895 + }, + { + "epoch": 1.011136, + "grad_norm": 3.218439817428589, + "learning_rate": 4.9394853739584e-06, + "loss": 3.8864, + "step": 7900 + }, + { + "epoch": 1.011136, + "eval_loss": 0.9792375564575195, + "eval_runtime": 6.9749, + "eval_samples_per_second": 143.371, + "eval_steps_per_second": 17.921, + "step": 7900 + }, + { + "epoch": 1.011776, + "grad_norm": 3.564470052719116, + "learning_rate": 4.934442766333034e-06, + "loss": 3.9028, + "step": 7905 + }, + { + "epoch": 1.012416, + "grad_norm": 3.1783535480499268, + "learning_rate": 4.9294002253975575e-06, + "loss": 3.8552, + "step": 7910 + }, + { + "epoch": 1.013056, + "grad_norm": 3.098893165588379, + "learning_rate": 4.92435775628163e-06, + "loss": 3.8879, + "step": 7915 + }, + { + "epoch": 1.013696, + "grad_norm": 3.3577730655670166, + "learning_rate": 4.9193153641148465e-06, + "loss": 3.8814, + "step": 7920 + }, + { + "epoch": 1.014336, + "grad_norm": 3.4143130779266357, + "learning_rate": 4.914273054026717e-06, + "loss": 3.9688, + "step": 7925 + }, + { + "epoch": 1.014976, + "grad_norm": 3.3033559322357178, + "learning_rate": 4.9092308311466655e-06, + "loss": 3.9102, + "step": 7930 + }, + { + "epoch": 1.015616, + "grad_norm": 3.276418924331665, + "learning_rate": 4.904188700604033e-06, + "loss": 4.0419, + "step": 7935 + }, + { + "epoch": 1.016256, + "grad_norm": 3.209242105484009, + "learning_rate": 4.899146667528069e-06, + "loss": 3.8513, + "step": 7940 + }, + { + "epoch": 1.016896, + "grad_norm": 3.382869005203247, + "learning_rate": 4.894104737047916e-06, + "loss": 3.9987, + "step": 7945 + }, + { + "epoch": 1.017536, + "grad_norm": 3.677175760269165, + "learning_rate": 4.889062914292615e-06, + "loss": 3.8717, + "step": 7950 + }, + { + "epoch": 1.018176, + "grad_norm": 3.4745471477508545, + "learning_rate": 4.884021204391097e-06, + "loss": 3.9395, + "step": 7955 + }, + { + "epoch": 1.018816, + "grad_norm": 3.321906805038452, + "learning_rate": 4.878979612472183e-06, + "loss": 3.8124, + "step": 7960 + }, + { + "epoch": 1.019456, + "grad_norm": 3.5144152641296387, + "learning_rate": 4.8739381436645685e-06, + "loss": 3.9267, + "step": 7965 + }, + { + "epoch": 1.020096, + "grad_norm": 3.3462071418762207, + "learning_rate": 4.8688968030968265e-06, + "loss": 3.9816, + "step": 7970 + }, + { + "epoch": 1.020736, + "grad_norm": 3.219229221343994, + "learning_rate": 4.863855595897395e-06, + "loss": 3.8884, + "step": 7975 + }, + { + "epoch": 1.021376, + "grad_norm": 3.2813336849212646, + "learning_rate": 4.858814527194586e-06, + "loss": 4.0285, + "step": 7980 + }, + { + "epoch": 1.022016, + "grad_norm": 3.3408925533294678, + "learning_rate": 4.85377360211656e-06, + "loss": 3.9853, + "step": 7985 + }, + { + "epoch": 1.022656, + "grad_norm": 3.3936686515808105, + "learning_rate": 4.848732825791338e-06, + "loss": 3.8653, + "step": 7990 + }, + { + "epoch": 1.023296, + "grad_norm": 3.290966749191284, + "learning_rate": 4.843692203346783e-06, + "loss": 4.0207, + "step": 7995 + }, + { + "epoch": 1.023936, + "grad_norm": 3.2207400798797607, + "learning_rate": 4.838651739910613e-06, + "loss": 3.8461, + "step": 8000 + }, + { + "epoch": 1.023936, + "eval_loss": 0.9756978154182434, + "eval_runtime": 6.6826, + "eval_samples_per_second": 149.642, + "eval_steps_per_second": 18.705, + "step": 8000 + }, + { + "epoch": 1.024576, + "grad_norm": 3.4297149181365967, + "learning_rate": 4.8336114406103725e-06, + "loss": 3.9118, + "step": 8005 + }, + { + "epoch": 1.025216, + "grad_norm": 3.3404061794281006, + "learning_rate": 4.828571310573447e-06, + "loss": 3.8817, + "step": 8010 + }, + { + "epoch": 1.025856, + "grad_norm": 3.070565700531006, + "learning_rate": 4.823531354927046e-06, + "loss": 3.8727, + "step": 8015 + }, + { + "epoch": 1.026496, + "grad_norm": 3.4748332500457764, + "learning_rate": 4.8184915787981975e-06, + "loss": 4.0154, + "step": 8020 + }, + { + "epoch": 1.027136, + "grad_norm": 3.987419366836548, + "learning_rate": 4.813451987313758e-06, + "loss": 3.8994, + "step": 8025 + }, + { + "epoch": 1.027776, + "grad_norm": 3.248142719268799, + "learning_rate": 4.808412585600387e-06, + "loss": 3.9946, + "step": 8030 + }, + { + "epoch": 1.028416, + "grad_norm": 3.4826786518096924, + "learning_rate": 4.8033733787845535e-06, + "loss": 3.7973, + "step": 8035 + }, + { + "epoch": 1.029056, + "grad_norm": 3.183786630630493, + "learning_rate": 4.7983343719925275e-06, + "loss": 4.1816, + "step": 8040 + }, + { + "epoch": 1.029696, + "grad_norm": 3.292250394821167, + "learning_rate": 4.79329557035038e-06, + "loss": 3.9218, + "step": 8045 + }, + { + "epoch": 1.030336, + "grad_norm": 3.4107160568237305, + "learning_rate": 4.788256978983968e-06, + "loss": 3.9041, + "step": 8050 + }, + { + "epoch": 1.030976, + "grad_norm": 3.4038548469543457, + "learning_rate": 4.783218603018936e-06, + "loss": 3.965, + "step": 8055 + }, + { + "epoch": 1.031616, + "grad_norm": 3.390043258666992, + "learning_rate": 4.778180447580707e-06, + "loss": 3.7506, + "step": 8060 + }, + { + "epoch": 1.032256, + "grad_norm": 3.275566577911377, + "learning_rate": 4.773142517794488e-06, + "loss": 3.9673, + "step": 8065 + }, + { + "epoch": 1.032896, + "grad_norm": 3.6536107063293457, + "learning_rate": 4.768104818785248e-06, + "loss": 3.8182, + "step": 8070 + }, + { + "epoch": 1.033536, + "grad_norm": 3.2812538146972656, + "learning_rate": 4.763067355677724e-06, + "loss": 3.9372, + "step": 8075 + }, + { + "epoch": 1.034176, + "grad_norm": 3.0662550926208496, + "learning_rate": 4.758030133596413e-06, + "loss": 3.8224, + "step": 8080 + }, + { + "epoch": 1.034816, + "grad_norm": 3.2141222953796387, + "learning_rate": 4.752993157665568e-06, + "loss": 3.9668, + "step": 8085 + }, + { + "epoch": 1.035456, + "grad_norm": 3.3035459518432617, + "learning_rate": 4.74795643300919e-06, + "loss": 3.8405, + "step": 8090 + }, + { + "epoch": 1.036096, + "grad_norm": 3.179811477661133, + "learning_rate": 4.742919964751025e-06, + "loss": 3.8759, + "step": 8095 + }, + { + "epoch": 1.036736, + "grad_norm": 3.3282361030578613, + "learning_rate": 4.737883758014557e-06, + "loss": 3.8409, + "step": 8100 + }, + { + "epoch": 1.036736, + "eval_loss": 0.9715144038200378, + "eval_runtime": 6.8358, + "eval_samples_per_second": 146.288, + "eval_steps_per_second": 18.286, + "step": 8100 + }, + { + "epoch": 1.037376, + "grad_norm": 3.3947505950927734, + "learning_rate": 4.732847817923005e-06, + "loss": 3.9228, + "step": 8105 + }, + { + "epoch": 1.038016, + "grad_norm": 3.3737497329711914, + "learning_rate": 4.7278121495993205e-06, + "loss": 3.9072, + "step": 8110 + }, + { + "epoch": 1.038656, + "grad_norm": 3.2646024227142334, + "learning_rate": 4.7227767581661714e-06, + "loss": 3.7719, + "step": 8115 + }, + { + "epoch": 1.039296, + "grad_norm": 3.168193817138672, + "learning_rate": 4.717741648745946e-06, + "loss": 3.8807, + "step": 8120 + }, + { + "epoch": 1.039936, + "grad_norm": 4.204946041107178, + "learning_rate": 4.712706826460753e-06, + "loss": 3.8374, + "step": 8125 + }, + { + "epoch": 1.040576, + "grad_norm": 3.0786685943603516, + "learning_rate": 4.707672296432397e-06, + "loss": 3.8454, + "step": 8130 + }, + { + "epoch": 1.041216, + "grad_norm": 3.35418701171875, + "learning_rate": 4.702638063782394e-06, + "loss": 3.9096, + "step": 8135 + }, + { + "epoch": 1.041856, + "grad_norm": 3.190556526184082, + "learning_rate": 4.6976041336319545e-06, + "loss": 3.8653, + "step": 8140 + }, + { + "epoch": 1.042496, + "grad_norm": 3.4106967449188232, + "learning_rate": 4.692570511101982e-06, + "loss": 3.9108, + "step": 8145 + }, + { + "epoch": 1.043136, + "grad_norm": 3.1676368713378906, + "learning_rate": 4.687537201313067e-06, + "loss": 3.8812, + "step": 8150 + }, + { + "epoch": 1.043776, + "grad_norm": 3.254140615463257, + "learning_rate": 4.682504209385481e-06, + "loss": 3.9211, + "step": 8155 + }, + { + "epoch": 1.044416, + "grad_norm": 3.426715850830078, + "learning_rate": 4.677471540439171e-06, + "loss": 3.763, + "step": 8160 + }, + { + "epoch": 1.045056, + "grad_norm": 3.3520352840423584, + "learning_rate": 4.672439199593761e-06, + "loss": 3.8946, + "step": 8165 + }, + { + "epoch": 1.045696, + "grad_norm": 3.071427583694458, + "learning_rate": 4.667407191968535e-06, + "loss": 3.9504, + "step": 8170 + }, + { + "epoch": 1.046336, + "grad_norm": 3.1410775184631348, + "learning_rate": 4.662375522682439e-06, + "loss": 3.8667, + "step": 8175 + }, + { + "epoch": 1.046976, + "grad_norm": 3.2197885513305664, + "learning_rate": 4.6573441968540795e-06, + "loss": 3.9225, + "step": 8180 + }, + { + "epoch": 1.047616, + "grad_norm": 3.2275218963623047, + "learning_rate": 4.652313219601706e-06, + "loss": 3.9144, + "step": 8185 + }, + { + "epoch": 1.048256, + "grad_norm": 3.601731538772583, + "learning_rate": 4.647282596043224e-06, + "loss": 3.9631, + "step": 8190 + }, + { + "epoch": 1.048896, + "grad_norm": 3.432614803314209, + "learning_rate": 4.642252331296168e-06, + "loss": 3.8468, + "step": 8195 + }, + { + "epoch": 1.049536, + "grad_norm": 3.2367265224456787, + "learning_rate": 4.637222430477713e-06, + "loss": 3.9908, + "step": 8200 + }, + { + "epoch": 1.049536, + "eval_loss": 0.9731603860855103, + "eval_runtime": 6.8061, + "eval_samples_per_second": 146.927, + "eval_steps_per_second": 18.366, + "step": 8200 + }, + { + "epoch": 1.050176, + "grad_norm": 3.158475399017334, + "learning_rate": 4.632192898704664e-06, + "loss": 3.7899, + "step": 8205 + }, + { + "epoch": 1.050816, + "grad_norm": 3.394278049468994, + "learning_rate": 4.62716374109345e-06, + "loss": 4.0507, + "step": 8210 + }, + { + "epoch": 1.051456, + "grad_norm": 3.066319704055786, + "learning_rate": 4.6221349627601195e-06, + "loss": 3.7521, + "step": 8215 + }, + { + "epoch": 1.052096, + "grad_norm": 3.577451229095459, + "learning_rate": 4.617106568820334e-06, + "loss": 3.8836, + "step": 8220 + }, + { + "epoch": 1.052736, + "grad_norm": 3.3249411582946777, + "learning_rate": 4.612078564389363e-06, + "loss": 3.9723, + "step": 8225 + }, + { + "epoch": 1.053376, + "grad_norm": 3.228684902191162, + "learning_rate": 4.607050954582086e-06, + "loss": 3.7932, + "step": 8230 + }, + { + "epoch": 1.054016, + "grad_norm": 3.2836358547210693, + "learning_rate": 4.602023744512974e-06, + "loss": 3.8523, + "step": 8235 + }, + { + "epoch": 1.054656, + "grad_norm": 3.2514820098876953, + "learning_rate": 4.596996939296093e-06, + "loss": 3.8587, + "step": 8240 + }, + { + "epoch": 1.055296, + "grad_norm": 3.3256592750549316, + "learning_rate": 4.591970544045099e-06, + "loss": 3.9497, + "step": 8245 + }, + { + "epoch": 1.055936, + "grad_norm": 3.186540365219116, + "learning_rate": 4.58694456387323e-06, + "loss": 3.7457, + "step": 8250 + }, + { + "epoch": 1.056576, + "grad_norm": 3.2372641563415527, + "learning_rate": 4.5819190038933035e-06, + "loss": 3.7401, + "step": 8255 + }, + { + "epoch": 1.057216, + "grad_norm": 3.463500738143921, + "learning_rate": 4.576893869217707e-06, + "loss": 3.8811, + "step": 8260 + }, + { + "epoch": 1.057856, + "grad_norm": 3.2895185947418213, + "learning_rate": 4.571869164958392e-06, + "loss": 4.0712, + "step": 8265 + }, + { + "epoch": 1.058496, + "grad_norm": 3.387270450592041, + "learning_rate": 4.566844896226883e-06, + "loss": 3.7833, + "step": 8270 + }, + { + "epoch": 1.059136, + "grad_norm": 3.3072781562805176, + "learning_rate": 4.56182106813425e-06, + "loss": 3.9508, + "step": 8275 + }, + { + "epoch": 1.059776, + "grad_norm": 3.262228012084961, + "learning_rate": 4.556797685791123e-06, + "loss": 4.0602, + "step": 8280 + }, + { + "epoch": 1.060416, + "grad_norm": 3.3029568195343018, + "learning_rate": 4.551774754307672e-06, + "loss": 3.8243, + "step": 8285 + }, + { + "epoch": 1.061056, + "grad_norm": 3.6015522480010986, + "learning_rate": 4.546752278793613e-06, + "loss": 3.9187, + "step": 8290 + }, + { + "epoch": 1.061696, + "grad_norm": 3.132108449935913, + "learning_rate": 4.541730264358198e-06, + "loss": 3.9011, + "step": 8295 + }, + { + "epoch": 1.062336, + "grad_norm": 3.4112889766693115, + "learning_rate": 4.536708716110207e-06, + "loss": 3.9141, + "step": 8300 + }, + { + "epoch": 1.062336, + "eval_loss": 0.9754996299743652, + "eval_runtime": 7.1041, + "eval_samples_per_second": 140.763, + "eval_steps_per_second": 17.595, + "step": 8300 + }, + { + "epoch": 1.062976, + "grad_norm": 3.3974194526672363, + "learning_rate": 4.5316876391579444e-06, + "loss": 3.9216, + "step": 8305 + }, + { + "epoch": 1.0636160000000001, + "grad_norm": 3.4178879261016846, + "learning_rate": 4.526667038609244e-06, + "loss": 3.8112, + "step": 8310 + }, + { + "epoch": 1.064256, + "grad_norm": 3.3445000648498535, + "learning_rate": 4.521646919571444e-06, + "loss": 3.9406, + "step": 8315 + }, + { + "epoch": 1.064896, + "grad_norm": 2.9815402030944824, + "learning_rate": 4.516627287151402e-06, + "loss": 3.9258, + "step": 8320 + }, + { + "epoch": 1.065536, + "grad_norm": 3.2848947048187256, + "learning_rate": 4.511608146455471e-06, + "loss": 3.8195, + "step": 8325 + }, + { + "epoch": 1.066176, + "grad_norm": 3.2756311893463135, + "learning_rate": 4.506589502589514e-06, + "loss": 3.844, + "step": 8330 + }, + { + "epoch": 1.066816, + "grad_norm": 3.47074294090271, + "learning_rate": 4.501571360658884e-06, + "loss": 3.986, + "step": 8335 + }, + { + "epoch": 1.067456, + "grad_norm": 3.780287981033325, + "learning_rate": 4.49655372576842e-06, + "loss": 4.0033, + "step": 8340 + }, + { + "epoch": 1.068096, + "grad_norm": 3.6877105236053467, + "learning_rate": 4.491536603022449e-06, + "loss": 3.8884, + "step": 8345 + }, + { + "epoch": 1.068736, + "grad_norm": 3.375788927078247, + "learning_rate": 4.486519997524776e-06, + "loss": 3.9015, + "step": 8350 + }, + { + "epoch": 1.069376, + "grad_norm": 3.4512062072753906, + "learning_rate": 4.481503914378683e-06, + "loss": 3.8001, + "step": 8355 + }, + { + "epoch": 1.070016, + "grad_norm": 3.4298641681671143, + "learning_rate": 4.476488358686916e-06, + "loss": 3.9691, + "step": 8360 + }, + { + "epoch": 1.070656, + "grad_norm": 3.2674472332000732, + "learning_rate": 4.471473335551687e-06, + "loss": 3.758, + "step": 8365 + }, + { + "epoch": 1.071296, + "grad_norm": 3.0024731159210205, + "learning_rate": 4.466458850074661e-06, + "loss": 3.9368, + "step": 8370 + }, + { + "epoch": 1.071936, + "grad_norm": 3.393695116043091, + "learning_rate": 4.461444907356967e-06, + "loss": 3.8741, + "step": 8375 + }, + { + "epoch": 1.072576, + "grad_norm": 3.399658441543579, + "learning_rate": 4.456431512499171e-06, + "loss": 3.8315, + "step": 8380 + }, + { + "epoch": 1.073216, + "grad_norm": 3.303518056869507, + "learning_rate": 4.45141867060129e-06, + "loss": 3.7508, + "step": 8385 + }, + { + "epoch": 1.073856, + "grad_norm": 3.3047292232513428, + "learning_rate": 4.446406386762768e-06, + "loss": 3.6943, + "step": 8390 + }, + { + "epoch": 1.074496, + "grad_norm": 3.38957142829895, + "learning_rate": 4.441394666082496e-06, + "loss": 3.8631, + "step": 8395 + }, + { + "epoch": 1.075136, + "grad_norm": 3.3375697135925293, + "learning_rate": 4.436383513658778e-06, + "loss": 3.7307, + "step": 8400 + }, + { + "epoch": 1.075136, + "eval_loss": 0.9679771661758423, + "eval_runtime": 6.7924, + "eval_samples_per_second": 147.224, + "eval_steps_per_second": 18.403, + "step": 8400 + }, + { + "epoch": 1.075776, + "grad_norm": 2.9178521633148193, + "learning_rate": 4.431372934589349e-06, + "loss": 3.7509, + "step": 8405 + }, + { + "epoch": 1.076416, + "grad_norm": 3.609673023223877, + "learning_rate": 4.426362933971354e-06, + "loss": 3.9966, + "step": 8410 + }, + { + "epoch": 1.077056, + "grad_norm": 3.0642848014831543, + "learning_rate": 4.421353516901358e-06, + "loss": 3.8124, + "step": 8415 + }, + { + "epoch": 1.077696, + "grad_norm": 3.655663013458252, + "learning_rate": 4.416344688475324e-06, + "loss": 3.8068, + "step": 8420 + }, + { + "epoch": 1.078336, + "grad_norm": 3.383075475692749, + "learning_rate": 4.411336453788622e-06, + "loss": 3.96, + "step": 8425 + }, + { + "epoch": 1.078976, + "grad_norm": 3.0383999347686768, + "learning_rate": 4.406328817936012e-06, + "loss": 3.7295, + "step": 8430 + }, + { + "epoch": 1.079616, + "grad_norm": 3.2788867950439453, + "learning_rate": 4.401321786011653e-06, + "loss": 3.9375, + "step": 8435 + }, + { + "epoch": 1.0802559999999999, + "grad_norm": 3.3316409587860107, + "learning_rate": 4.396315363109084e-06, + "loss": 3.9218, + "step": 8440 + }, + { + "epoch": 1.080896, + "grad_norm": 3.043272018432617, + "learning_rate": 4.391309554321224e-06, + "loss": 3.9933, + "step": 8445 + }, + { + "epoch": 1.081536, + "grad_norm": 3.2069058418273926, + "learning_rate": 4.3863043647403695e-06, + "loss": 3.812, + "step": 8450 + }, + { + "epoch": 1.082176, + "grad_norm": 3.4925637245178223, + "learning_rate": 4.381299799458186e-06, + "loss": 3.8843, + "step": 8455 + }, + { + "epoch": 1.082816, + "grad_norm": 3.4238312244415283, + "learning_rate": 4.376295863565708e-06, + "loss": 3.8669, + "step": 8460 + }, + { + "epoch": 1.083456, + "grad_norm": 3.490819215774536, + "learning_rate": 4.371292562153322e-06, + "loss": 3.9678, + "step": 8465 + }, + { + "epoch": 1.084096, + "grad_norm": 3.41055965423584, + "learning_rate": 4.366289900310773e-06, + "loss": 3.8562, + "step": 8470 + }, + { + "epoch": 1.084736, + "grad_norm": 3.2907748222351074, + "learning_rate": 4.36128788312716e-06, + "loss": 3.8839, + "step": 8475 + }, + { + "epoch": 1.085376, + "grad_norm": 3.337765693664551, + "learning_rate": 4.356286515690919e-06, + "loss": 3.9953, + "step": 8480 + }, + { + "epoch": 1.086016, + "grad_norm": 3.3621182441711426, + "learning_rate": 4.351285803089827e-06, + "loss": 3.8961, + "step": 8485 + }, + { + "epoch": 1.086656, + "grad_norm": 3.304739236831665, + "learning_rate": 4.346285750410996e-06, + "loss": 3.8328, + "step": 8490 + }, + { + "epoch": 1.087296, + "grad_norm": 3.2477059364318848, + "learning_rate": 4.341286362740867e-06, + "loss": 3.9207, + "step": 8495 + }, + { + "epoch": 1.087936, + "grad_norm": 3.1985952854156494, + "learning_rate": 4.336287645165205e-06, + "loss": 3.8237, + "step": 8500 + }, + { + "epoch": 1.087936, + "eval_loss": 0.9712408185005188, + "eval_runtime": 7.5304, + "eval_samples_per_second": 132.795, + "eval_steps_per_second": 16.599, + "step": 8500 + }, + { + "epoch": 1.088576, + "grad_norm": 3.250664234161377, + "learning_rate": 4.331289602769091e-06, + "loss": 3.8786, + "step": 8505 + }, + { + "epoch": 1.089216, + "grad_norm": 3.40374493598938, + "learning_rate": 4.32629224063692e-06, + "loss": 3.9622, + "step": 8510 + }, + { + "epoch": 1.089856, + "grad_norm": 3.1335885524749756, + "learning_rate": 4.321295563852394e-06, + "loss": 3.9723, + "step": 8515 + }, + { + "epoch": 1.090496, + "grad_norm": 3.25437331199646, + "learning_rate": 4.316299577498522e-06, + "loss": 3.9138, + "step": 8520 + }, + { + "epoch": 1.091136, + "grad_norm": 3.3222084045410156, + "learning_rate": 4.311304286657608e-06, + "loss": 3.9344, + "step": 8525 + }, + { + "epoch": 1.091776, + "grad_norm": 3.0206170082092285, + "learning_rate": 4.306309696411246e-06, + "loss": 3.7114, + "step": 8530 + }, + { + "epoch": 1.092416, + "grad_norm": 3.313721179962158, + "learning_rate": 4.301315811840319e-06, + "loss": 3.831, + "step": 8535 + }, + { + "epoch": 1.093056, + "grad_norm": 2.9408037662506104, + "learning_rate": 4.296322638024996e-06, + "loss": 3.8785, + "step": 8540 + }, + { + "epoch": 1.093696, + "grad_norm": 3.5085701942443848, + "learning_rate": 4.291330180044717e-06, + "loss": 3.9432, + "step": 8545 + }, + { + "epoch": 1.094336, + "grad_norm": 3.4924538135528564, + "learning_rate": 4.286338442978196e-06, + "loss": 3.9569, + "step": 8550 + }, + { + "epoch": 1.094976, + "grad_norm": 3.2100634574890137, + "learning_rate": 4.281347431903416e-06, + "loss": 3.8973, + "step": 8555 + }, + { + "epoch": 1.095616, + "grad_norm": 3.3668410778045654, + "learning_rate": 4.276357151897619e-06, + "loss": 3.7831, + "step": 8560 + }, + { + "epoch": 1.096256, + "grad_norm": 3.127249240875244, + "learning_rate": 4.271367608037304e-06, + "loss": 3.919, + "step": 8565 + }, + { + "epoch": 1.096896, + "grad_norm": 3.3912453651428223, + "learning_rate": 4.266378805398221e-06, + "loss": 3.9414, + "step": 8570 + }, + { + "epoch": 1.097536, + "grad_norm": 3.356959342956543, + "learning_rate": 4.261390749055363e-06, + "loss": 3.7983, + "step": 8575 + }, + { + "epoch": 1.098176, + "grad_norm": 4.657166957855225, + "learning_rate": 4.256403444082972e-06, + "loss": 3.7763, + "step": 8580 + }, + { + "epoch": 1.098816, + "grad_norm": 3.484509229660034, + "learning_rate": 4.251416895554517e-06, + "loss": 3.9047, + "step": 8585 + }, + { + "epoch": 1.099456, + "grad_norm": 3.3309333324432373, + "learning_rate": 4.246431108542701e-06, + "loss": 3.7863, + "step": 8590 + }, + { + "epoch": 1.100096, + "grad_norm": 3.1959686279296875, + "learning_rate": 4.241446088119452e-06, + "loss": 3.9393, + "step": 8595 + }, + { + "epoch": 1.100736, + "grad_norm": 3.439521074295044, + "learning_rate": 4.236461839355921e-06, + "loss": 3.8784, + "step": 8600 + }, + { + "epoch": 1.100736, + "eval_loss": 0.9694692492485046, + "eval_runtime": 7.2173, + "eval_samples_per_second": 138.556, + "eval_steps_per_second": 17.319, + "step": 8600 + }, + { + "epoch": 1.101376, + "grad_norm": 3.6734774112701416, + "learning_rate": 4.23147836732247e-06, + "loss": 3.8169, + "step": 8605 + }, + { + "epoch": 1.1020159999999999, + "grad_norm": 3.252636432647705, + "learning_rate": 4.226495677088671e-06, + "loss": 3.8374, + "step": 8610 + }, + { + "epoch": 1.102656, + "grad_norm": 3.2438418865203857, + "learning_rate": 4.221513773723301e-06, + "loss": 3.8564, + "step": 8615 + }, + { + "epoch": 1.103296, + "grad_norm": 3.2973501682281494, + "learning_rate": 4.216532662294342e-06, + "loss": 3.9575, + "step": 8620 + }, + { + "epoch": 1.103936, + "grad_norm": 3.2574779987335205, + "learning_rate": 4.211552347868961e-06, + "loss": 3.8727, + "step": 8625 + }, + { + "epoch": 1.104576, + "grad_norm": 3.1038432121276855, + "learning_rate": 4.2065728355135225e-06, + "loss": 3.7745, + "step": 8630 + }, + { + "epoch": 1.105216, + "grad_norm": 3.3146615028381348, + "learning_rate": 4.201594130293568e-06, + "loss": 3.7769, + "step": 8635 + }, + { + "epoch": 1.105856, + "grad_norm": 3.2965011596679688, + "learning_rate": 4.196616237273826e-06, + "loss": 3.8676, + "step": 8640 + }, + { + "epoch": 1.106496, + "grad_norm": 3.0500552654266357, + "learning_rate": 4.191639161518193e-06, + "loss": 4.0099, + "step": 8645 + }, + { + "epoch": 1.107136, + "grad_norm": 3.3446359634399414, + "learning_rate": 4.1866629080897345e-06, + "loss": 3.9098, + "step": 8650 + }, + { + "epoch": 1.107776, + "grad_norm": 3.277162551879883, + "learning_rate": 4.181687482050679e-06, + "loss": 3.8826, + "step": 8655 + }, + { + "epoch": 1.108416, + "grad_norm": 3.272045135498047, + "learning_rate": 4.176712888462417e-06, + "loss": 3.8741, + "step": 8660 + }, + { + "epoch": 1.109056, + "grad_norm": 3.469515323638916, + "learning_rate": 4.171739132385488e-06, + "loss": 3.8808, + "step": 8665 + }, + { + "epoch": 1.109696, + "grad_norm": 3.2473506927490234, + "learning_rate": 4.166766218879586e-06, + "loss": 3.8313, + "step": 8670 + }, + { + "epoch": 1.110336, + "grad_norm": 3.272639274597168, + "learning_rate": 4.161794153003538e-06, + "loss": 3.922, + "step": 8675 + }, + { + "epoch": 1.110976, + "grad_norm": 3.539156198501587, + "learning_rate": 4.156822939815314e-06, + "loss": 3.9184, + "step": 8680 + }, + { + "epoch": 1.111616, + "grad_norm": 3.2175755500793457, + "learning_rate": 4.151852584372021e-06, + "loss": 3.84, + "step": 8685 + }, + { + "epoch": 1.112256, + "grad_norm": 3.154583692550659, + "learning_rate": 4.146883091729887e-06, + "loss": 3.8663, + "step": 8690 + }, + { + "epoch": 1.112896, + "grad_norm": 3.3193087577819824, + "learning_rate": 4.141914466944262e-06, + "loss": 3.9162, + "step": 8695 + }, + { + "epoch": 1.113536, + "grad_norm": 7.432323932647705, + "learning_rate": 4.136946715069617e-06, + "loss": 3.8672, + "step": 8700 + }, + { + "epoch": 1.113536, + "eval_loss": 0.978622317314148, + "eval_runtime": 6.508, + "eval_samples_per_second": 153.658, + "eval_steps_per_second": 19.207, + "step": 8700 + }, + { + "epoch": 1.114176, + "grad_norm": 3.340735912322998, + "learning_rate": 4.1319798411595366e-06, + "loss": 3.8371, + "step": 8705 + }, + { + "epoch": 1.114816, + "grad_norm": 3.2261412143707275, + "learning_rate": 4.127013850266706e-06, + "loss": 3.8953, + "step": 8710 + }, + { + "epoch": 1.115456, + "grad_norm": 3.2747981548309326, + "learning_rate": 4.122048747442915e-06, + "loss": 3.8204, + "step": 8715 + }, + { + "epoch": 1.116096, + "grad_norm": 3.2461555004119873, + "learning_rate": 4.117084537739049e-06, + "loss": 3.9218, + "step": 8720 + }, + { + "epoch": 1.116736, + "grad_norm": 3.4664454460144043, + "learning_rate": 4.112121226205091e-06, + "loss": 3.8422, + "step": 8725 + }, + { + "epoch": 1.117376, + "grad_norm": 3.183678150177002, + "learning_rate": 4.107158817890101e-06, + "loss": 3.8495, + "step": 8730 + }, + { + "epoch": 1.118016, + "grad_norm": 3.3027968406677246, + "learning_rate": 4.102197317842227e-06, + "loss": 3.847, + "step": 8735 + }, + { + "epoch": 1.118656, + "grad_norm": 3.284727096557617, + "learning_rate": 4.097236731108688e-06, + "loss": 3.9299, + "step": 8740 + }, + { + "epoch": 1.119296, + "grad_norm": 3.5089738368988037, + "learning_rate": 4.092277062735779e-06, + "loss": 3.9105, + "step": 8745 + }, + { + "epoch": 1.119936, + "grad_norm": 3.26532244682312, + "learning_rate": 4.0873183177688595e-06, + "loss": 3.8122, + "step": 8750 + }, + { + "epoch": 1.120576, + "grad_norm": 3.258549213409424, + "learning_rate": 4.082360501252345e-06, + "loss": 3.8606, + "step": 8755 + }, + { + "epoch": 1.121216, + "grad_norm": 3.922128677368164, + "learning_rate": 4.077403618229711e-06, + "loss": 3.8658, + "step": 8760 + }, + { + "epoch": 1.121856, + "grad_norm": 3.0818393230438232, + "learning_rate": 4.072447673743484e-06, + "loss": 3.9445, + "step": 8765 + }, + { + "epoch": 1.122496, + "grad_norm": 3.128241777420044, + "learning_rate": 4.067492672835231e-06, + "loss": 3.8835, + "step": 8770 + }, + { + "epoch": 1.123136, + "grad_norm": 3.4902095794677734, + "learning_rate": 4.0625386205455675e-06, + "loss": 3.8119, + "step": 8775 + }, + { + "epoch": 1.1237759999999999, + "grad_norm": 3.1120331287384033, + "learning_rate": 4.057585521914132e-06, + "loss": 3.9596, + "step": 8780 + }, + { + "epoch": 1.124416, + "grad_norm": 3.81187105178833, + "learning_rate": 4.052633381979605e-06, + "loss": 3.8186, + "step": 8785 + }, + { + "epoch": 1.125056, + "grad_norm": 3.3931918144226074, + "learning_rate": 4.047682205779684e-06, + "loss": 3.877, + "step": 8790 + }, + { + "epoch": 1.125696, + "grad_norm": 3.126008987426758, + "learning_rate": 4.042731998351088e-06, + "loss": 3.844, + "step": 8795 + }, + { + "epoch": 1.126336, + "grad_norm": 3.5310118198394775, + "learning_rate": 4.037782764729552e-06, + "loss": 3.9613, + "step": 8800 + }, + { + "epoch": 1.126336, + "eval_loss": 0.9762705564498901, + "eval_runtime": 6.6433, + "eval_samples_per_second": 150.527, + "eval_steps_per_second": 18.816, + "step": 8800 + }, + { + "epoch": 1.126976, + "grad_norm": 3.327422857284546, + "learning_rate": 4.032834509949818e-06, + "loss": 3.8829, + "step": 8805 + }, + { + "epoch": 1.127616, + "grad_norm": 3.368654727935791, + "learning_rate": 4.027887239045636e-06, + "loss": 3.8485, + "step": 8810 + }, + { + "epoch": 1.128256, + "grad_norm": 3.317504644393921, + "learning_rate": 4.022940957049752e-06, + "loss": 3.8833, + "step": 8815 + }, + { + "epoch": 1.1288960000000001, + "grad_norm": 3.361346483230591, + "learning_rate": 4.017995668993904e-06, + "loss": 3.7834, + "step": 8820 + }, + { + "epoch": 1.129536, + "grad_norm": 3.4562320709228516, + "learning_rate": 4.013051379908822e-06, + "loss": 3.9239, + "step": 8825 + }, + { + "epoch": 1.130176, + "grad_norm": 3.381047487258911, + "learning_rate": 4.008108094824222e-06, + "loss": 3.8583, + "step": 8830 + }, + { + "epoch": 1.130816, + "grad_norm": 3.192483425140381, + "learning_rate": 4.0031658187687946e-06, + "loss": 3.9943, + "step": 8835 + }, + { + "epoch": 1.131456, + "grad_norm": 3.2975564002990723, + "learning_rate": 3.998224556770205e-06, + "loss": 3.8301, + "step": 8840 + }, + { + "epoch": 1.132096, + "grad_norm": 3.210685968399048, + "learning_rate": 3.993284313855086e-06, + "loss": 3.8343, + "step": 8845 + }, + { + "epoch": 1.132736, + "grad_norm": 3.294238328933716, + "learning_rate": 3.988345095049039e-06, + "loss": 3.9252, + "step": 8850 + }, + { + "epoch": 1.133376, + "grad_norm": 3.1468687057495117, + "learning_rate": 3.983406905376615e-06, + "loss": 3.8875, + "step": 8855 + }, + { + "epoch": 1.134016, + "grad_norm": 3.2530410289764404, + "learning_rate": 3.978469749861326e-06, + "loss": 3.7618, + "step": 8860 + }, + { + "epoch": 1.134656, + "grad_norm": 3.5931050777435303, + "learning_rate": 3.973533633525623e-06, + "loss": 3.9476, + "step": 8865 + }, + { + "epoch": 1.135296, + "grad_norm": 3.414227247238159, + "learning_rate": 3.968598561390911e-06, + "loss": 3.8739, + "step": 8870 + }, + { + "epoch": 1.135936, + "grad_norm": 3.238137722015381, + "learning_rate": 3.963664538477527e-06, + "loss": 3.8407, + "step": 8875 + }, + { + "epoch": 1.136576, + "grad_norm": 3.327712297439575, + "learning_rate": 3.958731569804738e-06, + "loss": 3.8728, + "step": 8880 + }, + { + "epoch": 1.137216, + "grad_norm": 3.3951661586761475, + "learning_rate": 3.95379966039074e-06, + "loss": 3.9143, + "step": 8885 + }, + { + "epoch": 1.137856, + "grad_norm": 3.6422483921051025, + "learning_rate": 3.948868815252658e-06, + "loss": 3.9115, + "step": 8890 + }, + { + "epoch": 1.138496, + "grad_norm": 3.4670307636260986, + "learning_rate": 3.9439390394065245e-06, + "loss": 3.9659, + "step": 8895 + }, + { + "epoch": 1.139136, + "grad_norm": 3.2578067779541016, + "learning_rate": 3.93901033786729e-06, + "loss": 3.9247, + "step": 8900 + }, + { + "epoch": 1.139136, + "eval_loss": 0.9684814214706421, + "eval_runtime": 6.8159, + "eval_samples_per_second": 146.715, + "eval_steps_per_second": 18.339, + "step": 8900 + }, + { + "epoch": 1.139776, + "grad_norm": 3.3462729454040527, + "learning_rate": 3.934082715648812e-06, + "loss": 4.0389, + "step": 8905 + }, + { + "epoch": 1.140416, + "grad_norm": 3.5041468143463135, + "learning_rate": 3.9291561777638486e-06, + "loss": 3.8389, + "step": 8910 + }, + { + "epoch": 1.141056, + "grad_norm": 3.22247576713562, + "learning_rate": 3.924230729224056e-06, + "loss": 3.8226, + "step": 8915 + }, + { + "epoch": 1.141696, + "grad_norm": 3.356802463531494, + "learning_rate": 3.91930637503998e-06, + "loss": 3.9254, + "step": 8920 + }, + { + "epoch": 1.142336, + "grad_norm": 3.6070470809936523, + "learning_rate": 3.914383120221053e-06, + "loss": 3.8787, + "step": 8925 + }, + { + "epoch": 1.142976, + "grad_norm": 3.1773595809936523, + "learning_rate": 3.909460969775595e-06, + "loss": 3.7098, + "step": 8930 + }, + { + "epoch": 1.143616, + "grad_norm": 3.2447359561920166, + "learning_rate": 3.904539928710796e-06, + "loss": 3.8433, + "step": 8935 + }, + { + "epoch": 1.144256, + "grad_norm": 3.0843183994293213, + "learning_rate": 3.899620002032718e-06, + "loss": 3.961, + "step": 8940 + }, + { + "epoch": 1.144896, + "grad_norm": 3.4813742637634277, + "learning_rate": 3.894701194746291e-06, + "loss": 3.8173, + "step": 8945 + }, + { + "epoch": 1.1455359999999999, + "grad_norm": 3.433196783065796, + "learning_rate": 3.889783511855311e-06, + "loss": 3.944, + "step": 8950 + }, + { + "epoch": 1.146176, + "grad_norm": 3.2032461166381836, + "learning_rate": 3.884866958362421e-06, + "loss": 3.8223, + "step": 8955 + }, + { + "epoch": 1.146816, + "grad_norm": 3.171764850616455, + "learning_rate": 3.879951539269122e-06, + "loss": 3.8118, + "step": 8960 + }, + { + "epoch": 1.147456, + "grad_norm": 3.438223123550415, + "learning_rate": 3.8750372595757545e-06, + "loss": 3.8781, + "step": 8965 + }, + { + "epoch": 1.148096, + "grad_norm": 3.287339687347412, + "learning_rate": 3.870124124281509e-06, + "loss": 3.8611, + "step": 8970 + }, + { + "epoch": 1.148736, + "grad_norm": 3.185896158218384, + "learning_rate": 3.8652121383844035e-06, + "loss": 3.7097, + "step": 8975 + }, + { + "epoch": 1.149376, + "grad_norm": 3.296564817428589, + "learning_rate": 3.860301306881292e-06, + "loss": 3.8195, + "step": 8980 + }, + { + "epoch": 1.150016, + "grad_norm": 3.3196237087249756, + "learning_rate": 3.85539163476785e-06, + "loss": 3.8855, + "step": 8985 + }, + { + "epoch": 1.1506560000000001, + "grad_norm": 3.337581157684326, + "learning_rate": 3.8504831270385765e-06, + "loss": 3.8602, + "step": 8990 + }, + { + "epoch": 1.151296, + "grad_norm": 3.3622496128082275, + "learning_rate": 3.845575788686787e-06, + "loss": 3.8893, + "step": 8995 + }, + { + "epoch": 1.151936, + "grad_norm": 3.600410223007202, + "learning_rate": 3.840669624704605e-06, + "loss": 3.9255, + "step": 9000 + }, + { + "epoch": 1.151936, + "eval_loss": 0.9813051223754883, + "eval_runtime": 6.8153, + "eval_samples_per_second": 146.729, + "eval_steps_per_second": 18.341, + "step": 9000 } ], "logging_steps": 5, @@ -2394,7 +13346,7 @@ "attributes": {} } }, - "total_flos": 5.585434243497984e+17, + "total_flos": 3.1416322171475067e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null