{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992892679459844, "eval_steps": 500, "global_step": 703, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.976618038646277, "learning_rate": 2.8169014084507043e-07, "loss": 1.2046, "step": 1 }, { "epoch": 0.01, "grad_norm": 9.12950183658323, "learning_rate": 1.4084507042253523e-06, "loss": 1.2117, "step": 5 }, { "epoch": 0.01, "grad_norm": 9.470582272109555, "learning_rate": 2.8169014084507046e-06, "loss": 1.1487, "step": 10 }, { "epoch": 0.02, "grad_norm": 3.298228638351113, "learning_rate": 4.225352112676057e-06, "loss": 1.0049, "step": 15 }, { "epoch": 0.03, "grad_norm": 1.6882426355197893, "learning_rate": 5.633802816901409e-06, "loss": 0.9194, "step": 20 }, { "epoch": 0.04, "grad_norm": 1.2713113746044633, "learning_rate": 7.042253521126761e-06, "loss": 0.8716, "step": 25 }, { "epoch": 0.04, "grad_norm": 1.0798985958176581, "learning_rate": 8.450704225352114e-06, "loss": 0.864, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.7403180014309052, "learning_rate": 9.859154929577466e-06, "loss": 0.8359, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.6646311223634714, "learning_rate": 1.1267605633802819e-05, "loss": 0.8614, "step": 40 }, { "epoch": 0.06, "grad_norm": 0.6593031879361225, "learning_rate": 1.2676056338028171e-05, "loss": 0.8375, "step": 45 }, { "epoch": 0.07, "grad_norm": 0.6848734508736534, "learning_rate": 1.4084507042253522e-05, "loss": 0.8305, "step": 50 }, { "epoch": 0.08, "grad_norm": 0.607938821978484, "learning_rate": 1.5492957746478872e-05, "loss": 0.8392, "step": 55 }, { "epoch": 0.09, "grad_norm": 0.5835312438044307, "learning_rate": 1.6901408450704228e-05, "loss": 0.8211, "step": 60 }, { "epoch": 0.09, "grad_norm": 0.5785695089038153, "learning_rate": 1.830985915492958e-05, "loss": 0.8104, "step": 65 }, { "epoch": 0.1, "grad_norm": 0.6041477487355849, "learning_rate": 1.9718309859154933e-05, "loss": 0.7881, "step": 70 }, { "epoch": 0.11, "grad_norm": 0.5827780932059524, "learning_rate": 1.9998023297700656e-05, "loss": 0.8227, "step": 75 }, { "epoch": 0.11, "grad_norm": 0.5965623592450553, "learning_rate": 1.9989994283927287e-05, "loss": 0.7805, "step": 80 }, { "epoch": 0.12, "grad_norm": 0.5862237875540276, "learning_rate": 1.997579437055642e-05, "loss": 0.8011, "step": 85 }, { "epoch": 0.13, "grad_norm": 0.6398990841425803, "learning_rate": 1.9955432328988437e-05, "loss": 0.8092, "step": 90 }, { "epoch": 0.14, "grad_norm": 0.5892745053482765, "learning_rate": 1.9928920737019735e-05, "loss": 0.8111, "step": 95 }, { "epoch": 0.14, "grad_norm": 0.6020877373053659, "learning_rate": 1.9896275971073326e-05, "loss": 0.8034, "step": 100 }, { "epoch": 0.15, "grad_norm": 0.5950163528352244, "learning_rate": 1.9857518196082964e-05, "loss": 0.783, "step": 105 }, { "epoch": 0.16, "grad_norm": 0.6219407257978654, "learning_rate": 1.981267135303714e-05, "loss": 0.7986, "step": 110 }, { "epoch": 0.16, "grad_norm": 0.6390371388931367, "learning_rate": 1.976176314419051e-05, "loss": 0.8172, "step": 115 }, { "epoch": 0.17, "grad_norm": 0.6237373520959375, "learning_rate": 1.9704825015952005e-05, "loss": 0.7981, "step": 120 }, { "epoch": 0.18, "grad_norm": 0.6799237392350457, "learning_rate": 1.9641892139460133e-05, "loss": 0.8019, "step": 125 }, { "epoch": 0.18, "grad_norm": 0.5744917640149093, "learning_rate": 1.9573003388857476e-05, "loss": 0.7802, "step": 130 }, { "epoch": 0.19, "grad_norm": 0.6373020618232809, "learning_rate": 1.949820131727783e-05, "loss": 0.8233, "step": 135 }, { "epoch": 0.2, "grad_norm": 0.5813679472234877, "learning_rate": 1.9417532130560784e-05, "loss": 0.7793, "step": 140 }, { "epoch": 0.21, "grad_norm": 0.5752342288229056, "learning_rate": 1.933104565871001e-05, "loss": 0.7901, "step": 145 }, { "epoch": 0.21, "grad_norm": 0.5794685365516447, "learning_rate": 1.9238795325112867e-05, "loss": 0.7823, "step": 150 }, { "epoch": 0.22, "grad_norm": 0.5914869332169176, "learning_rate": 1.9140838113540347e-05, "loss": 0.7906, "step": 155 }, { "epoch": 0.23, "grad_norm": 0.602426155045894, "learning_rate": 1.9037234532947737e-05, "loss": 0.7858, "step": 160 }, { "epoch": 0.23, "grad_norm": 0.5749535966002742, "learning_rate": 1.8928048580097758e-05, "loss": 0.7902, "step": 165 }, { "epoch": 0.24, "grad_norm": 0.586281192585965, "learning_rate": 1.8813347700029244e-05, "loss": 0.7756, "step": 170 }, { "epoch": 0.25, "grad_norm": 0.6233741174701375, "learning_rate": 1.869320274439583e-05, "loss": 0.7945, "step": 175 }, { "epoch": 0.26, "grad_norm": 0.6098071502533721, "learning_rate": 1.8567687927700255e-05, "loss": 0.8089, "step": 180 }, { "epoch": 0.26, "grad_norm": 0.6064829730589661, "learning_rate": 1.8436880781451545e-05, "loss": 0.7866, "step": 185 }, { "epoch": 0.27, "grad_norm": 0.6415063522870069, "learning_rate": 1.8300862106273113e-05, "loss": 0.7661, "step": 190 }, { "epoch": 0.28, "grad_norm": 0.5976369233036758, "learning_rate": 1.8159715921991612e-05, "loss": 0.7518, "step": 195 }, { "epoch": 0.28, "grad_norm": 0.5808209243272175, "learning_rate": 1.801352941573718e-05, "loss": 0.777, "step": 200 }, { "epoch": 0.29, "grad_norm": 0.6085478615714451, "learning_rate": 1.786239288808727e-05, "loss": 0.7517, "step": 205 }, { "epoch": 0.3, "grad_norm": 0.6110100484028232, "learning_rate": 1.770639969728726e-05, "loss": 0.7582, "step": 210 }, { "epoch": 0.31, "grad_norm": 0.6103064740938433, "learning_rate": 1.7545646201582304e-05, "loss": 0.7739, "step": 215 }, { "epoch": 0.31, "grad_norm": 0.6095643766898001, "learning_rate": 1.738023169969608e-05, "loss": 0.7747, "step": 220 }, { "epoch": 0.32, "grad_norm": 0.5878428144325306, "learning_rate": 1.721025836949317e-05, "loss": 0.7601, "step": 225 }, { "epoch": 0.33, "grad_norm": 0.580193134978998, "learning_rate": 1.703583120486297e-05, "loss": 0.7831, "step": 230 }, { "epoch": 0.33, "grad_norm": 0.6531767240234573, "learning_rate": 1.6857057950864134e-05, "loss": 0.7792, "step": 235 }, { "epoch": 0.34, "grad_norm": 0.6269016285983363, "learning_rate": 1.6674049037169565e-05, "loss": 0.7699, "step": 240 }, { "epoch": 0.35, "grad_norm": 0.5676615892406944, "learning_rate": 1.648691750985314e-05, "loss": 0.7465, "step": 245 }, { "epoch": 0.36, "grad_norm": 0.6103636647559819, "learning_rate": 1.6295778961560242e-05, "loss": 0.7615, "step": 250 }, { "epoch": 0.36, "grad_norm": 0.5602858008551354, "learning_rate": 1.6100751460105244e-05, "loss": 0.7517, "step": 255 }, { "epoch": 0.37, "grad_norm": 0.5652587482524394, "learning_rate": 1.5901955475540087e-05, "loss": 0.7433, "step": 260 }, { "epoch": 0.38, "grad_norm": 0.6293727832821757, "learning_rate": 1.5699513805738942e-05, "loss": 0.7546, "step": 265 }, { "epoch": 0.38, "grad_norm": 0.5899236848402083, "learning_rate": 1.549355150054501e-05, "loss": 0.7586, "step": 270 }, { "epoch": 0.39, "grad_norm": 0.5858156410561468, "learning_rate": 1.5284195784526196e-05, "loss": 0.7435, "step": 275 }, { "epoch": 0.4, "grad_norm": 0.6461462712535574, "learning_rate": 1.5071575978387505e-05, "loss": 0.7557, "step": 280 }, { "epoch": 0.41, "grad_norm": 0.5896422662231582, "learning_rate": 1.4855823419088576e-05, "loss": 0.7523, "step": 285 }, { "epoch": 0.41, "grad_norm": 0.6152912876368505, "learning_rate": 1.4637071378715807e-05, "loss": 0.7466, "step": 290 }, { "epoch": 0.42, "grad_norm": 0.5476828784032306, "learning_rate": 1.4415454982159121e-05, "loss": 0.7575, "step": 295 }, { "epoch": 0.43, "grad_norm": 0.5963972160682588, "learning_rate": 1.419111112364422e-05, "loss": 0.765, "step": 300 }, { "epoch": 0.43, "grad_norm": 0.5892250766623344, "learning_rate": 1.3964178382171942e-05, "loss": 0.7509, "step": 305 }, { "epoch": 0.44, "grad_norm": 0.5821016231570963, "learning_rate": 1.3734796935916888e-05, "loss": 0.7248, "step": 310 }, { "epoch": 0.45, "grad_norm": 0.5652243248281746, "learning_rate": 1.3503108475638244e-05, "loss": 0.7288, "step": 315 }, { "epoch": 0.45, "grad_norm": 0.6281159930375059, "learning_rate": 1.326925611715627e-05, "loss": 0.7559, "step": 320 }, { "epoch": 0.46, "grad_norm": 0.5888656609358633, "learning_rate": 1.3033384312948487e-05, "loss": 0.7448, "step": 325 }, { "epoch": 0.47, "grad_norm": 0.6165686859395609, "learning_rate": 1.2795638762920254e-05, "loss": 0.742, "step": 330 }, { "epoch": 0.48, "grad_norm": 0.6381549925910247, "learning_rate": 1.2556166324404747e-05, "loss": 0.7099, "step": 335 }, { "epoch": 0.48, "grad_norm": 0.5545610243561243, "learning_rate": 1.2315114921448012e-05, "loss": 0.7361, "step": 340 }, { "epoch": 0.49, "grad_norm": 0.5706928669226053, "learning_rate": 1.2072633453435092e-05, "loss": 0.7258, "step": 345 }, { "epoch": 0.5, "grad_norm": 0.6107785989730811, "learning_rate": 1.1828871703113686e-05, "loss": 0.7525, "step": 350 }, { "epoch": 0.5, "grad_norm": 0.6008084503226107, "learning_rate": 1.158398024407215e-05, "loss": 0.7443, "step": 355 }, { "epoch": 0.51, "grad_norm": 0.5731046479253954, "learning_rate": 1.1338110347728973e-05, "loss": 0.7488, "step": 360 }, { "epoch": 0.52, "grad_norm": 0.6378170263505424, "learning_rate": 1.1091413889891211e-05, "loss": 0.7451, "step": 365 }, { "epoch": 0.53, "grad_norm": 0.6479687798024056, "learning_rate": 1.0844043256939585e-05, "loss": 0.7513, "step": 370 }, { "epoch": 0.53, "grad_norm": 0.6131548915096868, "learning_rate": 1.05961512516982e-05, "loss": 0.7547, "step": 375 }, { "epoch": 0.54, "grad_norm": 0.6034495628166195, "learning_rate": 1.0347890999046998e-05, "loss": 0.7292, "step": 380 }, { "epoch": 0.55, "grad_norm": 0.5595982017393247, "learning_rate": 1.00994158513353e-05, "loss": 0.7291, "step": 385 }, { "epoch": 0.55, "grad_norm": 0.5908657324990593, "learning_rate": 9.850879293654829e-06, "loss": 0.7319, "step": 390 }, { "epoch": 0.56, "grad_norm": 0.5686261006528736, "learning_rate": 9.602434849030747e-06, "loss": 0.7522, "step": 395 }, { "epoch": 0.57, "grad_norm": 0.5931803289250058, "learning_rate": 9.354235983589229e-06, "loss": 0.7285, "step": 400 }, { "epoch": 0.58, "grad_norm": 0.576733249565169, "learning_rate": 9.106436011760229e-06, "loss": 0.7102, "step": 405 }, { "epoch": 0.58, "grad_norm": 0.6419706570687358, "learning_rate": 8.859188001573916e-06, "loss": 0.7511, "step": 410 }, { "epoch": 0.59, "grad_norm": 0.5844726337209573, "learning_rate": 8.61264468010932e-06, "loss": 0.7022, "step": 415 }, { "epoch": 0.6, "grad_norm": 0.6120657481180046, "learning_rate": 8.3669583391536e-06, "loss": 0.7274, "step": 420 }, { "epoch": 0.6, "grad_norm": 0.5798319383539561, "learning_rate": 8.122280741130177e-06, "loss": 0.737, "step": 425 }, { "epoch": 0.61, "grad_norm": 0.5911269901152743, "learning_rate": 7.878763025353875e-06, "loss": 0.7456, "step": 430 }, { "epoch": 0.62, "grad_norm": 0.6030008166764901, "learning_rate": 7.636555614670953e-06, "loss": 0.7443, "step": 435 }, { "epoch": 0.63, "grad_norm": 0.584933481934183, "learning_rate": 7.395808122541697e-06, "loss": 0.7457, "step": 440 }, { "epoch": 0.63, "grad_norm": 0.6178817275024883, "learning_rate": 7.156669260622997e-06, "loss": 0.7272, "step": 445 }, { "epoch": 0.64, "grad_norm": 0.576738068462372, "learning_rate": 6.9192867469079625e-06, "loss": 0.7355, "step": 450 }, { "epoch": 0.65, "grad_norm": 0.610153251929595, "learning_rate": 6.683807214479323e-06, "loss": 0.7369, "step": 455 }, { "epoch": 0.65, "grad_norm": 0.5755637011823521, "learning_rate": 6.450376120933008e-06, "loss": 0.725, "step": 460 }, { "epoch": 0.66, "grad_norm": 0.6096210457693504, "learning_rate": 6.219137658527819e-06, "loss": 0.7485, "step": 465 }, { "epoch": 0.67, "grad_norm": 0.5633088212735556, "learning_rate": 5.990234665116713e-06, "loss": 0.7322, "step": 470 }, { "epoch": 0.68, "grad_norm": 0.616469990783489, "learning_rate": 5.7638085359147235e-06, "loss": 0.7234, "step": 475 }, { "epoch": 0.68, "grad_norm": 0.6045801377903139, "learning_rate": 5.539999136157977e-06, "loss": 0.7093, "step": 480 }, { "epoch": 0.69, "grad_norm": 0.5168220414675904, "learning_rate": 5.318944714707861e-06, "loss": 0.7209, "step": 485 }, { "epoch": 0.7, "grad_norm": 0.5819546302447135, "learning_rate": 5.100781818653549e-06, "loss": 0.7088, "step": 490 }, { "epoch": 0.7, "grad_norm": 0.6214693170980583, "learning_rate": 4.885645208965779e-06, "loss": 0.7295, "step": 495 }, { "epoch": 0.71, "grad_norm": 0.5483651139224388, "learning_rate": 4.673667777253944e-06, "loss": 0.7452, "step": 500 }, { "epoch": 0.72, "grad_norm": 0.6043244439563197, "learning_rate": 4.464980463677846e-06, "loss": 0.7359, "step": 505 }, { "epoch": 0.72, "grad_norm": 0.5841849428722694, "learning_rate": 4.25971217606493e-06, "loss": 0.7364, "step": 510 }, { "epoch": 0.73, "grad_norm": 0.5971777780370174, "learning_rate": 4.057989710282897e-06, "loss": 0.7287, "step": 515 }, { "epoch": 0.74, "grad_norm": 0.5376730799298396, "learning_rate": 3.859937671916833e-06, "loss": 0.7383, "step": 520 }, { "epoch": 0.75, "grad_norm": 0.6254203665305461, "learning_rate": 3.6656783992993885e-06, "loss": 0.7264, "step": 525 }, { "epoch": 0.75, "grad_norm": 0.607374971442512, "learning_rate": 3.475331887941388e-06, "loss": 0.7384, "step": 530 }, { "epoch": 0.76, "grad_norm": 0.5562789621862069, "learning_rate": 3.2890157164096315e-06, "loss": 0.7398, "step": 535 }, { "epoch": 0.77, "grad_norm": 0.6391017139819878, "learning_rate": 3.1068449736977015e-06, "loss": 0.7341, "step": 540 }, { "epoch": 0.77, "grad_norm": 0.5658442494663427, "learning_rate": 2.9289321881345257e-06, "loss": 0.7244, "step": 545 }, { "epoch": 0.78, "grad_norm": 0.5722130325678222, "learning_rate": 2.755387257874764e-06, "loss": 0.7228, "step": 550 }, { "epoch": 0.79, "grad_norm": 0.5879060164397627, "learning_rate": 2.5863173830138212e-06, "loss": 0.718, "step": 555 }, { "epoch": 0.8, "grad_norm": 0.5854775512772113, "learning_rate": 2.4218269993694733e-06, "loss": 0.7286, "step": 560 }, { "epoch": 0.8, "grad_norm": 0.543757639434833, "learning_rate": 2.262017713971063e-06, "loss": 0.712, "step": 565 }, { "epoch": 0.81, "grad_norm": 0.5347024748449094, "learning_rate": 2.106988242295981e-06, "loss": 0.7311, "step": 570 }, { "epoch": 0.82, "grad_norm": 0.5422054679063607, "learning_rate": 1.9568343472923524e-06, "loss": 0.7156, "step": 575 }, { "epoch": 0.82, "grad_norm": 0.6108858998011519, "learning_rate": 1.8116487802254868e-06, "loss": 0.7282, "step": 580 }, { "epoch": 0.83, "grad_norm": 0.6037057135293289, "learning_rate": 1.6715212233846656e-06, "loss": 0.7159, "step": 585 }, { "epoch": 0.84, "grad_norm": 0.561518631539835, "learning_rate": 1.5365382346857005e-06, "loss": 0.7349, "step": 590 }, { "epoch": 0.85, "grad_norm": 0.5560148126465202, "learning_rate": 1.4067831942033904e-06, "loss": 0.7106, "step": 595 }, { "epoch": 0.85, "grad_norm": 0.5895011500377241, "learning_rate": 1.2823362526669825e-06, "loss": 0.7074, "step": 600 }, { "epoch": 0.86, "grad_norm": 0.5592384208209128, "learning_rate": 1.1632742819504406e-06, "loss": 0.7243, "step": 605 }, { "epoch": 0.87, "grad_norm": 0.599779305605372, "learning_rate": 1.0496708275880497e-06, "loss": 0.7238, "step": 610 }, { "epoch": 0.87, "grad_norm": 0.5909990735500785, "learning_rate": 9.415960633447674e-07, "loss": 0.7158, "step": 615 }, { "epoch": 0.88, "grad_norm": 0.5695280113058802, "learning_rate": 8.391167478693241e-07, "loss": 0.721, "step": 620 }, { "epoch": 0.89, "grad_norm": 0.5828076535860982, "learning_rate": 7.422961834568565e-07, "loss": 0.7097, "step": 625 }, { "epoch": 0.9, "grad_norm": 0.5642901879945964, "learning_rate": 6.51194176946588e-07, "loss": 0.711, "step": 630 }, { "epoch": 0.9, "grad_norm": 0.5948097987699422, "learning_rate": 5.658670027786561e-07, "loss": 0.713, "step": 635 }, { "epoch": 0.91, "grad_norm": 0.6020151691331499, "learning_rate": 4.863673682329373e-07, "loss": 0.7395, "step": 640 }, { "epoch": 0.92, "grad_norm": 0.5889825924322954, "learning_rate": 4.1274438087135273e-07, "loss": 0.7434, "step": 645 }, { "epoch": 0.92, "grad_norm": 0.5306993928105772, "learning_rate": 3.450435182037104e-07, "loss": 0.6871, "step": 650 }, { "epoch": 0.93, "grad_norm": 0.5664135818341235, "learning_rate": 2.8330659959589944e-07, "loss": 0.727, "step": 655 }, { "epoch": 0.94, "grad_norm": 0.5686227351601807, "learning_rate": 2.275717604377292e-07, "loss": 0.7329, "step": 660 }, { "epoch": 0.95, "grad_norm": 0.5396140909322422, "learning_rate": 1.7787342858638589e-07, "loss": 0.7136, "step": 665 }, { "epoch": 0.95, "grad_norm": 0.5729429274266005, "learning_rate": 1.3424230310007946e-07, "loss": 0.739, "step": 670 }, { "epoch": 0.96, "grad_norm": 0.5839067564756593, "learning_rate": 9.670533527498139e-08, "loss": 0.7209, "step": 675 }, { "epoch": 0.97, "grad_norm": 0.5835981596712281, "learning_rate": 6.528571199719502e-08, "loss": 0.7322, "step": 680 }, { "epoch": 0.97, "grad_norm": 0.5714383368277501, "learning_rate": 4.000284142003264e-08, "loss": 0.7112, "step": 685 }, { "epoch": 0.98, "grad_norm": 0.5810188754832274, "learning_rate": 2.0872340975438555e-08, "loss": 0.7327, "step": 690 }, { "epoch": 0.99, "grad_norm": 0.572576103364852, "learning_rate": 7.906027726981568e-09, "loss": 0.7149, "step": 695 }, { "epoch": 1.0, "grad_norm": 0.5576564322671743, "learning_rate": 1.111911070356131e-09, "loss": 0.7072, "step": 700 }, { "epoch": 1.0, "eval_loss": 0.7502086162567139, "eval_runtime": 9.4869, "eval_samples_per_second": 52.704, "eval_steps_per_second": 1.687, "step": 703 }, { "epoch": 1.0, "step": 703, "total_flos": 101830592102400.0, "train_loss": 0.762717972100205, "train_runtime": 5850.9583, "train_samples_per_second": 15.382, "train_steps_per_second": 0.12 } ], "logging_steps": 5, "max_steps": 703, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 101830592102400.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }