|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9992892679459844, |
|
"eval_steps": 500, |
|
"global_step": 703, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.976618038646277, |
|
"learning_rate": 2.8169014084507043e-07, |
|
"loss": 1.2046, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.12950183658323, |
|
"learning_rate": 1.4084507042253523e-06, |
|
"loss": 1.2117, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.470582272109555, |
|
"learning_rate": 2.8169014084507046e-06, |
|
"loss": 1.1487, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.298228638351113, |
|
"learning_rate": 4.225352112676057e-06, |
|
"loss": 1.0049, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.6882426355197893, |
|
"learning_rate": 5.633802816901409e-06, |
|
"loss": 0.9194, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2713113746044633, |
|
"learning_rate": 7.042253521126761e-06, |
|
"loss": 0.8716, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0798985958176581, |
|
"learning_rate": 8.450704225352114e-06, |
|
"loss": 0.864, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7403180014309052, |
|
"learning_rate": 9.859154929577466e-06, |
|
"loss": 0.8359, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6646311223634714, |
|
"learning_rate": 1.1267605633802819e-05, |
|
"loss": 0.8614, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6593031879361225, |
|
"learning_rate": 1.2676056338028171e-05, |
|
"loss": 0.8375, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6848734508736534, |
|
"learning_rate": 1.4084507042253522e-05, |
|
"loss": 0.8305, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.607938821978484, |
|
"learning_rate": 1.5492957746478872e-05, |
|
"loss": 0.8392, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5835312438044307, |
|
"learning_rate": 1.6901408450704228e-05, |
|
"loss": 0.8211, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5785695089038153, |
|
"learning_rate": 1.830985915492958e-05, |
|
"loss": 0.8104, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6041477487355849, |
|
"learning_rate": 1.9718309859154933e-05, |
|
"loss": 0.7881, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5827780932059524, |
|
"learning_rate": 1.9998023297700656e-05, |
|
"loss": 0.8227, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5965623592450553, |
|
"learning_rate": 1.9989994283927287e-05, |
|
"loss": 0.7805, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5862237875540276, |
|
"learning_rate": 1.997579437055642e-05, |
|
"loss": 0.8011, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6398990841425803, |
|
"learning_rate": 1.9955432328988437e-05, |
|
"loss": 0.8092, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5892745053482765, |
|
"learning_rate": 1.9928920737019735e-05, |
|
"loss": 0.8111, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6020877373053659, |
|
"learning_rate": 1.9896275971073326e-05, |
|
"loss": 0.8034, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5950163528352244, |
|
"learning_rate": 1.9857518196082964e-05, |
|
"loss": 0.783, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6219407257978654, |
|
"learning_rate": 1.981267135303714e-05, |
|
"loss": 0.7986, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6390371388931367, |
|
"learning_rate": 1.976176314419051e-05, |
|
"loss": 0.8172, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6237373520959375, |
|
"learning_rate": 1.9704825015952005e-05, |
|
"loss": 0.7981, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6799237392350457, |
|
"learning_rate": 1.9641892139460133e-05, |
|
"loss": 0.8019, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5744917640149093, |
|
"learning_rate": 1.9573003388857476e-05, |
|
"loss": 0.7802, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6373020618232809, |
|
"learning_rate": 1.949820131727783e-05, |
|
"loss": 0.8233, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5813679472234877, |
|
"learning_rate": 1.9417532130560784e-05, |
|
"loss": 0.7793, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5752342288229056, |
|
"learning_rate": 1.933104565871001e-05, |
|
"loss": 0.7901, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5794685365516447, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 0.7823, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5914869332169176, |
|
"learning_rate": 1.9140838113540347e-05, |
|
"loss": 0.7906, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.602426155045894, |
|
"learning_rate": 1.9037234532947737e-05, |
|
"loss": 0.7858, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5749535966002742, |
|
"learning_rate": 1.8928048580097758e-05, |
|
"loss": 0.7902, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.586281192585965, |
|
"learning_rate": 1.8813347700029244e-05, |
|
"loss": 0.7756, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6233741174701375, |
|
"learning_rate": 1.869320274439583e-05, |
|
"loss": 0.7945, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6098071502533721, |
|
"learning_rate": 1.8567687927700255e-05, |
|
"loss": 0.8089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6064829730589661, |
|
"learning_rate": 1.8436880781451545e-05, |
|
"loss": 0.7866, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6415063522870069, |
|
"learning_rate": 1.8300862106273113e-05, |
|
"loss": 0.7661, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5976369233036758, |
|
"learning_rate": 1.8159715921991612e-05, |
|
"loss": 0.7518, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5808209243272175, |
|
"learning_rate": 1.801352941573718e-05, |
|
"loss": 0.777, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6085478615714451, |
|
"learning_rate": 1.786239288808727e-05, |
|
"loss": 0.7517, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6110100484028232, |
|
"learning_rate": 1.770639969728726e-05, |
|
"loss": 0.7582, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6103064740938433, |
|
"learning_rate": 1.7545646201582304e-05, |
|
"loss": 0.7739, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6095643766898001, |
|
"learning_rate": 1.738023169969608e-05, |
|
"loss": 0.7747, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5878428144325306, |
|
"learning_rate": 1.721025836949317e-05, |
|
"loss": 0.7601, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.580193134978998, |
|
"learning_rate": 1.703583120486297e-05, |
|
"loss": 0.7831, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6531767240234573, |
|
"learning_rate": 1.6857057950864134e-05, |
|
"loss": 0.7792, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6269016285983363, |
|
"learning_rate": 1.6674049037169565e-05, |
|
"loss": 0.7699, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5676615892406944, |
|
"learning_rate": 1.648691750985314e-05, |
|
"loss": 0.7465, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6103636647559819, |
|
"learning_rate": 1.6295778961560242e-05, |
|
"loss": 0.7615, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5602858008551354, |
|
"learning_rate": 1.6100751460105244e-05, |
|
"loss": 0.7517, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5652587482524394, |
|
"learning_rate": 1.5901955475540087e-05, |
|
"loss": 0.7433, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6293727832821757, |
|
"learning_rate": 1.5699513805738942e-05, |
|
"loss": 0.7546, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5899236848402083, |
|
"learning_rate": 1.549355150054501e-05, |
|
"loss": 0.7586, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5858156410561468, |
|
"learning_rate": 1.5284195784526196e-05, |
|
"loss": 0.7435, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6461462712535574, |
|
"learning_rate": 1.5071575978387505e-05, |
|
"loss": 0.7557, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5896422662231582, |
|
"learning_rate": 1.4855823419088576e-05, |
|
"loss": 0.7523, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6152912876368505, |
|
"learning_rate": 1.4637071378715807e-05, |
|
"loss": 0.7466, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5476828784032306, |
|
"learning_rate": 1.4415454982159121e-05, |
|
"loss": 0.7575, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5963972160682588, |
|
"learning_rate": 1.419111112364422e-05, |
|
"loss": 0.765, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5892250766623344, |
|
"learning_rate": 1.3964178382171942e-05, |
|
"loss": 0.7509, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5821016231570963, |
|
"learning_rate": 1.3734796935916888e-05, |
|
"loss": 0.7248, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5652243248281746, |
|
"learning_rate": 1.3503108475638244e-05, |
|
"loss": 0.7288, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6281159930375059, |
|
"learning_rate": 1.326925611715627e-05, |
|
"loss": 0.7559, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5888656609358633, |
|
"learning_rate": 1.3033384312948487e-05, |
|
"loss": 0.7448, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6165686859395609, |
|
"learning_rate": 1.2795638762920254e-05, |
|
"loss": 0.742, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6381549925910247, |
|
"learning_rate": 1.2556166324404747e-05, |
|
"loss": 0.7099, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5545610243561243, |
|
"learning_rate": 1.2315114921448012e-05, |
|
"loss": 0.7361, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5706928669226053, |
|
"learning_rate": 1.2072633453435092e-05, |
|
"loss": 0.7258, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6107785989730811, |
|
"learning_rate": 1.1828871703113686e-05, |
|
"loss": 0.7525, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6008084503226107, |
|
"learning_rate": 1.158398024407215e-05, |
|
"loss": 0.7443, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5731046479253954, |
|
"learning_rate": 1.1338110347728973e-05, |
|
"loss": 0.7488, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6378170263505424, |
|
"learning_rate": 1.1091413889891211e-05, |
|
"loss": 0.7451, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6479687798024056, |
|
"learning_rate": 1.0844043256939585e-05, |
|
"loss": 0.7513, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6131548915096868, |
|
"learning_rate": 1.05961512516982e-05, |
|
"loss": 0.7547, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6034495628166195, |
|
"learning_rate": 1.0347890999046998e-05, |
|
"loss": 0.7292, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5595982017393247, |
|
"learning_rate": 1.00994158513353e-05, |
|
"loss": 0.7291, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5908657324990593, |
|
"learning_rate": 9.850879293654829e-06, |
|
"loss": 0.7319, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5686261006528736, |
|
"learning_rate": 9.602434849030747e-06, |
|
"loss": 0.7522, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5931803289250058, |
|
"learning_rate": 9.354235983589229e-06, |
|
"loss": 0.7285, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.576733249565169, |
|
"learning_rate": 9.106436011760229e-06, |
|
"loss": 0.7102, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6419706570687358, |
|
"learning_rate": 8.859188001573916e-06, |
|
"loss": 0.7511, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5844726337209573, |
|
"learning_rate": 8.61264468010932e-06, |
|
"loss": 0.7022, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6120657481180046, |
|
"learning_rate": 8.3669583391536e-06, |
|
"loss": 0.7274, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5798319383539561, |
|
"learning_rate": 8.122280741130177e-06, |
|
"loss": 0.737, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5911269901152743, |
|
"learning_rate": 7.878763025353875e-06, |
|
"loss": 0.7456, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6030008166764901, |
|
"learning_rate": 7.636555614670953e-06, |
|
"loss": 0.7443, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.584933481934183, |
|
"learning_rate": 7.395808122541697e-06, |
|
"loss": 0.7457, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6178817275024883, |
|
"learning_rate": 7.156669260622997e-06, |
|
"loss": 0.7272, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.576738068462372, |
|
"learning_rate": 6.9192867469079625e-06, |
|
"loss": 0.7355, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.610153251929595, |
|
"learning_rate": 6.683807214479323e-06, |
|
"loss": 0.7369, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5755637011823521, |
|
"learning_rate": 6.450376120933008e-06, |
|
"loss": 0.725, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6096210457693504, |
|
"learning_rate": 6.219137658527819e-06, |
|
"loss": 0.7485, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5633088212735556, |
|
"learning_rate": 5.990234665116713e-06, |
|
"loss": 0.7322, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.616469990783489, |
|
"learning_rate": 5.7638085359147235e-06, |
|
"loss": 0.7234, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6045801377903139, |
|
"learning_rate": 5.539999136157977e-06, |
|
"loss": 0.7093, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5168220414675904, |
|
"learning_rate": 5.318944714707861e-06, |
|
"loss": 0.7209, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5819546302447135, |
|
"learning_rate": 5.100781818653549e-06, |
|
"loss": 0.7088, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6214693170980583, |
|
"learning_rate": 4.885645208965779e-06, |
|
"loss": 0.7295, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5483651139224388, |
|
"learning_rate": 4.673667777253944e-06, |
|
"loss": 0.7452, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6043244439563197, |
|
"learning_rate": 4.464980463677846e-06, |
|
"loss": 0.7359, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5841849428722694, |
|
"learning_rate": 4.25971217606493e-06, |
|
"loss": 0.7364, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5971777780370174, |
|
"learning_rate": 4.057989710282897e-06, |
|
"loss": 0.7287, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5376730799298396, |
|
"learning_rate": 3.859937671916833e-06, |
|
"loss": 0.7383, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6254203665305461, |
|
"learning_rate": 3.6656783992993885e-06, |
|
"loss": 0.7264, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.607374971442512, |
|
"learning_rate": 3.475331887941388e-06, |
|
"loss": 0.7384, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5562789621862069, |
|
"learning_rate": 3.2890157164096315e-06, |
|
"loss": 0.7398, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6391017139819878, |
|
"learning_rate": 3.1068449736977015e-06, |
|
"loss": 0.7341, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5658442494663427, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.7244, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5722130325678222, |
|
"learning_rate": 2.755387257874764e-06, |
|
"loss": 0.7228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5879060164397627, |
|
"learning_rate": 2.5863173830138212e-06, |
|
"loss": 0.718, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5854775512772113, |
|
"learning_rate": 2.4218269993694733e-06, |
|
"loss": 0.7286, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.543757639434833, |
|
"learning_rate": 2.262017713971063e-06, |
|
"loss": 0.712, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5347024748449094, |
|
"learning_rate": 2.106988242295981e-06, |
|
"loss": 0.7311, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5422054679063607, |
|
"learning_rate": 1.9568343472923524e-06, |
|
"loss": 0.7156, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6108858998011519, |
|
"learning_rate": 1.8116487802254868e-06, |
|
"loss": 0.7282, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6037057135293289, |
|
"learning_rate": 1.6715212233846656e-06, |
|
"loss": 0.7159, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.561518631539835, |
|
"learning_rate": 1.5365382346857005e-06, |
|
"loss": 0.7349, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5560148126465202, |
|
"learning_rate": 1.4067831942033904e-06, |
|
"loss": 0.7106, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5895011500377241, |
|
"learning_rate": 1.2823362526669825e-06, |
|
"loss": 0.7074, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5592384208209128, |
|
"learning_rate": 1.1632742819504406e-06, |
|
"loss": 0.7243, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.599779305605372, |
|
"learning_rate": 1.0496708275880497e-06, |
|
"loss": 0.7238, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5909990735500785, |
|
"learning_rate": 9.415960633447674e-07, |
|
"loss": 0.7158, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5695280113058802, |
|
"learning_rate": 8.391167478693241e-07, |
|
"loss": 0.721, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5828076535860982, |
|
"learning_rate": 7.422961834568565e-07, |
|
"loss": 0.7097, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5642901879945964, |
|
"learning_rate": 6.51194176946588e-07, |
|
"loss": 0.711, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5948097987699422, |
|
"learning_rate": 5.658670027786561e-07, |
|
"loss": 0.713, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6020151691331499, |
|
"learning_rate": 4.863673682329373e-07, |
|
"loss": 0.7395, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5889825924322954, |
|
"learning_rate": 4.1274438087135273e-07, |
|
"loss": 0.7434, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5306993928105772, |
|
"learning_rate": 3.450435182037104e-07, |
|
"loss": 0.6871, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5664135818341235, |
|
"learning_rate": 2.8330659959589944e-07, |
|
"loss": 0.727, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5686227351601807, |
|
"learning_rate": 2.275717604377292e-07, |
|
"loss": 0.7329, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5396140909322422, |
|
"learning_rate": 1.7787342858638589e-07, |
|
"loss": 0.7136, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5729429274266005, |
|
"learning_rate": 1.3424230310007946e-07, |
|
"loss": 0.739, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5839067564756593, |
|
"learning_rate": 9.670533527498139e-08, |
|
"loss": 0.7209, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5835981596712281, |
|
"learning_rate": 6.528571199719502e-08, |
|
"loss": 0.7322, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5714383368277501, |
|
"learning_rate": 4.000284142003264e-08, |
|
"loss": 0.7112, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5810188754832274, |
|
"learning_rate": 2.0872340975438555e-08, |
|
"loss": 0.7327, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.572576103364852, |
|
"learning_rate": 7.906027726981568e-09, |
|
"loss": 0.7149, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5576564322671743, |
|
"learning_rate": 1.111911070356131e-09, |
|
"loss": 0.7072, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7502086162567139, |
|
"eval_runtime": 9.4869, |
|
"eval_samples_per_second": 52.704, |
|
"eval_steps_per_second": 1.687, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 703, |
|
"total_flos": 101830592102400.0, |
|
"train_loss": 0.762717972100205, |
|
"train_runtime": 5850.9583, |
|
"train_samples_per_second": 15.382, |
|
"train_steps_per_second": 0.12 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 703, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 101830592102400.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|