|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9992892679459844, |
|
"eval_steps": 500, |
|
"global_step": 703, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.976748246707771, |
|
"learning_rate": 2.8169014084507043e-07, |
|
"loss": 1.2046, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.141400774480953, |
|
"learning_rate": 1.4084507042253523e-06, |
|
"loss": 1.212, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.49864441857399, |
|
"learning_rate": 2.8169014084507046e-06, |
|
"loss": 1.1487, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.2926935702992415, |
|
"learning_rate": 4.225352112676057e-06, |
|
"loss": 1.0049, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.7117549060658137, |
|
"learning_rate": 5.633802816901409e-06, |
|
"loss": 0.9194, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2735260163319009, |
|
"learning_rate": 7.042253521126761e-06, |
|
"loss": 0.8716, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0790301127605728, |
|
"learning_rate": 8.450704225352114e-06, |
|
"loss": 0.864, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7414273564854603, |
|
"learning_rate": 9.859154929577466e-06, |
|
"loss": 0.8359, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6656956199139998, |
|
"learning_rate": 1.1267605633802819e-05, |
|
"loss": 0.8614, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6590669608248467, |
|
"learning_rate": 1.2676056338028171e-05, |
|
"loss": 0.8375, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6842437677529626, |
|
"learning_rate": 1.4084507042253522e-05, |
|
"loss": 0.8306, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6080481338431981, |
|
"learning_rate": 1.5492957746478872e-05, |
|
"loss": 0.8391, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5829909243761902, |
|
"learning_rate": 1.6901408450704228e-05, |
|
"loss": 0.8211, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5774288402112024, |
|
"learning_rate": 1.830985915492958e-05, |
|
"loss": 0.8104, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6037500273154381, |
|
"learning_rate": 1.9718309859154933e-05, |
|
"loss": 0.7882, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5823339744513759, |
|
"learning_rate": 1.9998023297700656e-05, |
|
"loss": 0.8226, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5970199949943188, |
|
"learning_rate": 1.9989994283927287e-05, |
|
"loss": 0.7805, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5863046182186602, |
|
"learning_rate": 1.997579437055642e-05, |
|
"loss": 0.8011, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.6404658915624578, |
|
"learning_rate": 1.9955432328988437e-05, |
|
"loss": 0.8092, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5889469279145607, |
|
"learning_rate": 1.9928920737019735e-05, |
|
"loss": 0.8111, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5999505405000287, |
|
"learning_rate": 1.9896275971073326e-05, |
|
"loss": 0.8034, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5958697553643331, |
|
"learning_rate": 1.9857518196082964e-05, |
|
"loss": 0.783, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6235891380868392, |
|
"learning_rate": 1.981267135303714e-05, |
|
"loss": 0.7986, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6386697510622403, |
|
"learning_rate": 1.976176314419051e-05, |
|
"loss": 0.8173, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6223044203965952, |
|
"learning_rate": 1.9704825015952005e-05, |
|
"loss": 0.7981, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6801656349332326, |
|
"learning_rate": 1.9641892139460133e-05, |
|
"loss": 0.8018, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5736017682468991, |
|
"learning_rate": 1.9573003388857476e-05, |
|
"loss": 0.7801, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6377284634315731, |
|
"learning_rate": 1.949820131727783e-05, |
|
"loss": 0.8233, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5834471085873422, |
|
"learning_rate": 1.9417532130560784e-05, |
|
"loss": 0.7792, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5750039228232332, |
|
"learning_rate": 1.933104565871001e-05, |
|
"loss": 0.79, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5788157023616078, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 0.7823, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5905988672413683, |
|
"learning_rate": 1.9140838113540347e-05, |
|
"loss": 0.7907, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6036919615732501, |
|
"learning_rate": 1.9037234532947737e-05, |
|
"loss": 0.7859, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5744116424871404, |
|
"learning_rate": 1.8928048580097758e-05, |
|
"loss": 0.7902, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5858290882445865, |
|
"learning_rate": 1.8813347700029244e-05, |
|
"loss": 0.7756, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6203103361508707, |
|
"learning_rate": 1.869320274439583e-05, |
|
"loss": 0.7945, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6094752243332237, |
|
"learning_rate": 1.8567687927700255e-05, |
|
"loss": 0.8089, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6065177506682852, |
|
"learning_rate": 1.8436880781451545e-05, |
|
"loss": 0.7866, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6422514480326399, |
|
"learning_rate": 1.8300862106273113e-05, |
|
"loss": 0.7661, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5976731025044114, |
|
"learning_rate": 1.8159715921991612e-05, |
|
"loss": 0.7518, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5813547875268166, |
|
"learning_rate": 1.801352941573718e-05, |
|
"loss": 0.777, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6079636229603744, |
|
"learning_rate": 1.786239288808727e-05, |
|
"loss": 0.7517, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6116028297485965, |
|
"learning_rate": 1.770639969728726e-05, |
|
"loss": 0.7582, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6104535774475512, |
|
"learning_rate": 1.7545646201582304e-05, |
|
"loss": 0.7739, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6092743685177864, |
|
"learning_rate": 1.738023169969608e-05, |
|
"loss": 0.7747, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5879141856316753, |
|
"learning_rate": 1.721025836949317e-05, |
|
"loss": 0.7601, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5801521678476529, |
|
"learning_rate": 1.703583120486297e-05, |
|
"loss": 0.7831, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6538515323321731, |
|
"learning_rate": 1.6857057950864134e-05, |
|
"loss": 0.7792, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6283544032704685, |
|
"learning_rate": 1.6674049037169565e-05, |
|
"loss": 0.7699, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5680653535007143, |
|
"learning_rate": 1.648691750985314e-05, |
|
"loss": 0.7465, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6108554255786504, |
|
"learning_rate": 1.6295778961560242e-05, |
|
"loss": 0.7615, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5595610558009545, |
|
"learning_rate": 1.6100751460105244e-05, |
|
"loss": 0.7517, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.5638534604629722, |
|
"learning_rate": 1.5901955475540087e-05, |
|
"loss": 0.7433, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6290842137244347, |
|
"learning_rate": 1.5699513805738942e-05, |
|
"loss": 0.7546, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5899201877158341, |
|
"learning_rate": 1.549355150054501e-05, |
|
"loss": 0.7586, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5848454901161203, |
|
"learning_rate": 1.5284195784526196e-05, |
|
"loss": 0.7435, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6441906832873957, |
|
"learning_rate": 1.5071575978387505e-05, |
|
"loss": 0.7557, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5889076952992223, |
|
"learning_rate": 1.4855823419088576e-05, |
|
"loss": 0.7523, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6138750994803515, |
|
"learning_rate": 1.4637071378715807e-05, |
|
"loss": 0.7466, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5460450262431842, |
|
"learning_rate": 1.4415454982159121e-05, |
|
"loss": 0.7575, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5958584774428966, |
|
"learning_rate": 1.419111112364422e-05, |
|
"loss": 0.7651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5892595546498745, |
|
"learning_rate": 1.3964178382171942e-05, |
|
"loss": 0.7509, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5824299769787271, |
|
"learning_rate": 1.3734796935916888e-05, |
|
"loss": 0.7248, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.564819905580984, |
|
"learning_rate": 1.3503108475638244e-05, |
|
"loss": 0.7288, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6263083055380363, |
|
"learning_rate": 1.326925611715627e-05, |
|
"loss": 0.7559, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5885749596082486, |
|
"learning_rate": 1.3033384312948487e-05, |
|
"loss": 0.7448, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6217412195838338, |
|
"learning_rate": 1.2795638762920254e-05, |
|
"loss": 0.742, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6379002045294617, |
|
"learning_rate": 1.2556166324404747e-05, |
|
"loss": 0.7099, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5539140087654301, |
|
"learning_rate": 1.2315114921448012e-05, |
|
"loss": 0.7362, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5709158729850722, |
|
"learning_rate": 1.2072633453435092e-05, |
|
"loss": 0.7259, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6103140402089173, |
|
"learning_rate": 1.1828871703113686e-05, |
|
"loss": 0.7525, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6006237395857831, |
|
"learning_rate": 1.158398024407215e-05, |
|
"loss": 0.7443, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5730036044160878, |
|
"learning_rate": 1.1338110347728973e-05, |
|
"loss": 0.7488, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6357913417816614, |
|
"learning_rate": 1.1091413889891211e-05, |
|
"loss": 0.7451, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6484616619442031, |
|
"learning_rate": 1.0844043256939585e-05, |
|
"loss": 0.7513, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6147553788096094, |
|
"learning_rate": 1.05961512516982e-05, |
|
"loss": 0.7547, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6026348227570094, |
|
"learning_rate": 1.0347890999046998e-05, |
|
"loss": 0.7291, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5607221603847458, |
|
"learning_rate": 1.00994158513353e-05, |
|
"loss": 0.729, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5908644383211163, |
|
"learning_rate": 9.850879293654829e-06, |
|
"loss": 0.7319, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5697397462576617, |
|
"learning_rate": 9.602434849030747e-06, |
|
"loss": 0.7522, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5939805294573912, |
|
"learning_rate": 9.354235983589229e-06, |
|
"loss": 0.7285, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5755564582128363, |
|
"learning_rate": 9.106436011760229e-06, |
|
"loss": 0.7102, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6451432716799574, |
|
"learning_rate": 8.859188001573916e-06, |
|
"loss": 0.7511, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5843168824282191, |
|
"learning_rate": 8.61264468010932e-06, |
|
"loss": 0.7022, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6112413168467156, |
|
"learning_rate": 8.3669583391536e-06, |
|
"loss": 0.7275, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.581022238822059, |
|
"learning_rate": 8.122280741130177e-06, |
|
"loss": 0.737, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5858683287529846, |
|
"learning_rate": 7.878763025353875e-06, |
|
"loss": 0.7456, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5983662476814803, |
|
"learning_rate": 7.636555614670953e-06, |
|
"loss": 0.7443, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5853808275325328, |
|
"learning_rate": 7.395808122541697e-06, |
|
"loss": 0.7456, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6178373733573391, |
|
"learning_rate": 7.156669260622997e-06, |
|
"loss": 0.7272, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5767484461478896, |
|
"learning_rate": 6.9192867469079625e-06, |
|
"loss": 0.7355, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6095308999494778, |
|
"learning_rate": 6.683807214479323e-06, |
|
"loss": 0.737, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5757726348096623, |
|
"learning_rate": 6.450376120933008e-06, |
|
"loss": 0.725, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6084178997751468, |
|
"learning_rate": 6.219137658527819e-06, |
|
"loss": 0.7484, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5625041815811782, |
|
"learning_rate": 5.990234665116713e-06, |
|
"loss": 0.7322, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6160006018394228, |
|
"learning_rate": 5.7638085359147235e-06, |
|
"loss": 0.7235, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6035240580848604, |
|
"learning_rate": 5.539999136157977e-06, |
|
"loss": 0.7094, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5166781636185539, |
|
"learning_rate": 5.318944714707861e-06, |
|
"loss": 0.7209, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.5822772862847312, |
|
"learning_rate": 5.100781818653549e-06, |
|
"loss": 0.7088, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6227541043433473, |
|
"learning_rate": 4.885645208965779e-06, |
|
"loss": 0.7295, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.54846199474894, |
|
"learning_rate": 4.673667777253944e-06, |
|
"loss": 0.7452, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6040144782037625, |
|
"learning_rate": 4.464980463677846e-06, |
|
"loss": 0.736, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5847697359147894, |
|
"learning_rate": 4.25971217606493e-06, |
|
"loss": 0.7364, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.597219775666177, |
|
"learning_rate": 4.057989710282897e-06, |
|
"loss": 0.7288, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5379233895836751, |
|
"learning_rate": 3.859937671916833e-06, |
|
"loss": 0.7383, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6251650029425307, |
|
"learning_rate": 3.6656783992993885e-06, |
|
"loss": 0.7264, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6054608357116987, |
|
"learning_rate": 3.475331887941388e-06, |
|
"loss": 0.7384, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5551297177270929, |
|
"learning_rate": 3.2890157164096315e-06, |
|
"loss": 0.7398, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6396725383662202, |
|
"learning_rate": 3.1068449736977015e-06, |
|
"loss": 0.7341, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5648152571609389, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.7244, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.57252018708692, |
|
"learning_rate": 2.755387257874764e-06, |
|
"loss": 0.7228, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5878135364575673, |
|
"learning_rate": 2.5863173830138212e-06, |
|
"loss": 0.7181, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5840059710891027, |
|
"learning_rate": 2.4218269993694733e-06, |
|
"loss": 0.7286, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5438192269829342, |
|
"learning_rate": 2.262017713971063e-06, |
|
"loss": 0.712, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5350040038892513, |
|
"learning_rate": 2.106988242295981e-06, |
|
"loss": 0.7312, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5423604435174225, |
|
"learning_rate": 1.9568343472923524e-06, |
|
"loss": 0.7155, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6113034501153347, |
|
"learning_rate": 1.8116487802254868e-06, |
|
"loss": 0.7282, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6023686669705883, |
|
"learning_rate": 1.6715212233846656e-06, |
|
"loss": 0.7159, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5618755550032292, |
|
"learning_rate": 1.5365382346857005e-06, |
|
"loss": 0.7349, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5579514856251387, |
|
"learning_rate": 1.4067831942033904e-06, |
|
"loss": 0.7106, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5910581925001689, |
|
"learning_rate": 1.2823362526669825e-06, |
|
"loss": 0.7074, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.5595582019084373, |
|
"learning_rate": 1.1632742819504406e-06, |
|
"loss": 0.7244, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.6034146234727371, |
|
"learning_rate": 1.0496708275880497e-06, |
|
"loss": 0.7237, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5913679291364223, |
|
"learning_rate": 9.415960633447674e-07, |
|
"loss": 0.7158, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5694936405039339, |
|
"learning_rate": 8.391167478693241e-07, |
|
"loss": 0.7209, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.5805765466263503, |
|
"learning_rate": 7.422961834568565e-07, |
|
"loss": 0.7097, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5641375110299046, |
|
"learning_rate": 6.51194176946588e-07, |
|
"loss": 0.711, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5928177799223828, |
|
"learning_rate": 5.658670027786561e-07, |
|
"loss": 0.713, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6022673346018554, |
|
"learning_rate": 4.863673682329373e-07, |
|
"loss": 0.7395, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.588777849626458, |
|
"learning_rate": 4.1274438087135273e-07, |
|
"loss": 0.7435, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.52867680060555, |
|
"learning_rate": 3.450435182037104e-07, |
|
"loss": 0.6871, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5676798921047626, |
|
"learning_rate": 2.8330659959589944e-07, |
|
"loss": 0.727, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5666677388076765, |
|
"learning_rate": 2.275717604377292e-07, |
|
"loss": 0.7329, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5389907093110754, |
|
"learning_rate": 1.7787342858638589e-07, |
|
"loss": 0.7136, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.571817712811333, |
|
"learning_rate": 1.3424230310007946e-07, |
|
"loss": 0.739, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5830917030629162, |
|
"learning_rate": 9.670533527498139e-08, |
|
"loss": 0.7209, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5837803782222483, |
|
"learning_rate": 6.528571199719502e-08, |
|
"loss": 0.7322, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5711509584660898, |
|
"learning_rate": 4.000284142003264e-08, |
|
"loss": 0.7113, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5826105062040772, |
|
"learning_rate": 2.0872340975438555e-08, |
|
"loss": 0.7327, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5724533629251033, |
|
"learning_rate": 7.906027726981568e-09, |
|
"loss": 0.715, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5578776974422313, |
|
"learning_rate": 1.111911070356131e-09, |
|
"loss": 0.7072, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7502214908599854, |
|
"eval_runtime": 9.4753, |
|
"eval_samples_per_second": 52.769, |
|
"eval_steps_per_second": 1.689, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 703, |
|
"total_flos": 101830592102400.0, |
|
"train_loss": 0.7627222812701425, |
|
"train_runtime": 5862.0034, |
|
"train_samples_per_second": 15.353, |
|
"train_steps_per_second": 0.12 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 703, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 101830592102400.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|