|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9968454258675079, |
|
"eval_steps": 60, |
|
"global_step": 237, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004206098843322818, |
|
"grad_norm": 0.5299676656723022, |
|
"learning_rate": 2e-05, |
|
"loss": 1.77, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004206098843322818, |
|
"eval_loss": 1.9898090362548828, |
|
"eval_runtime": 65.4901, |
|
"eval_samples_per_second": 41.167, |
|
"eval_steps_per_second": 20.583, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008412197686645636, |
|
"grad_norm": 0.5349143743515015, |
|
"learning_rate": 4e-05, |
|
"loss": 1.745, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012618296529968454, |
|
"grad_norm": 0.5094612240791321, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7007, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.016824395373291272, |
|
"grad_norm": 0.5268917083740234, |
|
"learning_rate": 8e-05, |
|
"loss": 1.6582, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02103049421661409, |
|
"grad_norm": 0.5398459434509277, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7391, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.025236593059936908, |
|
"grad_norm": 0.5613242983818054, |
|
"learning_rate": 0.00012, |
|
"loss": 1.6436, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.029442691903259727, |
|
"grad_norm": 0.45200833678245544, |
|
"learning_rate": 0.00014, |
|
"loss": 1.5859, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.033648790746582544, |
|
"grad_norm": 0.3515471816062927, |
|
"learning_rate": 0.00016, |
|
"loss": 1.4385, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03785488958990536, |
|
"grad_norm": 0.2859397530555725, |
|
"learning_rate": 0.00018, |
|
"loss": 1.427, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04206098843322818, |
|
"grad_norm": 0.3456244170665741, |
|
"learning_rate": 0.0002, |
|
"loss": 1.408, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.046267087276550996, |
|
"grad_norm": 0.42806366086006165, |
|
"learning_rate": 0.0001999904234053922, |
|
"loss": 1.4541, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.050473186119873815, |
|
"grad_norm": 0.5130056142807007, |
|
"learning_rate": 0.00019996169545579207, |
|
"loss": 1.3664, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.054679284963196635, |
|
"grad_norm": 0.39732199907302856, |
|
"learning_rate": 0.00019991382165351814, |
|
"loss": 1.3276, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.058885383806519455, |
|
"grad_norm": 0.3794059157371521, |
|
"learning_rate": 0.00019984681116793038, |
|
"loss": 1.3153, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06309148264984227, |
|
"grad_norm": 0.27593305706977844, |
|
"learning_rate": 0.00019976067683367385, |
|
"loss": 1.2554, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06729758149316509, |
|
"grad_norm": 0.28591713309288025, |
|
"learning_rate": 0.00019965543514822062, |
|
"loss": 1.283, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07150368033648791, |
|
"grad_norm": 0.26724520325660706, |
|
"learning_rate": 0.00019953110626870979, |
|
"loss": 1.1646, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07570977917981073, |
|
"grad_norm": 0.24611811339855194, |
|
"learning_rate": 0.0001993877140080869, |
|
"loss": 1.1762, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07991587802313355, |
|
"grad_norm": 0.2281356304883957, |
|
"learning_rate": 0.000199225285830543, |
|
"loss": 1.1467, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08412197686645637, |
|
"grad_norm": 0.22052225470542908, |
|
"learning_rate": 0.00019904385284625424, |
|
"loss": 1.1377, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08832807570977919, |
|
"grad_norm": 0.23453611135482788, |
|
"learning_rate": 0.00019884344980542338, |
|
"loss": 1.1162, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09253417455310199, |
|
"grad_norm": 0.22467325627803802, |
|
"learning_rate": 0.00019862411509162406, |
|
"loss": 1.155, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09674027339642481, |
|
"grad_norm": 0.2170630544424057, |
|
"learning_rate": 0.00019838589071444903, |
|
"loss": 1.1279, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10094637223974763, |
|
"grad_norm": 0.21346993744373322, |
|
"learning_rate": 0.00019812882230146398, |
|
"loss": 1.0946, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10515247108307045, |
|
"grad_norm": 0.21408380568027496, |
|
"learning_rate": 0.00019785295908946848, |
|
"loss": 1.0889, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10935856992639327, |
|
"grad_norm": 0.22000430524349213, |
|
"learning_rate": 0.0001975583539150655, |
|
"loss": 1.0476, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11356466876971609, |
|
"grad_norm": 0.20778758823871613, |
|
"learning_rate": 0.00019724506320454153, |
|
"loss": 1.0954, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11777076761303891, |
|
"grad_norm": 0.22037693858146667, |
|
"learning_rate": 0.00019691314696305913, |
|
"loss": 1.055, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12197686645636173, |
|
"grad_norm": 0.20428280532360077, |
|
"learning_rate": 0.0001965626687631641, |
|
"loss": 1.0159, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12618296529968454, |
|
"grad_norm": 0.20502522587776184, |
|
"learning_rate": 0.00019619369573260924, |
|
"loss": 1.0254, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13038906414300735, |
|
"grad_norm": 0.2062043696641922, |
|
"learning_rate": 0.0001958062985414972, |
|
"loss": 0.9779, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13459516298633017, |
|
"grad_norm": 0.22229152917861938, |
|
"learning_rate": 0.00019540055138874505, |
|
"loss": 1.0201, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.138801261829653, |
|
"grad_norm": 0.21910454332828522, |
|
"learning_rate": 0.00019497653198787264, |
|
"loss": 0.9958, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.14300736067297581, |
|
"grad_norm": 0.22630847990512848, |
|
"learning_rate": 0.0001945343215521182, |
|
"loss": 0.9892, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14721345951629863, |
|
"grad_norm": 0.21370179951190948, |
|
"learning_rate": 0.00019407400477888315, |
|
"loss": 0.9409, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.15141955835962145, |
|
"grad_norm": 0.22368259727954865, |
|
"learning_rate": 0.00019359566983351013, |
|
"loss": 0.9626, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15562565720294427, |
|
"grad_norm": 0.24231955409049988, |
|
"learning_rate": 0.00019309940833239626, |
|
"loss": 0.9914, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1598317560462671, |
|
"grad_norm": 0.24762062728405, |
|
"learning_rate": 0.00019258531532544585, |
|
"loss": 0.9311, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1640378548895899, |
|
"grad_norm": 0.21248659491539001, |
|
"learning_rate": 0.00019205348927786532, |
|
"loss": 0.9399, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.16824395373291273, |
|
"grad_norm": 0.2374017834663391, |
|
"learning_rate": 0.00019150403205130383, |
|
"loss": 0.9664, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17245005257623555, |
|
"grad_norm": 0.25241079926490784, |
|
"learning_rate": 0.0001909370488843436, |
|
"loss": 0.9475, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.17665615141955837, |
|
"grad_norm": 0.24083252251148224, |
|
"learning_rate": 0.00019035264837234347, |
|
"loss": 0.9602, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1808622502628812, |
|
"grad_norm": 0.24024806916713715, |
|
"learning_rate": 0.0001897509424466393, |
|
"loss": 0.9167, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.18506834910620398, |
|
"grad_norm": 0.2538228929042816, |
|
"learning_rate": 0.0001891320463531055, |
|
"loss": 0.904, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.1892744479495268, |
|
"grad_norm": 0.2393723875284195, |
|
"learning_rate": 0.00018849607863008193, |
|
"loss": 0.8927, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19348054679284962, |
|
"grad_norm": 0.2394389659166336, |
|
"learning_rate": 0.00018784316108566996, |
|
"loss": 0.8675, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.19768664563617244, |
|
"grad_norm": 0.24351197481155396, |
|
"learning_rate": 0.00018717341877440226, |
|
"loss": 0.873, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20189274447949526, |
|
"grad_norm": 0.2396727055311203, |
|
"learning_rate": 0.000186486979973291, |
|
"loss": 0.8972, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.20609884332281808, |
|
"grad_norm": 0.2674885392189026, |
|
"learning_rate": 0.0001857839761572586, |
|
"loss": 0.8613, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2103049421661409, |
|
"grad_norm": 0.25012922286987305, |
|
"learning_rate": 0.00018506454197395606, |
|
"loss": 0.8481, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21451104100946372, |
|
"grad_norm": 0.23941218852996826, |
|
"learning_rate": 0.0001843288152179739, |
|
"loss": 0.8638, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.21871713985278654, |
|
"grad_norm": 0.25679612159729004, |
|
"learning_rate": 0.00018357693680444976, |
|
"loss": 0.8928, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.22292323869610936, |
|
"grad_norm": 0.25766387581825256, |
|
"learning_rate": 0.00018280905074207884, |
|
"loss": 0.8997, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.22712933753943218, |
|
"grad_norm": 0.24009671807289124, |
|
"learning_rate": 0.00018202530410553163, |
|
"loss": 0.8536, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.231335436382755, |
|
"grad_norm": 0.24763701856136322, |
|
"learning_rate": 0.00018122584700728443, |
|
"loss": 0.8581, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.23554153522607782, |
|
"grad_norm": 0.2651236653327942, |
|
"learning_rate": 0.0001804108325688679, |
|
"loss": 0.8164, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.23974763406940064, |
|
"grad_norm": 0.23978441953659058, |
|
"learning_rate": 0.0001795804168915396, |
|
"loss": 0.8321, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.24395373291272346, |
|
"grad_norm": 0.2508217394351959, |
|
"learning_rate": 0.00017873475902638553, |
|
"loss": 0.815, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.24815983175604628, |
|
"grad_norm": 0.2765346169471741, |
|
"learning_rate": 0.00017787402094385666, |
|
"loss": 0.8674, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.25236593059936907, |
|
"grad_norm": 0.27468088269233704, |
|
"learning_rate": 0.00017699836750274662, |
|
"loss": 0.8841, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.25236593059936907, |
|
"eval_loss": 1.075273036956787, |
|
"eval_runtime": 66.5472, |
|
"eval_samples_per_second": 40.513, |
|
"eval_steps_per_second": 20.256, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2565720294426919, |
|
"grad_norm": 0.27056071162223816, |
|
"learning_rate": 0.00017610796641861581, |
|
"loss": 0.8459, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2607781282860147, |
|
"grad_norm": 0.2631956934928894, |
|
"learning_rate": 0.00017520298823166873, |
|
"loss": 0.8853, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.26498422712933756, |
|
"grad_norm": 0.28352680802345276, |
|
"learning_rate": 0.00017428360627408978, |
|
"loss": 0.8625, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.26919032597266035, |
|
"grad_norm": 0.24897028505802155, |
|
"learning_rate": 0.00017334999663684504, |
|
"loss": 0.8627, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2733964248159832, |
|
"grad_norm": 0.2620624303817749, |
|
"learning_rate": 0.00017240233813595478, |
|
"loss": 0.8088, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.277602523659306, |
|
"grad_norm": 0.24983716011047363, |
|
"learning_rate": 0.0001714408122782448, |
|
"loss": 0.8318, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.28180862250262884, |
|
"grad_norm": 0.2667708694934845, |
|
"learning_rate": 0.000170465603226582, |
|
"loss": 0.8368, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.28601472134595163, |
|
"grad_norm": 0.2828388214111328, |
|
"learning_rate": 0.0001694768977646013, |
|
"loss": 0.8282, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2902208201892745, |
|
"grad_norm": 0.2581498324871063, |
|
"learning_rate": 0.0001684748852609306, |
|
"loss": 0.8375, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.29442691903259727, |
|
"grad_norm": 0.27101799845695496, |
|
"learning_rate": 0.0001674597576329207, |
|
"loss": 0.818, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.29863301787592006, |
|
"grad_norm": 0.27231255173683167, |
|
"learning_rate": 0.00016643170930988698, |
|
"loss": 0.843, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3028391167192429, |
|
"grad_norm": 0.2566690444946289, |
|
"learning_rate": 0.00016539093719586994, |
|
"loss": 0.8348, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3070452155625657, |
|
"grad_norm": 0.2482360601425171, |
|
"learning_rate": 0.00016433764063192194, |
|
"loss": 0.8122, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.31125131440588855, |
|
"grad_norm": 0.25742995738983154, |
|
"learning_rate": 0.00016327202135792685, |
|
"loss": 0.776, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"grad_norm": 0.25104233622550964, |
|
"learning_rate": 0.00016219428347396053, |
|
"loss": 0.7823, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3196635120925342, |
|
"grad_norm": 0.2921640872955322, |
|
"learning_rate": 0.00016110463340119913, |
|
"loss": 0.8127, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.323869610935857, |
|
"grad_norm": 0.26554426550865173, |
|
"learning_rate": 0.00016000327984238292, |
|
"loss": 0.7716, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3280757097791798, |
|
"grad_norm": 0.24784542620182037, |
|
"learning_rate": 0.00015889043374184286, |
|
"loss": 0.7714, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3322818086225026, |
|
"grad_norm": 0.26592087745666504, |
|
"learning_rate": 0.0001577663082450984, |
|
"loss": 0.7397, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.33648790746582546, |
|
"grad_norm": 0.3072431683540344, |
|
"learning_rate": 0.00015663111865803285, |
|
"loss": 0.7579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.34069400630914826, |
|
"grad_norm": 0.29445305466651917, |
|
"learning_rate": 0.00015548508240565583, |
|
"loss": 0.7998, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3449001051524711, |
|
"grad_norm": 0.26053521037101746, |
|
"learning_rate": 0.0001543284189904592, |
|
"loss": 0.7832, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3491062039957939, |
|
"grad_norm": 0.2956802248954773, |
|
"learning_rate": 0.00015316134995037545, |
|
"loss": 0.8054, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.35331230283911674, |
|
"grad_norm": 0.2673921287059784, |
|
"learning_rate": 0.00015198409881634617, |
|
"loss": 0.8061, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.35751840168243953, |
|
"grad_norm": 0.2793889045715332, |
|
"learning_rate": 0.00015079689106950854, |
|
"loss": 0.7902, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3617245005257624, |
|
"grad_norm": 0.26718223094940186, |
|
"learning_rate": 0.00014959995409800873, |
|
"loss": 0.7769, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3659305993690852, |
|
"grad_norm": 0.300536572933197, |
|
"learning_rate": 0.00014839351715344968, |
|
"loss": 0.8245, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.37013669821240797, |
|
"grad_norm": 0.2824515998363495, |
|
"learning_rate": 0.00014717781130698212, |
|
"loss": 0.8122, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3743427970557308, |
|
"grad_norm": 0.28050506114959717, |
|
"learning_rate": 0.00014595306940504716, |
|
"loss": 0.778, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3785488958990536, |
|
"grad_norm": 0.2906787395477295, |
|
"learning_rate": 0.00014471952602477866, |
|
"loss": 0.7703, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.38275499474237645, |
|
"grad_norm": 0.298177033662796, |
|
"learning_rate": 0.00014347741742907433, |
|
"loss": 0.7672, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.38696109358569925, |
|
"grad_norm": 0.27583765983581543, |
|
"learning_rate": 0.00014222698152134374, |
|
"loss": 0.7784, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3911671924290221, |
|
"grad_norm": 0.28834670782089233, |
|
"learning_rate": 0.0001409684577999423, |
|
"loss": 0.8278, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3953732912723449, |
|
"grad_norm": 0.29721811413764954, |
|
"learning_rate": 0.00013970208731229974, |
|
"loss": 0.7997, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.39957939011566773, |
|
"grad_norm": 0.2688146233558655, |
|
"learning_rate": 0.00013842811260875168, |
|
"loss": 0.7465, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4037854889589905, |
|
"grad_norm": 0.27095234394073486, |
|
"learning_rate": 0.0001371467776960837, |
|
"loss": 0.757, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.40799158780231337, |
|
"grad_norm": 0.30743858218193054, |
|
"learning_rate": 0.0001358583279907961, |
|
"loss": 0.7882, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.41219768664563616, |
|
"grad_norm": 0.274873822927475, |
|
"learning_rate": 0.00013456301027209882, |
|
"loss": 0.7737, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.416403785488959, |
|
"grad_norm": 0.25485867261886597, |
|
"learning_rate": 0.00013326107263464558, |
|
"loss": 0.7454, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4206098843322818, |
|
"grad_norm": 0.2994694709777832, |
|
"learning_rate": 0.00013195276444101547, |
|
"loss": 0.8133, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.42481598317560465, |
|
"grad_norm": 0.2943129241466522, |
|
"learning_rate": 0.0001306383362739523, |
|
"loss": 0.7501, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.42902208201892744, |
|
"grad_norm": 0.2888595163822174, |
|
"learning_rate": 0.0001293180398883701, |
|
"loss": 0.7522, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.4332281808622503, |
|
"grad_norm": 0.28455743193626404, |
|
"learning_rate": 0.00012799212816313376, |
|
"loss": 0.7278, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4374342797055731, |
|
"grad_norm": 0.32477039098739624, |
|
"learning_rate": 0.00012666085505262485, |
|
"loss": 0.819, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4416403785488959, |
|
"grad_norm": 0.28067031502723694, |
|
"learning_rate": 0.00012532447553810126, |
|
"loss": 0.7979, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4458464773922187, |
|
"grad_norm": 0.26430413126945496, |
|
"learning_rate": 0.00012398324557885994, |
|
"loss": 0.7497, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4500525762355415, |
|
"grad_norm": 0.27110588550567627, |
|
"learning_rate": 0.00012263742206321287, |
|
"loss": 0.7937, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.45425867507886436, |
|
"grad_norm": 0.287041574716568, |
|
"learning_rate": 0.0001212872627592845, |
|
"loss": 0.7897, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.45846477392218715, |
|
"grad_norm": 0.28561776876449585, |
|
"learning_rate": 0.00011993302626564102, |
|
"loss": 0.8011, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.46267087276551, |
|
"grad_norm": 0.2852155566215515, |
|
"learning_rate": 0.00011857497196176049, |
|
"loss": 0.7426, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4668769716088328, |
|
"grad_norm": 0.2712121903896332, |
|
"learning_rate": 0.00011721335995835336, |
|
"loss": 0.7277, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.47108307045215564, |
|
"grad_norm": 0.2779647409915924, |
|
"learning_rate": 0.00011584845104754304, |
|
"loss": 0.7698, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.47528916929547843, |
|
"grad_norm": 0.2774654030799866, |
|
"learning_rate": 0.00011448050665291587, |
|
"loss": 0.7583, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4794952681388013, |
|
"grad_norm": 0.3046507239341736, |
|
"learning_rate": 0.00011310978877945007, |
|
"loss": 0.7987, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.48370136698212407, |
|
"grad_norm": 0.2816363573074341, |
|
"learning_rate": 0.00011173655996333357, |
|
"loss": 0.7898, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4879074658254469, |
|
"grad_norm": 0.27383196353912354, |
|
"learning_rate": 0.00011036108322167988, |
|
"loss": 0.7248, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.4921135646687697, |
|
"grad_norm": 0.28104445338249207, |
|
"learning_rate": 0.00010898362200215197, |
|
"loss": 0.7144, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.49631966351209256, |
|
"grad_norm": 0.28643152117729187, |
|
"learning_rate": 0.0001076044401325036, |
|
"loss": 0.7856, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5005257623554153, |
|
"grad_norm": 0.261483371257782, |
|
"learning_rate": 0.0001062238017700478, |
|
"loss": 0.7429, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5047318611987381, |
|
"grad_norm": 0.2796306908130646, |
|
"learning_rate": 0.00010484197135106263, |
|
"loss": 0.7772, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5047318611987381, |
|
"eval_loss": 0.9960550665855408, |
|
"eval_runtime": 65.8413, |
|
"eval_samples_per_second": 40.947, |
|
"eval_steps_per_second": 20.473, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.508937960042061, |
|
"grad_norm": 0.3079998791217804, |
|
"learning_rate": 0.00010345921354014279, |
|
"loss": 0.7497, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5131440588853838, |
|
"grad_norm": 0.3106074929237366, |
|
"learning_rate": 0.00010207579317950827, |
|
"loss": 0.7568, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5173501577287066, |
|
"grad_norm": 0.27859166264533997, |
|
"learning_rate": 0.00010069197523827833, |
|
"loss": 0.7695, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5215562565720294, |
|
"grad_norm": 0.2840277850627899, |
|
"learning_rate": 9.930802476172169e-05, |
|
"loss": 0.7815, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5257623554153522, |
|
"grad_norm": 0.28042981028556824, |
|
"learning_rate": 9.792420682049174e-05, |
|
"loss": 0.7546, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5299684542586751, |
|
"grad_norm": 0.2857164144515991, |
|
"learning_rate": 9.654078645985722e-05, |
|
"loss": 0.7617, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5341745531019979, |
|
"grad_norm": 0.29590827226638794, |
|
"learning_rate": 9.515802864893739e-05, |
|
"loss": 0.748, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5383806519453207, |
|
"grad_norm": 0.29375162720680237, |
|
"learning_rate": 9.377619822995219e-05, |
|
"loss": 0.7532, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5425867507886435, |
|
"grad_norm": 0.28436464071273804, |
|
"learning_rate": 9.239555986749645e-05, |
|
"loss": 0.7511, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5467928496319664, |
|
"grad_norm": 0.29677248001098633, |
|
"learning_rate": 9.101637799784804e-05, |
|
"loss": 0.7456, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5509989484752892, |
|
"grad_norm": 0.27983585000038147, |
|
"learning_rate": 8.963891677832011e-05, |
|
"loss": 0.6888, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.555205047318612, |
|
"grad_norm": 0.27517008781433105, |
|
"learning_rate": 8.826344003666647e-05, |
|
"loss": 0.7431, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5594111461619348, |
|
"grad_norm": 0.28197160363197327, |
|
"learning_rate": 8.689021122054996e-05, |
|
"loss": 0.7379, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5636172450052577, |
|
"grad_norm": 0.29125264286994934, |
|
"learning_rate": 8.551949334708415e-05, |
|
"loss": 0.7639, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5678233438485805, |
|
"grad_norm": 0.2851899266242981, |
|
"learning_rate": 8.415154895245697e-05, |
|
"loss": 0.7764, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5720294426919033, |
|
"grad_norm": 0.2771802544593811, |
|
"learning_rate": 8.278664004164665e-05, |
|
"loss": 0.6961, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.576235541535226, |
|
"grad_norm": 0.27956414222717285, |
|
"learning_rate": 8.142502803823955e-05, |
|
"loss": 0.7454, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.580441640378549, |
|
"grad_norm": 0.30068668723106384, |
|
"learning_rate": 8.0066973734359e-05, |
|
"loss": 0.7683, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5846477392218717, |
|
"grad_norm": 0.2820778489112854, |
|
"learning_rate": 7.871273724071553e-05, |
|
"loss": 0.7412, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5888538380651945, |
|
"grad_norm": 0.2672085165977478, |
|
"learning_rate": 7.736257793678714e-05, |
|
"loss": 0.716, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5930599369085173, |
|
"grad_norm": 0.27900293469429016, |
|
"learning_rate": 7.601675442114009e-05, |
|
"loss": 0.7259, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5972660357518401, |
|
"grad_norm": 0.2954063415527344, |
|
"learning_rate": 7.46755244618988e-05, |
|
"loss": 0.7047, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.601472134595163, |
|
"grad_norm": 0.3212134838104248, |
|
"learning_rate": 7.333914494737514e-05, |
|
"loss": 0.7657, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6056782334384858, |
|
"grad_norm": 0.30651283264160156, |
|
"learning_rate": 7.200787183686625e-05, |
|
"loss": 0.7489, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6098843322818086, |
|
"grad_norm": 0.26834797859191895, |
|
"learning_rate": 7.068196011162994e-05, |
|
"loss": 0.7484, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6140904311251314, |
|
"grad_norm": 0.2777973711490631, |
|
"learning_rate": 6.936166372604773e-05, |
|
"loss": 0.7245, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6182965299684543, |
|
"grad_norm": 0.293694406747818, |
|
"learning_rate": 6.804723555898458e-05, |
|
"loss": 0.7211, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6225026288117771, |
|
"grad_norm": 0.28515610098838806, |
|
"learning_rate": 6.673892736535448e-05, |
|
"loss": 0.7439, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6267087276550999, |
|
"grad_norm": 0.2929891049861908, |
|
"learning_rate": 6.543698972790117e-05, |
|
"loss": 0.7434, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"grad_norm": 0.29031944274902344, |
|
"learning_rate": 6.414167200920391e-05, |
|
"loss": 0.7176, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6351209253417456, |
|
"grad_norm": 0.2764637768268585, |
|
"learning_rate": 6.28532223039163e-05, |
|
"loss": 0.7503, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6393270241850684, |
|
"grad_norm": 0.2900468707084656, |
|
"learning_rate": 6.157188739124834e-05, |
|
"loss": 0.6879, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6435331230283912, |
|
"grad_norm": 0.2989012897014618, |
|
"learning_rate": 6.029791268770029e-05, |
|
"loss": 0.7135, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.647739221871714, |
|
"grad_norm": 0.2998535931110382, |
|
"learning_rate": 5.903154220005771e-05, |
|
"loss": 0.7171, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6519453207150369, |
|
"grad_norm": 0.27283868193626404, |
|
"learning_rate": 5.777301847865629e-05, |
|
"loss": 0.7112, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6561514195583596, |
|
"grad_norm": 0.2988041341304779, |
|
"learning_rate": 5.652258257092569e-05, |
|
"loss": 0.7444, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6603575184016824, |
|
"grad_norm": 0.2845938205718994, |
|
"learning_rate": 5.528047397522133e-05, |
|
"loss": 0.716, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6645636172450052, |
|
"grad_norm": 0.29695218801498413, |
|
"learning_rate": 5.404693059495285e-05, |
|
"loss": 0.7585, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.668769716088328, |
|
"grad_norm": 0.28558245301246643, |
|
"learning_rate": 5.282218869301788e-05, |
|
"loss": 0.6908, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6729758149316509, |
|
"grad_norm": 0.280200719833374, |
|
"learning_rate": 5.160648284655032e-05, |
|
"loss": 0.7508, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6771819137749737, |
|
"grad_norm": 0.2981257438659668, |
|
"learning_rate": 5.040004590199128e-05, |
|
"loss": 0.7147, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6813880126182965, |
|
"grad_norm": 0.2873106598854065, |
|
"learning_rate": 4.920310893049146e-05, |
|
"loss": 0.7011, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6855941114616193, |
|
"grad_norm": 0.2717635929584503, |
|
"learning_rate": 4.801590118365383e-05, |
|
"loss": 0.6668, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6898002103049422, |
|
"grad_norm": 0.27607038617134094, |
|
"learning_rate": 4.683865004962452e-05, |
|
"loss": 0.7033, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.694006309148265, |
|
"grad_norm": 0.2881218194961548, |
|
"learning_rate": 4.567158100954083e-05, |
|
"loss": 0.7275, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6982124079915878, |
|
"grad_norm": 0.2758018672466278, |
|
"learning_rate": 4.4514917594344184e-05, |
|
"loss": 0.737, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7024185068349106, |
|
"grad_norm": 0.29527172446250916, |
|
"learning_rate": 4.3368881341967135e-05, |
|
"loss": 0.7433, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7066246056782335, |
|
"grad_norm": 0.2847643792629242, |
|
"learning_rate": 4.223369175490162e-05, |
|
"loss": 0.7471, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7108307045215563, |
|
"grad_norm": 0.2958676815032959, |
|
"learning_rate": 4.110956625815713e-05, |
|
"loss": 0.6838, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.7150368033648791, |
|
"grad_norm": 0.28350576758384705, |
|
"learning_rate": 3.9996720157617094e-05, |
|
"loss": 0.7306, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7192429022082019, |
|
"grad_norm": 0.2808986008167267, |
|
"learning_rate": 3.8895366598800896e-05, |
|
"loss": 0.6823, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7234490010515248, |
|
"grad_norm": 0.2684039771556854, |
|
"learning_rate": 3.780571652603949e-05, |
|
"loss": 0.7105, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7276550998948476, |
|
"grad_norm": 0.28138425946235657, |
|
"learning_rate": 3.672797864207316e-05, |
|
"loss": 0.7221, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.7318611987381703, |
|
"grad_norm": 0.2772335708141327, |
|
"learning_rate": 3.566235936807808e-05, |
|
"loss": 0.6835, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7360672975814931, |
|
"grad_norm": 0.27244430780410767, |
|
"learning_rate": 3.460906280413007e-05, |
|
"loss": 0.6577, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7402733964248159, |
|
"grad_norm": 0.2977088689804077, |
|
"learning_rate": 3.3568290690113034e-05, |
|
"loss": 0.7213, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7444794952681388, |
|
"grad_norm": 0.289736270904541, |
|
"learning_rate": 3.25402423670793e-05, |
|
"loss": 0.7154, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7486855941114616, |
|
"grad_norm": 0.287818044424057, |
|
"learning_rate": 3.1525114739069415e-05, |
|
"loss": 0.6977, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7528916929547844, |
|
"grad_norm": 0.31408464908599854, |
|
"learning_rate": 3.0523102235398714e-05, |
|
"loss": 0.781, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7570977917981072, |
|
"grad_norm": 0.27790582180023193, |
|
"learning_rate": 2.9534396773417994e-05, |
|
"loss": 0.7169, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7570977917981072, |
|
"eval_loss": 0.9679059386253357, |
|
"eval_runtime": 66.127, |
|
"eval_samples_per_second": 40.77, |
|
"eval_steps_per_second": 20.385, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7613038906414301, |
|
"grad_norm": 0.28392866253852844, |
|
"learning_rate": 2.855918772175522e-05, |
|
"loss": 0.6662, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7655099894847529, |
|
"grad_norm": 0.2941664159297943, |
|
"learning_rate": 2.7597661864045233e-05, |
|
"loss": 0.6816, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7697160883280757, |
|
"grad_norm": 0.2740324139595032, |
|
"learning_rate": 2.6650003363154963e-05, |
|
"loss": 0.7046, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7739221871713985, |
|
"grad_norm": 0.2933352291584015, |
|
"learning_rate": 2.5716393725910215e-05, |
|
"loss": 0.7208, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7781282860147214, |
|
"grad_norm": 0.2843799591064453, |
|
"learning_rate": 2.47970117683313e-05, |
|
"loss": 0.685, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7823343848580442, |
|
"grad_norm": 0.27152329683303833, |
|
"learning_rate": 2.389203358138419e-05, |
|
"loss": 0.7176, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.786540483701367, |
|
"grad_norm": 0.2916063964366913, |
|
"learning_rate": 2.3001632497253424e-05, |
|
"loss": 0.7439, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7907465825446898, |
|
"grad_norm": 0.27915897965431213, |
|
"learning_rate": 2.2125979056143364e-05, |
|
"loss": 0.7, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7949526813880127, |
|
"grad_norm": 0.30191752314567566, |
|
"learning_rate": 2.1265240973614486e-05, |
|
"loss": 0.7377, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7991587802313355, |
|
"grad_norm": 0.286101758480072, |
|
"learning_rate": 2.0419583108460418e-05, |
|
"loss": 0.6916, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8033648790746583, |
|
"grad_norm": 0.2800692319869995, |
|
"learning_rate": 1.958916743113214e-05, |
|
"loss": 0.7374, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.807570977917981, |
|
"grad_norm": 0.27292168140411377, |
|
"learning_rate": 1.877415299271561e-05, |
|
"loss": 0.6757, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8117770767613038, |
|
"grad_norm": 0.28094640374183655, |
|
"learning_rate": 1.7974695894468384e-05, |
|
"loss": 0.7024, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.8159831756046267, |
|
"grad_norm": 0.2871862053871155, |
|
"learning_rate": 1.7190949257921196e-05, |
|
"loss": 0.7173, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8201892744479495, |
|
"grad_norm": 0.27189600467681885, |
|
"learning_rate": 1.642306319555027e-05, |
|
"loss": 0.7019, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8243953732912723, |
|
"grad_norm": 0.28526559472084045, |
|
"learning_rate": 1.5671184782026106e-05, |
|
"loss": 0.7113, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8286014721345951, |
|
"grad_norm": 0.2855590283870697, |
|
"learning_rate": 1.4935458026043959e-05, |
|
"loss": 0.6977, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.832807570977918, |
|
"grad_norm": 0.28118449449539185, |
|
"learning_rate": 1.4216023842741455e-05, |
|
"loss": 0.7241, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8370136698212408, |
|
"grad_norm": 0.28818827867507935, |
|
"learning_rate": 1.3513020026709023e-05, |
|
"loss": 0.6964, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8412197686645636, |
|
"grad_norm": 0.3235337436199188, |
|
"learning_rate": 1.2826581225597767e-05, |
|
"loss": 0.7406, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8454258675078864, |
|
"grad_norm": 0.2899198830127716, |
|
"learning_rate": 1.2156838914330072e-05, |
|
"loss": 0.7374, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8496319663512093, |
|
"grad_norm": 0.28662335872650146, |
|
"learning_rate": 1.1503921369918091e-05, |
|
"loss": 0.7039, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8538380651945321, |
|
"grad_norm": 0.2748032510280609, |
|
"learning_rate": 1.0867953646894525e-05, |
|
"loss": 0.7517, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8580441640378549, |
|
"grad_norm": 0.27125102281570435, |
|
"learning_rate": 1.0249057553360742e-05, |
|
"loss": 0.6948, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8622502628811777, |
|
"grad_norm": 0.2795623242855072, |
|
"learning_rate": 9.647351627656543e-06, |
|
"loss": 0.7123, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8664563617245006, |
|
"grad_norm": 0.28939002752304077, |
|
"learning_rate": 9.062951115656403e-06, |
|
"loss": 0.7266, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8706624605678234, |
|
"grad_norm": 0.2878707945346832, |
|
"learning_rate": 8.495967948696192e-06, |
|
"loss": 0.7335, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8748685594111462, |
|
"grad_norm": 0.27489086985588074, |
|
"learning_rate": 7.946510722134692e-06, |
|
"loss": 0.692, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.879074658254469, |
|
"grad_norm": 0.2869216799736023, |
|
"learning_rate": 7.4146846745541506e-06, |
|
"loss": 0.7193, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8832807570977917, |
|
"grad_norm": 0.2801933288574219, |
|
"learning_rate": 6.900591667603751e-06, |
|
"loss": 0.7178, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8874868559411146, |
|
"grad_norm": 0.2767332196235657, |
|
"learning_rate": 6.40433016648988e-06, |
|
"loss": 0.7499, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8916929547844374, |
|
"grad_norm": 0.2783336043357849, |
|
"learning_rate": 5.925995221116853e-06, |
|
"loss": 0.7152, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8958990536277602, |
|
"grad_norm": 0.27832481265068054, |
|
"learning_rate": 5.465678447881828e-06, |
|
"loss": 0.6977, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.900105152471083, |
|
"grad_norm": 0.2835717499256134, |
|
"learning_rate": 5.023468012127364e-06, |
|
"loss": 0.7251, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9043112513144059, |
|
"grad_norm": 0.27503538131713867, |
|
"learning_rate": 4.599448611254964e-06, |
|
"loss": 0.7166, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9085173501577287, |
|
"grad_norm": 0.26619476079940796, |
|
"learning_rate": 4.193701458502807e-06, |
|
"loss": 0.7095, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9127234490010515, |
|
"grad_norm": 0.2752280533313751, |
|
"learning_rate": 3.80630426739077e-06, |
|
"loss": 0.7412, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9169295478443743, |
|
"grad_norm": 0.281093567609787, |
|
"learning_rate": 3.4373312368358944e-06, |
|
"loss": 0.7592, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9211356466876972, |
|
"grad_norm": 0.28015753626823425, |
|
"learning_rate": 3.086853036940862e-06, |
|
"loss": 0.7104, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.92534174553102, |
|
"grad_norm": 0.2644014358520508, |
|
"learning_rate": 2.754936795458485e-06, |
|
"loss": 0.6985, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9295478443743428, |
|
"grad_norm": 0.2755027413368225, |
|
"learning_rate": 2.4416460849345123e-06, |
|
"loss": 0.7159, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9337539432176656, |
|
"grad_norm": 0.28020283579826355, |
|
"learning_rate": 2.1470409105315283e-06, |
|
"loss": 0.7389, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9379600420609885, |
|
"grad_norm": 0.2773683965206146, |
|
"learning_rate": 1.8711776985360308e-06, |
|
"loss": 0.686, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9421661409043113, |
|
"grad_norm": 0.2784758508205414, |
|
"learning_rate": 1.61410928555098e-06, |
|
"loss": 0.6857, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"grad_norm": 0.2857016623020172, |
|
"learning_rate": 1.3758849083759352e-06, |
|
"loss": 0.6982, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9505783385909569, |
|
"grad_norm": 0.27618998289108276, |
|
"learning_rate": 1.1565501945766222e-06, |
|
"loss": 0.7328, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9547844374342797, |
|
"grad_norm": 0.273423969745636, |
|
"learning_rate": 9.56147153745779e-07, |
|
"loss": 0.6762, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9589905362776026, |
|
"grad_norm": 0.2603454291820526, |
|
"learning_rate": 7.747141694570026e-07, |
|
"loss": 0.6784, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9631966351209253, |
|
"grad_norm": 0.2638219892978668, |
|
"learning_rate": 6.122859919130974e-07, |
|
"loss": 0.731, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9674027339642481, |
|
"grad_norm": 0.28604456782341003, |
|
"learning_rate": 4.6889373129022085e-07, |
|
"loss": 0.6937, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9716088328075709, |
|
"grad_norm": 0.2867179811000824, |
|
"learning_rate": 3.445648517793942e-07, |
|
"loss": 0.7492, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9758149316508938, |
|
"grad_norm": 0.27991774678230286, |
|
"learning_rate": 2.3932316632614416e-07, |
|
"loss": 0.7411, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9800210304942166, |
|
"grad_norm": 0.2658878266811371, |
|
"learning_rate": 1.5318883206962842e-07, |
|
"loss": 0.7317, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9842271293375394, |
|
"grad_norm": 0.26533135771751404, |
|
"learning_rate": 8.617834648185774e-08, |
|
"loss": 0.6636, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9884332281808622, |
|
"grad_norm": 0.26577314734458923, |
|
"learning_rate": 3.8304544207945495e-08, |
|
"loss": 0.7273, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9926393270241851, |
|
"grad_norm": 0.2715383768081665, |
|
"learning_rate": 9.576594607807465e-09, |
|
"loss": 0.7253, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9968454258675079, |
|
"grad_norm": 0.28140708804130554, |
|
"learning_rate": 0.0, |
|
"loss": 0.6756, |
|
"step": 237 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 237, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.319764496895181e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|