|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 789, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012674271229404308, |
|
"grad_norm": 0.8615043469450224, |
|
"learning_rate": 2.531645569620253e-06, |
|
"loss": 0.479, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0063371356147021544, |
|
"grad_norm": 1.2064683000577734, |
|
"learning_rate": 1.2658227848101267e-05, |
|
"loss": 0.5395, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012674271229404309, |
|
"grad_norm": 1.0410453983480228, |
|
"learning_rate": 2.5316455696202533e-05, |
|
"loss": 0.6061, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.019011406844106463, |
|
"grad_norm": 0.6487558463156515, |
|
"learning_rate": 3.79746835443038e-05, |
|
"loss": 0.5462, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.025348542458808618, |
|
"grad_norm": 0.5370587969238408, |
|
"learning_rate": 5.0632911392405066e-05, |
|
"loss": 0.4583, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.031685678073510776, |
|
"grad_norm": 0.5034416625324662, |
|
"learning_rate": 6.329113924050633e-05, |
|
"loss": 0.4883, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03802281368821293, |
|
"grad_norm": 0.39983399371170963, |
|
"learning_rate": 7.59493670886076e-05, |
|
"loss": 0.3405, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.044359949302915085, |
|
"grad_norm": 0.43348705894690165, |
|
"learning_rate": 8.860759493670887e-05, |
|
"loss": 0.4164, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.050697084917617236, |
|
"grad_norm": 0.38455150629020124, |
|
"learning_rate": 0.00010126582278481013, |
|
"loss": 0.3655, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.057034220532319393, |
|
"grad_norm": 0.4620346635793738, |
|
"learning_rate": 0.0001139240506329114, |
|
"loss": 0.321, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06337135614702155, |
|
"grad_norm": 0.5487179791839946, |
|
"learning_rate": 0.00012658227848101267, |
|
"loss": 0.3917, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0697084917617237, |
|
"grad_norm": 0.4322988318215336, |
|
"learning_rate": 0.00013924050632911395, |
|
"loss": 0.2954, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07604562737642585, |
|
"grad_norm": 0.45659465975830865, |
|
"learning_rate": 0.0001518987341772152, |
|
"loss": 0.3362, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08238276299112801, |
|
"grad_norm": 0.5012029391973714, |
|
"learning_rate": 0.00016455696202531648, |
|
"loss": 0.3201, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08871989860583017, |
|
"grad_norm": 0.4257121777036391, |
|
"learning_rate": 0.00017721518987341773, |
|
"loss": 0.3103, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09505703422053231, |
|
"grad_norm": 0.49930786967179713, |
|
"learning_rate": 0.00018987341772151899, |
|
"loss": 0.3895, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10139416983523447, |
|
"grad_norm": 0.3987283812690162, |
|
"learning_rate": 0.00019999902106840922, |
|
"loss": 0.2607, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10773130544993663, |
|
"grad_norm": 0.4941124105408692, |
|
"learning_rate": 0.00019996476047513454, |
|
"loss": 0.3509, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11406844106463879, |
|
"grad_norm": 0.35089312674321954, |
|
"learning_rate": 0.00019988157246677513, |
|
"loss": 0.323, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12040557667934093, |
|
"grad_norm": 0.4238705680350401, |
|
"learning_rate": 0.00019974949775942134, |
|
"loss": 0.2834, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.1267427122940431, |
|
"grad_norm": 0.45602469539148643, |
|
"learning_rate": 0.00019956860099659346, |
|
"loss": 0.3492, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.13307984790874525, |
|
"grad_norm": 0.40420721250262537, |
|
"learning_rate": 0.00019933897071760235, |
|
"loss": 0.2865, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1394169835234474, |
|
"grad_norm": 0.47242229343887376, |
|
"learning_rate": 0.00019906071931421413, |
|
"loss": 0.3047, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.14575411913814956, |
|
"grad_norm": 0.3662230634874705, |
|
"learning_rate": 0.00019873398297564037, |
|
"loss": 0.3183, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1520912547528517, |
|
"grad_norm": 0.3786550636833906, |
|
"learning_rate": 0.00019835892162188066, |
|
"loss": 0.2903, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15842839036755388, |
|
"grad_norm": 0.42556611928998644, |
|
"learning_rate": 0.00019793571882545047, |
|
"loss": 0.3298, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.16476552598225602, |
|
"grad_norm": 0.3477579859642712, |
|
"learning_rate": 0.0001974645817215322, |
|
"loss": 0.2811, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.17110266159695817, |
|
"grad_norm": 0.338149840344514, |
|
"learning_rate": 0.0001969457409065933, |
|
"loss": 0.3218, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17743979721166034, |
|
"grad_norm": 0.31604418234996284, |
|
"learning_rate": 0.0001963794503255219, |
|
"loss": 0.3181, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.18377693282636248, |
|
"grad_norm": 0.37941148877648684, |
|
"learning_rate": 0.00019576598714733431, |
|
"loss": 0.2708, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.19011406844106463, |
|
"grad_norm": 0.41662892297732984, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.3133, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1964512040557668, |
|
"grad_norm": 0.33440472234808516, |
|
"learning_rate": 0.0001943987669710586, |
|
"loss": 0.3027, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.20278833967046894, |
|
"grad_norm": 0.3858834779514916, |
|
"learning_rate": 0.0001936456791542776, |
|
"loss": 0.3148, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20912547528517111, |
|
"grad_norm": 0.3588666651678752, |
|
"learning_rate": 0.000192846756775466, |
|
"loss": 0.3038, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.21546261089987326, |
|
"grad_norm": 0.3923595420574365, |
|
"learning_rate": 0.00019200239086448933, |
|
"loss": 0.251, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2217997465145754, |
|
"grad_norm": 0.3720411933468231, |
|
"learning_rate": 0.0001911129946933968, |
|
"loss": 0.3066, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22813688212927757, |
|
"grad_norm": 0.32287305381502635, |
|
"learning_rate": 0.00019017900357414669, |
|
"loss": 0.3021, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.23447401774397972, |
|
"grad_norm": 0.3574163723783811, |
|
"learning_rate": 0.00018920087464554427, |
|
"loss": 0.3156, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.24081115335868186, |
|
"grad_norm": 0.4152681195493882, |
|
"learning_rate": 0.0001881790866494969, |
|
"loss": 0.2839, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.24714828897338403, |
|
"grad_norm": 0.36834582922377895, |
|
"learning_rate": 0.00018711413969669526, |
|
"loss": 0.2945, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2534854245880862, |
|
"grad_norm": 0.39981519235640095, |
|
"learning_rate": 0.00018600655502183612, |
|
"loss": 0.3189, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2534854245880862, |
|
"eval_loss": 0.28277337551116943, |
|
"eval_runtime": 1084.4496, |
|
"eval_samples_per_second": 3.689, |
|
"eval_steps_per_second": 0.115, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2598225602027883, |
|
"grad_norm": 0.31219177577589374, |
|
"learning_rate": 0.0001848568747285054, |
|
"loss": 0.2299, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2661596958174905, |
|
"grad_norm": 0.37017559919032633, |
|
"learning_rate": 0.00018366566152384773, |
|
"loss": 0.2753, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.27249683143219267, |
|
"grad_norm": 0.29894743373031823, |
|
"learning_rate": 0.00018243349844315117, |
|
"loss": 0.2734, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2788339670468948, |
|
"grad_norm": 0.3227139696142255, |
|
"learning_rate": 0.00018116098856448253, |
|
"loss": 0.2595, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.28517110266159695, |
|
"grad_norm": 0.3780707564332847, |
|
"learning_rate": 0.00017984875471351302, |
|
"loss": 0.3107, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2915082382762991, |
|
"grad_norm": 0.31519655631233245, |
|
"learning_rate": 0.00017849743915867807, |
|
"loss": 0.2336, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.29784537389100124, |
|
"grad_norm": 0.35840275346083816, |
|
"learning_rate": 0.00017710770329682144, |
|
"loss": 0.2939, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.3041825095057034, |
|
"grad_norm": 0.2971942370747859, |
|
"learning_rate": 0.0001756802273294766, |
|
"loss": 0.2959, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3105196451204056, |
|
"grad_norm": 0.3247193570102783, |
|
"learning_rate": 0.0001742157099299445, |
|
"loss": 0.2373, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.31685678073510776, |
|
"grad_norm": 0.38491700907647547, |
|
"learning_rate": 0.00017271486790133023, |
|
"loss": 0.3272, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3231939163498099, |
|
"grad_norm": 0.3075011053275653, |
|
"learning_rate": 0.00017117843582570608, |
|
"loss": 0.2377, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.32953105196451205, |
|
"grad_norm": 0.31453423737207853, |
|
"learning_rate": 0.00016960716570457292, |
|
"loss": 0.2842, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3358681875792142, |
|
"grad_norm": 0.24817557882025235, |
|
"learning_rate": 0.00016800182659079568, |
|
"loss": 0.2721, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.34220532319391633, |
|
"grad_norm": 0.3377039172873109, |
|
"learning_rate": 0.00016636320421219278, |
|
"loss": 0.2463, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.3485424588086185, |
|
"grad_norm": 0.3661471684768712, |
|
"learning_rate": 0.00016469210058696446, |
|
"loss": 0.3025, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3548795944233207, |
|
"grad_norm": 0.27351558488185734, |
|
"learning_rate": 0.0001629893336311477, |
|
"loss": 0.2335, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3612167300380228, |
|
"grad_norm": 0.4358770290773004, |
|
"learning_rate": 0.00016125573675828983, |
|
"loss": 0.279, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.36755386565272496, |
|
"grad_norm": 0.2774172437400498, |
|
"learning_rate": 0.00015949215847153717, |
|
"loss": 0.2581, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.37389100126742714, |
|
"grad_norm": 0.28991353506835355, |
|
"learning_rate": 0.00015769946194833817, |
|
"loss": 0.2314, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.38022813688212925, |
|
"grad_norm": 0.4117767724226869, |
|
"learning_rate": 0.00015587852461796376, |
|
"loss": 0.3218, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3865652724968314, |
|
"grad_norm": 0.31038917537934363, |
|
"learning_rate": 0.00015403023773205286, |
|
"loss": 0.2315, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3929024081115336, |
|
"grad_norm": 0.3358971163136089, |
|
"learning_rate": 0.00015215550592839218, |
|
"loss": 0.302, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.39923954372623577, |
|
"grad_norm": 0.24064351622215968, |
|
"learning_rate": 0.00015025524678814427, |
|
"loss": 0.2795, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4055766793409379, |
|
"grad_norm": 0.32648334764287845, |
|
"learning_rate": 0.00014833039038674047, |
|
"loss": 0.2398, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.41191381495564006, |
|
"grad_norm": 0.4100611479861272, |
|
"learning_rate": 0.0001463818788386588, |
|
"loss": 0.2849, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.41825095057034223, |
|
"grad_norm": 0.333871597957756, |
|
"learning_rate": 0.00014441066583630906, |
|
"loss": 0.2297, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.42458808618504434, |
|
"grad_norm": 0.34174204454819196, |
|
"learning_rate": 0.00014241771618325123, |
|
"loss": 0.2704, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4309252217997465, |
|
"grad_norm": 0.2539128471983557, |
|
"learning_rate": 0.00014040400532197583, |
|
"loss": 0.2579, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4372623574144487, |
|
"grad_norm": 0.29762852904975656, |
|
"learning_rate": 0.0001383705188564767, |
|
"loss": 0.255, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4435994930291508, |
|
"grad_norm": 0.3477101390224288, |
|
"learning_rate": 0.00013631825206985063, |
|
"loss": 0.2841, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.449936628643853, |
|
"grad_norm": 0.2807172517529508, |
|
"learning_rate": 0.0001342482094371591, |
|
"loss": 0.2112, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.45627376425855515, |
|
"grad_norm": 0.3838095185531826, |
|
"learning_rate": 0.00013216140413379167, |
|
"loss": 0.282, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.46261089987325726, |
|
"grad_norm": 0.2421625693621495, |
|
"learning_rate": 0.00013005885753957048, |
|
"loss": 0.2473, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.46894803548795944, |
|
"grad_norm": 0.32700829134754156, |
|
"learning_rate": 0.0001279415987388395, |
|
"loss": 0.2174, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4752851711026616, |
|
"grad_norm": 0.3713274528328548, |
|
"learning_rate": 0.0001258106640167826, |
|
"loss": 0.283, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4816223067173637, |
|
"grad_norm": 0.27730345855582755, |
|
"learning_rate": 0.0001236670963522172, |
|
"loss": 0.2021, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4879594423320659, |
|
"grad_norm": 0.34446917830376517, |
|
"learning_rate": 0.00012151194490711178, |
|
"loss": 0.2561, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.49429657794676807, |
|
"grad_norm": 0.2522933072667094, |
|
"learning_rate": 0.00011934626451307726, |
|
"loss": 0.2433, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5006337135614702, |
|
"grad_norm": 0.3469006061112617, |
|
"learning_rate": 0.00011717111515508319, |
|
"loss": 0.2266, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5069708491761724, |
|
"grad_norm": 0.3778944814729631, |
|
"learning_rate": 0.00011498756145265144, |
|
"loss": 0.2906, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5069708491761724, |
|
"eval_loss": 0.2465018332004547, |
|
"eval_runtime": 1086.0248, |
|
"eval_samples_per_second": 3.683, |
|
"eval_steps_per_second": 0.115, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5133079847908745, |
|
"grad_norm": 0.2906640770941992, |
|
"learning_rate": 0.00011279667213878205, |
|
"loss": 0.2329, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5196451204055766, |
|
"grad_norm": 0.3562675808467966, |
|
"learning_rate": 0.00011059951953686535, |
|
"loss": 0.2727, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5259822560202788, |
|
"grad_norm": 0.23100289329465973, |
|
"learning_rate": 0.00010839717903583684, |
|
"loss": 0.2559, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.532319391634981, |
|
"grad_norm": 0.31979110117969145, |
|
"learning_rate": 0.00010619072856383181, |
|
"loss": 0.2413, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5386565272496832, |
|
"grad_norm": 0.3131449805956835, |
|
"learning_rate": 0.00010398124806059701, |
|
"loss": 0.2807, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5449936628643853, |
|
"grad_norm": 0.30855801225213353, |
|
"learning_rate": 0.00010176981894891768, |
|
"loss": 0.1961, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5513307984790875, |
|
"grad_norm": 0.3422494582211193, |
|
"learning_rate": 9.955752360531896e-05, |
|
"loss": 0.2805, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5576679340937896, |
|
"grad_norm": 0.26688868743798966, |
|
"learning_rate": 9.734544483030026e-05, |
|
"loss": 0.2597, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5640050697084917, |
|
"grad_norm": 0.3161252521303147, |
|
"learning_rate": 9.513466531836221e-05, |
|
"loss": 0.2166, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5703422053231939, |
|
"grad_norm": 0.3401785018630494, |
|
"learning_rate": 9.292626712808556e-05, |
|
"loss": 0.2827, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5766793409378961, |
|
"grad_norm": 0.3036061299387351, |
|
"learning_rate": 9.072133115252112e-05, |
|
"loss": 0.2254, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5830164765525983, |
|
"grad_norm": 0.35415945685790207, |
|
"learning_rate": 8.85209365901505e-05, |
|
"loss": 0.2692, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5893536121673004, |
|
"grad_norm": 0.25401069755470806, |
|
"learning_rate": 8.632616041667577e-05, |
|
"loss": 0.2539, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5956907477820025, |
|
"grad_norm": 0.31003086790259704, |
|
"learning_rate": 8.41380768578976e-05, |
|
"loss": 0.2271, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6020278833967047, |
|
"grad_norm": 0.3237135373224526, |
|
"learning_rate": 8.195775686393897e-05, |
|
"loss": 0.2792, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6083650190114068, |
|
"grad_norm": 0.2952913468902984, |
|
"learning_rate": 7.978626758507217e-05, |
|
"loss": 0.2033, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.614702154626109, |
|
"grad_norm": 0.32514176968318775, |
|
"learning_rate": 7.762467184940574e-05, |
|
"loss": 0.2632, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6210392902408112, |
|
"grad_norm": 0.2704128994845209, |
|
"learning_rate": 7.547402764268689e-05, |
|
"loss": 0.2657, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6273764258555133, |
|
"grad_norm": 0.3419677722251191, |
|
"learning_rate": 7.333538759047389e-05, |
|
"loss": 0.2432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6337135614702155, |
|
"grad_norm": 0.3336718483492428, |
|
"learning_rate": 7.120979844293201e-05, |
|
"loss": 0.2789, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6400506970849176, |
|
"grad_norm": 0.28861366582406006, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.2185, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6463878326996197, |
|
"grad_norm": 0.3292639675337998, |
|
"learning_rate": 6.700192741471447e-05, |
|
"loss": 0.2622, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6527249683143219, |
|
"grad_norm": 0.2539285743910528, |
|
"learning_rate": 6.4921705062331e-05, |
|
"loss": 0.2442, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6590621039290241, |
|
"grad_norm": 0.3067269034698268, |
|
"learning_rate": 6.285865166317386e-05, |
|
"loss": 0.2143, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6653992395437263, |
|
"grad_norm": 0.3245132022152399, |
|
"learning_rate": 6.081377697177576e-05, |
|
"loss": 0.2719, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6717363751584284, |
|
"grad_norm": 0.27285602978719187, |
|
"learning_rate": 5.8788081845162246e-05, |
|
"loss": 0.205, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6780735107731305, |
|
"grad_norm": 0.33017438859174614, |
|
"learning_rate": 5.678255775298542e-05, |
|
"loss": 0.2595, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6844106463878327, |
|
"grad_norm": 0.22042843479196494, |
|
"learning_rate": 5.479818629225259e-05, |
|
"loss": 0.2508, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6907477820025348, |
|
"grad_norm": 0.3262775579247169, |
|
"learning_rate": 5.2835938706886966e-05, |
|
"loss": 0.2252, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.697084917617237, |
|
"grad_norm": 0.3604718135367501, |
|
"learning_rate": 5.0896775412355434e-05, |
|
"loss": 0.2703, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7034220532319392, |
|
"grad_norm": 0.2849330234510489, |
|
"learning_rate": 4.89816455255966e-05, |
|
"loss": 0.1977, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7097591888466414, |
|
"grad_norm": 0.2992590502661103, |
|
"learning_rate": 4.7091486400478604e-05, |
|
"loss": 0.252, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7160963244613435, |
|
"grad_norm": 0.2424976560859796, |
|
"learning_rate": 4.5227223169014456e-05, |
|
"loss": 0.2465, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7224334600760456, |
|
"grad_norm": 0.35965307208087977, |
|
"learning_rate": 4.338976828855939e-05, |
|
"loss": 0.2249, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7287705956907478, |
|
"grad_norm": 0.3741830397170476, |
|
"learning_rate": 4.1580021095211486e-05, |
|
"loss": 0.284, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7351077313054499, |
|
"grad_norm": 0.2903803822883755, |
|
"learning_rate": 3.9798867363634814e-05, |
|
"loss": 0.2123, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7414448669201521, |
|
"grad_norm": 0.31710020807356026, |
|
"learning_rate": 3.804717887351991e-05, |
|
"loss": 0.2605, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7477820025348543, |
|
"grad_norm": 0.24733722916292933, |
|
"learning_rate": 3.632581298289427e-05, |
|
"loss": 0.2289, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7541191381495564, |
|
"grad_norm": 0.3001878742603873, |
|
"learning_rate": 3.4635612208491194e-05, |
|
"loss": 0.2322, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7604562737642585, |
|
"grad_norm": 0.3250883573895976, |
|
"learning_rate": 3.2977403813382926e-05, |
|
"loss": 0.2525, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7604562737642585, |
|
"eval_loss": 0.2325838953256607, |
|
"eval_runtime": 1086.1174, |
|
"eval_samples_per_second": 3.683, |
|
"eval_steps_per_second": 0.115, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7667934093789607, |
|
"grad_norm": 0.3246331902911399, |
|
"learning_rate": 3.135199940207947e-05, |
|
"loss": 0.2206, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7731305449936628, |
|
"grad_norm": 0.3364965552008111, |
|
"learning_rate": 2.976019452329153e-05, |
|
"loss": 0.2566, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.779467680608365, |
|
"grad_norm": 0.22924356326780843, |
|
"learning_rate": 2.8202768280551894e-05, |
|
"loss": 0.2448, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7858048162230672, |
|
"grad_norm": 0.3207071242715804, |
|
"learning_rate": 2.6680482950885777e-05, |
|
"loss": 0.2269, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7921419518377694, |
|
"grad_norm": 0.37606836614121164, |
|
"learning_rate": 2.5194083611716935e-05, |
|
"loss": 0.27, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7984790874524715, |
|
"grad_norm": 0.2830616844795413, |
|
"learning_rate": 2.374429777619205e-05, |
|
"loss": 0.1848, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8048162230671736, |
|
"grad_norm": 0.3123281734582231, |
|
"learning_rate": 2.2331835037101823e-05, |
|
"loss": 0.2562, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8111533586818758, |
|
"grad_norm": 0.27281906149266577, |
|
"learning_rate": 2.0957386719573224e-05, |
|
"loss": 0.2417, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8174904942965779, |
|
"grad_norm": 0.3463842809980822, |
|
"learning_rate": 1.962162554270267e-05, |
|
"loss": 0.2108, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8238276299112801, |
|
"grad_norm": 0.35511851586585724, |
|
"learning_rate": 1.83252052902961e-05, |
|
"loss": 0.2789, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8301647655259823, |
|
"grad_norm": 0.2720460231999165, |
|
"learning_rate": 1.7068760490876422e-05, |
|
"loss": 0.1886, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8365019011406845, |
|
"grad_norm": 0.323785370919597, |
|
"learning_rate": 1.5852906107115893e-05, |
|
"loss": 0.2617, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8428390367553865, |
|
"grad_norm": 0.23954436901187928, |
|
"learning_rate": 1.4678237234844649e-05, |
|
"loss": 0.2501, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.8491761723700887, |
|
"grad_norm": 0.3169133202131331, |
|
"learning_rate": 1.354532881178301e-05, |
|
"loss": 0.1988, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8555133079847909, |
|
"grad_norm": 0.3516256902668012, |
|
"learning_rate": 1.2454735336140167e-05, |
|
"loss": 0.2765, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.861850443599493, |
|
"grad_norm": 0.2923347791144997, |
|
"learning_rate": 1.1406990595216971e-05, |
|
"loss": 0.2065, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8681875792141952, |
|
"grad_norm": 0.30823497046755327, |
|
"learning_rate": 1.0402607404145449e-05, |
|
"loss": 0.2623, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8745247148288974, |
|
"grad_norm": 0.25292131222665615, |
|
"learning_rate": 9.442077354893198e-06, |
|
"loss": 0.2461, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8808618504435995, |
|
"grad_norm": 0.32414756933713984, |
|
"learning_rate": 8.525870575655392e-06, |
|
"loss": 0.217, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8871989860583016, |
|
"grad_norm": 0.3423100582401461, |
|
"learning_rate": 7.654435500752055e-06, |
|
"loss": 0.2675, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8935361216730038, |
|
"grad_norm": 0.29544160874192754, |
|
"learning_rate": 6.828198651143425e-06, |
|
"loss": 0.1927, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.899873257287706, |
|
"grad_norm": 0.32477510117980407, |
|
"learning_rate": 6.047564425670749e-06, |
|
"loss": 0.2613, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.9062103929024081, |
|
"grad_norm": 0.23676655623304554, |
|
"learning_rate": 5.312914903124566e-06, |
|
"loss": 0.2414, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.9125475285171103, |
|
"grad_norm": 0.2967989522178623, |
|
"learning_rate": 4.624609655237544e-06, |
|
"loss": 0.2002, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9188846641318125, |
|
"grad_norm": 0.340334151463555, |
|
"learning_rate": 3.982985570693354e-06, |
|
"loss": 0.2764, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9252217997465145, |
|
"grad_norm": 0.2776495387493925, |
|
"learning_rate": 3.388356690237582e-06, |
|
"loss": 0.1859, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9315589353612167, |
|
"grad_norm": 0.37073108669893173, |
|
"learning_rate": 2.84101405297158e-06, |
|
"loss": 0.2378, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9378960709759189, |
|
"grad_norm": 0.2570912693772944, |
|
"learning_rate": 2.341225553904336e-06, |
|
"loss": 0.2398, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.944233206590621, |
|
"grad_norm": 0.3306131510091784, |
|
"learning_rate": 1.8892358128322018e-06, |
|
"loss": 0.2092, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9505703422053232, |
|
"grad_norm": 0.33888565708232804, |
|
"learning_rate": 1.4852660546105234e-06, |
|
"loss": 0.2873, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9569074778200254, |
|
"grad_norm": 0.2906867586955816, |
|
"learning_rate": 1.1295140008758864e-06, |
|
"loss": 0.2079, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9632446134347274, |
|
"grad_norm": 0.34402820600782946, |
|
"learning_rate": 8.221537732719275e-07, |
|
"loss": 0.2648, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9695817490494296, |
|
"grad_norm": 0.26017431588284967, |
|
"learning_rate": 5.633358082260954e-07, |
|
"loss": 0.2481, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9759188846641318, |
|
"grad_norm": 0.3339069885340444, |
|
"learning_rate": 3.5318678331904833e-07, |
|
"loss": 0.2237, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.982256020278834, |
|
"grad_norm": 0.34754105161997045, |
|
"learning_rate": 1.9180955528270706e-07, |
|
"loss": 0.27, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9885931558935361, |
|
"grad_norm": 0.3246226295094946, |
|
"learning_rate": 7.928310965742425e-08, |
|
"loss": 0.2226, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9949302915082383, |
|
"grad_norm": 0.31594487456225406, |
|
"learning_rate": 1.5662522132742218e-08, |
|
"loss": 0.2444, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 789, |
|
"total_flos": 1.0371979095834624e+16, |
|
"train_loss": 0.2726454405050314, |
|
"train_runtime": 20804.2809, |
|
"train_samples_per_second": 1.213, |
|
"train_steps_per_second": 0.038 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 789, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0371979095834624e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|