|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99510603588907, |
|
"eval_steps": 500, |
|
"global_step": 918, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03262642740619902, |
|
"grad_norm": 0.5258967280387878, |
|
"learning_rate": 5e-06, |
|
"loss": 1.8319, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06525285481239804, |
|
"grad_norm": 0.5200194120407104, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.7689, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09787928221859707, |
|
"grad_norm": 0.56816166639328, |
|
"learning_rate": 1.45e-05, |
|
"loss": 1.7576, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13050570962479607, |
|
"grad_norm": 0.5110853910446167, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 1.6579, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1631321370309951, |
|
"grad_norm": 0.4745779037475586, |
|
"learning_rate": 2.45e-05, |
|
"loss": 1.5154, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19575856443719414, |
|
"grad_norm": 0.5604385137557983, |
|
"learning_rate": 2.95e-05, |
|
"loss": 1.3892, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22838499184339314, |
|
"grad_norm": 0.7449052333831787, |
|
"learning_rate": 3.45e-05, |
|
"loss": 1.2256, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.26101141924959215, |
|
"grad_norm": 0.7363050580024719, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.982, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2936378466557912, |
|
"grad_norm": 0.5178012847900391, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.7681, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3262642740619902, |
|
"grad_norm": 0.6690722703933716, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.7226, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35889070146818924, |
|
"grad_norm": 0.4482414424419403, |
|
"learning_rate": 4.944987775061125e-05, |
|
"loss": 0.6749, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3915171288743883, |
|
"grad_norm": 0.5283921957015991, |
|
"learning_rate": 4.883863080684597e-05, |
|
"loss": 0.6465, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42414355628058725, |
|
"grad_norm": 0.5526198744773865, |
|
"learning_rate": 4.822738386308069e-05, |
|
"loss": 0.594, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4567699836867863, |
|
"grad_norm": 0.5214644074440002, |
|
"learning_rate": 4.761613691931541e-05, |
|
"loss": 0.5579, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4893964110929853, |
|
"grad_norm": 0.5322310924530029, |
|
"learning_rate": 4.7004889975550123e-05, |
|
"loss": 0.5305, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5220228384991843, |
|
"grad_norm": 0.7314450740814209, |
|
"learning_rate": 4.6393643031784844e-05, |
|
"loss": 0.4756, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5546492659053833, |
|
"grad_norm": 0.8103362917900085, |
|
"learning_rate": 4.5782396088019564e-05, |
|
"loss": 0.4592, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5872756933115824, |
|
"grad_norm": 0.6366791725158691, |
|
"learning_rate": 4.5171149144254284e-05, |
|
"loss": 0.4786, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6199021207177814, |
|
"grad_norm": 0.6715788245201111, |
|
"learning_rate": 4.4559902200489e-05, |
|
"loss": 0.4216, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6525285481239804, |
|
"grad_norm": 0.6936819553375244, |
|
"learning_rate": 4.394865525672372e-05, |
|
"loss": 0.3964, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6851549755301795, |
|
"grad_norm": 0.8318383693695068, |
|
"learning_rate": 4.333740831295844e-05, |
|
"loss": 0.4043, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7177814029363785, |
|
"grad_norm": 0.656146228313446, |
|
"learning_rate": 4.272616136919316e-05, |
|
"loss": 0.3881, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7504078303425775, |
|
"grad_norm": 0.9217523336410522, |
|
"learning_rate": 4.211491442542788e-05, |
|
"loss": 0.3378, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7830342577487766, |
|
"grad_norm": 0.7269819378852844, |
|
"learning_rate": 4.150366748166259e-05, |
|
"loss": 0.3052, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8156606851549756, |
|
"grad_norm": 0.7469998002052307, |
|
"learning_rate": 4.089242053789731e-05, |
|
"loss": 0.301, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8482871125611745, |
|
"grad_norm": 0.7021219730377197, |
|
"learning_rate": 4.028117359413203e-05, |
|
"loss": 0.32, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8809135399673735, |
|
"grad_norm": 0.6621549725532532, |
|
"learning_rate": 3.966992665036675e-05, |
|
"loss": 0.283, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9135399673735726, |
|
"grad_norm": 1.2602078914642334, |
|
"learning_rate": 3.905867970660147e-05, |
|
"loss": 0.3075, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9461663947797716, |
|
"grad_norm": 0.8152816295623779, |
|
"learning_rate": 3.8447432762836186e-05, |
|
"loss": 0.2951, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9787928221859706, |
|
"grad_norm": 0.7038506865501404, |
|
"learning_rate": 3.783618581907091e-05, |
|
"loss": 0.2964, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0114192495921697, |
|
"grad_norm": 1.1956801414489746, |
|
"learning_rate": 3.722493887530563e-05, |
|
"loss": 0.2707, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0440456769983686, |
|
"grad_norm": 0.6589512228965759, |
|
"learning_rate": 3.661369193154035e-05, |
|
"loss": 0.2689, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0766721044045677, |
|
"grad_norm": 0.953842043876648, |
|
"learning_rate": 3.600244498777506e-05, |
|
"loss": 0.2898, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1092985318107667, |
|
"grad_norm": 0.6870063543319702, |
|
"learning_rate": 3.539119804400978e-05, |
|
"loss": 0.2836, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1419249592169658, |
|
"grad_norm": 0.9847205877304077, |
|
"learning_rate": 3.47799511002445e-05, |
|
"loss": 0.2707, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1745513866231647, |
|
"grad_norm": 1.0384355783462524, |
|
"learning_rate": 3.416870415647922e-05, |
|
"loss": 0.2578, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2071778140293639, |
|
"grad_norm": 0.7339671850204468, |
|
"learning_rate": 3.355745721271394e-05, |
|
"loss": 0.2622, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2398042414355628, |
|
"grad_norm": 0.8134469389915466, |
|
"learning_rate": 3.2946210268948655e-05, |
|
"loss": 0.2365, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.272430668841762, |
|
"grad_norm": 0.9134344458580017, |
|
"learning_rate": 3.2334963325183375e-05, |
|
"loss": 0.2586, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3050570962479608, |
|
"grad_norm": 0.7556074261665344, |
|
"learning_rate": 3.1723716381418096e-05, |
|
"loss": 0.236, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3376835236541598, |
|
"grad_norm": 1.0541133880615234, |
|
"learning_rate": 3.1112469437652816e-05, |
|
"loss": 0.2483, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.370309951060359, |
|
"grad_norm": 1.0119078159332275, |
|
"learning_rate": 3.0501222493887533e-05, |
|
"loss": 0.2462, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4029363784665578, |
|
"grad_norm": 0.9008921980857849, |
|
"learning_rate": 2.988997555012225e-05, |
|
"loss": 0.2373, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.435562805872757, |
|
"grad_norm": 0.9207481741905212, |
|
"learning_rate": 2.927872860635697e-05, |
|
"loss": 0.2504, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.468189233278956, |
|
"grad_norm": 0.8675833344459534, |
|
"learning_rate": 2.866748166259169e-05, |
|
"loss": 0.2463, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5008156606851548, |
|
"grad_norm": 1.7861591577529907, |
|
"learning_rate": 2.8056234718826407e-05, |
|
"loss": 0.2468, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.533442088091354, |
|
"grad_norm": 0.8898101449012756, |
|
"learning_rate": 2.7444987775061127e-05, |
|
"loss": 0.2254, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.566068515497553, |
|
"grad_norm": 0.9002168774604797, |
|
"learning_rate": 2.6833740831295844e-05, |
|
"loss": 0.2265, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.598694942903752, |
|
"grad_norm": 1.0384007692337036, |
|
"learning_rate": 2.6222493887530564e-05, |
|
"loss": 0.214, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.631321370309951, |
|
"grad_norm": 0.7849037647247314, |
|
"learning_rate": 2.561124694376528e-05, |
|
"loss": 0.2325, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.66394779771615, |
|
"grad_norm": 0.7699252367019653, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.2171, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.6965742251223492, |
|
"grad_norm": 0.9055914878845215, |
|
"learning_rate": 2.438875305623472e-05, |
|
"loss": 0.2322, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7292006525285482, |
|
"grad_norm": 1.4799339771270752, |
|
"learning_rate": 2.3777506112469438e-05, |
|
"loss": 0.2165, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.761827079934747, |
|
"grad_norm": 0.9675979018211365, |
|
"learning_rate": 2.316625916870416e-05, |
|
"loss": 0.2218, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.7944535073409462, |
|
"grad_norm": 1.1401549577713013, |
|
"learning_rate": 2.2555012224938875e-05, |
|
"loss": 0.2123, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8270799347471451, |
|
"grad_norm": 1.1033681631088257, |
|
"learning_rate": 2.1943765281173596e-05, |
|
"loss": 0.2285, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8597063621533443, |
|
"grad_norm": 1.0548712015151978, |
|
"learning_rate": 2.1332518337408312e-05, |
|
"loss": 0.2136, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8923327895595432, |
|
"grad_norm": 1.5485633611679077, |
|
"learning_rate": 2.0721271393643033e-05, |
|
"loss": 0.2055, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9249592169657421, |
|
"grad_norm": 0.9844083786010742, |
|
"learning_rate": 2.0110024449877753e-05, |
|
"loss": 0.2142, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9575856443719413, |
|
"grad_norm": 1.1740403175354004, |
|
"learning_rate": 1.949877750611247e-05, |
|
"loss": 0.1948, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9902120717781404, |
|
"grad_norm": 0.9265509843826294, |
|
"learning_rate": 1.888753056234719e-05, |
|
"loss": 0.2142, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.0228384991843393, |
|
"grad_norm": 0.8141701221466064, |
|
"learning_rate": 1.8276283618581907e-05, |
|
"loss": 0.2023, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0554649265905383, |
|
"grad_norm": 0.7996273040771484, |
|
"learning_rate": 1.7665036674816627e-05, |
|
"loss": 0.1924, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.088091353996737, |
|
"grad_norm": 0.9325422048568726, |
|
"learning_rate": 1.7053789731051344e-05, |
|
"loss": 0.1887, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1207177814029365, |
|
"grad_norm": 0.9285069108009338, |
|
"learning_rate": 1.6442542787286064e-05, |
|
"loss": 0.1891, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.1533442088091355, |
|
"grad_norm": 1.1411644220352173, |
|
"learning_rate": 1.583129584352078e-05, |
|
"loss": 0.1898, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.1859706362153344, |
|
"grad_norm": 0.8324933052062988, |
|
"learning_rate": 1.5220048899755501e-05, |
|
"loss": 0.1854, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.2185970636215333, |
|
"grad_norm": 1.0230185985565186, |
|
"learning_rate": 1.460880195599022e-05, |
|
"loss": 0.1909, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.2512234910277327, |
|
"grad_norm": 1.1167818307876587, |
|
"learning_rate": 1.3997555012224938e-05, |
|
"loss": 0.2023, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.2838499184339316, |
|
"grad_norm": 0.8201693892478943, |
|
"learning_rate": 1.3386308068459657e-05, |
|
"loss": 0.1928, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3164763458401305, |
|
"grad_norm": 1.2748645544052124, |
|
"learning_rate": 1.2775061124694377e-05, |
|
"loss": 0.2043, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.3491027732463294, |
|
"grad_norm": 0.9501346945762634, |
|
"learning_rate": 1.2163814180929096e-05, |
|
"loss": 0.1914, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.3817292006525284, |
|
"grad_norm": 0.8706419467926025, |
|
"learning_rate": 1.1552567237163816e-05, |
|
"loss": 0.1763, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.4143556280587277, |
|
"grad_norm": 1.8394954204559326, |
|
"learning_rate": 1.0941320293398534e-05, |
|
"loss": 0.1856, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.4469820554649266, |
|
"grad_norm": 0.9215448498725891, |
|
"learning_rate": 1.0330073349633253e-05, |
|
"loss": 0.181, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.4796084828711256, |
|
"grad_norm": 0.8978357911109924, |
|
"learning_rate": 9.718826405867972e-06, |
|
"loss": 0.18, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.5122349102773245, |
|
"grad_norm": 0.9564265608787537, |
|
"learning_rate": 9.10757946210269e-06, |
|
"loss": 0.1961, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.544861337683524, |
|
"grad_norm": 1.1424188613891602, |
|
"learning_rate": 8.496332518337409e-06, |
|
"loss": 0.1812, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.5774877650897228, |
|
"grad_norm": 1.0267794132232666, |
|
"learning_rate": 7.885085574572127e-06, |
|
"loss": 0.1861, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6101141924959217, |
|
"grad_norm": 0.8952123522758484, |
|
"learning_rate": 7.273838630806847e-06, |
|
"loss": 0.1716, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.6427406199021206, |
|
"grad_norm": 1.2367416620254517, |
|
"learning_rate": 6.662591687041565e-06, |
|
"loss": 0.1863, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.6753670473083195, |
|
"grad_norm": 1.0904302597045898, |
|
"learning_rate": 6.051344743276284e-06, |
|
"loss": 0.1823, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.707993474714519, |
|
"grad_norm": 1.0542256832122803, |
|
"learning_rate": 5.440097799511003e-06, |
|
"loss": 0.1764, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.740619902120718, |
|
"grad_norm": 0.8970702886581421, |
|
"learning_rate": 4.828850855745722e-06, |
|
"loss": 0.1866, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.7732463295269167, |
|
"grad_norm": 0.8508768081665039, |
|
"learning_rate": 4.21760391198044e-06, |
|
"loss": 0.1791, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.8058727569331157, |
|
"grad_norm": 0.9417251944541931, |
|
"learning_rate": 3.606356968215159e-06, |
|
"loss": 0.18, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.838499184339315, |
|
"grad_norm": 0.8804249167442322, |
|
"learning_rate": 2.9951100244498777e-06, |
|
"loss": 0.1844, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.871125611745514, |
|
"grad_norm": 0.8565665483474731, |
|
"learning_rate": 2.3838630806845967e-06, |
|
"loss": 0.1781, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.903752039151713, |
|
"grad_norm": 0.9489305019378662, |
|
"learning_rate": 1.7726161369193154e-06, |
|
"loss": 0.1809, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.936378466557912, |
|
"grad_norm": 0.7006880044937134, |
|
"learning_rate": 1.1613691931540342e-06, |
|
"loss": 0.1712, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.9690048939641107, |
|
"grad_norm": 0.8745304942131042, |
|
"learning_rate": 5.501222493887531e-07, |
|
"loss": 0.1773, |
|
"step": 910 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 918, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.1375621636321444e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|