{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99510603588907, "eval_steps": 500, "global_step": 918, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03262642740619902, "grad_norm": 0.5258967280387878, "learning_rate": 5e-06, "loss": 1.8319, "step": 10 }, { "epoch": 0.06525285481239804, "grad_norm": 0.5200194120407104, "learning_rate": 9.5e-06, "loss": 1.7689, "step": 20 }, { "epoch": 0.09787928221859707, "grad_norm": 0.56816166639328, "learning_rate": 1.45e-05, "loss": 1.7576, "step": 30 }, { "epoch": 0.13050570962479607, "grad_norm": 0.5110853910446167, "learning_rate": 1.9500000000000003e-05, "loss": 1.6579, "step": 40 }, { "epoch": 0.1631321370309951, "grad_norm": 0.4745779037475586, "learning_rate": 2.45e-05, "loss": 1.5154, "step": 50 }, { "epoch": 0.19575856443719414, "grad_norm": 0.5604385137557983, "learning_rate": 2.95e-05, "loss": 1.3892, "step": 60 }, { "epoch": 0.22838499184339314, "grad_norm": 0.7449052333831787, "learning_rate": 3.45e-05, "loss": 1.2256, "step": 70 }, { "epoch": 0.26101141924959215, "grad_norm": 0.7363050580024719, "learning_rate": 3.9500000000000005e-05, "loss": 0.982, "step": 80 }, { "epoch": 0.2936378466557912, "grad_norm": 0.5178012847900391, "learning_rate": 4.4500000000000004e-05, "loss": 0.7681, "step": 90 }, { "epoch": 0.3262642740619902, "grad_norm": 0.6690722703933716, "learning_rate": 4.9500000000000004e-05, "loss": 0.7226, "step": 100 }, { "epoch": 0.35889070146818924, "grad_norm": 0.4482414424419403, "learning_rate": 4.944987775061125e-05, "loss": 0.6749, "step": 110 }, { "epoch": 0.3915171288743883, "grad_norm": 0.5283921957015991, "learning_rate": 4.883863080684597e-05, "loss": 0.6465, "step": 120 }, { "epoch": 0.42414355628058725, "grad_norm": 0.5526198744773865, "learning_rate": 4.822738386308069e-05, "loss": 0.594, "step": 130 }, { "epoch": 0.4567699836867863, "grad_norm": 0.5214644074440002, "learning_rate": 4.761613691931541e-05, "loss": 0.5579, "step": 140 }, { "epoch": 0.4893964110929853, "grad_norm": 0.5322310924530029, "learning_rate": 4.7004889975550123e-05, "loss": 0.5305, "step": 150 }, { "epoch": 0.5220228384991843, "grad_norm": 0.7314450740814209, "learning_rate": 4.6393643031784844e-05, "loss": 0.4756, "step": 160 }, { "epoch": 0.5546492659053833, "grad_norm": 0.8103362917900085, "learning_rate": 4.5782396088019564e-05, "loss": 0.4592, "step": 170 }, { "epoch": 0.5872756933115824, "grad_norm": 0.6366791725158691, "learning_rate": 4.5171149144254284e-05, "loss": 0.4786, "step": 180 }, { "epoch": 0.6199021207177814, "grad_norm": 0.6715788245201111, "learning_rate": 4.4559902200489e-05, "loss": 0.4216, "step": 190 }, { "epoch": 0.6525285481239804, "grad_norm": 0.6936819553375244, "learning_rate": 4.394865525672372e-05, "loss": 0.3964, "step": 200 }, { "epoch": 0.6851549755301795, "grad_norm": 0.8318383693695068, "learning_rate": 4.333740831295844e-05, "loss": 0.4043, "step": 210 }, { "epoch": 0.7177814029363785, "grad_norm": 0.656146228313446, "learning_rate": 4.272616136919316e-05, "loss": 0.3881, "step": 220 }, { "epoch": 0.7504078303425775, "grad_norm": 0.9217523336410522, "learning_rate": 4.211491442542788e-05, "loss": 0.3378, "step": 230 }, { "epoch": 0.7830342577487766, "grad_norm": 0.7269819378852844, "learning_rate": 4.150366748166259e-05, "loss": 0.3052, "step": 240 }, { "epoch": 0.8156606851549756, "grad_norm": 0.7469998002052307, "learning_rate": 4.089242053789731e-05, "loss": 0.301, "step": 250 }, { "epoch": 0.8482871125611745, "grad_norm": 0.7021219730377197, "learning_rate": 4.028117359413203e-05, "loss": 0.32, "step": 260 }, { "epoch": 0.8809135399673735, "grad_norm": 0.6621549725532532, "learning_rate": 3.966992665036675e-05, "loss": 0.283, "step": 270 }, { "epoch": 0.9135399673735726, "grad_norm": 1.2602078914642334, "learning_rate": 3.905867970660147e-05, "loss": 0.3075, "step": 280 }, { "epoch": 0.9461663947797716, "grad_norm": 0.8152816295623779, "learning_rate": 3.8447432762836186e-05, "loss": 0.2951, "step": 290 }, { "epoch": 0.9787928221859706, "grad_norm": 0.7038506865501404, "learning_rate": 3.783618581907091e-05, "loss": 0.2964, "step": 300 }, { "epoch": 1.0114192495921697, "grad_norm": 1.1956801414489746, "learning_rate": 3.722493887530563e-05, "loss": 0.2707, "step": 310 }, { "epoch": 1.0440456769983686, "grad_norm": 0.6589512228965759, "learning_rate": 3.661369193154035e-05, "loss": 0.2689, "step": 320 }, { "epoch": 1.0766721044045677, "grad_norm": 0.953842043876648, "learning_rate": 3.600244498777506e-05, "loss": 0.2898, "step": 330 }, { "epoch": 1.1092985318107667, "grad_norm": 0.6870063543319702, "learning_rate": 3.539119804400978e-05, "loss": 0.2836, "step": 340 }, { "epoch": 1.1419249592169658, "grad_norm": 0.9847205877304077, "learning_rate": 3.47799511002445e-05, "loss": 0.2707, "step": 350 }, { "epoch": 1.1745513866231647, "grad_norm": 1.0384355783462524, "learning_rate": 3.416870415647922e-05, "loss": 0.2578, "step": 360 }, { "epoch": 1.2071778140293639, "grad_norm": 0.7339671850204468, "learning_rate": 3.355745721271394e-05, "loss": 0.2622, "step": 370 }, { "epoch": 1.2398042414355628, "grad_norm": 0.8134469389915466, "learning_rate": 3.2946210268948655e-05, "loss": 0.2365, "step": 380 }, { "epoch": 1.272430668841762, "grad_norm": 0.9134344458580017, "learning_rate": 3.2334963325183375e-05, "loss": 0.2586, "step": 390 }, { "epoch": 1.3050570962479608, "grad_norm": 0.7556074261665344, "learning_rate": 3.1723716381418096e-05, "loss": 0.236, "step": 400 }, { "epoch": 1.3376835236541598, "grad_norm": 1.0541133880615234, "learning_rate": 3.1112469437652816e-05, "loss": 0.2483, "step": 410 }, { "epoch": 1.370309951060359, "grad_norm": 1.0119078159332275, "learning_rate": 3.0501222493887533e-05, "loss": 0.2462, "step": 420 }, { "epoch": 1.4029363784665578, "grad_norm": 0.9008921980857849, "learning_rate": 2.988997555012225e-05, "loss": 0.2373, "step": 430 }, { "epoch": 1.435562805872757, "grad_norm": 0.9207481741905212, "learning_rate": 2.927872860635697e-05, "loss": 0.2504, "step": 440 }, { "epoch": 1.468189233278956, "grad_norm": 0.8675833344459534, "learning_rate": 2.866748166259169e-05, "loss": 0.2463, "step": 450 }, { "epoch": 1.5008156606851548, "grad_norm": 1.7861591577529907, "learning_rate": 2.8056234718826407e-05, "loss": 0.2468, "step": 460 }, { "epoch": 1.533442088091354, "grad_norm": 0.8898101449012756, "learning_rate": 2.7444987775061127e-05, "loss": 0.2254, "step": 470 }, { "epoch": 1.566068515497553, "grad_norm": 0.9002168774604797, "learning_rate": 2.6833740831295844e-05, "loss": 0.2265, "step": 480 }, { "epoch": 1.598694942903752, "grad_norm": 1.0384007692337036, "learning_rate": 2.6222493887530564e-05, "loss": 0.214, "step": 490 }, { "epoch": 1.631321370309951, "grad_norm": 0.7849037647247314, "learning_rate": 2.561124694376528e-05, "loss": 0.2325, "step": 500 }, { "epoch": 1.66394779771615, "grad_norm": 0.7699252367019653, "learning_rate": 2.5e-05, "loss": 0.2171, "step": 510 }, { "epoch": 1.6965742251223492, "grad_norm": 0.9055914878845215, "learning_rate": 2.438875305623472e-05, "loss": 0.2322, "step": 520 }, { "epoch": 1.7292006525285482, "grad_norm": 1.4799339771270752, "learning_rate": 2.3777506112469438e-05, "loss": 0.2165, "step": 530 }, { "epoch": 1.761827079934747, "grad_norm": 0.9675979018211365, "learning_rate": 2.316625916870416e-05, "loss": 0.2218, "step": 540 }, { "epoch": 1.7944535073409462, "grad_norm": 1.1401549577713013, "learning_rate": 2.2555012224938875e-05, "loss": 0.2123, "step": 550 }, { "epoch": 1.8270799347471451, "grad_norm": 1.1033681631088257, "learning_rate": 2.1943765281173596e-05, "loss": 0.2285, "step": 560 }, { "epoch": 1.8597063621533443, "grad_norm": 1.0548712015151978, "learning_rate": 2.1332518337408312e-05, "loss": 0.2136, "step": 570 }, { "epoch": 1.8923327895595432, "grad_norm": 1.5485633611679077, "learning_rate": 2.0721271393643033e-05, "loss": 0.2055, "step": 580 }, { "epoch": 1.9249592169657421, "grad_norm": 0.9844083786010742, "learning_rate": 2.0110024449877753e-05, "loss": 0.2142, "step": 590 }, { "epoch": 1.9575856443719413, "grad_norm": 1.1740403175354004, "learning_rate": 1.949877750611247e-05, "loss": 0.1948, "step": 600 }, { "epoch": 1.9902120717781404, "grad_norm": 0.9265509843826294, "learning_rate": 1.888753056234719e-05, "loss": 0.2142, "step": 610 }, { "epoch": 2.0228384991843393, "grad_norm": 0.8141701221466064, "learning_rate": 1.8276283618581907e-05, "loss": 0.2023, "step": 620 }, { "epoch": 2.0554649265905383, "grad_norm": 0.7996273040771484, "learning_rate": 1.7665036674816627e-05, "loss": 0.1924, "step": 630 }, { "epoch": 2.088091353996737, "grad_norm": 0.9325422048568726, "learning_rate": 1.7053789731051344e-05, "loss": 0.1887, "step": 640 }, { "epoch": 2.1207177814029365, "grad_norm": 0.9285069108009338, "learning_rate": 1.6442542787286064e-05, "loss": 0.1891, "step": 650 }, { "epoch": 2.1533442088091355, "grad_norm": 1.1411644220352173, "learning_rate": 1.583129584352078e-05, "loss": 0.1898, "step": 660 }, { "epoch": 2.1859706362153344, "grad_norm": 0.8324933052062988, "learning_rate": 1.5220048899755501e-05, "loss": 0.1854, "step": 670 }, { "epoch": 2.2185970636215333, "grad_norm": 1.0230185985565186, "learning_rate": 1.460880195599022e-05, "loss": 0.1909, "step": 680 }, { "epoch": 2.2512234910277327, "grad_norm": 1.1167818307876587, "learning_rate": 1.3997555012224938e-05, "loss": 0.2023, "step": 690 }, { "epoch": 2.2838499184339316, "grad_norm": 0.8201693892478943, "learning_rate": 1.3386308068459657e-05, "loss": 0.1928, "step": 700 }, { "epoch": 2.3164763458401305, "grad_norm": 1.2748645544052124, "learning_rate": 1.2775061124694377e-05, "loss": 0.2043, "step": 710 }, { "epoch": 2.3491027732463294, "grad_norm": 0.9501346945762634, "learning_rate": 1.2163814180929096e-05, "loss": 0.1914, "step": 720 }, { "epoch": 2.3817292006525284, "grad_norm": 0.8706419467926025, "learning_rate": 1.1552567237163816e-05, "loss": 0.1763, "step": 730 }, { "epoch": 2.4143556280587277, "grad_norm": 1.8394954204559326, "learning_rate": 1.0941320293398534e-05, "loss": 0.1856, "step": 740 }, { "epoch": 2.4469820554649266, "grad_norm": 0.9215448498725891, "learning_rate": 1.0330073349633253e-05, "loss": 0.181, "step": 750 }, { "epoch": 2.4796084828711256, "grad_norm": 0.8978357911109924, "learning_rate": 9.718826405867972e-06, "loss": 0.18, "step": 760 }, { "epoch": 2.5122349102773245, "grad_norm": 0.9564265608787537, "learning_rate": 9.10757946210269e-06, "loss": 0.1961, "step": 770 }, { "epoch": 2.544861337683524, "grad_norm": 1.1424188613891602, "learning_rate": 8.496332518337409e-06, "loss": 0.1812, "step": 780 }, { "epoch": 2.5774877650897228, "grad_norm": 1.0267794132232666, "learning_rate": 7.885085574572127e-06, "loss": 0.1861, "step": 790 }, { "epoch": 2.6101141924959217, "grad_norm": 0.8952123522758484, "learning_rate": 7.273838630806847e-06, "loss": 0.1716, "step": 800 }, { "epoch": 2.6427406199021206, "grad_norm": 1.2367416620254517, "learning_rate": 6.662591687041565e-06, "loss": 0.1863, "step": 810 }, { "epoch": 2.6753670473083195, "grad_norm": 1.0904302597045898, "learning_rate": 6.051344743276284e-06, "loss": 0.1823, "step": 820 }, { "epoch": 2.707993474714519, "grad_norm": 1.0542256832122803, "learning_rate": 5.440097799511003e-06, "loss": 0.1764, "step": 830 }, { "epoch": 2.740619902120718, "grad_norm": 0.8970702886581421, "learning_rate": 4.828850855745722e-06, "loss": 0.1866, "step": 840 }, { "epoch": 2.7732463295269167, "grad_norm": 0.8508768081665039, "learning_rate": 4.21760391198044e-06, "loss": 0.1791, "step": 850 }, { "epoch": 2.8058727569331157, "grad_norm": 0.9417251944541931, "learning_rate": 3.606356968215159e-06, "loss": 0.18, "step": 860 }, { "epoch": 2.838499184339315, "grad_norm": 0.8804249167442322, "learning_rate": 2.9951100244498777e-06, "loss": 0.1844, "step": 870 }, { "epoch": 2.871125611745514, "grad_norm": 0.8565665483474731, "learning_rate": 2.3838630806845967e-06, "loss": 0.1781, "step": 880 }, { "epoch": 2.903752039151713, "grad_norm": 0.9489305019378662, "learning_rate": 1.7726161369193154e-06, "loss": 0.1809, "step": 890 }, { "epoch": 2.936378466557912, "grad_norm": 0.7006880044937134, "learning_rate": 1.1613691931540342e-06, "loss": 0.1712, "step": 900 }, { "epoch": 2.9690048939641107, "grad_norm": 0.8745304942131042, "learning_rate": 5.501222493887531e-07, "loss": 0.1773, "step": 910 } ], "logging_steps": 10, "max_steps": 918, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1375621636321444e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }