{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9971181556195967, "eval_steps": 500, "global_step": 780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03842459173871278, "grad_norm": 0.7461163997650146, "learning_rate": 8.333333333333334e-05, "loss": 1.2598, "step": 10 }, { "epoch": 0.07684918347742556, "grad_norm": 0.25502100586891174, "learning_rate": 0.0001666666666666667, "loss": 0.7563, "step": 20 }, { "epoch": 0.11527377521613832, "grad_norm": 0.14209185540676117, "learning_rate": 0.00019996891820008164, "loss": 0.63, "step": 30 }, { "epoch": 0.15369836695485112, "grad_norm": 0.10494557023048401, "learning_rate": 0.0001997790438338385, "loss": 0.5646, "step": 40 }, { "epoch": 0.19212295869356388, "grad_norm": 0.11090180277824402, "learning_rate": 0.0001994168902089112, "loss": 0.5158, "step": 50 }, { "epoch": 0.23054755043227665, "grad_norm": 0.09561982750892639, "learning_rate": 0.00019888308262251285, "loss": 0.5171, "step": 60 }, { "epoch": 0.2689721421709894, "grad_norm": 0.13890881836414337, "learning_rate": 0.0001981785427508966, "loss": 0.5013, "step": 70 }, { "epoch": 0.30739673390970224, "grad_norm": 0.09685279428958893, "learning_rate": 0.00019730448705798239, "loss": 0.4803, "step": 80 }, { "epoch": 0.345821325648415, "grad_norm": 0.0978529155254364, "learning_rate": 0.0001962624246950012, "loss": 0.4824, "step": 90 }, { "epoch": 0.38424591738712777, "grad_norm": 0.09823426604270935, "learning_rate": 0.0001950541548947829, "loss": 0.4765, "step": 100 }, { "epoch": 0.42267050912584053, "grad_norm": 0.11479681730270386, "learning_rate": 0.0001936817638651871, "loss": 0.4804, "step": 110 }, { "epoch": 0.4610951008645533, "grad_norm": 0.1102244183421135, "learning_rate": 0.00019214762118704076, "loss": 0.4735, "step": 120 }, { "epoch": 0.49951969260326606, "grad_norm": 0.09442220628261566, "learning_rate": 0.00019045437572280194, "loss": 0.4654, "step": 130 }, { "epoch": 0.5379442843419788, "grad_norm": 0.0998912900686264, "learning_rate": 0.00018860495104301345, "loss": 0.4714, "step": 140 }, { "epoch": 0.5763688760806917, "grad_norm": 0.12593407928943634, "learning_rate": 0.00018660254037844388, "loss": 0.4652, "step": 150 }, { "epoch": 0.6147934678194045, "grad_norm": 0.10841673612594604, "learning_rate": 0.0001844506011066308, "loss": 0.4633, "step": 160 }, { "epoch": 0.6532180595581172, "grad_norm": 0.09892784804105759, "learning_rate": 0.00018215284878234642, "loss": 0.461, "step": 170 }, { "epoch": 0.69164265129683, "grad_norm": 0.5387171506881714, "learning_rate": 0.00017971325072229226, "loss": 0.4591, "step": 180 }, { "epoch": 0.7300672430355427, "grad_norm": 0.11192867159843445, "learning_rate": 0.0001771360191551, "loss": 0.4592, "step": 190 }, { "epoch": 0.7684918347742555, "grad_norm": 0.10694364458322525, "learning_rate": 0.00017442560394846516, "loss": 0.4574, "step": 200 }, { "epoch": 0.8069164265129684, "grad_norm": 0.10873424261808395, "learning_rate": 0.00017158668492597186, "loss": 0.4492, "step": 210 }, { "epoch": 0.8453410182516811, "grad_norm": 0.11694315820932388, "learning_rate": 0.0001686241637868734, "loss": 0.4467, "step": 220 }, { "epoch": 0.8837656099903939, "grad_norm": 0.10100408643484116, "learning_rate": 0.000165543155642781, "loss": 0.4488, "step": 230 }, { "epoch": 0.9221902017291066, "grad_norm": 0.10397649556398392, "learning_rate": 0.00016234898018587337, "loss": 0.447, "step": 240 }, { "epoch": 0.9606147934678194, "grad_norm": 0.10007993876934052, "learning_rate": 0.00015904715250387498, "loss": 0.4428, "step": 250 }, { "epoch": 0.9990393852065321, "grad_norm": 0.10865867137908936, "learning_rate": 0.00015564337355766412, "loss": 0.4452, "step": 260 }, { "epoch": 1.037463976945245, "grad_norm": 0.10476306080818176, "learning_rate": 0.0001521435203379498, "loss": 0.4367, "step": 270 }, { "epoch": 1.0758885686839577, "grad_norm": 0.10958375781774521, "learning_rate": 0.00014855363571801523, "loss": 0.4336, "step": 280 }, { "epoch": 1.1143131604226706, "grad_norm": 0.11801016330718994, "learning_rate": 0.00014487991802004623, "loss": 0.4346, "step": 290 }, { "epoch": 1.1527377521613833, "grad_norm": 0.11526134610176086, "learning_rate": 0.00014112871031306119, "loss": 0.4221, "step": 300 }, { "epoch": 1.191162343900096, "grad_norm": 0.10705429315567017, "learning_rate": 0.0001373064894609194, "loss": 0.4363, "step": 310 }, { "epoch": 1.229586935638809, "grad_norm": 0.09906008094549179, "learning_rate": 0.00013341985493931877, "loss": 0.4359, "step": 320 }, { "epoch": 1.2680115273775217, "grad_norm": 0.11935935914516449, "learning_rate": 0.00012947551744109043, "loss": 0.429, "step": 330 }, { "epoch": 1.3064361191162344, "grad_norm": 0.11651390045881271, "learning_rate": 0.0001254802872894655, "loss": 0.4295, "step": 340 }, { "epoch": 1.344860710854947, "grad_norm": 0.13374051451683044, "learning_rate": 0.00012144106267931876, "loss": 0.43, "step": 350 }, { "epoch": 1.38328530259366, "grad_norm": 0.10709749907255173, "learning_rate": 0.00011736481776669306, "loss": 0.4229, "step": 360 }, { "epoch": 1.4217098943323727, "grad_norm": 0.10747699439525604, "learning_rate": 0.00011325859062716795, "loss": 0.4255, "step": 370 }, { "epoch": 1.4601344860710854, "grad_norm": 0.1302700638771057, "learning_rate": 0.00010912947110386484, "loss": 0.4314, "step": 380 }, { "epoch": 1.4985590778097984, "grad_norm": 0.10743537545204163, "learning_rate": 0.00010498458856606972, "loss": 0.4242, "step": 390 }, { "epoch": 1.536983669548511, "grad_norm": 0.11519400030374527, "learning_rate": 0.00010083109959960973, "loss": 0.4216, "step": 400 }, { "epoch": 1.5754082612872238, "grad_norm": 0.11456304788589478, "learning_rate": 9.667617565023735e-05, "loss": 0.4315, "step": 410 }, { "epoch": 1.6138328530259365, "grad_norm": 0.10759314894676208, "learning_rate": 9.252699064135758e-05, "loss": 0.4199, "step": 420 }, { "epoch": 1.6522574447646494, "grad_norm": 0.1024770587682724, "learning_rate": 8.839070858747697e-05, "loss": 0.4272, "step": 430 }, { "epoch": 1.6906820365033621, "grad_norm": 0.10736548155546188, "learning_rate": 8.427447122476148e-05, "loss": 0.4232, "step": 440 }, { "epoch": 1.729106628242075, "grad_norm": 0.1060362458229065, "learning_rate": 8.018538568006027e-05, "loss": 0.4237, "step": 450 }, { "epoch": 1.7675312199807878, "grad_norm": 0.10463803261518478, "learning_rate": 7.613051219968623e-05, "loss": 0.4247, "step": 460 }, { "epoch": 1.8059558117195005, "grad_norm": 0.10327400267124176, "learning_rate": 7.211685195914097e-05, "loss": 0.4196, "step": 470 }, { "epoch": 1.8443804034582132, "grad_norm": 0.13895417749881744, "learning_rate": 6.815133497483157e-05, "loss": 0.4205, "step": 480 }, { "epoch": 1.882804995196926, "grad_norm": 0.10684759169816971, "learning_rate": 6.424080813865138e-05, "loss": 0.4224, "step": 490 }, { "epoch": 1.9212295869356388, "grad_norm": 0.13927388191223145, "learning_rate": 6.039202339608432e-05, "loss": 0.4196, "step": 500 }, { "epoch": 1.9596541786743515, "grad_norm": 0.10447521507740021, "learning_rate": 5.6611626088244194e-05, "loss": 0.4225, "step": 510 }, { "epoch": 1.9980787704130645, "grad_norm": 0.10929688066244125, "learning_rate": 5.290614347797802e-05, "loss": 0.4254, "step": 520 }, { "epoch": 2.036503362151777, "grad_norm": 0.10282639414072037, "learning_rate": 4.92819734798441e-05, "loss": 0.4115, "step": 530 }, { "epoch": 2.07492795389049, "grad_norm": 0.10815539956092834, "learning_rate": 4.574537361342407e-05, "loss": 0.4162, "step": 540 }, { "epoch": 2.1133525456292026, "grad_norm": 0.1221628338098526, "learning_rate": 4.23024501990417e-05, "loss": 0.4056, "step": 550 }, { "epoch": 2.1517771373679153, "grad_norm": 0.1123044565320015, "learning_rate": 3.89591478145437e-05, "loss": 0.4089, "step": 560 }, { "epoch": 2.1902017291066285, "grad_norm": 0.11408425867557526, "learning_rate": 3.5721239031346066e-05, "loss": 0.4113, "step": 570 }, { "epoch": 2.228626320845341, "grad_norm": 0.10744322091341019, "learning_rate": 3.259431444746846e-05, "loss": 0.4018, "step": 580 }, { "epoch": 2.267050912584054, "grad_norm": 0.11307461559772491, "learning_rate": 2.9583773034764826e-05, "loss": 0.405, "step": 590 }, { "epoch": 2.3054755043227666, "grad_norm": 0.11285313963890076, "learning_rate": 2.66948128170174e-05, "loss": 0.3995, "step": 600 }, { "epoch": 2.3439000960614793, "grad_norm": 0.15918630361557007, "learning_rate": 2.3932421894989167e-05, "loss": 0.4027, "step": 610 }, { "epoch": 2.382324687800192, "grad_norm": 0.11159035563468933, "learning_rate": 2.1301369833931117e-05, "loss": 0.4082, "step": 620 }, { "epoch": 2.4207492795389047, "grad_norm": 0.11359286308288574, "learning_rate": 1.880619942841435e-05, "loss": 0.4054, "step": 630 }, { "epoch": 2.459173871277618, "grad_norm": 0.11326448619365692, "learning_rate": 1.6451218858706374e-05, "loss": 0.4064, "step": 640 }, { "epoch": 2.4975984630163306, "grad_norm": 0.11316490173339844, "learning_rate": 1.4240494252234049e-05, "loss": 0.4111, "step": 650 }, { "epoch": 2.5360230547550433, "grad_norm": 0.11959807574748993, "learning_rate": 1.2177842662977135e-05, "loss": 0.4078, "step": 660 }, { "epoch": 2.574447646493756, "grad_norm": 0.10980040580034256, "learning_rate": 1.0266825480913611e-05, "loss": 0.408, "step": 670 }, { "epoch": 2.6128722382324687, "grad_norm": 0.11757558584213257, "learning_rate": 8.510742282896544e-06, "loss": 0.4019, "step": 680 }, { "epoch": 2.6512968299711814, "grad_norm": 0.11953994631767273, "learning_rate": 6.9126251355795864e-06, "loss": 0.4037, "step": 690 }, { "epoch": 2.689721421709894, "grad_norm": 0.13467754423618317, "learning_rate": 5.475233360227527e-06, "loss": 0.404, "step": 700 }, { "epoch": 2.7281460134486073, "grad_norm": 0.11414311081171036, "learning_rate": 4.20104876845111e-06, "loss": 0.4043, "step": 710 }, { "epoch": 2.76657060518732, "grad_norm": 0.11653583496809006, "learning_rate": 3.092271377092215e-06, "loss": 0.4036, "step": 720 }, { "epoch": 2.8049951969260327, "grad_norm": 0.1183491125702858, "learning_rate": 2.150815609657875e-06, "loss": 0.4053, "step": 730 }, { "epoch": 2.8434197886647454, "grad_norm": 0.142572820186615, "learning_rate": 1.378306990862177e-06, "loss": 0.4037, "step": 740 }, { "epoch": 2.881844380403458, "grad_norm": 0.11191695928573608, "learning_rate": 7.760793399827937e-07, "loss": 0.4035, "step": 750 }, { "epoch": 2.920268972142171, "grad_norm": 0.11203925311565399, "learning_rate": 3.451724678784518e-07, "loss": 0.409, "step": 760 }, { "epoch": 2.9586935638808836, "grad_norm": 0.11161793023347855, "learning_rate": 8.633038164358454e-08, "loss": 0.3982, "step": 770 }, { "epoch": 2.9971181556195967, "grad_norm": 0.11390741169452667, "learning_rate": 0.0, "loss": 0.4073, "step": 780 } ], "logging_steps": 10, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.794517495096279e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }