{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.807339449541285, "eval_steps": 500, "global_step": 1080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08154943934760449, "grad_norm": 12.287543296813965, "learning_rate": 9.259259259259259e-07, "loss": 1.1833, "step": 10 }, { "epoch": 0.16309887869520898, "grad_norm": 11.78052043914795, "learning_rate": 1.8518518518518519e-06, "loss": 1.0856, "step": 20 }, { "epoch": 0.24464831804281345, "grad_norm": 15.526741981506348, "learning_rate": 2.7777777777777783e-06, "loss": 1.0896, "step": 30 }, { "epoch": 0.32619775739041795, "grad_norm": 8.213525772094727, "learning_rate": 3.7037037037037037e-06, "loss": 0.9945, "step": 40 }, { "epoch": 0.4077471967380224, "grad_norm": 12.257412910461426, "learning_rate": 4.62962962962963e-06, "loss": 1.0562, "step": 50 }, { "epoch": 0.4892966360856269, "grad_norm": 8.546279907226562, "learning_rate": 5.555555555555557e-06, "loss": 1.0866, "step": 60 }, { "epoch": 0.5708460754332314, "grad_norm": 13.490253448486328, "learning_rate": 6.481481481481482e-06, "loss": 1.151, "step": 70 }, { "epoch": 0.6523955147808359, "grad_norm": 19.235937118530273, "learning_rate": 7.4074074074074075e-06, "loss": 0.9591, "step": 80 }, { "epoch": 0.7339449541284404, "grad_norm": 6.614363193511963, "learning_rate": 8.333333333333334e-06, "loss": 1.1466, "step": 90 }, { "epoch": 0.8154943934760448, "grad_norm": 9.026179313659668, "learning_rate": 9.25925925925926e-06, "loss": 1.0284, "step": 100 }, { "epoch": 0.8970438328236493, "grad_norm": 10.798940658569336, "learning_rate": 9.999895536228031e-06, "loss": 1.2601, "step": 110 }, { "epoch": 0.9785932721712538, "grad_norm": 9.451533317565918, "learning_rate": 9.996239762521152e-06, "loss": 1.1091, "step": 120 }, { "epoch": 1.0601427115188584, "grad_norm": 6.230576992034912, "learning_rate": 9.987365164467767e-06, "loss": 0.9699, "step": 130 }, { "epoch": 1.1416921508664628, "grad_norm": 7.345333099365234, "learning_rate": 9.973281012033009e-06, "loss": 0.8974, "step": 140 }, { "epoch": 1.2232415902140672, "grad_norm": 8.226116180419922, "learning_rate": 9.954002016824226e-06, "loss": 0.9152, "step": 150 }, { "epoch": 1.3047910295616718, "grad_norm": 5.226821422576904, "learning_rate": 9.929548316723983e-06, "loss": 0.8205, "step": 160 }, { "epoch": 1.3863404689092762, "grad_norm": 14.963723182678223, "learning_rate": 9.899945454855007e-06, "loss": 0.852, "step": 170 }, { "epoch": 1.4678899082568808, "grad_norm": 6.257207870483398, "learning_rate": 9.86522435289912e-06, "loss": 0.7172, "step": 180 }, { "epoch": 1.5494393476044852, "grad_norm": 5.380673885345459, "learning_rate": 9.825421278797984e-06, "loss": 0.7586, "step": 190 }, { "epoch": 1.6309887869520896, "grad_norm": 6.626143455505371, "learning_rate": 9.7805778088694e-06, "loss": 0.802, "step": 200 }, { "epoch": 1.7125382262996942, "grad_norm": 5.537201881408691, "learning_rate": 9.730740784378755e-06, "loss": 0.8788, "step": 210 }, { "epoch": 1.7940876656472988, "grad_norm": 14.645645141601562, "learning_rate": 9.67596226261095e-06, "loss": 0.8143, "step": 220 }, { "epoch": 1.8756371049949032, "grad_norm": 5.771991729736328, "learning_rate": 9.616299462493952e-06, "loss": 0.8228, "step": 230 }, { "epoch": 1.9571865443425076, "grad_norm": 7.7364091873168945, "learning_rate": 9.551814704830734e-06, "loss": 0.7194, "step": 240 }, { "epoch": 2.038735983690112, "grad_norm": 7.201632499694824, "learning_rate": 9.482575347202047e-06, "loss": 0.6792, "step": 250 }, { "epoch": 2.120285423037717, "grad_norm": 10.6597318649292, "learning_rate": 9.40865371360804e-06, "loss": 0.5799, "step": 260 }, { "epoch": 2.2018348623853212, "grad_norm": 5.66484260559082, "learning_rate": 9.330127018922195e-06, "loss": 0.5307, "step": 270 }, { "epoch": 2.2833843017329256, "grad_norm": 11.373878479003906, "learning_rate": 9.247077288236488e-06, "loss": 0.5052, "step": 280 }, { "epoch": 2.36493374108053, "grad_norm": 7.9306864738464355, "learning_rate": 9.159591271182058e-06, "loss": 0.4944, "step": 290 }, { "epoch": 2.4464831804281344, "grad_norm": 4.888540267944336, "learning_rate": 9.067760351314838e-06, "loss": 0.5803, "step": 300 }, { "epoch": 2.528032619775739, "grad_norm": 5.445197582244873, "learning_rate": 8.97168045066082e-06, "loss": 0.4859, "step": 310 }, { "epoch": 2.6095820591233436, "grad_norm": 6.764598369598389, "learning_rate": 8.871451929520662e-06, "loss": 0.5261, "step": 320 }, { "epoch": 2.691131498470948, "grad_norm": 4.372009754180908, "learning_rate": 8.767179481638303e-06, "loss": 0.5705, "step": 330 }, { "epoch": 2.7726809378185524, "grad_norm": 5.150655746459961, "learning_rate": 8.658972024843063e-06, "loss": 0.5248, "step": 340 }, { "epoch": 2.8542303771661572, "grad_norm": 7.280877590179443, "learning_rate": 8.546942587279465e-06, "loss": 0.4764, "step": 350 }, { "epoch": 2.9357798165137616, "grad_norm": 6.407410621643066, "learning_rate": 8.43120818934367e-06, "loss": 0.5481, "step": 360 }, { "epoch": 3.017329255861366, "grad_norm": 3.3497812747955322, "learning_rate": 8.31188972144974e-06, "loss": 0.4215, "step": 370 }, { "epoch": 3.0988786952089704, "grad_norm": 8.869242668151855, "learning_rate": 8.18911181775353e-06, "loss": 0.3038, "step": 380 }, { "epoch": 3.180428134556575, "grad_norm": 7.462818622589111, "learning_rate": 8.063002725966014e-06, "loss": 0.3173, "step": 390 }, { "epoch": 3.261977573904179, "grad_norm": 4.446223258972168, "learning_rate": 7.93369417339209e-06, "loss": 0.3345, "step": 400 }, { "epoch": 3.343527013251784, "grad_norm": 4.583369255065918, "learning_rate": 7.801321229334764e-06, "loss": 0.3165, "step": 410 }, { "epoch": 3.4250764525993884, "grad_norm": 3.839564323425293, "learning_rate": 7.666022164008458e-06, "loss": 0.2921, "step": 420 }, { "epoch": 3.506625891946993, "grad_norm": 3.111229181289673, "learning_rate": 7.527938304108795e-06, "loss": 0.3062, "step": 430 }, { "epoch": 3.588175331294597, "grad_norm": 4.569026470184326, "learning_rate": 7.387213885189746e-06, "loss": 0.2924, "step": 440 }, { "epoch": 3.669724770642202, "grad_norm": 6.843806266784668, "learning_rate": 7.243995901002312e-06, "loss": 0.2929, "step": 450 }, { "epoch": 3.7512742099898064, "grad_norm": 3.574108839035034, "learning_rate": 7.098433949952146e-06, "loss": 0.3544, "step": 460 }, { "epoch": 3.832823649337411, "grad_norm": 3.691032886505127, "learning_rate": 6.950680078836475e-06, "loss": 0.3103, "step": 470 }, { "epoch": 3.914373088685015, "grad_norm": 3.9971020221710205, "learning_rate": 6.800888624023552e-06, "loss": 0.2742, "step": 480 }, { "epoch": 3.9959225280326196, "grad_norm": 2.556887149810791, "learning_rate": 6.649216050240539e-06, "loss": 0.3339, "step": 490 }, { "epoch": 4.077471967380224, "grad_norm": 2.1896212100982666, "learning_rate": 6.495820787138209e-06, "loss": 0.193, "step": 500 }, { "epoch": 4.077471967380224, "eval_loss": 1.6843377351760864, "eval_runtime": 4.0486, "eval_samples_per_second": 27.17, "eval_steps_per_second": 27.17, "step": 500 }, { "epoch": 4.159021406727828, "grad_norm": 4.995784282684326, "learning_rate": 6.340863063803187e-06, "loss": 0.2102, "step": 510 }, { "epoch": 4.240570846075434, "grad_norm": 3.022165536880493, "learning_rate": 6.184504741390596e-06, "loss": 0.1799, "step": 520 }, { "epoch": 4.322120285423038, "grad_norm": 3.73397159576416, "learning_rate": 6.02690914405191e-06, "loss": 0.1945, "step": 530 }, { "epoch": 4.4036697247706424, "grad_norm": 3.1206557750701904, "learning_rate": 5.8682408883346535e-06, "loss": 0.1186, "step": 540 }, { "epoch": 4.485219164118247, "grad_norm": 6.9254326820373535, "learning_rate": 5.708665711232103e-06, "loss": 0.1761, "step": 550 }, { "epoch": 4.566768603465851, "grad_norm": 1.7940961122512817, "learning_rate": 5.548350297062659e-06, "loss": 0.1747, "step": 560 }, { "epoch": 4.648318042813456, "grad_norm": 3.6487746238708496, "learning_rate": 5.387462103359655e-06, "loss": 0.205, "step": 570 }, { "epoch": 4.72986748216106, "grad_norm": 2.9038984775543213, "learning_rate": 5.2261691859535325e-06, "loss": 0.1384, "step": 580 }, { "epoch": 4.811416921508664, "grad_norm": 7.9458842277526855, "learning_rate": 5.064640023429042e-06, "loss": 0.1475, "step": 590 }, { "epoch": 4.892966360856269, "grad_norm": 3.393327236175537, "learning_rate": 4.903043341140879e-06, "loss": 0.1647, "step": 600 }, { "epoch": 4.974515800203873, "grad_norm": 5.027682781219482, "learning_rate": 4.741547934971528e-06, "loss": 0.1691, "step": 610 }, { "epoch": 5.0560652395514785, "grad_norm": 1.468493938446045, "learning_rate": 4.580322495015466e-06, "loss": 0.1022, "step": 620 }, { "epoch": 5.137614678899083, "grad_norm": 1.9386202096939087, "learning_rate": 4.4195354293738484e-06, "loss": 0.0677, "step": 630 }, { "epoch": 5.219164118246687, "grad_norm": 2.2345311641693115, "learning_rate": 4.259354688243758e-06, "loss": 0.0761, "step": 640 }, { "epoch": 5.300713557594292, "grad_norm": 2.380667209625244, "learning_rate": 4.099947588485744e-06, "loss": 0.0733, "step": 650 }, { "epoch": 5.382262996941896, "grad_norm": 1.2158432006835938, "learning_rate": 3.941480638852948e-06, "loss": 0.0841, "step": 660 }, { "epoch": 5.4638124362895, "grad_norm": 2.382153034210205, "learning_rate": 3.784119366064293e-06, "loss": 0.0929, "step": 670 }, { "epoch": 5.545361875637105, "grad_norm": 2.7093493938446045, "learning_rate": 3.6280281419034934e-06, "loss": 0.0634, "step": 680 }, { "epoch": 5.626911314984709, "grad_norm": 1.597835659980774, "learning_rate": 3.473370011524435e-06, "loss": 0.1201, "step": 690 }, { "epoch": 5.708460754332314, "grad_norm": 2.351442575454712, "learning_rate": 3.3203065231422904e-06, "loss": 0.0811, "step": 700 }, { "epoch": 5.790010193679919, "grad_norm": 2.5688395500183105, "learning_rate": 3.1689975592882603e-06, "loss": 0.0708, "step": 710 }, { "epoch": 5.871559633027523, "grad_norm": 3.0511674880981445, "learning_rate": 3.019601169804216e-06, "loss": 0.0846, "step": 720 }, { "epoch": 5.953109072375128, "grad_norm": 5.8331298828125, "learning_rate": 2.8722734067516637e-06, "loss": 0.0782, "step": 730 }, { "epoch": 6.034658511722732, "grad_norm": 1.1434911489486694, "learning_rate": 2.7271681614074973e-06, "loss": 0.0624, "step": 740 }, { "epoch": 6.116207951070336, "grad_norm": 2.7258777618408203, "learning_rate": 2.5844370035168077e-06, "loss": 0.0373, "step": 750 }, { "epoch": 6.197757390417941, "grad_norm": 0.5002675652503967, "learning_rate": 2.4442290229706344e-06, "loss": 0.0331, "step": 760 }, { "epoch": 6.279306829765545, "grad_norm": 0.7639961242675781, "learning_rate": 2.3066906740740626e-06, "loss": 0.0232, "step": 770 }, { "epoch": 6.36085626911315, "grad_norm": 2.4113121032714844, "learning_rate": 2.171965622567308e-06, "loss": 0.0356, "step": 780 }, { "epoch": 6.442405708460754, "grad_norm": 1.4164212942123413, "learning_rate": 2.0401945955596206e-06, "loss": 0.0599, "step": 790 }, { "epoch": 6.523955147808358, "grad_norm": 1.6531766653060913, "learning_rate": 1.9115152345327154e-06, "loss": 0.0298, "step": 800 }, { "epoch": 6.605504587155964, "grad_norm": 1.0268425941467285, "learning_rate": 1.7860619515673034e-06, "loss": 0.029, "step": 810 }, { "epoch": 6.687054026503568, "grad_norm": 1.7151083946228027, "learning_rate": 1.6639657889429017e-06, "loss": 0.0322, "step": 820 }, { "epoch": 6.7686034658511725, "grad_norm": 1.3095351457595825, "learning_rate": 1.5453542822575624e-06, "loss": 0.0276, "step": 830 }, { "epoch": 6.850152905198777, "grad_norm": 2.322570323944092, "learning_rate": 1.4303513272105057e-06, "loss": 0.0307, "step": 840 }, { "epoch": 6.931702344546381, "grad_norm": 5.283768653869629, "learning_rate": 1.3190770501868243e-06, "loss": 0.0439, "step": 850 }, { "epoch": 7.013251783893986, "grad_norm": 2.021864414215088, "learning_rate": 1.2116476827794104e-06, "loss": 0.0349, "step": 860 }, { "epoch": 7.09480122324159, "grad_norm": 1.7383687496185303, "learning_rate": 1.1081754403792e-06, "loss": 0.0141, "step": 870 }, { "epoch": 7.176350662589194, "grad_norm": 0.9524487853050232, "learning_rate": 1.008768404960535e-06, "loss": 0.0131, "step": 880 }, { "epoch": 7.257900101936799, "grad_norm": 0.6132211089134216, "learning_rate": 9.135304121840976e-07, "loss": 0.0156, "step": 890 }, { "epoch": 7.339449541284404, "grad_norm": 0.16791853308677673, "learning_rate": 8.225609429353187e-07, "loss": 0.0129, "step": 900 }, { "epoch": 7.4209989806320085, "grad_norm": 0.24683955311775208, "learning_rate": 7.35955019411585e-07, "loss": 0.0142, "step": 910 }, { "epoch": 7.502548419979613, "grad_norm": 0.177597314119339, "learning_rate": 6.53803105866761e-07, "loss": 0.0103, "step": 920 }, { "epoch": 7.584097859327217, "grad_norm": 1.1235181093215942, "learning_rate": 5.76191014116711e-07, "loss": 0.0166, "step": 930 }, { "epoch": 7.665647298674822, "grad_norm": 0.7732178568840027, "learning_rate": 5.031998139045352e-07, "loss": 0.0144, "step": 940 }, { "epoch": 7.747196738022426, "grad_norm": 1.0395855903625488, "learning_rate": 4.349057482191299e-07, "loss": 0.0165, "step": 950 }, { "epoch": 7.82874617737003, "grad_norm": 0.6988322734832764, "learning_rate": 3.7138015365554834e-07, "loss": 0.0118, "step": 960 }, { "epoch": 7.910295616717635, "grad_norm": 0.8800845742225647, "learning_rate": 3.1268938590032495e-07, "loss": 0.0141, "step": 970 }, { "epoch": 7.991845056065239, "grad_norm": 0.32935312390327454, "learning_rate": 2.5889475041961767e-07, "loss": 0.0096, "step": 980 }, { "epoch": 8.073394495412844, "grad_norm": 0.8593177199363708, "learning_rate": 2.1005243842255552e-07, "loss": 0.0207, "step": 990 }, { "epoch": 8.154943934760448, "grad_norm": 0.561518132686615, "learning_rate": 1.6621346816668993e-07, "loss": 0.0083, "step": 1000 }, { "epoch": 8.154943934760448, "eval_loss": 2.4749834537506104, "eval_runtime": 3.9418, "eval_samples_per_second": 27.906, "eval_steps_per_second": 27.906, "step": 1000 }, { "epoch": 8.236493374108052, "grad_norm": 0.1318514049053192, "learning_rate": 1.2742363166685035e-07, "loss": 0.0084, "step": 1010 }, { "epoch": 8.318042813455657, "grad_norm": 0.09888464212417603, "learning_rate": 9.372344686307655e-08, "loss": 0.0065, "step": 1020 }, { "epoch": 8.399592252803261, "grad_norm": 0.2378782033920288, "learning_rate": 6.514811529758747e-08, "loss": 0.0116, "step": 1030 }, { "epoch": 8.481141692150867, "grad_norm": 0.2287997156381607, "learning_rate": 4.172748534499449e-08, "loss": 0.0108, "step": 1040 }, { "epoch": 8.562691131498472, "grad_norm": 0.5172834992408752, "learning_rate": 2.3486021034170857e-08, "loss": 0.0088, "step": 1050 }, { "epoch": 8.644240570846076, "grad_norm": 0.272684246301651, "learning_rate": 1.044277649433989e-08, "loss": 0.0086, "step": 1060 }, { "epoch": 8.72579001019368, "grad_norm": 0.2911706864833832, "learning_rate": 2.611376052073511e-09, "loss": 0.0086, "step": 1070 }, { "epoch": 8.807339449541285, "grad_norm": 0.5013861060142517, "learning_rate": 0.0, "loss": 0.0063, "step": 1080 }, { "epoch": 8.807339449541285, "step": 1080, "total_flos": 1.2056852760477696e+16, "train_loss": 0.3479360826589443, "train_runtime": 1723.5404, "train_samples_per_second": 5.013, "train_steps_per_second": 0.627 } ], "logging_steps": 10, "max_steps": 1080, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2056852760477696e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }