{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9974293059125965, "eval_steps": 500, "global_step": 1749, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01713796058269066, "grad_norm": 0.4733077883720398, "learning_rate": 4.971412235563179e-05, "loss": 2.5254, "step": 10 }, { "epoch": 0.03427592116538132, "grad_norm": 0.44018083810806274, "learning_rate": 4.9428244711263584e-05, "loss": 1.8284, "step": 20 }, { "epoch": 0.05141388174807198, "grad_norm": 0.7213996648788452, "learning_rate": 4.914236706689537e-05, "loss": 1.2128, "step": 30 }, { "epoch": 0.06855184233076264, "grad_norm": 0.6227633953094482, "learning_rate": 4.8856489422527165e-05, "loss": 0.6727, "step": 40 }, { "epoch": 0.0856898029134533, "grad_norm": 0.2804420590400696, "learning_rate": 4.8570611778158946e-05, "loss": 0.3089, "step": 50 }, { "epoch": 0.10282776349614396, "grad_norm": 0.10756277292966843, "learning_rate": 4.828473413379074e-05, "loss": 0.2448, "step": 60 }, { "epoch": 0.11996572407883462, "grad_norm": 0.11257284879684448, "learning_rate": 4.799885648942253e-05, "loss": 0.2268, "step": 70 }, { "epoch": 0.13710368466152528, "grad_norm": 0.09793874621391296, "learning_rate": 4.771297884505432e-05, "loss": 0.2328, "step": 80 }, { "epoch": 0.15424164524421594, "grad_norm": 0.10177452117204666, "learning_rate": 4.742710120068611e-05, "loss": 0.2183, "step": 90 }, { "epoch": 0.1713796058269066, "grad_norm": 0.09770502895116806, "learning_rate": 4.7141223556317895e-05, "loss": 0.1971, "step": 100 }, { "epoch": 0.18851756640959727, "grad_norm": 0.11037880927324295, "learning_rate": 4.685534591194969e-05, "loss": 0.1996, "step": 110 }, { "epoch": 0.20565552699228792, "grad_norm": 0.11869871616363525, "learning_rate": 4.656946826758148e-05, "loss": 0.2044, "step": 120 }, { "epoch": 0.22279348757497858, "grad_norm": 0.10279687494039536, "learning_rate": 4.628359062321327e-05, "loss": 0.1782, "step": 130 }, { "epoch": 0.23993144815766923, "grad_norm": 0.10952438414096832, "learning_rate": 4.599771297884506e-05, "loss": 0.1837, "step": 140 }, { "epoch": 0.2570694087403599, "grad_norm": 0.13122445344924927, "learning_rate": 4.5711835334476845e-05, "loss": 0.1907, "step": 150 }, { "epoch": 0.27420736932305056, "grad_norm": 0.11156395077705383, "learning_rate": 4.542595769010863e-05, "loss": 0.1748, "step": 160 }, { "epoch": 0.2913453299057412, "grad_norm": 0.1657029241323471, "learning_rate": 4.5140080045740427e-05, "loss": 0.1704, "step": 170 }, { "epoch": 0.30848329048843187, "grad_norm": 0.11676887422800064, "learning_rate": 4.4854202401372214e-05, "loss": 0.16, "step": 180 }, { "epoch": 0.32562125107112255, "grad_norm": 0.17466452717781067, "learning_rate": 4.4568324757004e-05, "loss": 0.1705, "step": 190 }, { "epoch": 0.3427592116538132, "grad_norm": 0.1373518407344818, "learning_rate": 4.4282447112635795e-05, "loss": 0.1663, "step": 200 }, { "epoch": 0.35989717223650386, "grad_norm": 0.11815926432609558, "learning_rate": 4.399656946826758e-05, "loss": 0.1562, "step": 210 }, { "epoch": 0.37703513281919454, "grad_norm": 0.12364204227924347, "learning_rate": 4.3710691823899376e-05, "loss": 0.1655, "step": 220 }, { "epoch": 0.39417309340188517, "grad_norm": 0.11595863103866577, "learning_rate": 4.3424814179531164e-05, "loss": 0.1621, "step": 230 }, { "epoch": 0.41131105398457585, "grad_norm": 0.11320952326059341, "learning_rate": 4.313893653516296e-05, "loss": 0.1523, "step": 240 }, { "epoch": 0.4284490145672665, "grad_norm": 0.12280760705471039, "learning_rate": 4.285305889079474e-05, "loss": 0.1711, "step": 250 }, { "epoch": 0.44558697514995715, "grad_norm": 0.12085650116205215, "learning_rate": 4.256718124642653e-05, "loss": 0.1548, "step": 260 }, { "epoch": 0.46272493573264784, "grad_norm": 0.1491166651248932, "learning_rate": 4.228130360205832e-05, "loss": 0.1631, "step": 270 }, { "epoch": 0.47986289631533846, "grad_norm": 0.16156260669231415, "learning_rate": 4.199542595769011e-05, "loss": 0.1624, "step": 280 }, { "epoch": 0.49700085689802914, "grad_norm": 0.12156664580106735, "learning_rate": 4.17095483133219e-05, "loss": 0.1383, "step": 290 }, { "epoch": 0.5141388174807198, "grad_norm": 0.11533980071544647, "learning_rate": 4.142367066895369e-05, "loss": 0.1539, "step": 300 }, { "epoch": 0.5312767780634104, "grad_norm": 0.1386200189590454, "learning_rate": 4.113779302458548e-05, "loss": 0.1525, "step": 310 }, { "epoch": 0.5484147386461011, "grad_norm": 0.16111765801906586, "learning_rate": 4.085191538021727e-05, "loss": 0.14, "step": 320 }, { "epoch": 0.5655526992287918, "grad_norm": 0.151380717754364, "learning_rate": 4.0566037735849064e-05, "loss": 0.1492, "step": 330 }, { "epoch": 0.5826906598114824, "grad_norm": 0.15228472650051117, "learning_rate": 4.028016009148085e-05, "loss": 0.157, "step": 340 }, { "epoch": 0.5998286203941731, "grad_norm": 0.11199972033500671, "learning_rate": 3.999428244711264e-05, "loss": 0.1492, "step": 350 }, { "epoch": 0.6169665809768637, "grad_norm": 0.17204181849956512, "learning_rate": 3.9708404802744425e-05, "loss": 0.1504, "step": 360 }, { "epoch": 0.6341045415595544, "grad_norm": 0.13271720707416534, "learning_rate": 3.942252715837621e-05, "loss": 0.1604, "step": 370 }, { "epoch": 0.6512425021422451, "grad_norm": 0.15998151898384094, "learning_rate": 3.913664951400801e-05, "loss": 0.148, "step": 380 }, { "epoch": 0.6683804627249358, "grad_norm": 0.12898313999176025, "learning_rate": 3.8850771869639794e-05, "loss": 0.1492, "step": 390 }, { "epoch": 0.6855184233076264, "grad_norm": 0.13998836278915405, "learning_rate": 3.856489422527159e-05, "loss": 0.1548, "step": 400 }, { "epoch": 0.702656383890317, "grad_norm": 0.14940115809440613, "learning_rate": 3.8279016580903375e-05, "loss": 0.1432, "step": 410 }, { "epoch": 0.7197943444730077, "grad_norm": 0.13358236849308014, "learning_rate": 3.799313893653517e-05, "loss": 0.1459, "step": 420 }, { "epoch": 0.7369323050556984, "grad_norm": 0.1597578376531601, "learning_rate": 3.7707261292166957e-05, "loss": 0.1351, "step": 430 }, { "epoch": 0.7540702656383891, "grad_norm": 0.12782897055149078, "learning_rate": 3.7421383647798744e-05, "loss": 0.1392, "step": 440 }, { "epoch": 0.7712082262210797, "grad_norm": 0.13537828624248505, "learning_rate": 3.713550600343053e-05, "loss": 0.1402, "step": 450 }, { "epoch": 0.7883461868037703, "grad_norm": 0.17046277225017548, "learning_rate": 3.684962835906232e-05, "loss": 0.1227, "step": 460 }, { "epoch": 0.805484147386461, "grad_norm": 0.16829894483089447, "learning_rate": 3.656375071469411e-05, "loss": 0.133, "step": 470 }, { "epoch": 0.8226221079691517, "grad_norm": 0.17760640382766724, "learning_rate": 3.62778730703259e-05, "loss": 0.1361, "step": 480 }, { "epoch": 0.8397600685518424, "grad_norm": 0.1783646047115326, "learning_rate": 3.5991995425957694e-05, "loss": 0.1272, "step": 490 }, { "epoch": 0.856898029134533, "grad_norm": 0.1848060041666031, "learning_rate": 3.570611778158948e-05, "loss": 0.1281, "step": 500 }, { "epoch": 0.8740359897172236, "grad_norm": 0.1244303435087204, "learning_rate": 3.5420240137221275e-05, "loss": 0.1276, "step": 510 }, { "epoch": 0.8911739502999143, "grad_norm": 0.1454310566186905, "learning_rate": 3.513436249285306e-05, "loss": 0.1334, "step": 520 }, { "epoch": 0.908311910882605, "grad_norm": 0.17194361984729767, "learning_rate": 3.484848484848485e-05, "loss": 0.1271, "step": 530 }, { "epoch": 0.9254498714652957, "grad_norm": 0.15851227939128876, "learning_rate": 3.456260720411664e-05, "loss": 0.1304, "step": 540 }, { "epoch": 0.9425878320479862, "grad_norm": 0.1614447981119156, "learning_rate": 3.4276729559748424e-05, "loss": 0.1256, "step": 550 }, { "epoch": 0.9597257926306769, "grad_norm": 0.19016973674297333, "learning_rate": 3.399085191538022e-05, "loss": 0.1329, "step": 560 }, { "epoch": 0.9768637532133676, "grad_norm": 0.15122413635253906, "learning_rate": 3.3704974271012005e-05, "loss": 0.1228, "step": 570 }, { "epoch": 0.9940017137960583, "grad_norm": 0.15631020069122314, "learning_rate": 3.34190966266438e-05, "loss": 0.1235, "step": 580 }, { "epoch": 0.9991431019708654, "eval_loss": 0.15606513619422913, "eval_runtime": 439.0741, "eval_samples_per_second": 10.932, "eval_steps_per_second": 1.367, "step": 583 }, { "epoch": 1.0111396743787489, "grad_norm": 0.20745624601840973, "learning_rate": 3.313321898227559e-05, "loss": 0.1245, "step": 590 }, { "epoch": 1.0282776349614395, "grad_norm": 0.182315856218338, "learning_rate": 3.284734133790738e-05, "loss": 0.1112, "step": 600 }, { "epoch": 1.0454155955441302, "grad_norm": 0.1425042450428009, "learning_rate": 3.256146369353917e-05, "loss": 0.1186, "step": 610 }, { "epoch": 1.062553556126821, "grad_norm": 0.1898319274187088, "learning_rate": 3.2275586049170955e-05, "loss": 0.1203, "step": 620 }, { "epoch": 1.0796915167095116, "grad_norm": 0.17110544443130493, "learning_rate": 3.198970840480275e-05, "loss": 0.1223, "step": 630 }, { "epoch": 1.0968294772922023, "grad_norm": 0.17418451607227325, "learning_rate": 3.170383076043453e-05, "loss": 0.111, "step": 640 }, { "epoch": 1.113967437874893, "grad_norm": 0.19765284657478333, "learning_rate": 3.1417953116066324e-05, "loss": 0.1173, "step": 650 }, { "epoch": 1.1311053984575836, "grad_norm": 0.17013542354106903, "learning_rate": 3.113207547169811e-05, "loss": 0.1128, "step": 660 }, { "epoch": 1.1482433590402743, "grad_norm": 0.21173644065856934, "learning_rate": 3.0846197827329905e-05, "loss": 0.109, "step": 670 }, { "epoch": 1.165381319622965, "grad_norm": 0.13383643329143524, "learning_rate": 3.056032018296169e-05, "loss": 0.1092, "step": 680 }, { "epoch": 1.1825192802056554, "grad_norm": 0.22101104259490967, "learning_rate": 3.0274442538593483e-05, "loss": 0.1175, "step": 690 }, { "epoch": 1.1996572407883461, "grad_norm": 0.1745050698518753, "learning_rate": 2.9988564894225274e-05, "loss": 0.1096, "step": 700 }, { "epoch": 1.2167952013710368, "grad_norm": 0.2413550466299057, "learning_rate": 2.9702687249857064e-05, "loss": 0.1044, "step": 710 }, { "epoch": 1.2339331619537275, "grad_norm": 0.15909789502620697, "learning_rate": 2.9416809605488855e-05, "loss": 0.1076, "step": 720 }, { "epoch": 1.2510711225364182, "grad_norm": 0.21382008492946625, "learning_rate": 2.9130931961120646e-05, "loss": 0.1077, "step": 730 }, { "epoch": 1.2682090831191088, "grad_norm": 0.1845552772283554, "learning_rate": 2.884505431675243e-05, "loss": 0.102, "step": 740 }, { "epoch": 1.2853470437017995, "grad_norm": 0.19705334305763245, "learning_rate": 2.855917667238422e-05, "loss": 0.1048, "step": 750 }, { "epoch": 1.3024850042844902, "grad_norm": 0.16979779303073883, "learning_rate": 2.827329902801601e-05, "loss": 0.1012, "step": 760 }, { "epoch": 1.3196229648671807, "grad_norm": 0.18766264617443085, "learning_rate": 2.7987421383647798e-05, "loss": 0.1043, "step": 770 }, { "epoch": 1.3367609254498714, "grad_norm": 0.1593862771987915, "learning_rate": 2.770154373927959e-05, "loss": 0.0953, "step": 780 }, { "epoch": 1.353898886032562, "grad_norm": 0.19427362084388733, "learning_rate": 2.741566609491138e-05, "loss": 0.1008, "step": 790 }, { "epoch": 1.3710368466152527, "grad_norm": 0.21518777310848236, "learning_rate": 2.712978845054317e-05, "loss": 0.0973, "step": 800 }, { "epoch": 1.3881748071979434, "grad_norm": 0.2065572291612625, "learning_rate": 2.684391080617496e-05, "loss": 0.1035, "step": 810 }, { "epoch": 1.405312767780634, "grad_norm": 0.26617249846458435, "learning_rate": 2.655803316180675e-05, "loss": 0.0969, "step": 820 }, { "epoch": 1.4224507283633248, "grad_norm": 0.17817485332489014, "learning_rate": 2.627215551743854e-05, "loss": 0.0944, "step": 830 }, { "epoch": 1.4395886889460154, "grad_norm": 0.17743416130542755, "learning_rate": 2.5986277873070326e-05, "loss": 0.0984, "step": 840 }, { "epoch": 1.4567266495287061, "grad_norm": 0.23810917139053345, "learning_rate": 2.5700400228702117e-05, "loss": 0.099, "step": 850 }, { "epoch": 1.4738646101113968, "grad_norm": 0.18336652219295502, "learning_rate": 2.5414522584333904e-05, "loss": 0.0954, "step": 860 }, { "epoch": 1.4910025706940875, "grad_norm": 0.22074759006500244, "learning_rate": 2.5128644939965695e-05, "loss": 0.0915, "step": 870 }, { "epoch": 1.5081405312767782, "grad_norm": 0.2292771190404892, "learning_rate": 2.4842767295597485e-05, "loss": 0.0931, "step": 880 }, { "epoch": 1.5252784918594688, "grad_norm": 0.27021974325180054, "learning_rate": 2.4556889651229276e-05, "loss": 0.0933, "step": 890 }, { "epoch": 1.5424164524421595, "grad_norm": 0.28702589869499207, "learning_rate": 2.4271012006861067e-05, "loss": 0.0978, "step": 900 }, { "epoch": 1.5595544130248502, "grad_norm": 0.18202678859233856, "learning_rate": 2.3985134362492854e-05, "loss": 0.0908, "step": 910 }, { "epoch": 1.5766923736075407, "grad_norm": 0.15822364389896393, "learning_rate": 2.3699256718124644e-05, "loss": 0.0878, "step": 920 }, { "epoch": 1.5938303341902313, "grad_norm": 0.21551179885864258, "learning_rate": 2.3413379073756435e-05, "loss": 0.0883, "step": 930 }, { "epoch": 1.610968294772922, "grad_norm": 0.20409446954727173, "learning_rate": 2.3127501429388222e-05, "loss": 0.094, "step": 940 }, { "epoch": 1.6281062553556127, "grad_norm": 0.16509220004081726, "learning_rate": 2.2841623785020013e-05, "loss": 0.0909, "step": 950 }, { "epoch": 1.6452442159383034, "grad_norm": 0.1946859210729599, "learning_rate": 2.25557461406518e-05, "loss": 0.0898, "step": 960 }, { "epoch": 1.6623821765209938, "grad_norm": 0.16525417566299438, "learning_rate": 2.226986849628359e-05, "loss": 0.0842, "step": 970 }, { "epoch": 1.6795201371036845, "grad_norm": 0.26819315552711487, "learning_rate": 2.198399085191538e-05, "loss": 0.0897, "step": 980 }, { "epoch": 1.6966580976863752, "grad_norm": 0.273593008518219, "learning_rate": 2.1698113207547172e-05, "loss": 0.0895, "step": 990 }, { "epoch": 1.713796058269066, "grad_norm": 0.20378242433071136, "learning_rate": 2.1412235563178963e-05, "loss": 0.0912, "step": 1000 }, { "epoch": 1.7309340188517566, "grad_norm": 0.24366410076618195, "learning_rate": 2.112635791881075e-05, "loss": 0.0843, "step": 1010 }, { "epoch": 1.7480719794344473, "grad_norm": 0.2034538835287094, "learning_rate": 2.084048027444254e-05, "loss": 0.0874, "step": 1020 }, { "epoch": 1.765209940017138, "grad_norm": 0.2260463535785675, "learning_rate": 2.0554602630074328e-05, "loss": 0.0898, "step": 1030 }, { "epoch": 1.7823479005998286, "grad_norm": 0.165971040725708, "learning_rate": 2.026872498570612e-05, "loss": 0.0867, "step": 1040 }, { "epoch": 1.7994858611825193, "grad_norm": 0.20090453326702118, "learning_rate": 1.998284734133791e-05, "loss": 0.0891, "step": 1050 }, { "epoch": 1.81662382176521, "grad_norm": 0.24718008935451508, "learning_rate": 1.9696969696969697e-05, "loss": 0.0872, "step": 1060 }, { "epoch": 1.8337617823479007, "grad_norm": 0.2808043360710144, "learning_rate": 1.9411092052601487e-05, "loss": 0.0889, "step": 1070 }, { "epoch": 1.8508997429305913, "grad_norm": 0.19833321869373322, "learning_rate": 1.9125214408233278e-05, "loss": 0.0859, "step": 1080 }, { "epoch": 1.868037703513282, "grad_norm": 0.21585367619991302, "learning_rate": 1.883933676386507e-05, "loss": 0.0872, "step": 1090 }, { "epoch": 1.8851756640959727, "grad_norm": 0.16208423674106598, "learning_rate": 1.8553459119496856e-05, "loss": 0.0835, "step": 1100 }, { "epoch": 1.9023136246786634, "grad_norm": 0.195535346865654, "learning_rate": 1.8267581475128647e-05, "loss": 0.0785, "step": 1110 }, { "epoch": 1.919451585261354, "grad_norm": 0.22447216510772705, "learning_rate": 1.7981703830760434e-05, "loss": 0.08, "step": 1120 }, { "epoch": 1.9365895458440445, "grad_norm": 0.22212448716163635, "learning_rate": 1.7695826186392224e-05, "loss": 0.0826, "step": 1130 }, { "epoch": 1.9537275064267352, "grad_norm": 0.2077985554933548, "learning_rate": 1.7409948542024015e-05, "loss": 0.0822, "step": 1140 }, { "epoch": 1.9708654670094259, "grad_norm": 0.20502477884292603, "learning_rate": 1.7124070897655802e-05, "loss": 0.0817, "step": 1150 }, { "epoch": 1.9880034275921166, "grad_norm": 0.22330917418003082, "learning_rate": 1.6838193253287593e-05, "loss": 0.0837, "step": 1160 }, { "epoch": 2.0, "eval_loss": 0.19116699695587158, "eval_runtime": 439.6787, "eval_samples_per_second": 10.917, "eval_steps_per_second": 1.365, "step": 1167 }, { "epoch": 2.005141388174807, "grad_norm": 0.1825590431690216, "learning_rate": 1.6552315608919384e-05, "loss": 0.0764, "step": 1170 }, { "epoch": 2.0222793487574977, "grad_norm": 0.245217964053154, "learning_rate": 1.6266437964551174e-05, "loss": 0.0776, "step": 1180 }, { "epoch": 2.0394173093401884, "grad_norm": 0.185066357254982, "learning_rate": 1.5980560320182965e-05, "loss": 0.0736, "step": 1190 }, { "epoch": 2.056555269922879, "grad_norm": 0.1703159511089325, "learning_rate": 1.5694682675814752e-05, "loss": 0.075, "step": 1200 }, { "epoch": 2.0736932305055698, "grad_norm": 0.19329093396663666, "learning_rate": 1.540880503144654e-05, "loss": 0.0789, "step": 1210 }, { "epoch": 2.0908311910882604, "grad_norm": 0.22116591036319733, "learning_rate": 1.5122927387078332e-05, "loss": 0.0761, "step": 1220 }, { "epoch": 2.107969151670951, "grad_norm": 0.17945989966392517, "learning_rate": 1.4837049742710121e-05, "loss": 0.0766, "step": 1230 }, { "epoch": 2.125107112253642, "grad_norm": 0.21690496802330017, "learning_rate": 1.4551172098341912e-05, "loss": 0.0748, "step": 1240 }, { "epoch": 2.1422450728363325, "grad_norm": 0.17853769659996033, "learning_rate": 1.4265294453973699e-05, "loss": 0.0755, "step": 1250 }, { "epoch": 2.159383033419023, "grad_norm": 0.18670514225959778, "learning_rate": 1.397941680960549e-05, "loss": 0.0789, "step": 1260 }, { "epoch": 2.176520994001714, "grad_norm": 0.2004610300064087, "learning_rate": 1.3693539165237278e-05, "loss": 0.0771, "step": 1270 }, { "epoch": 2.1936589545844045, "grad_norm": 0.17286266386508942, "learning_rate": 1.3407661520869069e-05, "loss": 0.0757, "step": 1280 }, { "epoch": 2.210796915167095, "grad_norm": 0.21835772693157196, "learning_rate": 1.312178387650086e-05, "loss": 0.0764, "step": 1290 }, { "epoch": 2.227934875749786, "grad_norm": 0.2717360854148865, "learning_rate": 1.2835906232132647e-05, "loss": 0.0761, "step": 1300 }, { "epoch": 2.2450728363324766, "grad_norm": 0.17896392941474915, "learning_rate": 1.2550028587764438e-05, "loss": 0.0748, "step": 1310 }, { "epoch": 2.2622107969151672, "grad_norm": 0.20612064003944397, "learning_rate": 1.2264150943396227e-05, "loss": 0.0745, "step": 1320 }, { "epoch": 2.279348757497858, "grad_norm": 0.25621137022972107, "learning_rate": 1.1978273299028017e-05, "loss": 0.0754, "step": 1330 }, { "epoch": 2.2964867180805486, "grad_norm": 0.1826545149087906, "learning_rate": 1.1692395654659806e-05, "loss": 0.0738, "step": 1340 }, { "epoch": 2.3136246786632393, "grad_norm": 0.22315889596939087, "learning_rate": 1.1406518010291597e-05, "loss": 0.0738, "step": 1350 }, { "epoch": 2.33076263924593, "grad_norm": 0.2433796525001526, "learning_rate": 1.1120640365923384e-05, "loss": 0.0746, "step": 1360 }, { "epoch": 2.34790059982862, "grad_norm": 0.19135648012161255, "learning_rate": 1.0834762721555175e-05, "loss": 0.0745, "step": 1370 }, { "epoch": 2.365038560411311, "grad_norm": 0.21853765845298767, "learning_rate": 1.0548885077186965e-05, "loss": 0.0751, "step": 1380 }, { "epoch": 2.3821765209940016, "grad_norm": 0.23422804474830627, "learning_rate": 1.0263007432818754e-05, "loss": 0.0736, "step": 1390 }, { "epoch": 2.3993144815766922, "grad_norm": 0.22039854526519775, "learning_rate": 9.977129788450543e-06, "loss": 0.0722, "step": 1400 }, { "epoch": 2.416452442159383, "grad_norm": 0.20668627321720123, "learning_rate": 9.691252144082332e-06, "loss": 0.074, "step": 1410 }, { "epoch": 2.4335904027420736, "grad_norm": 0.3503040671348572, "learning_rate": 9.405374499714123e-06, "loss": 0.0786, "step": 1420 }, { "epoch": 2.4507283633247643, "grad_norm": 0.2581626772880554, "learning_rate": 9.119496855345912e-06, "loss": 0.0721, "step": 1430 }, { "epoch": 2.467866323907455, "grad_norm": 0.2439981997013092, "learning_rate": 8.833619210977703e-06, "loss": 0.0733, "step": 1440 }, { "epoch": 2.4850042844901457, "grad_norm": 0.16303370893001556, "learning_rate": 8.547741566609492e-06, "loss": 0.0727, "step": 1450 }, { "epoch": 2.5021422450728363, "grad_norm": 0.17133218050003052, "learning_rate": 8.26186392224128e-06, "loss": 0.0722, "step": 1460 }, { "epoch": 2.519280205655527, "grad_norm": 0.2298302799463272, "learning_rate": 7.975986277873071e-06, "loss": 0.0717, "step": 1470 }, { "epoch": 2.5364181662382177, "grad_norm": 0.22220905125141144, "learning_rate": 7.69010863350486e-06, "loss": 0.0763, "step": 1480 }, { "epoch": 2.5535561268209084, "grad_norm": 0.2196768969297409, "learning_rate": 7.40423098913665e-06, "loss": 0.0694, "step": 1490 }, { "epoch": 2.570694087403599, "grad_norm": 0.23876038193702698, "learning_rate": 7.11835334476844e-06, "loss": 0.0702, "step": 1500 }, { "epoch": 2.5878320479862897, "grad_norm": 0.2197275310754776, "learning_rate": 6.832475700400229e-06, "loss": 0.0731, "step": 1510 }, { "epoch": 2.6049700085689804, "grad_norm": 0.1521027684211731, "learning_rate": 6.5465980560320186e-06, "loss": 0.0706, "step": 1520 }, { "epoch": 2.622107969151671, "grad_norm": 0.2315615713596344, "learning_rate": 6.2607204116638075e-06, "loss": 0.0699, "step": 1530 }, { "epoch": 2.6392459297343613, "grad_norm": 0.22933197021484375, "learning_rate": 5.974842767295598e-06, "loss": 0.0726, "step": 1540 }, { "epoch": 2.656383890317052, "grad_norm": 0.23843450844287872, "learning_rate": 5.688965122927387e-06, "loss": 0.0696, "step": 1550 }, { "epoch": 2.6735218508997427, "grad_norm": 0.2547082304954529, "learning_rate": 5.403087478559177e-06, "loss": 0.0697, "step": 1560 }, { "epoch": 2.6906598114824334, "grad_norm": 0.19213077425956726, "learning_rate": 5.117209834190966e-06, "loss": 0.0725, "step": 1570 }, { "epoch": 2.707797772065124, "grad_norm": 0.19462022185325623, "learning_rate": 4.8313321898227566e-06, "loss": 0.0707, "step": 1580 }, { "epoch": 2.7249357326478147, "grad_norm": 0.22448916733264923, "learning_rate": 4.5454545454545455e-06, "loss": 0.0708, "step": 1590 }, { "epoch": 2.7420736932305054, "grad_norm": 0.22550085186958313, "learning_rate": 4.259576901086335e-06, "loss": 0.0705, "step": 1600 }, { "epoch": 2.759211653813196, "grad_norm": 0.24738526344299316, "learning_rate": 3.973699256718124e-06, "loss": 0.0716, "step": 1610 }, { "epoch": 2.776349614395887, "grad_norm": 0.20870988070964813, "learning_rate": 3.687821612349914e-06, "loss": 0.0674, "step": 1620 }, { "epoch": 2.7934875749785775, "grad_norm": 0.2011324018239975, "learning_rate": 3.4019439679817043e-06, "loss": 0.0692, "step": 1630 }, { "epoch": 2.810625535561268, "grad_norm": 0.25708264112472534, "learning_rate": 3.1160663236134933e-06, "loss": 0.0699, "step": 1640 }, { "epoch": 2.827763496143959, "grad_norm": 0.20108073949813843, "learning_rate": 2.830188679245283e-06, "loss": 0.0702, "step": 1650 }, { "epoch": 2.8449014567266495, "grad_norm": 0.21398206055164337, "learning_rate": 2.5443110348770725e-06, "loss": 0.0727, "step": 1660 }, { "epoch": 2.86203941730934, "grad_norm": 0.1945817768573761, "learning_rate": 2.2584333905088623e-06, "loss": 0.0739, "step": 1670 }, { "epoch": 2.879177377892031, "grad_norm": 0.20674385130405426, "learning_rate": 1.972555746140652e-06, "loss": 0.0738, "step": 1680 }, { "epoch": 2.8963153384747216, "grad_norm": 0.2226879894733429, "learning_rate": 1.6866781017724415e-06, "loss": 0.0698, "step": 1690 }, { "epoch": 2.9134532990574122, "grad_norm": 0.1832134872674942, "learning_rate": 1.400800457404231e-06, "loss": 0.0701, "step": 1700 }, { "epoch": 2.930591259640103, "grad_norm": 0.2704426348209381, "learning_rate": 1.1149228130360207e-06, "loss": 0.0681, "step": 1710 }, { "epoch": 2.9477292202227936, "grad_norm": 0.2618798315525055, "learning_rate": 8.290451686678103e-07, "loss": 0.0696, "step": 1720 }, { "epoch": 2.9648671808054843, "grad_norm": 0.16076813638210297, "learning_rate": 5.431675242995998e-07, "loss": 0.0729, "step": 1730 }, { "epoch": 2.982005141388175, "grad_norm": 0.20760373771190643, "learning_rate": 2.572898799313894e-07, "loss": 0.0693, "step": 1740 }, { "epoch": 2.9974293059125965, "eval_loss": 0.2091314196586609, "eval_runtime": 438.8842, "eval_samples_per_second": 10.937, "eval_steps_per_second": 1.367, "step": 1749 } ], "logging_steps": 10, "max_steps": 1749, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3059345020564275e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }