|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9974293059125965, |
|
"eval_steps": 500, |
|
"global_step": 1749, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01713796058269066, |
|
"grad_norm": 0.4733077883720398, |
|
"learning_rate": 4.971412235563179e-05, |
|
"loss": 2.5254, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03427592116538132, |
|
"grad_norm": 0.44018083810806274, |
|
"learning_rate": 4.9428244711263584e-05, |
|
"loss": 1.8284, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.7213996648788452, |
|
"learning_rate": 4.914236706689537e-05, |
|
"loss": 1.2128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06855184233076264, |
|
"grad_norm": 0.6227633953094482, |
|
"learning_rate": 4.8856489422527165e-05, |
|
"loss": 0.6727, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0856898029134533, |
|
"grad_norm": 0.2804420590400696, |
|
"learning_rate": 4.8570611778158946e-05, |
|
"loss": 0.3089, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.10756277292966843, |
|
"learning_rate": 4.828473413379074e-05, |
|
"loss": 0.2448, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11996572407883462, |
|
"grad_norm": 0.11257284879684448, |
|
"learning_rate": 4.799885648942253e-05, |
|
"loss": 0.2268, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13710368466152528, |
|
"grad_norm": 0.09793874621391296, |
|
"learning_rate": 4.771297884505432e-05, |
|
"loss": 0.2328, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.10177452117204666, |
|
"learning_rate": 4.742710120068611e-05, |
|
"loss": 0.2183, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1713796058269066, |
|
"grad_norm": 0.09770502895116806, |
|
"learning_rate": 4.7141223556317895e-05, |
|
"loss": 0.1971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18851756640959727, |
|
"grad_norm": 0.11037880927324295, |
|
"learning_rate": 4.685534591194969e-05, |
|
"loss": 0.1996, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.11869871616363525, |
|
"learning_rate": 4.656946826758148e-05, |
|
"loss": 0.2044, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22279348757497858, |
|
"grad_norm": 0.10279687494039536, |
|
"learning_rate": 4.628359062321327e-05, |
|
"loss": 0.1782, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23993144815766923, |
|
"grad_norm": 0.10952438414096832, |
|
"learning_rate": 4.599771297884506e-05, |
|
"loss": 0.1837, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.13122445344924927, |
|
"learning_rate": 4.5711835334476845e-05, |
|
"loss": 0.1907, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27420736932305056, |
|
"grad_norm": 0.11156395077705383, |
|
"learning_rate": 4.542595769010863e-05, |
|
"loss": 0.1748, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2913453299057412, |
|
"grad_norm": 0.1657029241323471, |
|
"learning_rate": 4.5140080045740427e-05, |
|
"loss": 0.1704, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.11676887422800064, |
|
"learning_rate": 4.4854202401372214e-05, |
|
"loss": 0.16, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32562125107112255, |
|
"grad_norm": 0.17466452717781067, |
|
"learning_rate": 4.4568324757004e-05, |
|
"loss": 0.1705, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3427592116538132, |
|
"grad_norm": 0.1373518407344818, |
|
"learning_rate": 4.4282447112635795e-05, |
|
"loss": 0.1663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.11815926432609558, |
|
"learning_rate": 4.399656946826758e-05, |
|
"loss": 0.1562, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37703513281919454, |
|
"grad_norm": 0.12364204227924347, |
|
"learning_rate": 4.3710691823899376e-05, |
|
"loss": 0.1655, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39417309340188517, |
|
"grad_norm": 0.11595863103866577, |
|
"learning_rate": 4.3424814179531164e-05, |
|
"loss": 0.1621, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.11320952326059341, |
|
"learning_rate": 4.313893653516296e-05, |
|
"loss": 0.1523, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4284490145672665, |
|
"grad_norm": 0.12280760705471039, |
|
"learning_rate": 4.285305889079474e-05, |
|
"loss": 0.1711, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44558697514995715, |
|
"grad_norm": 0.12085650116205215, |
|
"learning_rate": 4.256718124642653e-05, |
|
"loss": 0.1548, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 0.1491166651248932, |
|
"learning_rate": 4.228130360205832e-05, |
|
"loss": 0.1631, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.47986289631533846, |
|
"grad_norm": 0.16156260669231415, |
|
"learning_rate": 4.199542595769011e-05, |
|
"loss": 0.1624, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.49700085689802914, |
|
"grad_norm": 0.12156664580106735, |
|
"learning_rate": 4.17095483133219e-05, |
|
"loss": 0.1383, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 0.11533980071544647, |
|
"learning_rate": 4.142367066895369e-05, |
|
"loss": 0.1539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5312767780634104, |
|
"grad_norm": 0.1386200189590454, |
|
"learning_rate": 4.113779302458548e-05, |
|
"loss": 0.1525, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5484147386461011, |
|
"grad_norm": 0.16111765801906586, |
|
"learning_rate": 4.085191538021727e-05, |
|
"loss": 0.14, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5655526992287918, |
|
"grad_norm": 0.151380717754364, |
|
"learning_rate": 4.0566037735849064e-05, |
|
"loss": 0.1492, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5826906598114824, |
|
"grad_norm": 0.15228472650051117, |
|
"learning_rate": 4.028016009148085e-05, |
|
"loss": 0.157, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5998286203941731, |
|
"grad_norm": 0.11199972033500671, |
|
"learning_rate": 3.999428244711264e-05, |
|
"loss": 0.1492, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6169665809768637, |
|
"grad_norm": 0.17204181849956512, |
|
"learning_rate": 3.9708404802744425e-05, |
|
"loss": 0.1504, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6341045415595544, |
|
"grad_norm": 0.13271720707416534, |
|
"learning_rate": 3.942252715837621e-05, |
|
"loss": 0.1604, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6512425021422451, |
|
"grad_norm": 0.15998151898384094, |
|
"learning_rate": 3.913664951400801e-05, |
|
"loss": 0.148, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6683804627249358, |
|
"grad_norm": 0.12898313999176025, |
|
"learning_rate": 3.8850771869639794e-05, |
|
"loss": 0.1492, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6855184233076264, |
|
"grad_norm": 0.13998836278915405, |
|
"learning_rate": 3.856489422527159e-05, |
|
"loss": 0.1548, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.702656383890317, |
|
"grad_norm": 0.14940115809440613, |
|
"learning_rate": 3.8279016580903375e-05, |
|
"loss": 0.1432, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7197943444730077, |
|
"grad_norm": 0.13358236849308014, |
|
"learning_rate": 3.799313893653517e-05, |
|
"loss": 0.1459, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7369323050556984, |
|
"grad_norm": 0.1597578376531601, |
|
"learning_rate": 3.7707261292166957e-05, |
|
"loss": 0.1351, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7540702656383891, |
|
"grad_norm": 0.12782897055149078, |
|
"learning_rate": 3.7421383647798744e-05, |
|
"loss": 0.1392, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7712082262210797, |
|
"grad_norm": 0.13537828624248505, |
|
"learning_rate": 3.713550600343053e-05, |
|
"loss": 0.1402, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7883461868037703, |
|
"grad_norm": 0.17046277225017548, |
|
"learning_rate": 3.684962835906232e-05, |
|
"loss": 0.1227, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.805484147386461, |
|
"grad_norm": 0.16829894483089447, |
|
"learning_rate": 3.656375071469411e-05, |
|
"loss": 0.133, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8226221079691517, |
|
"grad_norm": 0.17760640382766724, |
|
"learning_rate": 3.62778730703259e-05, |
|
"loss": 0.1361, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8397600685518424, |
|
"grad_norm": 0.1783646047115326, |
|
"learning_rate": 3.5991995425957694e-05, |
|
"loss": 0.1272, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.856898029134533, |
|
"grad_norm": 0.1848060041666031, |
|
"learning_rate": 3.570611778158948e-05, |
|
"loss": 0.1281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8740359897172236, |
|
"grad_norm": 0.1244303435087204, |
|
"learning_rate": 3.5420240137221275e-05, |
|
"loss": 0.1276, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8911739502999143, |
|
"grad_norm": 0.1454310566186905, |
|
"learning_rate": 3.513436249285306e-05, |
|
"loss": 0.1334, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.908311910882605, |
|
"grad_norm": 0.17194361984729767, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 0.1271, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9254498714652957, |
|
"grad_norm": 0.15851227939128876, |
|
"learning_rate": 3.456260720411664e-05, |
|
"loss": 0.1304, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9425878320479862, |
|
"grad_norm": 0.1614447981119156, |
|
"learning_rate": 3.4276729559748424e-05, |
|
"loss": 0.1256, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9597257926306769, |
|
"grad_norm": 0.19016973674297333, |
|
"learning_rate": 3.399085191538022e-05, |
|
"loss": 0.1329, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9768637532133676, |
|
"grad_norm": 0.15122413635253906, |
|
"learning_rate": 3.3704974271012005e-05, |
|
"loss": 0.1228, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9940017137960583, |
|
"grad_norm": 0.15631020069122314, |
|
"learning_rate": 3.34190966266438e-05, |
|
"loss": 0.1235, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9991431019708654, |
|
"eval_loss": 0.15606513619422913, |
|
"eval_runtime": 439.0741, |
|
"eval_samples_per_second": 10.932, |
|
"eval_steps_per_second": 1.367, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.0111396743787489, |
|
"grad_norm": 0.20745624601840973, |
|
"learning_rate": 3.313321898227559e-05, |
|
"loss": 0.1245, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.0282776349614395, |
|
"grad_norm": 0.182315856218338, |
|
"learning_rate": 3.284734133790738e-05, |
|
"loss": 0.1112, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0454155955441302, |
|
"grad_norm": 0.1425042450428009, |
|
"learning_rate": 3.256146369353917e-05, |
|
"loss": 0.1186, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.062553556126821, |
|
"grad_norm": 0.1898319274187088, |
|
"learning_rate": 3.2275586049170955e-05, |
|
"loss": 0.1203, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0796915167095116, |
|
"grad_norm": 0.17110544443130493, |
|
"learning_rate": 3.198970840480275e-05, |
|
"loss": 0.1223, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0968294772922023, |
|
"grad_norm": 0.17418451607227325, |
|
"learning_rate": 3.170383076043453e-05, |
|
"loss": 0.111, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.113967437874893, |
|
"grad_norm": 0.19765284657478333, |
|
"learning_rate": 3.1417953116066324e-05, |
|
"loss": 0.1173, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1311053984575836, |
|
"grad_norm": 0.17013542354106903, |
|
"learning_rate": 3.113207547169811e-05, |
|
"loss": 0.1128, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.1482433590402743, |
|
"grad_norm": 0.21173644065856934, |
|
"learning_rate": 3.0846197827329905e-05, |
|
"loss": 0.109, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.165381319622965, |
|
"grad_norm": 0.13383643329143524, |
|
"learning_rate": 3.056032018296169e-05, |
|
"loss": 0.1092, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1825192802056554, |
|
"grad_norm": 0.22101104259490967, |
|
"learning_rate": 3.0274442538593483e-05, |
|
"loss": 0.1175, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.1996572407883461, |
|
"grad_norm": 0.1745050698518753, |
|
"learning_rate": 2.9988564894225274e-05, |
|
"loss": 0.1096, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.2167952013710368, |
|
"grad_norm": 0.2413550466299057, |
|
"learning_rate": 2.9702687249857064e-05, |
|
"loss": 0.1044, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.2339331619537275, |
|
"grad_norm": 0.15909789502620697, |
|
"learning_rate": 2.9416809605488855e-05, |
|
"loss": 0.1076, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.2510711225364182, |
|
"grad_norm": 0.21382008492946625, |
|
"learning_rate": 2.9130931961120646e-05, |
|
"loss": 0.1077, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.2682090831191088, |
|
"grad_norm": 0.1845552772283554, |
|
"learning_rate": 2.884505431675243e-05, |
|
"loss": 0.102, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2853470437017995, |
|
"grad_norm": 0.19705334305763245, |
|
"learning_rate": 2.855917667238422e-05, |
|
"loss": 0.1048, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3024850042844902, |
|
"grad_norm": 0.16979779303073883, |
|
"learning_rate": 2.827329902801601e-05, |
|
"loss": 0.1012, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.3196229648671807, |
|
"grad_norm": 0.18766264617443085, |
|
"learning_rate": 2.7987421383647798e-05, |
|
"loss": 0.1043, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.3367609254498714, |
|
"grad_norm": 0.1593862771987915, |
|
"learning_rate": 2.770154373927959e-05, |
|
"loss": 0.0953, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.353898886032562, |
|
"grad_norm": 0.19427362084388733, |
|
"learning_rate": 2.741566609491138e-05, |
|
"loss": 0.1008, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.3710368466152527, |
|
"grad_norm": 0.21518777310848236, |
|
"learning_rate": 2.712978845054317e-05, |
|
"loss": 0.0973, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.3881748071979434, |
|
"grad_norm": 0.2065572291612625, |
|
"learning_rate": 2.684391080617496e-05, |
|
"loss": 0.1035, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.405312767780634, |
|
"grad_norm": 0.26617249846458435, |
|
"learning_rate": 2.655803316180675e-05, |
|
"loss": 0.0969, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.4224507283633248, |
|
"grad_norm": 0.17817485332489014, |
|
"learning_rate": 2.627215551743854e-05, |
|
"loss": 0.0944, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.4395886889460154, |
|
"grad_norm": 0.17743416130542755, |
|
"learning_rate": 2.5986277873070326e-05, |
|
"loss": 0.0984, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.4567266495287061, |
|
"grad_norm": 0.23810917139053345, |
|
"learning_rate": 2.5700400228702117e-05, |
|
"loss": 0.099, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.4738646101113968, |
|
"grad_norm": 0.18336652219295502, |
|
"learning_rate": 2.5414522584333904e-05, |
|
"loss": 0.0954, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.4910025706940875, |
|
"grad_norm": 0.22074759006500244, |
|
"learning_rate": 2.5128644939965695e-05, |
|
"loss": 0.0915, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.5081405312767782, |
|
"grad_norm": 0.2292771190404892, |
|
"learning_rate": 2.4842767295597485e-05, |
|
"loss": 0.0931, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.5252784918594688, |
|
"grad_norm": 0.27021974325180054, |
|
"learning_rate": 2.4556889651229276e-05, |
|
"loss": 0.0933, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.5424164524421595, |
|
"grad_norm": 0.28702589869499207, |
|
"learning_rate": 2.4271012006861067e-05, |
|
"loss": 0.0978, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5595544130248502, |
|
"grad_norm": 0.18202678859233856, |
|
"learning_rate": 2.3985134362492854e-05, |
|
"loss": 0.0908, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.5766923736075407, |
|
"grad_norm": 0.15822364389896393, |
|
"learning_rate": 2.3699256718124644e-05, |
|
"loss": 0.0878, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.5938303341902313, |
|
"grad_norm": 0.21551179885864258, |
|
"learning_rate": 2.3413379073756435e-05, |
|
"loss": 0.0883, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.610968294772922, |
|
"grad_norm": 0.20409446954727173, |
|
"learning_rate": 2.3127501429388222e-05, |
|
"loss": 0.094, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.6281062553556127, |
|
"grad_norm": 0.16509220004081726, |
|
"learning_rate": 2.2841623785020013e-05, |
|
"loss": 0.0909, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.6452442159383034, |
|
"grad_norm": 0.1946859210729599, |
|
"learning_rate": 2.25557461406518e-05, |
|
"loss": 0.0898, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.6623821765209938, |
|
"grad_norm": 0.16525417566299438, |
|
"learning_rate": 2.226986849628359e-05, |
|
"loss": 0.0842, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.6795201371036845, |
|
"grad_norm": 0.26819315552711487, |
|
"learning_rate": 2.198399085191538e-05, |
|
"loss": 0.0897, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.6966580976863752, |
|
"grad_norm": 0.273593008518219, |
|
"learning_rate": 2.1698113207547172e-05, |
|
"loss": 0.0895, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.713796058269066, |
|
"grad_norm": 0.20378242433071136, |
|
"learning_rate": 2.1412235563178963e-05, |
|
"loss": 0.0912, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7309340188517566, |
|
"grad_norm": 0.24366410076618195, |
|
"learning_rate": 2.112635791881075e-05, |
|
"loss": 0.0843, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.7480719794344473, |
|
"grad_norm": 0.2034538835287094, |
|
"learning_rate": 2.084048027444254e-05, |
|
"loss": 0.0874, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.765209940017138, |
|
"grad_norm": 0.2260463535785675, |
|
"learning_rate": 2.0554602630074328e-05, |
|
"loss": 0.0898, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.7823479005998286, |
|
"grad_norm": 0.165971040725708, |
|
"learning_rate": 2.026872498570612e-05, |
|
"loss": 0.0867, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.7994858611825193, |
|
"grad_norm": 0.20090453326702118, |
|
"learning_rate": 1.998284734133791e-05, |
|
"loss": 0.0891, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.81662382176521, |
|
"grad_norm": 0.24718008935451508, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 0.0872, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.8337617823479007, |
|
"grad_norm": 0.2808043360710144, |
|
"learning_rate": 1.9411092052601487e-05, |
|
"loss": 0.0889, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.8508997429305913, |
|
"grad_norm": 0.19833321869373322, |
|
"learning_rate": 1.9125214408233278e-05, |
|
"loss": 0.0859, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.868037703513282, |
|
"grad_norm": 0.21585367619991302, |
|
"learning_rate": 1.883933676386507e-05, |
|
"loss": 0.0872, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.8851756640959727, |
|
"grad_norm": 0.16208423674106598, |
|
"learning_rate": 1.8553459119496856e-05, |
|
"loss": 0.0835, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9023136246786634, |
|
"grad_norm": 0.195535346865654, |
|
"learning_rate": 1.8267581475128647e-05, |
|
"loss": 0.0785, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.919451585261354, |
|
"grad_norm": 0.22447216510772705, |
|
"learning_rate": 1.7981703830760434e-05, |
|
"loss": 0.08, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.9365895458440445, |
|
"grad_norm": 0.22212448716163635, |
|
"learning_rate": 1.7695826186392224e-05, |
|
"loss": 0.0826, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.9537275064267352, |
|
"grad_norm": 0.2077985554933548, |
|
"learning_rate": 1.7409948542024015e-05, |
|
"loss": 0.0822, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.9708654670094259, |
|
"grad_norm": 0.20502477884292603, |
|
"learning_rate": 1.7124070897655802e-05, |
|
"loss": 0.0817, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.9880034275921166, |
|
"grad_norm": 0.22330917418003082, |
|
"learning_rate": 1.6838193253287593e-05, |
|
"loss": 0.0837, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.19116699695587158, |
|
"eval_runtime": 439.6787, |
|
"eval_samples_per_second": 10.917, |
|
"eval_steps_per_second": 1.365, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 2.005141388174807, |
|
"grad_norm": 0.1825590431690216, |
|
"learning_rate": 1.6552315608919384e-05, |
|
"loss": 0.0764, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.0222793487574977, |
|
"grad_norm": 0.245217964053154, |
|
"learning_rate": 1.6266437964551174e-05, |
|
"loss": 0.0776, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.0394173093401884, |
|
"grad_norm": 0.185066357254982, |
|
"learning_rate": 1.5980560320182965e-05, |
|
"loss": 0.0736, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.056555269922879, |
|
"grad_norm": 0.1703159511089325, |
|
"learning_rate": 1.5694682675814752e-05, |
|
"loss": 0.075, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.0736932305055698, |
|
"grad_norm": 0.19329093396663666, |
|
"learning_rate": 1.540880503144654e-05, |
|
"loss": 0.0789, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.0908311910882604, |
|
"grad_norm": 0.22116591036319733, |
|
"learning_rate": 1.5122927387078332e-05, |
|
"loss": 0.0761, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.107969151670951, |
|
"grad_norm": 0.17945989966392517, |
|
"learning_rate": 1.4837049742710121e-05, |
|
"loss": 0.0766, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.125107112253642, |
|
"grad_norm": 0.21690496802330017, |
|
"learning_rate": 1.4551172098341912e-05, |
|
"loss": 0.0748, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.1422450728363325, |
|
"grad_norm": 0.17853769659996033, |
|
"learning_rate": 1.4265294453973699e-05, |
|
"loss": 0.0755, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.159383033419023, |
|
"grad_norm": 0.18670514225959778, |
|
"learning_rate": 1.397941680960549e-05, |
|
"loss": 0.0789, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.176520994001714, |
|
"grad_norm": 0.2004610300064087, |
|
"learning_rate": 1.3693539165237278e-05, |
|
"loss": 0.0771, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.1936589545844045, |
|
"grad_norm": 0.17286266386508942, |
|
"learning_rate": 1.3407661520869069e-05, |
|
"loss": 0.0757, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.210796915167095, |
|
"grad_norm": 0.21835772693157196, |
|
"learning_rate": 1.312178387650086e-05, |
|
"loss": 0.0764, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.227934875749786, |
|
"grad_norm": 0.2717360854148865, |
|
"learning_rate": 1.2835906232132647e-05, |
|
"loss": 0.0761, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.2450728363324766, |
|
"grad_norm": 0.17896392941474915, |
|
"learning_rate": 1.2550028587764438e-05, |
|
"loss": 0.0748, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.2622107969151672, |
|
"grad_norm": 0.20612064003944397, |
|
"learning_rate": 1.2264150943396227e-05, |
|
"loss": 0.0745, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.279348757497858, |
|
"grad_norm": 0.25621137022972107, |
|
"learning_rate": 1.1978273299028017e-05, |
|
"loss": 0.0754, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.2964867180805486, |
|
"grad_norm": 0.1826545149087906, |
|
"learning_rate": 1.1692395654659806e-05, |
|
"loss": 0.0738, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.3136246786632393, |
|
"grad_norm": 0.22315889596939087, |
|
"learning_rate": 1.1406518010291597e-05, |
|
"loss": 0.0738, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.33076263924593, |
|
"grad_norm": 0.2433796525001526, |
|
"learning_rate": 1.1120640365923384e-05, |
|
"loss": 0.0746, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.34790059982862, |
|
"grad_norm": 0.19135648012161255, |
|
"learning_rate": 1.0834762721555175e-05, |
|
"loss": 0.0745, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.365038560411311, |
|
"grad_norm": 0.21853765845298767, |
|
"learning_rate": 1.0548885077186965e-05, |
|
"loss": 0.0751, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.3821765209940016, |
|
"grad_norm": 0.23422804474830627, |
|
"learning_rate": 1.0263007432818754e-05, |
|
"loss": 0.0736, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.3993144815766922, |
|
"grad_norm": 0.22039854526519775, |
|
"learning_rate": 9.977129788450543e-06, |
|
"loss": 0.0722, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.416452442159383, |
|
"grad_norm": 0.20668627321720123, |
|
"learning_rate": 9.691252144082332e-06, |
|
"loss": 0.074, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.4335904027420736, |
|
"grad_norm": 0.3503040671348572, |
|
"learning_rate": 9.405374499714123e-06, |
|
"loss": 0.0786, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.4507283633247643, |
|
"grad_norm": 0.2581626772880554, |
|
"learning_rate": 9.119496855345912e-06, |
|
"loss": 0.0721, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.467866323907455, |
|
"grad_norm": 0.2439981997013092, |
|
"learning_rate": 8.833619210977703e-06, |
|
"loss": 0.0733, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.4850042844901457, |
|
"grad_norm": 0.16303370893001556, |
|
"learning_rate": 8.547741566609492e-06, |
|
"loss": 0.0727, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.5021422450728363, |
|
"grad_norm": 0.17133218050003052, |
|
"learning_rate": 8.26186392224128e-06, |
|
"loss": 0.0722, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.519280205655527, |
|
"grad_norm": 0.2298302799463272, |
|
"learning_rate": 7.975986277873071e-06, |
|
"loss": 0.0717, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.5364181662382177, |
|
"grad_norm": 0.22220905125141144, |
|
"learning_rate": 7.69010863350486e-06, |
|
"loss": 0.0763, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.5535561268209084, |
|
"grad_norm": 0.2196768969297409, |
|
"learning_rate": 7.40423098913665e-06, |
|
"loss": 0.0694, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.570694087403599, |
|
"grad_norm": 0.23876038193702698, |
|
"learning_rate": 7.11835334476844e-06, |
|
"loss": 0.0702, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.5878320479862897, |
|
"grad_norm": 0.2197275310754776, |
|
"learning_rate": 6.832475700400229e-06, |
|
"loss": 0.0731, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.6049700085689804, |
|
"grad_norm": 0.1521027684211731, |
|
"learning_rate": 6.5465980560320186e-06, |
|
"loss": 0.0706, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.622107969151671, |
|
"grad_norm": 0.2315615713596344, |
|
"learning_rate": 6.2607204116638075e-06, |
|
"loss": 0.0699, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.6392459297343613, |
|
"grad_norm": 0.22933197021484375, |
|
"learning_rate": 5.974842767295598e-06, |
|
"loss": 0.0726, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.656383890317052, |
|
"grad_norm": 0.23843450844287872, |
|
"learning_rate": 5.688965122927387e-06, |
|
"loss": 0.0696, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.6735218508997427, |
|
"grad_norm": 0.2547082304954529, |
|
"learning_rate": 5.403087478559177e-06, |
|
"loss": 0.0697, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.6906598114824334, |
|
"grad_norm": 0.19213077425956726, |
|
"learning_rate": 5.117209834190966e-06, |
|
"loss": 0.0725, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.707797772065124, |
|
"grad_norm": 0.19462022185325623, |
|
"learning_rate": 4.8313321898227566e-06, |
|
"loss": 0.0707, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.7249357326478147, |
|
"grad_norm": 0.22448916733264923, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.0708, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.7420736932305054, |
|
"grad_norm": 0.22550085186958313, |
|
"learning_rate": 4.259576901086335e-06, |
|
"loss": 0.0705, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.759211653813196, |
|
"grad_norm": 0.24738526344299316, |
|
"learning_rate": 3.973699256718124e-06, |
|
"loss": 0.0716, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.776349614395887, |
|
"grad_norm": 0.20870988070964813, |
|
"learning_rate": 3.687821612349914e-06, |
|
"loss": 0.0674, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.7934875749785775, |
|
"grad_norm": 0.2011324018239975, |
|
"learning_rate": 3.4019439679817043e-06, |
|
"loss": 0.0692, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.810625535561268, |
|
"grad_norm": 0.25708264112472534, |
|
"learning_rate": 3.1160663236134933e-06, |
|
"loss": 0.0699, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.827763496143959, |
|
"grad_norm": 0.20108073949813843, |
|
"learning_rate": 2.830188679245283e-06, |
|
"loss": 0.0702, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8449014567266495, |
|
"grad_norm": 0.21398206055164337, |
|
"learning_rate": 2.5443110348770725e-06, |
|
"loss": 0.0727, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.86203941730934, |
|
"grad_norm": 0.1945817768573761, |
|
"learning_rate": 2.2584333905088623e-06, |
|
"loss": 0.0739, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.879177377892031, |
|
"grad_norm": 0.20674385130405426, |
|
"learning_rate": 1.972555746140652e-06, |
|
"loss": 0.0738, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.8963153384747216, |
|
"grad_norm": 0.2226879894733429, |
|
"learning_rate": 1.6866781017724415e-06, |
|
"loss": 0.0698, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.9134532990574122, |
|
"grad_norm": 0.1832134872674942, |
|
"learning_rate": 1.400800457404231e-06, |
|
"loss": 0.0701, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.930591259640103, |
|
"grad_norm": 0.2704426348209381, |
|
"learning_rate": 1.1149228130360207e-06, |
|
"loss": 0.0681, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.9477292202227936, |
|
"grad_norm": 0.2618798315525055, |
|
"learning_rate": 8.290451686678103e-07, |
|
"loss": 0.0696, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.9648671808054843, |
|
"grad_norm": 0.16076813638210297, |
|
"learning_rate": 5.431675242995998e-07, |
|
"loss": 0.0729, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.982005141388175, |
|
"grad_norm": 0.20760373771190643, |
|
"learning_rate": 2.572898799313894e-07, |
|
"loss": 0.0693, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.9974293059125965, |
|
"eval_loss": 0.2091314196586609, |
|
"eval_runtime": 438.8842, |
|
"eval_samples_per_second": 10.937, |
|
"eval_steps_per_second": 1.367, |
|
"step": 1749 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1749, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3059345020564275e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|