|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991431019708654, |
|
"eval_steps": 500, |
|
"global_step": 583, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01713796058269066, |
|
"grad_norm": 0.4733077883720398, |
|
"learning_rate": 4.971412235563179e-05, |
|
"loss": 2.5254, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03427592116538132, |
|
"grad_norm": 0.44018083810806274, |
|
"learning_rate": 4.9428244711263584e-05, |
|
"loss": 1.8284, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05141388174807198, |
|
"grad_norm": 0.7213996648788452, |
|
"learning_rate": 4.914236706689537e-05, |
|
"loss": 1.2128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06855184233076264, |
|
"grad_norm": 0.6227633953094482, |
|
"learning_rate": 4.8856489422527165e-05, |
|
"loss": 0.6727, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0856898029134533, |
|
"grad_norm": 0.2804420590400696, |
|
"learning_rate": 4.8570611778158946e-05, |
|
"loss": 0.3089, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10282776349614396, |
|
"grad_norm": 0.10756277292966843, |
|
"learning_rate": 4.828473413379074e-05, |
|
"loss": 0.2448, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11996572407883462, |
|
"grad_norm": 0.11257284879684448, |
|
"learning_rate": 4.799885648942253e-05, |
|
"loss": 0.2268, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13710368466152528, |
|
"grad_norm": 0.09793874621391296, |
|
"learning_rate": 4.771297884505432e-05, |
|
"loss": 0.2328, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15424164524421594, |
|
"grad_norm": 0.10177452117204666, |
|
"learning_rate": 4.742710120068611e-05, |
|
"loss": 0.2183, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1713796058269066, |
|
"grad_norm": 0.09770502895116806, |
|
"learning_rate": 4.7141223556317895e-05, |
|
"loss": 0.1971, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18851756640959727, |
|
"grad_norm": 0.11037880927324295, |
|
"learning_rate": 4.685534591194969e-05, |
|
"loss": 0.1996, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20565552699228792, |
|
"grad_norm": 0.11869871616363525, |
|
"learning_rate": 4.656946826758148e-05, |
|
"loss": 0.2044, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22279348757497858, |
|
"grad_norm": 0.10279687494039536, |
|
"learning_rate": 4.628359062321327e-05, |
|
"loss": 0.1782, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23993144815766923, |
|
"grad_norm": 0.10952438414096832, |
|
"learning_rate": 4.599771297884506e-05, |
|
"loss": 0.1837, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2570694087403599, |
|
"grad_norm": 0.13122445344924927, |
|
"learning_rate": 4.5711835334476845e-05, |
|
"loss": 0.1907, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.27420736932305056, |
|
"grad_norm": 0.11156395077705383, |
|
"learning_rate": 4.542595769010863e-05, |
|
"loss": 0.1748, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2913453299057412, |
|
"grad_norm": 0.1657029241323471, |
|
"learning_rate": 4.5140080045740427e-05, |
|
"loss": 0.1704, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30848329048843187, |
|
"grad_norm": 0.11676887422800064, |
|
"learning_rate": 4.4854202401372214e-05, |
|
"loss": 0.16, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32562125107112255, |
|
"grad_norm": 0.17466452717781067, |
|
"learning_rate": 4.4568324757004e-05, |
|
"loss": 0.1705, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3427592116538132, |
|
"grad_norm": 0.1373518407344818, |
|
"learning_rate": 4.4282447112635795e-05, |
|
"loss": 0.1663, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.35989717223650386, |
|
"grad_norm": 0.11815926432609558, |
|
"learning_rate": 4.399656946826758e-05, |
|
"loss": 0.1562, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37703513281919454, |
|
"grad_norm": 0.12364204227924347, |
|
"learning_rate": 4.3710691823899376e-05, |
|
"loss": 0.1655, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39417309340188517, |
|
"grad_norm": 0.11595863103866577, |
|
"learning_rate": 4.3424814179531164e-05, |
|
"loss": 0.1621, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.41131105398457585, |
|
"grad_norm": 0.11320952326059341, |
|
"learning_rate": 4.313893653516296e-05, |
|
"loss": 0.1523, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4284490145672665, |
|
"grad_norm": 0.12280760705471039, |
|
"learning_rate": 4.285305889079474e-05, |
|
"loss": 0.1711, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44558697514995715, |
|
"grad_norm": 0.12085650116205215, |
|
"learning_rate": 4.256718124642653e-05, |
|
"loss": 0.1548, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.46272493573264784, |
|
"grad_norm": 0.1491166651248932, |
|
"learning_rate": 4.228130360205832e-05, |
|
"loss": 0.1631, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.47986289631533846, |
|
"grad_norm": 0.16156260669231415, |
|
"learning_rate": 4.199542595769011e-05, |
|
"loss": 0.1624, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.49700085689802914, |
|
"grad_norm": 0.12156664580106735, |
|
"learning_rate": 4.17095483133219e-05, |
|
"loss": 0.1383, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5141388174807198, |
|
"grad_norm": 0.11533980071544647, |
|
"learning_rate": 4.142367066895369e-05, |
|
"loss": 0.1539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5312767780634104, |
|
"grad_norm": 0.1386200189590454, |
|
"learning_rate": 4.113779302458548e-05, |
|
"loss": 0.1525, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5484147386461011, |
|
"grad_norm": 0.16111765801906586, |
|
"learning_rate": 4.085191538021727e-05, |
|
"loss": 0.14, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5655526992287918, |
|
"grad_norm": 0.151380717754364, |
|
"learning_rate": 4.0566037735849064e-05, |
|
"loss": 0.1492, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5826906598114824, |
|
"grad_norm": 0.15228472650051117, |
|
"learning_rate": 4.028016009148085e-05, |
|
"loss": 0.157, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5998286203941731, |
|
"grad_norm": 0.11199972033500671, |
|
"learning_rate": 3.999428244711264e-05, |
|
"loss": 0.1492, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6169665809768637, |
|
"grad_norm": 0.17204181849956512, |
|
"learning_rate": 3.9708404802744425e-05, |
|
"loss": 0.1504, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6341045415595544, |
|
"grad_norm": 0.13271720707416534, |
|
"learning_rate": 3.942252715837621e-05, |
|
"loss": 0.1604, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6512425021422451, |
|
"grad_norm": 0.15998151898384094, |
|
"learning_rate": 3.913664951400801e-05, |
|
"loss": 0.148, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6683804627249358, |
|
"grad_norm": 0.12898313999176025, |
|
"learning_rate": 3.8850771869639794e-05, |
|
"loss": 0.1492, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6855184233076264, |
|
"grad_norm": 0.13998836278915405, |
|
"learning_rate": 3.856489422527159e-05, |
|
"loss": 0.1548, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.702656383890317, |
|
"grad_norm": 0.14940115809440613, |
|
"learning_rate": 3.8279016580903375e-05, |
|
"loss": 0.1432, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7197943444730077, |
|
"grad_norm": 0.13358236849308014, |
|
"learning_rate": 3.799313893653517e-05, |
|
"loss": 0.1459, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7369323050556984, |
|
"grad_norm": 0.1597578376531601, |
|
"learning_rate": 3.7707261292166957e-05, |
|
"loss": 0.1351, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7540702656383891, |
|
"grad_norm": 0.12782897055149078, |
|
"learning_rate": 3.7421383647798744e-05, |
|
"loss": 0.1392, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7712082262210797, |
|
"grad_norm": 0.13537828624248505, |
|
"learning_rate": 3.713550600343053e-05, |
|
"loss": 0.1402, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7883461868037703, |
|
"grad_norm": 0.17046277225017548, |
|
"learning_rate": 3.684962835906232e-05, |
|
"loss": 0.1227, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.805484147386461, |
|
"grad_norm": 0.16829894483089447, |
|
"learning_rate": 3.656375071469411e-05, |
|
"loss": 0.133, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8226221079691517, |
|
"grad_norm": 0.17760640382766724, |
|
"learning_rate": 3.62778730703259e-05, |
|
"loss": 0.1361, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8397600685518424, |
|
"grad_norm": 0.1783646047115326, |
|
"learning_rate": 3.5991995425957694e-05, |
|
"loss": 0.1272, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.856898029134533, |
|
"grad_norm": 0.1848060041666031, |
|
"learning_rate": 3.570611778158948e-05, |
|
"loss": 0.1281, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8740359897172236, |
|
"grad_norm": 0.1244303435087204, |
|
"learning_rate": 3.5420240137221275e-05, |
|
"loss": 0.1276, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8911739502999143, |
|
"grad_norm": 0.1454310566186905, |
|
"learning_rate": 3.513436249285306e-05, |
|
"loss": 0.1334, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.908311910882605, |
|
"grad_norm": 0.17194361984729767, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 0.1271, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9254498714652957, |
|
"grad_norm": 0.15851227939128876, |
|
"learning_rate": 3.456260720411664e-05, |
|
"loss": 0.1304, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9425878320479862, |
|
"grad_norm": 0.1614447981119156, |
|
"learning_rate": 3.4276729559748424e-05, |
|
"loss": 0.1256, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9597257926306769, |
|
"grad_norm": 0.19016973674297333, |
|
"learning_rate": 3.399085191538022e-05, |
|
"loss": 0.1329, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9768637532133676, |
|
"grad_norm": 0.15122413635253906, |
|
"learning_rate": 3.3704974271012005e-05, |
|
"loss": 0.1228, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9940017137960583, |
|
"grad_norm": 0.15631020069122314, |
|
"learning_rate": 3.34190966266438e-05, |
|
"loss": 0.1235, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9991431019708654, |
|
"eval_loss": 0.15606513619422913, |
|
"eval_runtime": 439.0741, |
|
"eval_samples_per_second": 10.932, |
|
"eval_steps_per_second": 1.367, |
|
"step": 583 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1749, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.356382410355507e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|