|
{ |
|
"best_metric": 0.0001007633691187948, |
|
"best_model_checkpoint": "autotrain-ljk9o-0hizk/checkpoint-4800", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 4800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 34.15324783325195, |
|
"learning_rate": 2.1875e-06, |
|
"loss": 4.8362, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 7.099372863769531, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 4.6659, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 7.394688606262207, |
|
"learning_rate": 7.2916666666666674e-06, |
|
"loss": 4.4834, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 5.589704513549805, |
|
"learning_rate": 9.895833333333333e-06, |
|
"loss": 4.2105, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 5.1627020835876465, |
|
"learning_rate": 1.25e-05, |
|
"loss": 3.8238, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 14.816465377807617, |
|
"learning_rate": 1.5104166666666667e-05, |
|
"loss": 3.542, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 3.7660512924194336, |
|
"learning_rate": 1.7708333333333335e-05, |
|
"loss": 3.1228, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 6.042110919952393, |
|
"learning_rate": 2.0312500000000002e-05, |
|
"loss": 2.8363, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 4.053712368011475, |
|
"learning_rate": 2.2916666666666667e-05, |
|
"loss": 2.6232, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 3.718096971511841, |
|
"learning_rate": 2.552083333333333e-05, |
|
"loss": 2.285, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 4.003573417663574, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 2.0495, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 3.6954550743103027, |
|
"learning_rate": 3.072916666666667e-05, |
|
"loss": 1.79, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 4.259199142456055, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.5362, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 4.389467239379883, |
|
"learning_rate": 3.5833333333333335e-05, |
|
"loss": 1.3678, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 4.271336555480957, |
|
"learning_rate": 3.8437500000000006e-05, |
|
"loss": 1.0995, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.2241291999816895, |
|
"learning_rate": 4.104166666666667e-05, |
|
"loss": 0.9856, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 4.329673767089844, |
|
"learning_rate": 4.3645833333333335e-05, |
|
"loss": 0.8101, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 12.796660423278809, |
|
"learning_rate": 4.6250000000000006e-05, |
|
"loss": 0.6484, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 3.313157558441162, |
|
"learning_rate": 4.885416666666667e-05, |
|
"loss": 0.5902, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 24.351720809936523, |
|
"learning_rate": 4.983796296296296e-05, |
|
"loss": 0.5107, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 2.632058620452881, |
|
"learning_rate": 4.954861111111112e-05, |
|
"loss": 0.4271, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 5.7889628410339355, |
|
"learning_rate": 4.925925925925926e-05, |
|
"loss": 0.3724, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 2.4024550914764404, |
|
"learning_rate": 4.896990740740741e-05, |
|
"loss": 0.3208, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 2.1066699028015137, |
|
"learning_rate": 4.8680555555555554e-05, |
|
"loss": 0.2709, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 2.587980270385742, |
|
"learning_rate": 4.839120370370371e-05, |
|
"loss": 0.2463, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 3.222367763519287, |
|
"learning_rate": 4.8101851851851854e-05, |
|
"loss": 0.2429, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 1.7287578582763672, |
|
"learning_rate": 4.7812500000000003e-05, |
|
"loss": 0.1956, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 1.7069084644317627, |
|
"learning_rate": 4.752314814814815e-05, |
|
"loss": 0.1838, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 3.9924962520599365, |
|
"learning_rate": 4.72337962962963e-05, |
|
"loss": 0.1825, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 1.7422994375228882, |
|
"learning_rate": 4.6944444444444446e-05, |
|
"loss": 0.1525, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 2.156935691833496, |
|
"learning_rate": 4.6655092592592596e-05, |
|
"loss": 0.1334, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.77634596824646, |
|
"learning_rate": 4.6365740740740746e-05, |
|
"loss": 0.1109, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 1.2669907808303833, |
|
"learning_rate": 4.607638888888889e-05, |
|
"loss": 0.1152, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 2.9084904193878174, |
|
"learning_rate": 4.578703703703704e-05, |
|
"loss": 0.0947, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 1.7987799644470215, |
|
"learning_rate": 4.549768518518518e-05, |
|
"loss": 0.1078, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 1.0232861042022705, |
|
"learning_rate": 4.520833333333334e-05, |
|
"loss": 0.0877, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 1.5493382215499878, |
|
"learning_rate": 4.491898148148148e-05, |
|
"loss": 0.0963, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 1.3363285064697266, |
|
"learning_rate": 4.462962962962963e-05, |
|
"loss": 0.0735, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 2.2493879795074463, |
|
"learning_rate": 4.4340277777777775e-05, |
|
"loss": 0.0806, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.1489849090576172, |
|
"learning_rate": 4.405092592592593e-05, |
|
"loss": 0.0853, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 1.4875305891036987, |
|
"learning_rate": 4.3761574074074075e-05, |
|
"loss": 0.0715, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 1.5242434740066528, |
|
"learning_rate": 4.3472222222222225e-05, |
|
"loss": 0.0617, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.671875, |
|
"grad_norm": 0.6194405555725098, |
|
"learning_rate": 4.318287037037037e-05, |
|
"loss": 0.0537, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 10.070043563842773, |
|
"learning_rate": 4.2893518518518525e-05, |
|
"loss": 0.0563, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.7786325812339783, |
|
"learning_rate": 4.260416666666667e-05, |
|
"loss": 0.0528, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 1.3423603773117065, |
|
"learning_rate": 4.231481481481482e-05, |
|
"loss": 0.0503, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.734375, |
|
"grad_norm": 0.9283588528633118, |
|
"learning_rate": 4.202546296296296e-05, |
|
"loss": 0.0547, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.4316812753677368, |
|
"learning_rate": 4.173611111111112e-05, |
|
"loss": 0.0469, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.765625, |
|
"grad_norm": 0.4404491186141968, |
|
"learning_rate": 4.144675925925926e-05, |
|
"loss": 0.0403, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 1.7463942766189575, |
|
"learning_rate": 4.115740740740741e-05, |
|
"loss": 0.0423, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.796875, |
|
"grad_norm": 0.5837883353233337, |
|
"learning_rate": 4.0868055555555554e-05, |
|
"loss": 0.0318, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 1.1707327365875244, |
|
"learning_rate": 4.057870370370371e-05, |
|
"loss": 0.0386, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.828125, |
|
"grad_norm": 1.5415633916854858, |
|
"learning_rate": 4.028935185185185e-05, |
|
"loss": 0.0435, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.8029876351356506, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0332, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.6900579929351807, |
|
"learning_rate": 3.9710648148148146e-05, |
|
"loss": 0.0357, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 1.1126506328582764, |
|
"learning_rate": 3.94212962962963e-05, |
|
"loss": 0.0329, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.890625, |
|
"grad_norm": 1.2710646390914917, |
|
"learning_rate": 3.9131944444444446e-05, |
|
"loss": 0.0299, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 2.057583808898926, |
|
"learning_rate": 3.8842592592592596e-05, |
|
"loss": 0.0335, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.921875, |
|
"grad_norm": 0.6628808975219727, |
|
"learning_rate": 3.855324074074074e-05, |
|
"loss": 0.0263, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 1.8219642639160156, |
|
"learning_rate": 3.826388888888889e-05, |
|
"loss": 0.0352, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.953125, |
|
"grad_norm": 0.6485552191734314, |
|
"learning_rate": 3.797453703703704e-05, |
|
"loss": 0.0226, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.4729490280151367, |
|
"learning_rate": 3.768518518518518e-05, |
|
"loss": 0.0292, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.984375, |
|
"grad_norm": 2.796936511993408, |
|
"learning_rate": 3.739583333333334e-05, |
|
"loss": 0.0254, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.35814589262008667, |
|
"learning_rate": 3.710648148148148e-05, |
|
"loss": 0.024, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_gen_len": 19.0, |
|
"eval_loss": 0.002919314429163933, |
|
"eval_rouge1": 36.1041, |
|
"eval_rouge2": 31.2121, |
|
"eval_rougeL": 36.1363, |
|
"eval_rougeLsum": 36.1113, |
|
"eval_runtime": 41.8038, |
|
"eval_samples_per_second": 19.137, |
|
"eval_steps_per_second": 4.784, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.015625, |
|
"grad_norm": 0.4471568465232849, |
|
"learning_rate": 3.681712962962963e-05, |
|
"loss": 0.022, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.03125, |
|
"grad_norm": 0.44426223635673523, |
|
"learning_rate": 3.6527777777777775e-05, |
|
"loss": 0.0258, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.046875, |
|
"grad_norm": 0.21920345723628998, |
|
"learning_rate": 3.623842592592593e-05, |
|
"loss": 0.0219, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.0625, |
|
"grad_norm": 1.597996473312378, |
|
"learning_rate": 3.5949074074074075e-05, |
|
"loss": 0.0167, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.078125, |
|
"grad_norm": 0.2239530384540558, |
|
"learning_rate": 3.5659722222222225e-05, |
|
"loss": 0.0215, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.09375, |
|
"grad_norm": 0.22859439253807068, |
|
"learning_rate": 3.537037037037037e-05, |
|
"loss": 0.021, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.109375, |
|
"grad_norm": 0.43176373839378357, |
|
"learning_rate": 3.5081018518518524e-05, |
|
"loss": 0.0159, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.2172795981168747, |
|
"learning_rate": 3.479166666666667e-05, |
|
"loss": 0.0182, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.140625, |
|
"grad_norm": 0.10611604899168015, |
|
"learning_rate": 3.450231481481482e-05, |
|
"loss": 0.0233, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.15625, |
|
"grad_norm": 0.26522374153137207, |
|
"learning_rate": 3.421296296296296e-05, |
|
"loss": 0.0255, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.171875, |
|
"grad_norm": 0.2485981285572052, |
|
"learning_rate": 3.392361111111112e-05, |
|
"loss": 0.0226, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.1875, |
|
"grad_norm": 0.4683358371257782, |
|
"learning_rate": 3.363425925925926e-05, |
|
"loss": 0.0167, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.203125, |
|
"grad_norm": 0.15683135390281677, |
|
"learning_rate": 3.334490740740741e-05, |
|
"loss": 0.0155, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.21875, |
|
"grad_norm": 0.0837627425789833, |
|
"learning_rate": 3.3055555555555553e-05, |
|
"loss": 0.0144, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.234375, |
|
"grad_norm": 0.2218899428844452, |
|
"learning_rate": 3.276620370370371e-05, |
|
"loss": 0.0144, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.38553592562675476, |
|
"learning_rate": 3.247685185185185e-05, |
|
"loss": 0.0115, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.265625, |
|
"grad_norm": 0.2546806335449219, |
|
"learning_rate": 3.21875e-05, |
|
"loss": 0.0133, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.28125, |
|
"grad_norm": 0.9946944117546082, |
|
"learning_rate": 3.1898148148148146e-05, |
|
"loss": 0.0155, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.296875, |
|
"grad_norm": 0.4023357927799225, |
|
"learning_rate": 3.16087962962963e-05, |
|
"loss": 0.0133, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.3125, |
|
"grad_norm": 0.1124168410897255, |
|
"learning_rate": 3.1319444444444446e-05, |
|
"loss": 0.0141, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.328125, |
|
"grad_norm": 0.14160649478435516, |
|
"learning_rate": 3.1030092592592596e-05, |
|
"loss": 0.0094, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.34375, |
|
"grad_norm": 0.27373650670051575, |
|
"learning_rate": 3.074074074074074e-05, |
|
"loss": 0.012, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.359375, |
|
"grad_norm": 0.09323440492153168, |
|
"learning_rate": 3.045138888888889e-05, |
|
"loss": 0.0118, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.3543163537979126, |
|
"learning_rate": 3.016203703703704e-05, |
|
"loss": 0.0149, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.390625, |
|
"grad_norm": 0.13677853345870972, |
|
"learning_rate": 2.9872685185185185e-05, |
|
"loss": 0.0103, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.40625, |
|
"grad_norm": 0.4921596646308899, |
|
"learning_rate": 2.9583333333333335e-05, |
|
"loss": 0.0105, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.421875, |
|
"grad_norm": 0.23629239201545715, |
|
"learning_rate": 2.9293981481481482e-05, |
|
"loss": 0.0121, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.4375, |
|
"grad_norm": 0.3946538269519806, |
|
"learning_rate": 2.900462962962963e-05, |
|
"loss": 0.0105, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.453125, |
|
"grad_norm": 0.1448473185300827, |
|
"learning_rate": 2.8715277777777778e-05, |
|
"loss": 0.0113, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.46875, |
|
"grad_norm": 2.191969394683838, |
|
"learning_rate": 2.8425925925925928e-05, |
|
"loss": 0.0106, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.484375, |
|
"grad_norm": 13.945716857910156, |
|
"learning_rate": 2.8136574074074075e-05, |
|
"loss": 0.0085, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.3949015140533447, |
|
"learning_rate": 2.7847222222222224e-05, |
|
"loss": 0.011, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.515625, |
|
"grad_norm": 0.1495410054922104, |
|
"learning_rate": 2.755787037037037e-05, |
|
"loss": 0.0099, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.53125, |
|
"grad_norm": 1.6874018907546997, |
|
"learning_rate": 2.726851851851852e-05, |
|
"loss": 0.013, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.546875, |
|
"grad_norm": 0.153639018535614, |
|
"learning_rate": 2.6979166666666667e-05, |
|
"loss": 0.0068, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.05537520349025726, |
|
"learning_rate": 2.6689814814814817e-05, |
|
"loss": 0.0101, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.578125, |
|
"grad_norm": 0.37778735160827637, |
|
"learning_rate": 2.6400462962962964e-05, |
|
"loss": 0.0083, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.59375, |
|
"grad_norm": 0.16083496809005737, |
|
"learning_rate": 2.6111111111111114e-05, |
|
"loss": 0.0109, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.609375, |
|
"grad_norm": 0.19876737892627716, |
|
"learning_rate": 2.582175925925926e-05, |
|
"loss": 0.0091, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.20663735270500183, |
|
"learning_rate": 2.553240740740741e-05, |
|
"loss": 0.0107, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.640625, |
|
"grad_norm": 0.5358327627182007, |
|
"learning_rate": 2.5243055555555557e-05, |
|
"loss": 0.0082, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.65625, |
|
"grad_norm": 0.35017523169517517, |
|
"learning_rate": 2.4953703703703703e-05, |
|
"loss": 0.0119, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.671875, |
|
"grad_norm": 0.26763439178466797, |
|
"learning_rate": 2.4664351851851853e-05, |
|
"loss": 0.0081, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.6875, |
|
"grad_norm": 1.2130630016326904, |
|
"learning_rate": 2.4375e-05, |
|
"loss": 0.0077, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.703125, |
|
"grad_norm": 0.10127498209476471, |
|
"learning_rate": 2.408564814814815e-05, |
|
"loss": 0.0078, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 1.71875, |
|
"grad_norm": 0.18934841454029083, |
|
"learning_rate": 2.3796296296296296e-05, |
|
"loss": 0.0076, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.734375, |
|
"grad_norm": 0.2767048180103302, |
|
"learning_rate": 2.3506944444444446e-05, |
|
"loss": 0.0087, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.714077353477478, |
|
"learning_rate": 2.3217592592592592e-05, |
|
"loss": 0.0091, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.765625, |
|
"grad_norm": 0.07030107080936432, |
|
"learning_rate": 2.2928240740740742e-05, |
|
"loss": 0.0083, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 1.78125, |
|
"grad_norm": 1.842840313911438, |
|
"learning_rate": 2.263888888888889e-05, |
|
"loss": 0.0073, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.796875, |
|
"grad_norm": 0.057295847684144974, |
|
"learning_rate": 2.234953703703704e-05, |
|
"loss": 0.0157, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 1.8125, |
|
"grad_norm": 1.018971562385559, |
|
"learning_rate": 2.2060185185185185e-05, |
|
"loss": 0.0105, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.828125, |
|
"grad_norm": 1.8360518217086792, |
|
"learning_rate": 2.1770833333333335e-05, |
|
"loss": 0.0084, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 1.84375, |
|
"grad_norm": 0.0763489380478859, |
|
"learning_rate": 2.148148148148148e-05, |
|
"loss": 0.006, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.859375, |
|
"grad_norm": 0.7448757290840149, |
|
"learning_rate": 2.119212962962963e-05, |
|
"loss": 0.0067, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.03100624494254589, |
|
"learning_rate": 2.0902777777777778e-05, |
|
"loss": 0.007, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.890625, |
|
"grad_norm": 0.18291109800338745, |
|
"learning_rate": 2.0613425925925928e-05, |
|
"loss": 0.0067, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 1.90625, |
|
"grad_norm": 1.1112022399902344, |
|
"learning_rate": 2.0324074074074074e-05, |
|
"loss": 0.0094, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.921875, |
|
"grad_norm": 0.09381380677223206, |
|
"learning_rate": 2.0034722222222224e-05, |
|
"loss": 0.0072, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 1.9375, |
|
"grad_norm": 0.13314466178417206, |
|
"learning_rate": 1.974537037037037e-05, |
|
"loss": 0.0053, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.953125, |
|
"grad_norm": 0.40121424198150635, |
|
"learning_rate": 1.945601851851852e-05, |
|
"loss": 0.009, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.96875, |
|
"grad_norm": 0.44097986817359924, |
|
"learning_rate": 1.9166666666666667e-05, |
|
"loss": 0.0068, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.984375, |
|
"grad_norm": 0.22938844561576843, |
|
"learning_rate": 1.8877314814814814e-05, |
|
"loss": 0.0078, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.10912167280912399, |
|
"learning_rate": 1.8587962962962964e-05, |
|
"loss": 0.0082, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_gen_len": 19.0, |
|
"eval_loss": 0.00017511926125735044, |
|
"eval_rouge1": 36.1041, |
|
"eval_rouge2": 31.2121, |
|
"eval_rougeL": 36.1363, |
|
"eval_rougeLsum": 36.1113, |
|
"eval_runtime": 41.5369, |
|
"eval_samples_per_second": 19.26, |
|
"eval_steps_per_second": 4.815, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.015625, |
|
"grad_norm": 0.09053827077150345, |
|
"learning_rate": 1.829861111111111e-05, |
|
"loss": 0.0071, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.03125, |
|
"grad_norm": 2.461913585662842, |
|
"learning_rate": 1.800925925925926e-05, |
|
"loss": 0.0061, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.046875, |
|
"grad_norm": 0.19066256284713745, |
|
"learning_rate": 1.7719907407407407e-05, |
|
"loss": 0.0066, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.0625, |
|
"grad_norm": 0.1872449815273285, |
|
"learning_rate": 1.7430555555555556e-05, |
|
"loss": 0.0057, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.078125, |
|
"grad_norm": 0.08601183444261551, |
|
"learning_rate": 1.7141203703703703e-05, |
|
"loss": 0.007, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.09375, |
|
"grad_norm": 0.27893713116645813, |
|
"learning_rate": 1.6851851851851853e-05, |
|
"loss": 0.0061, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.109375, |
|
"grad_norm": 0.44320690631866455, |
|
"learning_rate": 1.65625e-05, |
|
"loss": 0.0055, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.09852916747331619, |
|
"learning_rate": 1.627314814814815e-05, |
|
"loss": 0.0046, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.140625, |
|
"grad_norm": 0.03615964949131012, |
|
"learning_rate": 1.5983796296296296e-05, |
|
"loss": 0.0051, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.15625, |
|
"grad_norm": 0.11186737567186356, |
|
"learning_rate": 1.5694444444444446e-05, |
|
"loss": 0.0037, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.171875, |
|
"grad_norm": 0.29233425855636597, |
|
"learning_rate": 1.5405092592592592e-05, |
|
"loss": 0.0057, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.1875, |
|
"grad_norm": 0.06819481402635574, |
|
"learning_rate": 1.5115740740740742e-05, |
|
"loss": 0.0083, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.203125, |
|
"grad_norm": 0.2904689610004425, |
|
"learning_rate": 1.482638888888889e-05, |
|
"loss": 0.0037, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.21875, |
|
"grad_norm": 0.8704804182052612, |
|
"learning_rate": 1.4537037037037039e-05, |
|
"loss": 0.0049, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.234375, |
|
"grad_norm": 0.5658326745033264, |
|
"learning_rate": 1.4247685185185187e-05, |
|
"loss": 0.0039, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.3811335563659668, |
|
"learning_rate": 1.3958333333333335e-05, |
|
"loss": 0.0047, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.265625, |
|
"grad_norm": 0.26812756061553955, |
|
"learning_rate": 1.3668981481481483e-05, |
|
"loss": 0.0064, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.28125, |
|
"grad_norm": 0.07358690351247787, |
|
"learning_rate": 1.3379629629629631e-05, |
|
"loss": 0.0051, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.296875, |
|
"grad_norm": 0.46232402324676514, |
|
"learning_rate": 1.309027777777778e-05, |
|
"loss": 0.0053, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.3125, |
|
"grad_norm": 0.041453029960393906, |
|
"learning_rate": 1.2800925925925928e-05, |
|
"loss": 0.0063, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.328125, |
|
"grad_norm": 0.22587814927101135, |
|
"learning_rate": 1.2511574074074076e-05, |
|
"loss": 0.0033, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.34375, |
|
"grad_norm": 1.0070526599884033, |
|
"learning_rate": 1.2222222222222222e-05, |
|
"loss": 0.0052, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.359375, |
|
"grad_norm": 0.11478688567876816, |
|
"learning_rate": 1.193287037037037e-05, |
|
"loss": 0.004, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.3056720495223999, |
|
"learning_rate": 1.1643518518518519e-05, |
|
"loss": 0.0066, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.390625, |
|
"grad_norm": 0.30260908603668213, |
|
"learning_rate": 1.1354166666666667e-05, |
|
"loss": 0.0041, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.40625, |
|
"grad_norm": 1.0675514936447144, |
|
"learning_rate": 1.1064814814814815e-05, |
|
"loss": 0.0042, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.421875, |
|
"grad_norm": 0.17632050812244415, |
|
"learning_rate": 1.0775462962962963e-05, |
|
"loss": 0.0044, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.4375, |
|
"grad_norm": 0.13342522084712982, |
|
"learning_rate": 1.0486111111111112e-05, |
|
"loss": 0.0076, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.453125, |
|
"grad_norm": 0.05666012316942215, |
|
"learning_rate": 1.019675925925926e-05, |
|
"loss": 0.0051, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.46875, |
|
"grad_norm": 0.17459043860435486, |
|
"learning_rate": 9.907407407407408e-06, |
|
"loss": 0.0043, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.484375, |
|
"grad_norm": 0.044436413794755936, |
|
"learning_rate": 9.618055555555556e-06, |
|
"loss": 0.0074, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.08852487057447433, |
|
"learning_rate": 9.328703703703705e-06, |
|
"loss": 0.0044, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.515625, |
|
"grad_norm": 0.04410694167017937, |
|
"learning_rate": 9.039351851851851e-06, |
|
"loss": 0.005, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 2.53125, |
|
"grad_norm": 0.07558827102184296, |
|
"learning_rate": 8.75e-06, |
|
"loss": 0.0047, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.546875, |
|
"grad_norm": 0.41048064827919006, |
|
"learning_rate": 8.460648148148147e-06, |
|
"loss": 0.0034, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 2.5625, |
|
"grad_norm": 0.16999337077140808, |
|
"learning_rate": 8.171296296296296e-06, |
|
"loss": 0.0047, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.578125, |
|
"grad_norm": 0.08021432161331177, |
|
"learning_rate": 7.881944444444444e-06, |
|
"loss": 0.0055, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 2.59375, |
|
"grad_norm": 0.09371667355298996, |
|
"learning_rate": 7.592592592592593e-06, |
|
"loss": 0.0037, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.609375, |
|
"grad_norm": 0.41227176785469055, |
|
"learning_rate": 7.303240740740741e-06, |
|
"loss": 0.0047, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.617947518825531, |
|
"learning_rate": 7.013888888888889e-06, |
|
"loss": 0.0046, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.640625, |
|
"grad_norm": 0.08738510310649872, |
|
"learning_rate": 6.7245370370370375e-06, |
|
"loss": 0.0044, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 2.65625, |
|
"grad_norm": 2.8047690391540527, |
|
"learning_rate": 6.435185185185186e-06, |
|
"loss": 0.0043, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.671875, |
|
"grad_norm": 0.6523553729057312, |
|
"learning_rate": 6.145833333333333e-06, |
|
"loss": 0.0042, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 2.6875, |
|
"grad_norm": 1.3934566974639893, |
|
"learning_rate": 5.856481481481481e-06, |
|
"loss": 0.0044, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.703125, |
|
"grad_norm": 0.10399406403303146, |
|
"learning_rate": 5.5671296296296295e-06, |
|
"loss": 0.0046, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 2.71875, |
|
"grad_norm": 0.8010254502296448, |
|
"learning_rate": 5.277777777777778e-06, |
|
"loss": 0.0047, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.734375, |
|
"grad_norm": 0.34063494205474854, |
|
"learning_rate": 4.988425925925926e-06, |
|
"loss": 0.004, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.027729365974664688, |
|
"learning_rate": 4.699074074074074e-06, |
|
"loss": 0.0031, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.765625, |
|
"grad_norm": 0.029330110177397728, |
|
"learning_rate": 4.409722222222222e-06, |
|
"loss": 0.0028, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 2.78125, |
|
"grad_norm": 0.040268149226903915, |
|
"learning_rate": 4.1203703703703705e-06, |
|
"loss": 0.0048, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.796875, |
|
"grad_norm": 0.14908038079738617, |
|
"learning_rate": 3.831018518518519e-06, |
|
"loss": 0.0038, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 2.8125, |
|
"grad_norm": 0.3301020860671997, |
|
"learning_rate": 3.541666666666667e-06, |
|
"loss": 0.0039, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.828125, |
|
"grad_norm": 0.07975717633962631, |
|
"learning_rate": 3.252314814814815e-06, |
|
"loss": 0.0041, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 2.84375, |
|
"grad_norm": 0.031999628990888596, |
|
"learning_rate": 2.9629629629629633e-06, |
|
"loss": 0.0035, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.859375, |
|
"grad_norm": 0.051941078156232834, |
|
"learning_rate": 2.673611111111111e-06, |
|
"loss": 0.0029, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.07033877074718475, |
|
"learning_rate": 2.3842592592592593e-06, |
|
"loss": 0.0042, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.890625, |
|
"grad_norm": 0.11819066107273102, |
|
"learning_rate": 2.0949074074074075e-06, |
|
"loss": 0.0053, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 2.90625, |
|
"grad_norm": 0.28392651677131653, |
|
"learning_rate": 1.8055555555555555e-06, |
|
"loss": 0.004, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.921875, |
|
"grad_norm": 0.10968532413244247, |
|
"learning_rate": 1.5162037037037037e-06, |
|
"loss": 0.0053, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 2.9375, |
|
"grad_norm": 0.08729392290115356, |
|
"learning_rate": 1.226851851851852e-06, |
|
"loss": 0.0051, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.953125, |
|
"grad_norm": 0.2821725010871887, |
|
"learning_rate": 9.375e-07, |
|
"loss": 0.0034, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 2.96875, |
|
"grad_norm": 0.027782775461673737, |
|
"learning_rate": 6.481481481481481e-07, |
|
"loss": 0.003, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.984375, |
|
"grad_norm": 0.027040518820285797, |
|
"learning_rate": 3.5879629629629633e-07, |
|
"loss": 0.0031, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5342923998832703, |
|
"learning_rate": 6.944444444444445e-08, |
|
"loss": 0.0044, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_gen_len": 19.0, |
|
"eval_loss": 0.0001007633691187948, |
|
"eval_rouge1": 36.1041, |
|
"eval_rouge2": 31.2121, |
|
"eval_rougeL": 36.1363, |
|
"eval_rougeLsum": 36.1113, |
|
"eval_runtime": 41.5379, |
|
"eval_samples_per_second": 19.26, |
|
"eval_steps_per_second": 4.815, |
|
"step": 4800 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 4800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 2 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 86510373765120.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|