jmstadt's picture
Upload folder using huggingface_hub
63b1d44 verified
{
"best_metric": 0.0001007633691187948,
"best_model_checkpoint": "autotrain-ljk9o-0hizk/checkpoint-4800",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015625,
"grad_norm": 34.15324783325195,
"learning_rate": 2.1875e-06,
"loss": 4.8362,
"step": 25
},
{
"epoch": 0.03125,
"grad_norm": 7.099372863769531,
"learning_rate": 4.6875000000000004e-06,
"loss": 4.6659,
"step": 50
},
{
"epoch": 0.046875,
"grad_norm": 7.394688606262207,
"learning_rate": 7.2916666666666674e-06,
"loss": 4.4834,
"step": 75
},
{
"epoch": 0.0625,
"grad_norm": 5.589704513549805,
"learning_rate": 9.895833333333333e-06,
"loss": 4.2105,
"step": 100
},
{
"epoch": 0.078125,
"grad_norm": 5.1627020835876465,
"learning_rate": 1.25e-05,
"loss": 3.8238,
"step": 125
},
{
"epoch": 0.09375,
"grad_norm": 14.816465377807617,
"learning_rate": 1.5104166666666667e-05,
"loss": 3.542,
"step": 150
},
{
"epoch": 0.109375,
"grad_norm": 3.7660512924194336,
"learning_rate": 1.7708333333333335e-05,
"loss": 3.1228,
"step": 175
},
{
"epoch": 0.125,
"grad_norm": 6.042110919952393,
"learning_rate": 2.0312500000000002e-05,
"loss": 2.8363,
"step": 200
},
{
"epoch": 0.140625,
"grad_norm": 4.053712368011475,
"learning_rate": 2.2916666666666667e-05,
"loss": 2.6232,
"step": 225
},
{
"epoch": 0.15625,
"grad_norm": 3.718096971511841,
"learning_rate": 2.552083333333333e-05,
"loss": 2.285,
"step": 250
},
{
"epoch": 0.171875,
"grad_norm": 4.003573417663574,
"learning_rate": 2.8125000000000003e-05,
"loss": 2.0495,
"step": 275
},
{
"epoch": 0.1875,
"grad_norm": 3.6954550743103027,
"learning_rate": 3.072916666666667e-05,
"loss": 1.79,
"step": 300
},
{
"epoch": 0.203125,
"grad_norm": 4.259199142456055,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.5362,
"step": 325
},
{
"epoch": 0.21875,
"grad_norm": 4.389467239379883,
"learning_rate": 3.5833333333333335e-05,
"loss": 1.3678,
"step": 350
},
{
"epoch": 0.234375,
"grad_norm": 4.271336555480957,
"learning_rate": 3.8437500000000006e-05,
"loss": 1.0995,
"step": 375
},
{
"epoch": 0.25,
"grad_norm": 4.2241291999816895,
"learning_rate": 4.104166666666667e-05,
"loss": 0.9856,
"step": 400
},
{
"epoch": 0.265625,
"grad_norm": 4.329673767089844,
"learning_rate": 4.3645833333333335e-05,
"loss": 0.8101,
"step": 425
},
{
"epoch": 0.28125,
"grad_norm": 12.796660423278809,
"learning_rate": 4.6250000000000006e-05,
"loss": 0.6484,
"step": 450
},
{
"epoch": 0.296875,
"grad_norm": 3.313157558441162,
"learning_rate": 4.885416666666667e-05,
"loss": 0.5902,
"step": 475
},
{
"epoch": 0.3125,
"grad_norm": 24.351720809936523,
"learning_rate": 4.983796296296296e-05,
"loss": 0.5107,
"step": 500
},
{
"epoch": 0.328125,
"grad_norm": 2.632058620452881,
"learning_rate": 4.954861111111112e-05,
"loss": 0.4271,
"step": 525
},
{
"epoch": 0.34375,
"grad_norm": 5.7889628410339355,
"learning_rate": 4.925925925925926e-05,
"loss": 0.3724,
"step": 550
},
{
"epoch": 0.359375,
"grad_norm": 2.4024550914764404,
"learning_rate": 4.896990740740741e-05,
"loss": 0.3208,
"step": 575
},
{
"epoch": 0.375,
"grad_norm": 2.1066699028015137,
"learning_rate": 4.8680555555555554e-05,
"loss": 0.2709,
"step": 600
},
{
"epoch": 0.390625,
"grad_norm": 2.587980270385742,
"learning_rate": 4.839120370370371e-05,
"loss": 0.2463,
"step": 625
},
{
"epoch": 0.40625,
"grad_norm": 3.222367763519287,
"learning_rate": 4.8101851851851854e-05,
"loss": 0.2429,
"step": 650
},
{
"epoch": 0.421875,
"grad_norm": 1.7287578582763672,
"learning_rate": 4.7812500000000003e-05,
"loss": 0.1956,
"step": 675
},
{
"epoch": 0.4375,
"grad_norm": 1.7069084644317627,
"learning_rate": 4.752314814814815e-05,
"loss": 0.1838,
"step": 700
},
{
"epoch": 0.453125,
"grad_norm": 3.9924962520599365,
"learning_rate": 4.72337962962963e-05,
"loss": 0.1825,
"step": 725
},
{
"epoch": 0.46875,
"grad_norm": 1.7422994375228882,
"learning_rate": 4.6944444444444446e-05,
"loss": 0.1525,
"step": 750
},
{
"epoch": 0.484375,
"grad_norm": 2.156935691833496,
"learning_rate": 4.6655092592592596e-05,
"loss": 0.1334,
"step": 775
},
{
"epoch": 0.5,
"grad_norm": 1.77634596824646,
"learning_rate": 4.6365740740740746e-05,
"loss": 0.1109,
"step": 800
},
{
"epoch": 0.515625,
"grad_norm": 1.2669907808303833,
"learning_rate": 4.607638888888889e-05,
"loss": 0.1152,
"step": 825
},
{
"epoch": 0.53125,
"grad_norm": 2.9084904193878174,
"learning_rate": 4.578703703703704e-05,
"loss": 0.0947,
"step": 850
},
{
"epoch": 0.546875,
"grad_norm": 1.7987799644470215,
"learning_rate": 4.549768518518518e-05,
"loss": 0.1078,
"step": 875
},
{
"epoch": 0.5625,
"grad_norm": 1.0232861042022705,
"learning_rate": 4.520833333333334e-05,
"loss": 0.0877,
"step": 900
},
{
"epoch": 0.578125,
"grad_norm": 1.5493382215499878,
"learning_rate": 4.491898148148148e-05,
"loss": 0.0963,
"step": 925
},
{
"epoch": 0.59375,
"grad_norm": 1.3363285064697266,
"learning_rate": 4.462962962962963e-05,
"loss": 0.0735,
"step": 950
},
{
"epoch": 0.609375,
"grad_norm": 2.2493879795074463,
"learning_rate": 4.4340277777777775e-05,
"loss": 0.0806,
"step": 975
},
{
"epoch": 0.625,
"grad_norm": 1.1489849090576172,
"learning_rate": 4.405092592592593e-05,
"loss": 0.0853,
"step": 1000
},
{
"epoch": 0.640625,
"grad_norm": 1.4875305891036987,
"learning_rate": 4.3761574074074075e-05,
"loss": 0.0715,
"step": 1025
},
{
"epoch": 0.65625,
"grad_norm": 1.5242434740066528,
"learning_rate": 4.3472222222222225e-05,
"loss": 0.0617,
"step": 1050
},
{
"epoch": 0.671875,
"grad_norm": 0.6194405555725098,
"learning_rate": 4.318287037037037e-05,
"loss": 0.0537,
"step": 1075
},
{
"epoch": 0.6875,
"grad_norm": 10.070043563842773,
"learning_rate": 4.2893518518518525e-05,
"loss": 0.0563,
"step": 1100
},
{
"epoch": 0.703125,
"grad_norm": 0.7786325812339783,
"learning_rate": 4.260416666666667e-05,
"loss": 0.0528,
"step": 1125
},
{
"epoch": 0.71875,
"grad_norm": 1.3423603773117065,
"learning_rate": 4.231481481481482e-05,
"loss": 0.0503,
"step": 1150
},
{
"epoch": 0.734375,
"grad_norm": 0.9283588528633118,
"learning_rate": 4.202546296296296e-05,
"loss": 0.0547,
"step": 1175
},
{
"epoch": 0.75,
"grad_norm": 1.4316812753677368,
"learning_rate": 4.173611111111112e-05,
"loss": 0.0469,
"step": 1200
},
{
"epoch": 0.765625,
"grad_norm": 0.4404491186141968,
"learning_rate": 4.144675925925926e-05,
"loss": 0.0403,
"step": 1225
},
{
"epoch": 0.78125,
"grad_norm": 1.7463942766189575,
"learning_rate": 4.115740740740741e-05,
"loss": 0.0423,
"step": 1250
},
{
"epoch": 0.796875,
"grad_norm": 0.5837883353233337,
"learning_rate": 4.0868055555555554e-05,
"loss": 0.0318,
"step": 1275
},
{
"epoch": 0.8125,
"grad_norm": 1.1707327365875244,
"learning_rate": 4.057870370370371e-05,
"loss": 0.0386,
"step": 1300
},
{
"epoch": 0.828125,
"grad_norm": 1.5415633916854858,
"learning_rate": 4.028935185185185e-05,
"loss": 0.0435,
"step": 1325
},
{
"epoch": 0.84375,
"grad_norm": 0.8029876351356506,
"learning_rate": 4e-05,
"loss": 0.0332,
"step": 1350
},
{
"epoch": 0.859375,
"grad_norm": 0.6900579929351807,
"learning_rate": 3.9710648148148146e-05,
"loss": 0.0357,
"step": 1375
},
{
"epoch": 0.875,
"grad_norm": 1.1126506328582764,
"learning_rate": 3.94212962962963e-05,
"loss": 0.0329,
"step": 1400
},
{
"epoch": 0.890625,
"grad_norm": 1.2710646390914917,
"learning_rate": 3.9131944444444446e-05,
"loss": 0.0299,
"step": 1425
},
{
"epoch": 0.90625,
"grad_norm": 2.057583808898926,
"learning_rate": 3.8842592592592596e-05,
"loss": 0.0335,
"step": 1450
},
{
"epoch": 0.921875,
"grad_norm": 0.6628808975219727,
"learning_rate": 3.855324074074074e-05,
"loss": 0.0263,
"step": 1475
},
{
"epoch": 0.9375,
"grad_norm": 1.8219642639160156,
"learning_rate": 3.826388888888889e-05,
"loss": 0.0352,
"step": 1500
},
{
"epoch": 0.953125,
"grad_norm": 0.6485552191734314,
"learning_rate": 3.797453703703704e-05,
"loss": 0.0226,
"step": 1525
},
{
"epoch": 0.96875,
"grad_norm": 0.4729490280151367,
"learning_rate": 3.768518518518518e-05,
"loss": 0.0292,
"step": 1550
},
{
"epoch": 0.984375,
"grad_norm": 2.796936511993408,
"learning_rate": 3.739583333333334e-05,
"loss": 0.0254,
"step": 1575
},
{
"epoch": 1.0,
"grad_norm": 0.35814589262008667,
"learning_rate": 3.710648148148148e-05,
"loss": 0.024,
"step": 1600
},
{
"epoch": 1.0,
"eval_gen_len": 19.0,
"eval_loss": 0.002919314429163933,
"eval_rouge1": 36.1041,
"eval_rouge2": 31.2121,
"eval_rougeL": 36.1363,
"eval_rougeLsum": 36.1113,
"eval_runtime": 41.8038,
"eval_samples_per_second": 19.137,
"eval_steps_per_second": 4.784,
"step": 1600
},
{
"epoch": 1.015625,
"grad_norm": 0.4471568465232849,
"learning_rate": 3.681712962962963e-05,
"loss": 0.022,
"step": 1625
},
{
"epoch": 1.03125,
"grad_norm": 0.44426223635673523,
"learning_rate": 3.6527777777777775e-05,
"loss": 0.0258,
"step": 1650
},
{
"epoch": 1.046875,
"grad_norm": 0.21920345723628998,
"learning_rate": 3.623842592592593e-05,
"loss": 0.0219,
"step": 1675
},
{
"epoch": 1.0625,
"grad_norm": 1.597996473312378,
"learning_rate": 3.5949074074074075e-05,
"loss": 0.0167,
"step": 1700
},
{
"epoch": 1.078125,
"grad_norm": 0.2239530384540558,
"learning_rate": 3.5659722222222225e-05,
"loss": 0.0215,
"step": 1725
},
{
"epoch": 1.09375,
"grad_norm": 0.22859439253807068,
"learning_rate": 3.537037037037037e-05,
"loss": 0.021,
"step": 1750
},
{
"epoch": 1.109375,
"grad_norm": 0.43176373839378357,
"learning_rate": 3.5081018518518524e-05,
"loss": 0.0159,
"step": 1775
},
{
"epoch": 1.125,
"grad_norm": 0.2172795981168747,
"learning_rate": 3.479166666666667e-05,
"loss": 0.0182,
"step": 1800
},
{
"epoch": 1.140625,
"grad_norm": 0.10611604899168015,
"learning_rate": 3.450231481481482e-05,
"loss": 0.0233,
"step": 1825
},
{
"epoch": 1.15625,
"grad_norm": 0.26522374153137207,
"learning_rate": 3.421296296296296e-05,
"loss": 0.0255,
"step": 1850
},
{
"epoch": 1.171875,
"grad_norm": 0.2485981285572052,
"learning_rate": 3.392361111111112e-05,
"loss": 0.0226,
"step": 1875
},
{
"epoch": 1.1875,
"grad_norm": 0.4683358371257782,
"learning_rate": 3.363425925925926e-05,
"loss": 0.0167,
"step": 1900
},
{
"epoch": 1.203125,
"grad_norm": 0.15683135390281677,
"learning_rate": 3.334490740740741e-05,
"loss": 0.0155,
"step": 1925
},
{
"epoch": 1.21875,
"grad_norm": 0.0837627425789833,
"learning_rate": 3.3055555555555553e-05,
"loss": 0.0144,
"step": 1950
},
{
"epoch": 1.234375,
"grad_norm": 0.2218899428844452,
"learning_rate": 3.276620370370371e-05,
"loss": 0.0144,
"step": 1975
},
{
"epoch": 1.25,
"grad_norm": 0.38553592562675476,
"learning_rate": 3.247685185185185e-05,
"loss": 0.0115,
"step": 2000
},
{
"epoch": 1.265625,
"grad_norm": 0.2546806335449219,
"learning_rate": 3.21875e-05,
"loss": 0.0133,
"step": 2025
},
{
"epoch": 1.28125,
"grad_norm": 0.9946944117546082,
"learning_rate": 3.1898148148148146e-05,
"loss": 0.0155,
"step": 2050
},
{
"epoch": 1.296875,
"grad_norm": 0.4023357927799225,
"learning_rate": 3.16087962962963e-05,
"loss": 0.0133,
"step": 2075
},
{
"epoch": 1.3125,
"grad_norm": 0.1124168410897255,
"learning_rate": 3.1319444444444446e-05,
"loss": 0.0141,
"step": 2100
},
{
"epoch": 1.328125,
"grad_norm": 0.14160649478435516,
"learning_rate": 3.1030092592592596e-05,
"loss": 0.0094,
"step": 2125
},
{
"epoch": 1.34375,
"grad_norm": 0.27373650670051575,
"learning_rate": 3.074074074074074e-05,
"loss": 0.012,
"step": 2150
},
{
"epoch": 1.359375,
"grad_norm": 0.09323440492153168,
"learning_rate": 3.045138888888889e-05,
"loss": 0.0118,
"step": 2175
},
{
"epoch": 1.375,
"grad_norm": 0.3543163537979126,
"learning_rate": 3.016203703703704e-05,
"loss": 0.0149,
"step": 2200
},
{
"epoch": 1.390625,
"grad_norm": 0.13677853345870972,
"learning_rate": 2.9872685185185185e-05,
"loss": 0.0103,
"step": 2225
},
{
"epoch": 1.40625,
"grad_norm": 0.4921596646308899,
"learning_rate": 2.9583333333333335e-05,
"loss": 0.0105,
"step": 2250
},
{
"epoch": 1.421875,
"grad_norm": 0.23629239201545715,
"learning_rate": 2.9293981481481482e-05,
"loss": 0.0121,
"step": 2275
},
{
"epoch": 1.4375,
"grad_norm": 0.3946538269519806,
"learning_rate": 2.900462962962963e-05,
"loss": 0.0105,
"step": 2300
},
{
"epoch": 1.453125,
"grad_norm": 0.1448473185300827,
"learning_rate": 2.8715277777777778e-05,
"loss": 0.0113,
"step": 2325
},
{
"epoch": 1.46875,
"grad_norm": 2.191969394683838,
"learning_rate": 2.8425925925925928e-05,
"loss": 0.0106,
"step": 2350
},
{
"epoch": 1.484375,
"grad_norm": 13.945716857910156,
"learning_rate": 2.8136574074074075e-05,
"loss": 0.0085,
"step": 2375
},
{
"epoch": 1.5,
"grad_norm": 2.3949015140533447,
"learning_rate": 2.7847222222222224e-05,
"loss": 0.011,
"step": 2400
},
{
"epoch": 1.515625,
"grad_norm": 0.1495410054922104,
"learning_rate": 2.755787037037037e-05,
"loss": 0.0099,
"step": 2425
},
{
"epoch": 1.53125,
"grad_norm": 1.6874018907546997,
"learning_rate": 2.726851851851852e-05,
"loss": 0.013,
"step": 2450
},
{
"epoch": 1.546875,
"grad_norm": 0.153639018535614,
"learning_rate": 2.6979166666666667e-05,
"loss": 0.0068,
"step": 2475
},
{
"epoch": 1.5625,
"grad_norm": 0.05537520349025726,
"learning_rate": 2.6689814814814817e-05,
"loss": 0.0101,
"step": 2500
},
{
"epoch": 1.578125,
"grad_norm": 0.37778735160827637,
"learning_rate": 2.6400462962962964e-05,
"loss": 0.0083,
"step": 2525
},
{
"epoch": 1.59375,
"grad_norm": 0.16083496809005737,
"learning_rate": 2.6111111111111114e-05,
"loss": 0.0109,
"step": 2550
},
{
"epoch": 1.609375,
"grad_norm": 0.19876737892627716,
"learning_rate": 2.582175925925926e-05,
"loss": 0.0091,
"step": 2575
},
{
"epoch": 1.625,
"grad_norm": 0.20663735270500183,
"learning_rate": 2.553240740740741e-05,
"loss": 0.0107,
"step": 2600
},
{
"epoch": 1.640625,
"grad_norm": 0.5358327627182007,
"learning_rate": 2.5243055555555557e-05,
"loss": 0.0082,
"step": 2625
},
{
"epoch": 1.65625,
"grad_norm": 0.35017523169517517,
"learning_rate": 2.4953703703703703e-05,
"loss": 0.0119,
"step": 2650
},
{
"epoch": 1.671875,
"grad_norm": 0.26763439178466797,
"learning_rate": 2.4664351851851853e-05,
"loss": 0.0081,
"step": 2675
},
{
"epoch": 1.6875,
"grad_norm": 1.2130630016326904,
"learning_rate": 2.4375e-05,
"loss": 0.0077,
"step": 2700
},
{
"epoch": 1.703125,
"grad_norm": 0.10127498209476471,
"learning_rate": 2.408564814814815e-05,
"loss": 0.0078,
"step": 2725
},
{
"epoch": 1.71875,
"grad_norm": 0.18934841454029083,
"learning_rate": 2.3796296296296296e-05,
"loss": 0.0076,
"step": 2750
},
{
"epoch": 1.734375,
"grad_norm": 0.2767048180103302,
"learning_rate": 2.3506944444444446e-05,
"loss": 0.0087,
"step": 2775
},
{
"epoch": 1.75,
"grad_norm": 0.714077353477478,
"learning_rate": 2.3217592592592592e-05,
"loss": 0.0091,
"step": 2800
},
{
"epoch": 1.765625,
"grad_norm": 0.07030107080936432,
"learning_rate": 2.2928240740740742e-05,
"loss": 0.0083,
"step": 2825
},
{
"epoch": 1.78125,
"grad_norm": 1.842840313911438,
"learning_rate": 2.263888888888889e-05,
"loss": 0.0073,
"step": 2850
},
{
"epoch": 1.796875,
"grad_norm": 0.057295847684144974,
"learning_rate": 2.234953703703704e-05,
"loss": 0.0157,
"step": 2875
},
{
"epoch": 1.8125,
"grad_norm": 1.018971562385559,
"learning_rate": 2.2060185185185185e-05,
"loss": 0.0105,
"step": 2900
},
{
"epoch": 1.828125,
"grad_norm": 1.8360518217086792,
"learning_rate": 2.1770833333333335e-05,
"loss": 0.0084,
"step": 2925
},
{
"epoch": 1.84375,
"grad_norm": 0.0763489380478859,
"learning_rate": 2.148148148148148e-05,
"loss": 0.006,
"step": 2950
},
{
"epoch": 1.859375,
"grad_norm": 0.7448757290840149,
"learning_rate": 2.119212962962963e-05,
"loss": 0.0067,
"step": 2975
},
{
"epoch": 1.875,
"grad_norm": 0.03100624494254589,
"learning_rate": 2.0902777777777778e-05,
"loss": 0.007,
"step": 3000
},
{
"epoch": 1.890625,
"grad_norm": 0.18291109800338745,
"learning_rate": 2.0613425925925928e-05,
"loss": 0.0067,
"step": 3025
},
{
"epoch": 1.90625,
"grad_norm": 1.1112022399902344,
"learning_rate": 2.0324074074074074e-05,
"loss": 0.0094,
"step": 3050
},
{
"epoch": 1.921875,
"grad_norm": 0.09381380677223206,
"learning_rate": 2.0034722222222224e-05,
"loss": 0.0072,
"step": 3075
},
{
"epoch": 1.9375,
"grad_norm": 0.13314466178417206,
"learning_rate": 1.974537037037037e-05,
"loss": 0.0053,
"step": 3100
},
{
"epoch": 1.953125,
"grad_norm": 0.40121424198150635,
"learning_rate": 1.945601851851852e-05,
"loss": 0.009,
"step": 3125
},
{
"epoch": 1.96875,
"grad_norm": 0.44097986817359924,
"learning_rate": 1.9166666666666667e-05,
"loss": 0.0068,
"step": 3150
},
{
"epoch": 1.984375,
"grad_norm": 0.22938844561576843,
"learning_rate": 1.8877314814814814e-05,
"loss": 0.0078,
"step": 3175
},
{
"epoch": 2.0,
"grad_norm": 0.10912167280912399,
"learning_rate": 1.8587962962962964e-05,
"loss": 0.0082,
"step": 3200
},
{
"epoch": 2.0,
"eval_gen_len": 19.0,
"eval_loss": 0.00017511926125735044,
"eval_rouge1": 36.1041,
"eval_rouge2": 31.2121,
"eval_rougeL": 36.1363,
"eval_rougeLsum": 36.1113,
"eval_runtime": 41.5369,
"eval_samples_per_second": 19.26,
"eval_steps_per_second": 4.815,
"step": 3200
},
{
"epoch": 2.015625,
"grad_norm": 0.09053827077150345,
"learning_rate": 1.829861111111111e-05,
"loss": 0.0071,
"step": 3225
},
{
"epoch": 2.03125,
"grad_norm": 2.461913585662842,
"learning_rate": 1.800925925925926e-05,
"loss": 0.0061,
"step": 3250
},
{
"epoch": 2.046875,
"grad_norm": 0.19066256284713745,
"learning_rate": 1.7719907407407407e-05,
"loss": 0.0066,
"step": 3275
},
{
"epoch": 2.0625,
"grad_norm": 0.1872449815273285,
"learning_rate": 1.7430555555555556e-05,
"loss": 0.0057,
"step": 3300
},
{
"epoch": 2.078125,
"grad_norm": 0.08601183444261551,
"learning_rate": 1.7141203703703703e-05,
"loss": 0.007,
"step": 3325
},
{
"epoch": 2.09375,
"grad_norm": 0.27893713116645813,
"learning_rate": 1.6851851851851853e-05,
"loss": 0.0061,
"step": 3350
},
{
"epoch": 2.109375,
"grad_norm": 0.44320690631866455,
"learning_rate": 1.65625e-05,
"loss": 0.0055,
"step": 3375
},
{
"epoch": 2.125,
"grad_norm": 0.09852916747331619,
"learning_rate": 1.627314814814815e-05,
"loss": 0.0046,
"step": 3400
},
{
"epoch": 2.140625,
"grad_norm": 0.03615964949131012,
"learning_rate": 1.5983796296296296e-05,
"loss": 0.0051,
"step": 3425
},
{
"epoch": 2.15625,
"grad_norm": 0.11186737567186356,
"learning_rate": 1.5694444444444446e-05,
"loss": 0.0037,
"step": 3450
},
{
"epoch": 2.171875,
"grad_norm": 0.29233425855636597,
"learning_rate": 1.5405092592592592e-05,
"loss": 0.0057,
"step": 3475
},
{
"epoch": 2.1875,
"grad_norm": 0.06819481402635574,
"learning_rate": 1.5115740740740742e-05,
"loss": 0.0083,
"step": 3500
},
{
"epoch": 2.203125,
"grad_norm": 0.2904689610004425,
"learning_rate": 1.482638888888889e-05,
"loss": 0.0037,
"step": 3525
},
{
"epoch": 2.21875,
"grad_norm": 0.8704804182052612,
"learning_rate": 1.4537037037037039e-05,
"loss": 0.0049,
"step": 3550
},
{
"epoch": 2.234375,
"grad_norm": 0.5658326745033264,
"learning_rate": 1.4247685185185187e-05,
"loss": 0.0039,
"step": 3575
},
{
"epoch": 2.25,
"grad_norm": 0.3811335563659668,
"learning_rate": 1.3958333333333335e-05,
"loss": 0.0047,
"step": 3600
},
{
"epoch": 2.265625,
"grad_norm": 0.26812756061553955,
"learning_rate": 1.3668981481481483e-05,
"loss": 0.0064,
"step": 3625
},
{
"epoch": 2.28125,
"grad_norm": 0.07358690351247787,
"learning_rate": 1.3379629629629631e-05,
"loss": 0.0051,
"step": 3650
},
{
"epoch": 2.296875,
"grad_norm": 0.46232402324676514,
"learning_rate": 1.309027777777778e-05,
"loss": 0.0053,
"step": 3675
},
{
"epoch": 2.3125,
"grad_norm": 0.041453029960393906,
"learning_rate": 1.2800925925925928e-05,
"loss": 0.0063,
"step": 3700
},
{
"epoch": 2.328125,
"grad_norm": 0.22587814927101135,
"learning_rate": 1.2511574074074076e-05,
"loss": 0.0033,
"step": 3725
},
{
"epoch": 2.34375,
"grad_norm": 1.0070526599884033,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.0052,
"step": 3750
},
{
"epoch": 2.359375,
"grad_norm": 0.11478688567876816,
"learning_rate": 1.193287037037037e-05,
"loss": 0.004,
"step": 3775
},
{
"epoch": 2.375,
"grad_norm": 0.3056720495223999,
"learning_rate": 1.1643518518518519e-05,
"loss": 0.0066,
"step": 3800
},
{
"epoch": 2.390625,
"grad_norm": 0.30260908603668213,
"learning_rate": 1.1354166666666667e-05,
"loss": 0.0041,
"step": 3825
},
{
"epoch": 2.40625,
"grad_norm": 1.0675514936447144,
"learning_rate": 1.1064814814814815e-05,
"loss": 0.0042,
"step": 3850
},
{
"epoch": 2.421875,
"grad_norm": 0.17632050812244415,
"learning_rate": 1.0775462962962963e-05,
"loss": 0.0044,
"step": 3875
},
{
"epoch": 2.4375,
"grad_norm": 0.13342522084712982,
"learning_rate": 1.0486111111111112e-05,
"loss": 0.0076,
"step": 3900
},
{
"epoch": 2.453125,
"grad_norm": 0.05666012316942215,
"learning_rate": 1.019675925925926e-05,
"loss": 0.0051,
"step": 3925
},
{
"epoch": 2.46875,
"grad_norm": 0.17459043860435486,
"learning_rate": 9.907407407407408e-06,
"loss": 0.0043,
"step": 3950
},
{
"epoch": 2.484375,
"grad_norm": 0.044436413794755936,
"learning_rate": 9.618055555555556e-06,
"loss": 0.0074,
"step": 3975
},
{
"epoch": 2.5,
"grad_norm": 0.08852487057447433,
"learning_rate": 9.328703703703705e-06,
"loss": 0.0044,
"step": 4000
},
{
"epoch": 2.515625,
"grad_norm": 0.04410694167017937,
"learning_rate": 9.039351851851851e-06,
"loss": 0.005,
"step": 4025
},
{
"epoch": 2.53125,
"grad_norm": 0.07558827102184296,
"learning_rate": 8.75e-06,
"loss": 0.0047,
"step": 4050
},
{
"epoch": 2.546875,
"grad_norm": 0.41048064827919006,
"learning_rate": 8.460648148148147e-06,
"loss": 0.0034,
"step": 4075
},
{
"epoch": 2.5625,
"grad_norm": 0.16999337077140808,
"learning_rate": 8.171296296296296e-06,
"loss": 0.0047,
"step": 4100
},
{
"epoch": 2.578125,
"grad_norm": 0.08021432161331177,
"learning_rate": 7.881944444444444e-06,
"loss": 0.0055,
"step": 4125
},
{
"epoch": 2.59375,
"grad_norm": 0.09371667355298996,
"learning_rate": 7.592592592592593e-06,
"loss": 0.0037,
"step": 4150
},
{
"epoch": 2.609375,
"grad_norm": 0.41227176785469055,
"learning_rate": 7.303240740740741e-06,
"loss": 0.0047,
"step": 4175
},
{
"epoch": 2.625,
"grad_norm": 0.617947518825531,
"learning_rate": 7.013888888888889e-06,
"loss": 0.0046,
"step": 4200
},
{
"epoch": 2.640625,
"grad_norm": 0.08738510310649872,
"learning_rate": 6.7245370370370375e-06,
"loss": 0.0044,
"step": 4225
},
{
"epoch": 2.65625,
"grad_norm": 2.8047690391540527,
"learning_rate": 6.435185185185186e-06,
"loss": 0.0043,
"step": 4250
},
{
"epoch": 2.671875,
"grad_norm": 0.6523553729057312,
"learning_rate": 6.145833333333333e-06,
"loss": 0.0042,
"step": 4275
},
{
"epoch": 2.6875,
"grad_norm": 1.3934566974639893,
"learning_rate": 5.856481481481481e-06,
"loss": 0.0044,
"step": 4300
},
{
"epoch": 2.703125,
"grad_norm": 0.10399406403303146,
"learning_rate": 5.5671296296296295e-06,
"loss": 0.0046,
"step": 4325
},
{
"epoch": 2.71875,
"grad_norm": 0.8010254502296448,
"learning_rate": 5.277777777777778e-06,
"loss": 0.0047,
"step": 4350
},
{
"epoch": 2.734375,
"grad_norm": 0.34063494205474854,
"learning_rate": 4.988425925925926e-06,
"loss": 0.004,
"step": 4375
},
{
"epoch": 2.75,
"grad_norm": 0.027729365974664688,
"learning_rate": 4.699074074074074e-06,
"loss": 0.0031,
"step": 4400
},
{
"epoch": 2.765625,
"grad_norm": 0.029330110177397728,
"learning_rate": 4.409722222222222e-06,
"loss": 0.0028,
"step": 4425
},
{
"epoch": 2.78125,
"grad_norm": 0.040268149226903915,
"learning_rate": 4.1203703703703705e-06,
"loss": 0.0048,
"step": 4450
},
{
"epoch": 2.796875,
"grad_norm": 0.14908038079738617,
"learning_rate": 3.831018518518519e-06,
"loss": 0.0038,
"step": 4475
},
{
"epoch": 2.8125,
"grad_norm": 0.3301020860671997,
"learning_rate": 3.541666666666667e-06,
"loss": 0.0039,
"step": 4500
},
{
"epoch": 2.828125,
"grad_norm": 0.07975717633962631,
"learning_rate": 3.252314814814815e-06,
"loss": 0.0041,
"step": 4525
},
{
"epoch": 2.84375,
"grad_norm": 0.031999628990888596,
"learning_rate": 2.9629629629629633e-06,
"loss": 0.0035,
"step": 4550
},
{
"epoch": 2.859375,
"grad_norm": 0.051941078156232834,
"learning_rate": 2.673611111111111e-06,
"loss": 0.0029,
"step": 4575
},
{
"epoch": 2.875,
"grad_norm": 0.07033877074718475,
"learning_rate": 2.3842592592592593e-06,
"loss": 0.0042,
"step": 4600
},
{
"epoch": 2.890625,
"grad_norm": 0.11819066107273102,
"learning_rate": 2.0949074074074075e-06,
"loss": 0.0053,
"step": 4625
},
{
"epoch": 2.90625,
"grad_norm": 0.28392651677131653,
"learning_rate": 1.8055555555555555e-06,
"loss": 0.004,
"step": 4650
},
{
"epoch": 2.921875,
"grad_norm": 0.10968532413244247,
"learning_rate": 1.5162037037037037e-06,
"loss": 0.0053,
"step": 4675
},
{
"epoch": 2.9375,
"grad_norm": 0.08729392290115356,
"learning_rate": 1.226851851851852e-06,
"loss": 0.0051,
"step": 4700
},
{
"epoch": 2.953125,
"grad_norm": 0.2821725010871887,
"learning_rate": 9.375e-07,
"loss": 0.0034,
"step": 4725
},
{
"epoch": 2.96875,
"grad_norm": 0.027782775461673737,
"learning_rate": 6.481481481481481e-07,
"loss": 0.003,
"step": 4750
},
{
"epoch": 2.984375,
"grad_norm": 0.027040518820285797,
"learning_rate": 3.5879629629629633e-07,
"loss": 0.0031,
"step": 4775
},
{
"epoch": 3.0,
"grad_norm": 0.5342923998832703,
"learning_rate": 6.944444444444445e-08,
"loss": 0.0044,
"step": 4800
},
{
"epoch": 3.0,
"eval_gen_len": 19.0,
"eval_loss": 0.0001007633691187948,
"eval_rouge1": 36.1041,
"eval_rouge2": 31.2121,
"eval_rougeL": 36.1363,
"eval_rougeLsum": 36.1113,
"eval_runtime": 41.5379,
"eval_samples_per_second": 19.26,
"eval_steps_per_second": 4.815,
"step": 4800
}
],
"logging_steps": 25,
"max_steps": 4800,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 86510373765120.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}