diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4710 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 660,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004545454545454545,
+      "grad_norm": 3.4694509506225586,
+      "learning_rate": 0.0004992424242424243,
+      "loss": 1.6877,
+      "step": 1
+    },
+    {
+      "epoch": 0.00909090909090909,
+      "grad_norm": 2.844703435897827,
+      "learning_rate": 0.0004984848484848485,
+      "loss": 1.7528,
+      "step": 2
+    },
+    {
+      "epoch": 0.013636363636363636,
+      "grad_norm": 4.147863388061523,
+      "learning_rate": 0.0004977272727272727,
+      "loss": 2.6111,
+      "step": 3
+    },
+    {
+      "epoch": 0.01818181818181818,
+      "grad_norm": 2.755852699279785,
+      "learning_rate": 0.000496969696969697,
+      "loss": 1.9464,
+      "step": 4
+    },
+    {
+      "epoch": 0.022727272727272728,
+      "grad_norm": 4.124767780303955,
+      "learning_rate": 0.0004962121212121212,
+      "loss": 2.2121,
+      "step": 5
+    },
+    {
+      "epoch": 0.02727272727272727,
+      "grad_norm": 3.923773765563965,
+      "learning_rate": 0.0004954545454545455,
+      "loss": 2.6991,
+      "step": 6
+    },
+    {
+      "epoch": 0.031818181818181815,
+      "grad_norm": 4.66182279586792,
+      "learning_rate": 0.0004946969696969697,
+      "loss": 2.176,
+      "step": 7
+    },
+    {
+      "epoch": 0.03636363636363636,
+      "grad_norm": 3.7830166816711426,
+      "learning_rate": 0.000493939393939394,
+      "loss": 2.7265,
+      "step": 8
+    },
+    {
+      "epoch": 0.04090909090909091,
+      "grad_norm": 3.966615676879883,
+      "learning_rate": 0.0004931818181818182,
+      "loss": 3.1926,
+      "step": 9
+    },
+    {
+      "epoch": 0.045454545454545456,
+      "grad_norm": 3.281916618347168,
+      "learning_rate": 0.0004924242424242425,
+      "loss": 2.1706,
+      "step": 10
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 4.288072109222412,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 3.1348,
+      "step": 11
+    },
+    {
+      "epoch": 0.05454545454545454,
+      "grad_norm": 3.2815868854522705,
+      "learning_rate": 0.0004909090909090909,
+      "loss": 2.2711,
+      "step": 12
+    },
+    {
+      "epoch": 0.05909090909090909,
+      "grad_norm": 2.75382924079895,
+      "learning_rate": 0.0004901515151515152,
+      "loss": 1.762,
+      "step": 13
+    },
+    {
+      "epoch": 0.06363636363636363,
+      "grad_norm": 2.6767005920410156,
+      "learning_rate": 0.0004893939393939393,
+      "loss": 1.591,
+      "step": 14
+    },
+    {
+      "epoch": 0.06818181818181818,
+      "grad_norm": 3.613719940185547,
+      "learning_rate": 0.0004886363636363636,
+      "loss": 2.1644,
+      "step": 15
+    },
+    {
+      "epoch": 0.07272727272727272,
+      "grad_norm": 3.3433680534362793,
+      "learning_rate": 0.00048787878787878784,
+      "loss": 2.9727,
+      "step": 16
+    },
+    {
+      "epoch": 0.07727272727272727,
+      "grad_norm": 3.7183644771575928,
+      "learning_rate": 0.0004871212121212121,
+      "loss": 1.8928,
+      "step": 17
+    },
+    {
+      "epoch": 0.08181818181818182,
+      "grad_norm": 4.1484575271606445,
+      "learning_rate": 0.0004863636363636364,
+      "loss": 2.6002,
+      "step": 18
+    },
+    {
+      "epoch": 0.08636363636363636,
+      "grad_norm": 3.281487464904785,
+      "learning_rate": 0.0004856060606060606,
+      "loss": 1.9074,
+      "step": 19
+    },
+    {
+      "epoch": 0.09090909090909091,
+      "grad_norm": 3.0067665576934814,
+      "learning_rate": 0.0004848484848484849,
+      "loss": 2.0375,
+      "step": 20
+    },
+    {
+      "epoch": 0.09545454545454546,
+      "grad_norm": 2.8053739070892334,
+      "learning_rate": 0.00048409090909090906,
+      "loss": 1.7248,
+      "step": 21
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 2.619422435760498,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 2.6251,
+      "step": 22
+    },
+    {
+      "epoch": 0.10454545454545454,
+      "grad_norm": 3.1278717517852783,
+      "learning_rate": 0.0004825757575757576,
+      "loss": 2.7065,
+      "step": 23
+    },
+    {
+      "epoch": 0.10909090909090909,
+      "grad_norm": 2.723963975906372,
+      "learning_rate": 0.00048181818181818184,
+      "loss": 1.8838,
+      "step": 24
+    },
+    {
+      "epoch": 0.11363636363636363,
+      "grad_norm": 2.6069819927215576,
+      "learning_rate": 0.0004810606060606061,
+      "loss": 1.9516,
+      "step": 25
+    },
+    {
+      "epoch": 0.11818181818181818,
+      "grad_norm": 2.426720380783081,
+      "learning_rate": 0.0004803030303030303,
+      "loss": 2.1439,
+      "step": 26
+    },
+    {
+      "epoch": 0.12272727272727273,
+      "grad_norm": 2.3633666038513184,
+      "learning_rate": 0.00047954545454545456,
+      "loss": 2.0113,
+      "step": 27
+    },
+    {
+      "epoch": 0.12727272727272726,
+      "grad_norm": 2.988654136657715,
+      "learning_rate": 0.0004787878787878788,
+      "loss": 2.1691,
+      "step": 28
+    },
+    {
+      "epoch": 0.1318181818181818,
+      "grad_norm": 2.713346481323242,
+      "learning_rate": 0.00047803030303030306,
+      "loss": 2.1206,
+      "step": 29
+    },
+    {
+      "epoch": 0.13636363636363635,
+      "grad_norm": 2.9896864891052246,
+      "learning_rate": 0.0004772727272727273,
+      "loss": 2.139,
+      "step": 30
+    },
+    {
+      "epoch": 0.1409090909090909,
+      "grad_norm": 2.7176098823547363,
+      "learning_rate": 0.0004765151515151515,
+      "loss": 2.2194,
+      "step": 31
+    },
+    {
+      "epoch": 0.14545454545454545,
+      "grad_norm": 2.6730499267578125,
+      "learning_rate": 0.0004757575757575758,
+      "loss": 1.5875,
+      "step": 32
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 3.921717643737793,
+      "learning_rate": 0.000475,
+      "loss": 1.9037,
+      "step": 33
+    },
+    {
+      "epoch": 0.15454545454545454,
+      "grad_norm": 2.800473690032959,
+      "learning_rate": 0.0004742424242424243,
+      "loss": 1.8628,
+      "step": 34
+    },
+    {
+      "epoch": 0.1590909090909091,
+      "grad_norm": 2.7188827991485596,
+      "learning_rate": 0.0004734848484848485,
+      "loss": 2.1262,
+      "step": 35
+    },
+    {
+      "epoch": 0.16363636363636364,
+      "grad_norm": 2.794339895248413,
+      "learning_rate": 0.0004727272727272727,
+      "loss": 2.3508,
+      "step": 36
+    },
+    {
+      "epoch": 0.16818181818181818,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004727272727272727,
+      "loss": 2.4565,
+      "step": 37
+    },
+    {
+      "epoch": 0.17272727272727273,
+      "grad_norm": 3.1815218925476074,
+      "learning_rate": 0.000471969696969697,
+      "loss": 2.2242,
+      "step": 38
+    },
+    {
+      "epoch": 0.17727272727272728,
+      "grad_norm": 3.4017906188964844,
+      "learning_rate": 0.0004712121212121212,
+      "loss": 2.1975,
+      "step": 39
+    },
+    {
+      "epoch": 0.18181818181818182,
+      "grad_norm": 2.7533328533172607,
+      "learning_rate": 0.00047045454545454544,
+      "loss": 2.1,
+      "step": 40
+    },
+    {
+      "epoch": 0.18636363636363637,
+      "grad_norm": 2.8896608352661133,
+      "learning_rate": 0.0004696969696969697,
+      "loss": 1.8716,
+      "step": 41
+    },
+    {
+      "epoch": 0.19090909090909092,
+      "grad_norm": 2.501896858215332,
+      "learning_rate": 0.00046893939393939394,
+      "loss": 2.3533,
+      "step": 42
+    },
+    {
+      "epoch": 0.19545454545454546,
+      "grad_norm": 2.2779133319854736,
+      "learning_rate": 0.0004681818181818182,
+      "loss": 1.8062,
+      "step": 43
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 2.78344988822937,
+      "learning_rate": 0.00046742424242424244,
+      "loss": 2.4259,
+      "step": 44
+    },
+    {
+      "epoch": 0.20454545454545456,
+      "grad_norm": 2.230311393737793,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 1.8689,
+      "step": 45
+    },
+    {
+      "epoch": 0.20909090909090908,
+      "grad_norm": 3.4158501625061035,
+      "learning_rate": 0.0004659090909090909,
+      "loss": 2.3214,
+      "step": 46
+    },
+    {
+      "epoch": 0.21363636363636362,
+      "grad_norm": 2.355423927307129,
+      "learning_rate": 0.00046515151515151516,
+      "loss": 1.7059,
+      "step": 47
+    },
+    {
+      "epoch": 0.21818181818181817,
+      "grad_norm": 2.495224952697754,
+      "learning_rate": 0.00046439393939393944,
+      "loss": 1.3574,
+      "step": 48
+    },
+    {
+      "epoch": 0.22272727272727272,
+      "grad_norm": 2.829482078552246,
+      "learning_rate": 0.00046363636363636366,
+      "loss": 2.2364,
+      "step": 49
+    },
+    {
+      "epoch": 0.22727272727272727,
+      "grad_norm": 2.390627861022949,
+      "learning_rate": 0.0004628787878787879,
+      "loss": 1.5226,
+      "step": 50
+    },
+    {
+      "epoch": 0.2318181818181818,
+      "grad_norm": 2.2006781101226807,
+      "learning_rate": 0.0004621212121212121,
+      "loss": 1.6848,
+      "step": 51
+    },
+    {
+      "epoch": 0.23636363636363636,
+      "grad_norm": 2.737412452697754,
+      "learning_rate": 0.0004613636363636364,
+      "loss": 1.9071,
+      "step": 52
+    },
+    {
+      "epoch": 0.2409090909090909,
+      "grad_norm": 3.4992029666900635,
+      "learning_rate": 0.00046060606060606066,
+      "loss": 2.0172,
+      "step": 53
+    },
+    {
+      "epoch": 0.24545454545454545,
+      "grad_norm": 3.196709394454956,
+      "learning_rate": 0.0004598484848484848,
+      "loss": 2.6357,
+      "step": 54
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 3.9549436569213867,
+      "learning_rate": 0.0004590909090909091,
+      "loss": 2.9822,
+      "step": 55
+    },
+    {
+      "epoch": 0.2545454545454545,
+      "grad_norm": 2.788527488708496,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 2.3458,
+      "step": 56
+    },
+    {
+      "epoch": 0.2590909090909091,
+      "grad_norm": 3.2539544105529785,
+      "learning_rate": 0.0004575757575757576,
+      "loss": 1.9336,
+      "step": 57
+    },
+    {
+      "epoch": 0.2636363636363636,
+      "grad_norm": 2.859744071960449,
+      "learning_rate": 0.0004568181818181819,
+      "loss": 2.4852,
+      "step": 58
+    },
+    {
+      "epoch": 0.2681818181818182,
+      "grad_norm": 2.6832542419433594,
+      "learning_rate": 0.00045606060606060605,
+      "loss": 2.0347,
+      "step": 59
+    },
+    {
+      "epoch": 0.2727272727272727,
+      "grad_norm": 4.593046188354492,
+      "learning_rate": 0.0004553030303030303,
+      "loss": 2.2415,
+      "step": 60
+    },
+    {
+      "epoch": 0.2772727272727273,
+      "grad_norm": 3.3459599018096924,
+      "learning_rate": 0.00045454545454545455,
+      "loss": 2.2843,
+      "step": 61
+    },
+    {
+      "epoch": 0.2818181818181818,
+      "grad_norm": 2.1371498107910156,
+      "learning_rate": 0.0004537878787878788,
+      "loss": 1.9185,
+      "step": 62
+    },
+    {
+      "epoch": 0.2863636363636364,
+      "grad_norm": 2.3603177070617676,
+      "learning_rate": 0.000453030303030303,
+      "loss": 1.9426,
+      "step": 63
+    },
+    {
+      "epoch": 0.2909090909090909,
+      "grad_norm": 2.5435550212860107,
+      "learning_rate": 0.00045227272727272727,
+      "loss": 2.4551,
+      "step": 64
+    },
+    {
+      "epoch": 0.29545454545454547,
+      "grad_norm": 2.5501880645751953,
+      "learning_rate": 0.00045151515151515154,
+      "loss": 1.9113,
+      "step": 65
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 2.8549928665161133,
+      "learning_rate": 0.00045075757575757577,
+      "loss": 2.2465,
+      "step": 66
+    },
+    {
+      "epoch": 0.30454545454545456,
+      "grad_norm": 2.396857976913452,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 2.1836,
+      "step": 67
+    },
+    {
+      "epoch": 0.3090909090909091,
+      "grad_norm": 2.7043912410736084,
+      "learning_rate": 0.0004492424242424242,
+      "loss": 2.1715,
+      "step": 68
+    },
+    {
+      "epoch": 0.31363636363636366,
+      "grad_norm": 3.1579270362854004,
+      "learning_rate": 0.0004484848484848485,
+      "loss": 2.4971,
+      "step": 69
+    },
+    {
+      "epoch": 0.3181818181818182,
+      "grad_norm": 2.3673815727233887,
+      "learning_rate": 0.00044772727272727276,
+      "loss": 1.7927,
+      "step": 70
+    },
+    {
+      "epoch": 0.32272727272727275,
+      "grad_norm": 2.776143789291382,
+      "learning_rate": 0.000446969696969697,
+      "loss": 1.9065,
+      "step": 71
+    },
+    {
+      "epoch": 0.32727272727272727,
+      "grad_norm": 2.937574625015259,
+      "learning_rate": 0.00044621212121212126,
+      "loss": 1.6579,
+      "step": 72
+    },
+    {
+      "epoch": 0.33181818181818185,
+      "grad_norm": 2.937641143798828,
+      "learning_rate": 0.00044545454545454543,
+      "loss": 2.2259,
+      "step": 73
+    },
+    {
+      "epoch": 0.33636363636363636,
+      "grad_norm": 2.305844306945801,
+      "learning_rate": 0.0004446969696969697,
+      "loss": 2.0766,
+      "step": 74
+    },
+    {
+      "epoch": 0.3409090909090909,
+      "grad_norm": 3.1322784423828125,
+      "learning_rate": 0.000443939393939394,
+      "loss": 2.3208,
+      "step": 75
+    },
+    {
+      "epoch": 0.34545454545454546,
+      "grad_norm": 2.7713890075683594,
+      "learning_rate": 0.0004431818181818182,
+      "loss": 2.4553,
+      "step": 76
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 2.275108575820923,
+      "learning_rate": 0.00044242424242424243,
+      "loss": 1.4249,
+      "step": 77
+    },
+    {
+      "epoch": 0.35454545454545455,
+      "grad_norm": 2.502997875213623,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 2.4343,
+      "step": 78
+    },
+    {
+      "epoch": 0.35909090909090907,
+      "grad_norm": 2.1204617023468018,
+      "learning_rate": 0.00044090909090909093,
+      "loss": 1.9248,
+      "step": 79
+    },
+    {
+      "epoch": 0.36363636363636365,
+      "grad_norm": 2.9564898014068604,
+      "learning_rate": 0.00044015151515151515,
+      "loss": 1.6672,
+      "step": 80
+    },
+    {
+      "epoch": 0.36818181818181817,
+      "grad_norm": 3.0879478454589844,
+      "learning_rate": 0.0004393939393939394,
+      "loss": 2.0625,
+      "step": 81
+    },
+    {
+      "epoch": 0.37272727272727274,
+      "grad_norm": 3.1532368659973145,
+      "learning_rate": 0.00043863636363636365,
+      "loss": 2.3648,
+      "step": 82
+    },
+    {
+      "epoch": 0.37727272727272726,
+      "grad_norm": 2.1850852966308594,
+      "learning_rate": 0.00043787878787878787,
+      "loss": 2.4397,
+      "step": 83
+    },
+    {
+      "epoch": 0.38181818181818183,
+      "grad_norm": 2.241144895553589,
+      "learning_rate": 0.00043712121212121215,
+      "loss": 2.5193,
+      "step": 84
+    },
+    {
+      "epoch": 0.38636363636363635,
+      "grad_norm": 3.6500165462493896,
+      "learning_rate": 0.00043636363636363637,
+      "loss": 2.1096,
+      "step": 85
+    },
+    {
+      "epoch": 0.39090909090909093,
+      "grad_norm": 2.0548837184906006,
+      "learning_rate": 0.0004356060606060606,
+      "loss": 2.359,
+      "step": 86
+    },
+    {
+      "epoch": 0.39545454545454545,
+      "grad_norm": 2.4512407779693604,
+      "learning_rate": 0.00043484848484848487,
+      "loss": 1.5353,
+      "step": 87
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.9420820474624634,
+      "learning_rate": 0.0004340909090909091,
+      "loss": 1.688,
+      "step": 88
+    },
+    {
+      "epoch": 0.40454545454545454,
+      "grad_norm": 1.845475673675537,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 1.6642,
+      "step": 89
+    },
+    {
+      "epoch": 0.4090909090909091,
+      "grad_norm": 2.2735133171081543,
+      "learning_rate": 0.0004325757575757576,
+      "loss": 1.89,
+      "step": 90
+    },
+    {
+      "epoch": 0.41363636363636364,
+      "grad_norm": 2.203105926513672,
+      "learning_rate": 0.0004318181818181818,
+      "loss": 1.9556,
+      "step": 91
+    },
+    {
+      "epoch": 0.41818181818181815,
+      "grad_norm": 1.8748105764389038,
+      "learning_rate": 0.00043106060606060603,
+      "loss": 1.5452,
+      "step": 92
+    },
+    {
+      "epoch": 0.42272727272727273,
+      "grad_norm": 2.8958442211151123,
+      "learning_rate": 0.0004303030303030303,
+      "loss": 1.9343,
+      "step": 93
+    },
+    {
+      "epoch": 0.42727272727272725,
+      "grad_norm": 2.7512269020080566,
+      "learning_rate": 0.0004295454545454546,
+      "loss": 2.4008,
+      "step": 94
+    },
+    {
+      "epoch": 0.4318181818181818,
+      "grad_norm": 2.748307228088379,
+      "learning_rate": 0.00042878787878787876,
+      "loss": 2.3614,
+      "step": 95
+    },
+    {
+      "epoch": 0.43636363636363634,
+      "grad_norm": 3.7091145515441895,
+      "learning_rate": 0.00042803030303030303,
+      "loss": 1.5435,
+      "step": 96
+    },
+    {
+      "epoch": 0.4409090909090909,
+      "grad_norm": 2.0227293968200684,
+      "learning_rate": 0.00042727272727272726,
+      "loss": 1.5536,
+      "step": 97
+    },
+    {
+      "epoch": 0.44545454545454544,
+      "grad_norm": 1.868477702140808,
+      "learning_rate": 0.00042651515151515153,
+      "loss": 2.0019,
+      "step": 98
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.2410340309143066,
+      "learning_rate": 0.0004257575757575758,
+      "loss": 2.0278,
+      "step": 99
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 2.4206206798553467,
+      "learning_rate": 0.000425,
+      "loss": 2.6757,
+      "step": 100
+    },
+    {
+      "epoch": 0.4590909090909091,
+      "grad_norm": 2.6481056213378906,
+      "learning_rate": 0.00042424242424242425,
+      "loss": 2.004,
+      "step": 101
+    },
+    {
+      "epoch": 0.4636363636363636,
+      "grad_norm": 2.493495225906372,
+      "learning_rate": 0.0004234848484848485,
+      "loss": 2.2102,
+      "step": 102
+    },
+    {
+      "epoch": 0.4681818181818182,
+      "grad_norm": 2.588595390319824,
+      "learning_rate": 0.00042272727272727275,
+      "loss": 2.3133,
+      "step": 103
+    },
+    {
+      "epoch": 0.4727272727272727,
+      "grad_norm": 2.185718536376953,
+      "learning_rate": 0.00042196969696969703,
+      "loss": 2.5506,
+      "step": 104
+    },
+    {
+      "epoch": 0.4772727272727273,
+      "grad_norm": 2.155470132827759,
+      "learning_rate": 0.0004212121212121212,
+      "loss": 2.2074,
+      "step": 105
+    },
+    {
+      "epoch": 0.4818181818181818,
+      "grad_norm": 2.518435001373291,
+      "learning_rate": 0.0004204545454545455,
+      "loss": 1.8589,
+      "step": 106
+    },
+    {
+      "epoch": 0.4863636363636364,
+      "grad_norm": 2.5512635707855225,
+      "learning_rate": 0.0004196969696969697,
+      "loss": 1.9953,
+      "step": 107
+    },
+    {
+      "epoch": 0.4909090909090909,
+      "grad_norm": 2.238809108734131,
+      "learning_rate": 0.00041893939393939397,
+      "loss": 2.2441,
+      "step": 108
+    },
+    {
+      "epoch": 0.4954545454545455,
+      "grad_norm": 1.8442784547805786,
+      "learning_rate": 0.00041818181818181814,
+      "loss": 1.8682,
+      "step": 109
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 2.4844954013824463,
+      "learning_rate": 0.0004174242424242424,
+      "loss": 1.9522,
+      "step": 110
+    },
+    {
+      "epoch": 0.5045454545454545,
+      "grad_norm": 1.9704878330230713,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 1.4167,
+      "step": 111
+    },
+    {
+      "epoch": 0.509090909090909,
+      "grad_norm": 2.2447972297668457,
+      "learning_rate": 0.0004159090909090909,
+      "loss": 1.7897,
+      "step": 112
+    },
+    {
+      "epoch": 0.5136363636363637,
+      "grad_norm": 2.530410051345825,
+      "learning_rate": 0.0004151515151515152,
+      "loss": 2.5473,
+      "step": 113
+    },
+    {
+      "epoch": 0.5181818181818182,
+      "grad_norm": 2.450526714324951,
+      "learning_rate": 0.00041439393939393936,
+      "loss": 2.0436,
+      "step": 114
+    },
+    {
+      "epoch": 0.5227272727272727,
+      "grad_norm": 2.4212632179260254,
+      "learning_rate": 0.00041363636363636364,
+      "loss": 2.1118,
+      "step": 115
+    },
+    {
+      "epoch": 0.5272727272727272,
+      "grad_norm": 1.9820351600646973,
+      "learning_rate": 0.0004128787878787879,
+      "loss": 1.9614,
+      "step": 116
+    },
+    {
+      "epoch": 0.5318181818181819,
+      "grad_norm": 2.467961549758911,
+      "learning_rate": 0.00041212121212121214,
+      "loss": 1.9572,
+      "step": 117
+    },
+    {
+      "epoch": 0.5363636363636364,
+      "grad_norm": 2.2693068981170654,
+      "learning_rate": 0.00041136363636363636,
+      "loss": 1.9033,
+      "step": 118
+    },
+    {
+      "epoch": 0.5409090909090909,
+      "grad_norm": 2.299119710922241,
+      "learning_rate": 0.0004106060606060606,
+      "loss": 1.9848,
+      "step": 119
+    },
+    {
+      "epoch": 0.5454545454545454,
+      "grad_norm": 2.0330560207366943,
+      "learning_rate": 0.00040984848484848486,
+      "loss": 1.3706,
+      "step": 120
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 2.197603702545166,
+      "learning_rate": 0.00040909090909090913,
+      "loss": 2.1687,
+      "step": 121
+    },
+    {
+      "epoch": 0.5545454545454546,
+      "grad_norm": 2.7206549644470215,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 2.3247,
+      "step": 122
+    },
+    {
+      "epoch": 0.5590909090909091,
+      "grad_norm": 2.882654905319214,
+      "learning_rate": 0.0004075757575757576,
+      "loss": 1.6946,
+      "step": 123
+    },
+    {
+      "epoch": 0.5636363636363636,
+      "grad_norm": 2.3815231323242188,
+      "learning_rate": 0.0004068181818181818,
+      "loss": 1.862,
+      "step": 124
+    },
+    {
+      "epoch": 0.5681818181818182,
+      "grad_norm": 2.4142932891845703,
+      "learning_rate": 0.0004060606060606061,
+      "loss": 2.0066,
+      "step": 125
+    },
+    {
+      "epoch": 0.5727272727272728,
+      "grad_norm": 2.6641104221343994,
+      "learning_rate": 0.0004053030303030303,
+      "loss": 1.9456,
+      "step": 126
+    },
+    {
+      "epoch": 0.5772727272727273,
+      "grad_norm": 2.983633518218994,
+      "learning_rate": 0.0004045454545454546,
+      "loss": 2.0049,
+      "step": 127
+    },
+    {
+      "epoch": 0.5818181818181818,
+      "grad_norm": 2.1993696689605713,
+      "learning_rate": 0.0004037878787878788,
+      "loss": 1.6259,
+      "step": 128
+    },
+    {
+      "epoch": 0.5863636363636363,
+      "grad_norm": 3.0398480892181396,
+      "learning_rate": 0.000403030303030303,
+      "loss": 1.6535,
+      "step": 129
+    },
+    {
+      "epoch": 0.5909090909090909,
+      "grad_norm": 2.298558235168457,
+      "learning_rate": 0.0004022727272727273,
+      "loss": 1.8693,
+      "step": 130
+    },
+    {
+      "epoch": 0.5954545454545455,
+      "grad_norm": 2.5126214027404785,
+      "learning_rate": 0.0004015151515151515,
+      "loss": 2.1854,
+      "step": 131
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 2.0419557094573975,
+      "learning_rate": 0.00040075757575757574,
+      "loss": 1.5857,
+      "step": 132
+    },
+    {
+      "epoch": 0.6045454545454545,
+      "grad_norm": 2.4304699897766113,
+      "learning_rate": 0.0004,
+      "loss": 2.347,
+      "step": 133
+    },
+    {
+      "epoch": 0.6090909090909091,
+      "grad_norm": 3.098036050796509,
+      "learning_rate": 0.00039924242424242424,
+      "loss": 2.9748,
+      "step": 134
+    },
+    {
+      "epoch": 0.6136363636363636,
+      "grad_norm": 2.4403679370880127,
+      "learning_rate": 0.0003984848484848485,
+      "loss": 1.9342,
+      "step": 135
+    },
+    {
+      "epoch": 0.6181818181818182,
+      "grad_norm": 2.832394599914551,
+      "learning_rate": 0.00039772727272727274,
+      "loss": 2.132,
+      "step": 136
+    },
+    {
+      "epoch": 0.6227272727272727,
+      "grad_norm": 2.601243019104004,
+      "learning_rate": 0.00039696969696969696,
+      "loss": 2.3074,
+      "step": 137
+    },
+    {
+      "epoch": 0.6272727272727273,
+      "grad_norm": 2.2306132316589355,
+      "learning_rate": 0.00039621212121212124,
+      "loss": 1.6065,
+      "step": 138
+    },
+    {
+      "epoch": 0.6318181818181818,
+      "grad_norm": 2.393157720565796,
+      "learning_rate": 0.00039545454545454546,
+      "loss": 1.7411,
+      "step": 139
+    },
+    {
+      "epoch": 0.6363636363636364,
+      "grad_norm": 2.174208164215088,
+      "learning_rate": 0.00039469696969696974,
+      "loss": 1.3876,
+      "step": 140
+    },
+    {
+      "epoch": 0.6409090909090909,
+      "grad_norm": 2.3376457691192627,
+      "learning_rate": 0.0003939393939393939,
+      "loss": 2.3752,
+      "step": 141
+    },
+    {
+      "epoch": 0.6454545454545455,
+      "grad_norm": 2.141479969024658,
+      "learning_rate": 0.0003931818181818182,
+      "loss": 1.3948,
+      "step": 142
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 2.4302890300750732,
+      "learning_rate": 0.0003924242424242424,
+      "loss": 2.5493,
+      "step": 143
+    },
+    {
+      "epoch": 0.6545454545454545,
+      "grad_norm": 1.9080986976623535,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 1.4652,
+      "step": 144
+    },
+    {
+      "epoch": 0.6590909090909091,
+      "grad_norm": 2.438420057296753,
+      "learning_rate": 0.00039090909090909096,
+      "loss": 2.0606,
+      "step": 145
+    },
+    {
+      "epoch": 0.6636363636363637,
+      "grad_norm": 2.028294563293457,
+      "learning_rate": 0.0003901515151515151,
+      "loss": 1.8798,
+      "step": 146
+    },
+    {
+      "epoch": 0.6681818181818182,
+      "grad_norm": 2.4235315322875977,
+      "learning_rate": 0.0003893939393939394,
+      "loss": 1.8855,
+      "step": 147
+    },
+    {
+      "epoch": 0.6727272727272727,
+      "grad_norm": 2.430391788482666,
+      "learning_rate": 0.0003886363636363636,
+      "loss": 1.7753,
+      "step": 148
+    },
+    {
+      "epoch": 0.6772727272727272,
+      "grad_norm": 2.053199052810669,
+      "learning_rate": 0.0003878787878787879,
+      "loss": 2.1466,
+      "step": 149
+    },
+    {
+      "epoch": 0.6818181818181818,
+      "grad_norm": 2.067093849182129,
+      "learning_rate": 0.0003871212121212122,
+      "loss": 1.7715,
+      "step": 150
+    },
+    {
+      "epoch": 0.6863636363636364,
+      "grad_norm": 2.047165632247925,
+      "learning_rate": 0.00038636363636363635,
+      "loss": 1.8703,
+      "step": 151
+    },
+    {
+      "epoch": 0.6909090909090909,
+      "grad_norm": 2.567540168762207,
+      "learning_rate": 0.0003856060606060606,
+      "loss": 1.7973,
+      "step": 152
+    },
+    {
+      "epoch": 0.6954545454545454,
+      "grad_norm": 2.5282599925994873,
+      "learning_rate": 0.00038484848484848485,
+      "loss": 2.422,
+      "step": 153
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.0428948402404785,
+      "learning_rate": 0.0003840909090909091,
+      "loss": 1.5709,
+      "step": 154
+    },
+    {
+      "epoch": 0.7045454545454546,
+      "grad_norm": 2.032672166824341,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 1.8584,
+      "step": 155
+    },
+    {
+      "epoch": 0.7090909090909091,
+      "grad_norm": 2.4448535442352295,
+      "learning_rate": 0.00038257575757575757,
+      "loss": 2.069,
+      "step": 156
+    },
+    {
+      "epoch": 0.7136363636363636,
+      "grad_norm": 1.6503087282180786,
+      "learning_rate": 0.00038181818181818184,
+      "loss": 1.5194,
+      "step": 157
+    },
+    {
+      "epoch": 0.7181818181818181,
+      "grad_norm": 2.5853593349456787,
+      "learning_rate": 0.00038106060606060607,
+      "loss": 2.4603,
+      "step": 158
+    },
+    {
+      "epoch": 0.7227272727272728,
+      "grad_norm": 2.353992223739624,
+      "learning_rate": 0.00038030303030303034,
+      "loss": 1.4417,
+      "step": 159
+    },
+    {
+      "epoch": 0.7272727272727273,
+      "grad_norm": 2.382633686065674,
+      "learning_rate": 0.0003795454545454545,
+      "loss": 1.9239,
+      "step": 160
+    },
+    {
+      "epoch": 0.7318181818181818,
+      "grad_norm": 2.7274303436279297,
+      "learning_rate": 0.0003787878787878788,
+      "loss": 2.1116,
+      "step": 161
+    },
+    {
+      "epoch": 0.7363636363636363,
+      "grad_norm": 2.0137476921081543,
+      "learning_rate": 0.00037803030303030306,
+      "loss": 1.5707,
+      "step": 162
+    },
+    {
+      "epoch": 0.740909090909091,
+      "grad_norm": 1.977155089378357,
+      "learning_rate": 0.0003772727272727273,
+      "loss": 1.4972,
+      "step": 163
+    },
+    {
+      "epoch": 0.7454545454545455,
+      "grad_norm": 2.5506880283355713,
+      "learning_rate": 0.0003765151515151515,
+      "loss": 2.4583,
+      "step": 164
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 2.2664661407470703,
+      "learning_rate": 0.00037575757575757573,
+      "loss": 2.1239,
+      "step": 165
+    },
+    {
+      "epoch": 0.7545454545454545,
+      "grad_norm": 2.393831968307495,
+      "learning_rate": 0.000375,
+      "loss": 2.2104,
+      "step": 166
+    },
+    {
+      "epoch": 0.759090909090909,
+      "grad_norm": 2.358670711517334,
+      "learning_rate": 0.0003742424242424243,
+      "loss": 1.7545,
+      "step": 167
+    },
+    {
+      "epoch": 0.7636363636363637,
+      "grad_norm": 2.0985164642333984,
+      "learning_rate": 0.0003734848484848485,
+      "loss": 1.7337,
+      "step": 168
+    },
+    {
+      "epoch": 0.7681818181818182,
+      "grad_norm": 1.711176872253418,
+      "learning_rate": 0.00037272727272727273,
+      "loss": 1.3195,
+      "step": 169
+    },
+    {
+      "epoch": 0.7727272727272727,
+      "grad_norm": 2.20684814453125,
+      "learning_rate": 0.00037196969696969695,
+      "loss": 1.7633,
+      "step": 170
+    },
+    {
+      "epoch": 0.7772727272727272,
+      "grad_norm": 2.0569570064544678,
+      "learning_rate": 0.00037121212121212123,
+      "loss": 1.8354,
+      "step": 171
+    },
+    {
+      "epoch": 0.7818181818181819,
+      "grad_norm": 2.4895520210266113,
+      "learning_rate": 0.0003704545454545455,
+      "loss": 2.6706,
+      "step": 172
+    },
+    {
+      "epoch": 0.7863636363636364,
+      "grad_norm": 2.3134992122650146,
+      "learning_rate": 0.00036969696969696967,
+      "loss": 2.077,
+      "step": 173
+    },
+    {
+      "epoch": 0.7909090909090909,
+      "grad_norm": 1.876047968864441,
+      "learning_rate": 0.00036893939393939395,
+      "loss": 1.5816,
+      "step": 174
+    },
+    {
+      "epoch": 0.7954545454545454,
+      "grad_norm": 2.301314353942871,
+      "learning_rate": 0.00036818181818181817,
+      "loss": 2.9433,
+      "step": 175
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 2.4783785343170166,
+      "learning_rate": 0.00036742424242424245,
+      "loss": 2.3913,
+      "step": 176
+    },
+    {
+      "epoch": 0.8045454545454546,
+      "grad_norm": 2.3966879844665527,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 2.6103,
+      "step": 177
+    },
+    {
+      "epoch": 0.8090909090909091,
+      "grad_norm": 2.1050190925598145,
+      "learning_rate": 0.0003659090909090909,
+      "loss": 1.6801,
+      "step": 178
+    },
+    {
+      "epoch": 0.8136363636363636,
+      "grad_norm": 2.3336639404296875,
+      "learning_rate": 0.00036515151515151517,
+      "loss": 1.936,
+      "step": 179
+    },
+    {
+      "epoch": 0.8181818181818182,
+      "grad_norm": 2.4867429733276367,
+      "learning_rate": 0.0003643939393939394,
+      "loss": 2.0285,
+      "step": 180
+    },
+    {
+      "epoch": 0.8227272727272728,
+      "grad_norm": 1.9529379606246948,
+      "learning_rate": 0.00036363636363636367,
+      "loss": 1.5503,
+      "step": 181
+    },
+    {
+      "epoch": 0.8272727272727273,
+      "grad_norm": 2.095381736755371,
+      "learning_rate": 0.00036287878787878784,
+      "loss": 2.5626,
+      "step": 182
+    },
+    {
+      "epoch": 0.8318181818181818,
+      "grad_norm": 2.156099319458008,
+      "learning_rate": 0.0003621212121212121,
+      "loss": 1.8788,
+      "step": 183
+    },
+    {
+      "epoch": 0.8363636363636363,
+      "grad_norm": 1.8726741075515747,
+      "learning_rate": 0.0003613636363636364,
+      "loss": 1.6386,
+      "step": 184
+    },
+    {
+      "epoch": 0.8409090909090909,
+      "grad_norm": 2.6056482791900635,
+      "learning_rate": 0.0003606060606060606,
+      "loss": 1.7965,
+      "step": 185
+    },
+    {
+      "epoch": 0.8454545454545455,
+      "grad_norm": 2.65775728225708,
+      "learning_rate": 0.0003598484848484849,
+      "loss": 1.775,
+      "step": 186
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.9309563636779785,
+      "learning_rate": 0.00035909090909090906,
+      "loss": 1.7575,
+      "step": 187
+    },
+    {
+      "epoch": 0.8545454545454545,
+      "grad_norm": 1.874107003211975,
+      "learning_rate": 0.00035833333333333333,
+      "loss": 1.4686,
+      "step": 188
+    },
+    {
+      "epoch": 0.8590909090909091,
+      "grad_norm": 2.125084400177002,
+      "learning_rate": 0.0003575757575757576,
+      "loss": 1.3713,
+      "step": 189
+    },
+    {
+      "epoch": 0.8636363636363636,
+      "grad_norm": 2.016660690307617,
+      "learning_rate": 0.00035681818181818183,
+      "loss": 1.6914,
+      "step": 190
+    },
+    {
+      "epoch": 0.8681818181818182,
+      "grad_norm": 2.8699893951416016,
+      "learning_rate": 0.0003560606060606061,
+      "loss": 1.6524,
+      "step": 191
+    },
+    {
+      "epoch": 0.8727272727272727,
+      "grad_norm": 2.2474772930145264,
+      "learning_rate": 0.0003553030303030303,
+      "loss": 1.7333,
+      "step": 192
+    },
+    {
+      "epoch": 0.8772727272727273,
+      "grad_norm": 2.6996030807495117,
+      "learning_rate": 0.00035454545454545455,
+      "loss": 1.5828,
+      "step": 193
+    },
+    {
+      "epoch": 0.8818181818181818,
+      "grad_norm": 2.196274757385254,
+      "learning_rate": 0.0003537878787878788,
+      "loss": 1.6058,
+      "step": 194
+    },
+    {
+      "epoch": 0.8863636363636364,
+      "grad_norm": 2.4350290298461914,
+      "learning_rate": 0.00035303030303030305,
+      "loss": 2.0724,
+      "step": 195
+    },
+    {
+      "epoch": 0.8909090909090909,
+      "grad_norm": 2.047480821609497,
+      "learning_rate": 0.0003522727272727273,
+      "loss": 1.7112,
+      "step": 196
+    },
+    {
+      "epoch": 0.8954545454545455,
+      "grad_norm": 2.4547033309936523,
+      "learning_rate": 0.0003515151515151515,
+      "loss": 1.7747,
+      "step": 197
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 2.9125847816467285,
+      "learning_rate": 0.0003507575757575758,
+      "loss": 2.0878,
+      "step": 198
+    },
+    {
+      "epoch": 0.9045454545454545,
+      "grad_norm": 2.168196678161621,
+      "learning_rate": 0.00035,
+      "loss": 1.5592,
+      "step": 199
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 2.364847183227539,
+      "learning_rate": 0.0003492424242424243,
+      "loss": 2.0301,
+      "step": 200
+    },
+    {
+      "epoch": 0.9136363636363637,
+      "grad_norm": 2.743267059326172,
+      "learning_rate": 0.0003484848484848485,
+      "loss": 1.8784,
+      "step": 201
+    },
+    {
+      "epoch": 0.9181818181818182,
+      "grad_norm": 2.2784361839294434,
+      "learning_rate": 0.0003477272727272727,
+      "loss": 1.5936,
+      "step": 202
+    },
+    {
+      "epoch": 0.9227272727272727,
+      "grad_norm": 2.875943422317505,
+      "learning_rate": 0.000346969696969697,
+      "loss": 1.9961,
+      "step": 203
+    },
+    {
+      "epoch": 0.9272727272727272,
+      "grad_norm": 2.0056071281433105,
+      "learning_rate": 0.0003462121212121212,
+      "loss": 1.8795,
+      "step": 204
+    },
+    {
+      "epoch": 0.9318181818181818,
+      "grad_norm": 2.3547298908233643,
+      "learning_rate": 0.00034545454545454544,
+      "loss": 2.1429,
+      "step": 205
+    },
+    {
+      "epoch": 0.9363636363636364,
+      "grad_norm": 2.7082138061523438,
+      "learning_rate": 0.0003446969696969697,
+      "loss": 1.7504,
+      "step": 206
+    },
+    {
+      "epoch": 0.9409090909090909,
+      "grad_norm": 2.335139751434326,
+      "learning_rate": 0.00034393939393939394,
+      "loss": 1.9774,
+      "step": 207
+    },
+    {
+      "epoch": 0.9454545454545454,
+      "grad_norm": 2.3550143241882324,
+      "learning_rate": 0.0003431818181818182,
+      "loss": 1.8602,
+      "step": 208
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 2.622682809829712,
+      "learning_rate": 0.00034242424242424244,
+      "loss": 2.2283,
+      "step": 209
+    },
+    {
+      "epoch": 0.9545454545454546,
+      "grad_norm": 1.7282129526138306,
+      "learning_rate": 0.00034166666666666666,
+      "loss": 1.6025,
+      "step": 210
+    },
+    {
+      "epoch": 0.9590909090909091,
+      "grad_norm": 2.8252415657043457,
+      "learning_rate": 0.0003409090909090909,
+      "loss": 1.7649,
+      "step": 211
+    },
+    {
+      "epoch": 0.9636363636363636,
+      "grad_norm": 2.146219491958618,
+      "learning_rate": 0.00034015151515151516,
+      "loss": 1.9742,
+      "step": 212
+    },
+    {
+      "epoch": 0.9681818181818181,
+      "grad_norm": 2.124577045440674,
+      "learning_rate": 0.00033939393939393943,
+      "loss": 1.7412,
+      "step": 213
+    },
+    {
+      "epoch": 0.9727272727272728,
+      "grad_norm": 1.7649497985839844,
+      "learning_rate": 0.00033863636363636366,
+      "loss": 1.2667,
+      "step": 214
+    },
+    {
+      "epoch": 0.9772727272727273,
+      "grad_norm": 2.375659465789795,
+      "learning_rate": 0.0003378787878787879,
+      "loss": 1.7142,
+      "step": 215
+    },
+    {
+      "epoch": 0.9818181818181818,
+      "grad_norm": 1.9995368719100952,
+      "learning_rate": 0.0003371212121212121,
+      "loss": 1.4613,
+      "step": 216
+    },
+    {
+      "epoch": 0.9863636363636363,
+      "grad_norm": 2.2640981674194336,
+      "learning_rate": 0.0003363636363636364,
+      "loss": 1.9474,
+      "step": 217
+    },
+    {
+      "epoch": 0.990909090909091,
+      "grad_norm": 2.1680893898010254,
+      "learning_rate": 0.00033560606060606066,
+      "loss": 2.5352,
+      "step": 218
+    },
+    {
+      "epoch": 0.9954545454545455,
+      "grad_norm": 2.3969366550445557,
+      "learning_rate": 0.0003348484848484848,
+      "loss": 1.9012,
+      "step": 219
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 3.3696913719177246,
+      "learning_rate": 0.0003340909090909091,
+      "loss": 2.2928,
+      "step": 220
+    },
+    {
+      "epoch": 1.0,
+      "eval_f1": 0.891,
+      "eval_gen_len": 41.9182,
+      "eval_loss": 1.8093845844268799,
+      "eval_precision": 0.8891,
+      "eval_recall": 0.8931,
+      "eval_rouge1": 0.466,
+      "eval_rouge2": 0.2146,
+      "eval_rougeL": 0.3912,
+      "eval_rougeLsum": 0.4301,
+      "eval_runtime": 25.1921,
+      "eval_samples_per_second": 4.366,
+      "eval_steps_per_second": 0.556,
+      "step": 220
+    },
+    {
+      "epoch": 1.0045454545454546,
+      "grad_norm": 1.7403843402862549,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 1.7294,
+      "step": 221
+    },
+    {
+      "epoch": 1.009090909090909,
+      "grad_norm": 1.5273452997207642,
+      "learning_rate": 0.0003325757575757576,
+      "loss": 1.3279,
+      "step": 222
+    },
+    {
+      "epoch": 1.0136363636363637,
+      "grad_norm": 1.427538275718689,
+      "learning_rate": 0.0003318181818181819,
+      "loss": 0.9647,
+      "step": 223
+    },
+    {
+      "epoch": 1.018181818181818,
+      "grad_norm": 1.5605067014694214,
+      "learning_rate": 0.00033106060606060604,
+      "loss": 1.3178,
+      "step": 224
+    },
+    {
+      "epoch": 1.0227272727272727,
+      "grad_norm": 1.6737922430038452,
+      "learning_rate": 0.0003303030303030303,
+      "loss": 1.403,
+      "step": 225
+    },
+    {
+      "epoch": 1.0272727272727273,
+      "grad_norm": 2.3249313831329346,
+      "learning_rate": 0.00032954545454545454,
+      "loss": 1.4907,
+      "step": 226
+    },
+    {
+      "epoch": 1.0318181818181817,
+      "grad_norm": 1.9939628839492798,
+      "learning_rate": 0.0003287878787878788,
+      "loss": 1.795,
+      "step": 227
+    },
+    {
+      "epoch": 1.0363636363636364,
+      "grad_norm": 2.482421398162842,
+      "learning_rate": 0.000328030303030303,
+      "loss": 1.7309,
+      "step": 228
+    },
+    {
+      "epoch": 1.040909090909091,
+      "grad_norm": 1.6090010404586792,
+      "learning_rate": 0.00032727272727272726,
+      "loss": 1.0976,
+      "step": 229
+    },
+    {
+      "epoch": 1.0454545454545454,
+      "grad_norm": 1.5481090545654297,
+      "learning_rate": 0.00032651515151515154,
+      "loss": 1.1785,
+      "step": 230
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 2.0420186519622803,
+      "learning_rate": 0.00032575757575757576,
+      "loss": 1.1853,
+      "step": 231
+    },
+    {
+      "epoch": 1.0545454545454545,
+      "grad_norm": 2.5020453929901123,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 1.8213,
+      "step": 232
+    },
+    {
+      "epoch": 1.059090909090909,
+      "grad_norm": 1.5312837362289429,
+      "learning_rate": 0.0003242424242424242,
+      "loss": 1.016,
+      "step": 233
+    },
+    {
+      "epoch": 1.0636363636363637,
+      "grad_norm": 2.062110185623169,
+      "learning_rate": 0.0003234848484848485,
+      "loss": 1.2245,
+      "step": 234
+    },
+    {
+      "epoch": 1.0681818181818181,
+      "grad_norm": 1.7521977424621582,
+      "learning_rate": 0.00032272727272727276,
+      "loss": 1.4904,
+      "step": 235
+    },
+    {
+      "epoch": 1.0727272727272728,
+      "grad_norm": 1.431998372077942,
+      "learning_rate": 0.000321969696969697,
+      "loss": 1.2364,
+      "step": 236
+    },
+    {
+      "epoch": 1.0772727272727272,
+      "grad_norm": 1.8957371711730957,
+      "learning_rate": 0.00032121212121212126,
+      "loss": 1.9241,
+      "step": 237
+    },
+    {
+      "epoch": 1.0818181818181818,
+      "grad_norm": 1.6720540523529053,
+      "learning_rate": 0.00032045454545454543,
+      "loss": 1.0261,
+      "step": 238
+    },
+    {
+      "epoch": 1.0863636363636364,
+      "grad_norm": 1.8503271341323853,
+      "learning_rate": 0.0003196969696969697,
+      "loss": 1.6694,
+      "step": 239
+    },
+    {
+      "epoch": 1.0909090909090908,
+      "grad_norm": 2.055772066116333,
+      "learning_rate": 0.000318939393939394,
+      "loss": 1.6855,
+      "step": 240
+    },
+    {
+      "epoch": 1.0954545454545455,
+      "grad_norm": 4.034445285797119,
+      "learning_rate": 0.0003181818181818182,
+      "loss": 1.8592,
+      "step": 241
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 2.6941239833831787,
+      "learning_rate": 0.0003174242424242424,
+      "loss": 1.1528,
+      "step": 242
+    },
+    {
+      "epoch": 1.1045454545454545,
+      "grad_norm": 1.8258893489837646,
+      "learning_rate": 0.00031666666666666665,
+      "loss": 1.1762,
+      "step": 243
+    },
+    {
+      "epoch": 1.1090909090909091,
+      "grad_norm": 2.721888303756714,
+      "learning_rate": 0.0003159090909090909,
+      "loss": 1.5523,
+      "step": 244
+    },
+    {
+      "epoch": 1.1136363636363635,
+      "grad_norm": 2.35798978805542,
+      "learning_rate": 0.00031515151515151515,
+      "loss": 1.7533,
+      "step": 245
+    },
+    {
+      "epoch": 1.1181818181818182,
+      "grad_norm": 2.089695453643799,
+      "learning_rate": 0.0003143939393939394,
+      "loss": 1.4344,
+      "step": 246
+    },
+    {
+      "epoch": 1.1227272727272728,
+      "grad_norm": 2.0275492668151855,
+      "learning_rate": 0.00031363636363636365,
+      "loss": 1.5359,
+      "step": 247
+    },
+    {
+      "epoch": 1.1272727272727272,
+      "grad_norm": 2.290893077850342,
+      "learning_rate": 0.00031287878787878787,
+      "loss": 1.8292,
+      "step": 248
+    },
+    {
+      "epoch": 1.1318181818181818,
+      "grad_norm": 2.3136603832244873,
+      "learning_rate": 0.00031212121212121214,
+      "loss": 1.6828,
+      "step": 249
+    },
+    {
+      "epoch": 1.1363636363636362,
+      "grad_norm": 2.1181459426879883,
+      "learning_rate": 0.00031136363636363637,
+      "loss": 1.1531,
+      "step": 250
+    },
+    {
+      "epoch": 1.1409090909090909,
+      "grad_norm": 1.9240480661392212,
+      "learning_rate": 0.0003106060606060606,
+      "loss": 1.2515,
+      "step": 251
+    },
+    {
+      "epoch": 1.1454545454545455,
+      "grad_norm": 2.522502899169922,
+      "learning_rate": 0.00030984848484848487,
+      "loss": 1.4942,
+      "step": 252
+    },
+    {
+      "epoch": 1.15,
+      "grad_norm": 1.5959993600845337,
+      "learning_rate": 0.0003090909090909091,
+      "loss": 1.1412,
+      "step": 253
+    },
+    {
+      "epoch": 1.1545454545454545,
+      "grad_norm": 1.740268588066101,
+      "learning_rate": 0.00030833333333333337,
+      "loss": 1.7009,
+      "step": 254
+    },
+    {
+      "epoch": 1.1590909090909092,
+      "grad_norm": 2.1791181564331055,
+      "learning_rate": 0.0003075757575757576,
+      "loss": 1.4727,
+      "step": 255
+    },
+    {
+      "epoch": 1.1636363636363636,
+      "grad_norm": 2.2325475215911865,
+      "learning_rate": 0.0003068181818181818,
+      "loss": 1.4379,
+      "step": 256
+    },
+    {
+      "epoch": 1.1681818181818182,
+      "grad_norm": 2.206281900405884,
+      "learning_rate": 0.00030606060606060603,
+      "loss": 1.5069,
+      "step": 257
+    },
+    {
+      "epoch": 1.1727272727272728,
+      "grad_norm": 2.6821632385253906,
+      "learning_rate": 0.0003053030303030303,
+      "loss": 1.7888,
+      "step": 258
+    },
+    {
+      "epoch": 1.1772727272727272,
+      "grad_norm": 2.567087173461914,
+      "learning_rate": 0.0003045454545454546,
+      "loss": 1.7501,
+      "step": 259
+    },
+    {
+      "epoch": 1.1818181818181819,
+      "grad_norm": 2.131887674331665,
+      "learning_rate": 0.00030378787878787875,
+      "loss": 1.3294,
+      "step": 260
+    },
+    {
+      "epoch": 1.1863636363636363,
+      "grad_norm": 1.5638073682785034,
+      "learning_rate": 0.00030303030303030303,
+      "loss": 1.3679,
+      "step": 261
+    },
+    {
+      "epoch": 1.190909090909091,
+      "grad_norm": 2.208799362182617,
+      "learning_rate": 0.00030227272727272725,
+      "loss": 1.2585,
+      "step": 262
+    },
+    {
+      "epoch": 1.1954545454545455,
+      "grad_norm": 2.379058599472046,
+      "learning_rate": 0.00030151515151515153,
+      "loss": 1.4692,
+      "step": 263
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 2.2137858867645264,
+      "learning_rate": 0.0003007575757575758,
+      "loss": 1.5648,
+      "step": 264
+    },
+    {
+      "epoch": 1.2045454545454546,
+      "grad_norm": 1.94793701171875,
+      "learning_rate": 0.0003,
+      "loss": 1.2642,
+      "step": 265
+    },
+    {
+      "epoch": 1.209090909090909,
+      "grad_norm": 2.8321635723114014,
+      "learning_rate": 0.00029924242424242425,
+      "loss": 1.4013,
+      "step": 266
+    },
+    {
+      "epoch": 1.2136363636363636,
+      "grad_norm": 2.6073920726776123,
+      "learning_rate": 0.00029848484848484847,
+      "loss": 1.6666,
+      "step": 267
+    },
+    {
+      "epoch": 1.2181818181818183,
+      "grad_norm": 1.6753661632537842,
+      "learning_rate": 0.00029772727272727275,
+      "loss": 1.2472,
+      "step": 268
+    },
+    {
+      "epoch": 1.2227272727272727,
+      "grad_norm": 2.296635866165161,
+      "learning_rate": 0.000296969696969697,
+      "loss": 1.8099,
+      "step": 269
+    },
+    {
+      "epoch": 1.2272727272727273,
+      "grad_norm": 2.8359079360961914,
+      "learning_rate": 0.0002962121212121212,
+      "loss": 1.5522,
+      "step": 270
+    },
+    {
+      "epoch": 1.231818181818182,
+      "grad_norm": 2.3741962909698486,
+      "learning_rate": 0.00029545454545454547,
+      "loss": 1.5737,
+      "step": 271
+    },
+    {
+      "epoch": 1.2363636363636363,
+      "grad_norm": 1.9859591722488403,
+      "learning_rate": 0.0002946969696969697,
+      "loss": 1.5659,
+      "step": 272
+    },
+    {
+      "epoch": 1.240909090909091,
+      "grad_norm": 1.9343222379684448,
+      "learning_rate": 0.00029393939393939397,
+      "loss": 1.1204,
+      "step": 273
+    },
+    {
+      "epoch": 1.2454545454545454,
+      "grad_norm": 1.6376460790634155,
+      "learning_rate": 0.00029318181818181814,
+      "loss": 0.8886,
+      "step": 274
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 1.9865474700927734,
+      "learning_rate": 0.0002924242424242424,
+      "loss": 1.5425,
+      "step": 275
+    },
+    {
+      "epoch": 1.2545454545454544,
+      "grad_norm": 2.1017825603485107,
+      "learning_rate": 0.0002916666666666667,
+      "loss": 1.1125,
+      "step": 276
+    },
+    {
+      "epoch": 1.259090909090909,
+      "grad_norm": 2.349350690841675,
+      "learning_rate": 0.0002909090909090909,
+      "loss": 1.6496,
+      "step": 277
+    },
+    {
+      "epoch": 1.2636363636363637,
+      "grad_norm": 1.8741109371185303,
+      "learning_rate": 0.0002901515151515152,
+      "loss": 1.3404,
+      "step": 278
+    },
+    {
+      "epoch": 1.268181818181818,
+      "grad_norm": 2.2605037689208984,
+      "learning_rate": 0.00028939393939393936,
+      "loss": 1.5495,
+      "step": 279
+    },
+    {
+      "epoch": 1.2727272727272727,
+      "grad_norm": 2.0851080417633057,
+      "learning_rate": 0.00028863636363636363,
+      "loss": 1.501,
+      "step": 280
+    },
+    {
+      "epoch": 1.2772727272727273,
+      "grad_norm": 2.2092325687408447,
+      "learning_rate": 0.0002878787878787879,
+      "loss": 1.5655,
+      "step": 281
+    },
+    {
+      "epoch": 1.2818181818181817,
+      "grad_norm": 2.343780755996704,
+      "learning_rate": 0.00028712121212121213,
+      "loss": 1.4229,
+      "step": 282
+    },
+    {
+      "epoch": 1.2863636363636364,
+      "grad_norm": 1.684411883354187,
+      "learning_rate": 0.00028636363636363636,
+      "loss": 1.4132,
+      "step": 283
+    },
+    {
+      "epoch": 1.290909090909091,
+      "grad_norm": 2.034984827041626,
+      "learning_rate": 0.0002856060606060606,
+      "loss": 1.1224,
+      "step": 284
+    },
+    {
+      "epoch": 1.2954545454545454,
+      "grad_norm": 1.9973840713500977,
+      "learning_rate": 0.00028484848484848485,
+      "loss": 1.4387,
+      "step": 285
+    },
+    {
+      "epoch": 1.3,
+      "grad_norm": 2.2674074172973633,
+      "learning_rate": 0.00028409090909090913,
+      "loss": 1.6697,
+      "step": 286
+    },
+    {
+      "epoch": 1.3045454545454547,
+      "grad_norm": 1.596279501914978,
+      "learning_rate": 0.00028333333333333335,
+      "loss": 1.0433,
+      "step": 287
+    },
+    {
+      "epoch": 1.309090909090909,
+      "grad_norm": 1.874055027961731,
+      "learning_rate": 0.0002825757575757576,
+      "loss": 1.291,
+      "step": 288
+    },
+    {
+      "epoch": 1.3136363636363637,
+      "grad_norm": 2.2551302909851074,
+      "learning_rate": 0.0002818181818181818,
+      "loss": 1.3771,
+      "step": 289
+    },
+    {
+      "epoch": 1.3181818181818181,
+      "grad_norm": 2.502380847930908,
+      "learning_rate": 0.0002810606060606061,
+      "loss": 1.853,
+      "step": 290
+    },
+    {
+      "epoch": 1.3227272727272728,
+      "grad_norm": 1.9750282764434814,
+      "learning_rate": 0.0002803030303030303,
+      "loss": 1.4369,
+      "step": 291
+    },
+    {
+      "epoch": 1.3272727272727272,
+      "grad_norm": 2.4181363582611084,
+      "learning_rate": 0.0002795454545454546,
+      "loss": 1.3565,
+      "step": 292
+    },
+    {
+      "epoch": 1.3318181818181818,
+      "grad_norm": 2.0823867321014404,
+      "learning_rate": 0.0002787878787878788,
+      "loss": 1.4589,
+      "step": 293
+    },
+    {
+      "epoch": 1.3363636363636364,
+      "grad_norm": 2.147993326187134,
+      "learning_rate": 0.000278030303030303,
+      "loss": 1.3616,
+      "step": 294
+    },
+    {
+      "epoch": 1.3409090909090908,
+      "grad_norm": 3.184967517852783,
+      "learning_rate": 0.0002772727272727273,
+      "loss": 1.8248,
+      "step": 295
+    },
+    {
+      "epoch": 1.3454545454545455,
+      "grad_norm": 2.3890221118927,
+      "learning_rate": 0.0002765151515151515,
+      "loss": 1.4599,
+      "step": 296
+    },
+    {
+      "epoch": 1.35,
+      "grad_norm": 1.9724668264389038,
+      "learning_rate": 0.00027575757575757574,
+      "loss": 1.2248,
+      "step": 297
+    },
+    {
+      "epoch": 1.3545454545454545,
+      "grad_norm": 2.1539180278778076,
+      "learning_rate": 0.000275,
+      "loss": 1.1712,
+      "step": 298
+    },
+    {
+      "epoch": 1.3590909090909091,
+      "grad_norm": 2.107490062713623,
+      "learning_rate": 0.00027424242424242424,
+      "loss": 1.3786,
+      "step": 299
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 2.052065372467041,
+      "learning_rate": 0.0002734848484848485,
+      "loss": 1.2121,
+      "step": 300
+    },
+    {
+      "epoch": 1.3681818181818182,
+      "grad_norm": 2.310871124267578,
+      "learning_rate": 0.00027272727272727274,
+      "loss": 1.4206,
+      "step": 301
+    },
+    {
+      "epoch": 1.3727272727272728,
+      "grad_norm": 2.1283962726593018,
+      "learning_rate": 0.00027196969696969696,
+      "loss": 1.8294,
+      "step": 302
+    },
+    {
+      "epoch": 1.3772727272727272,
+      "grad_norm": 1.676561951637268,
+      "learning_rate": 0.00027121212121212124,
+      "loss": 0.9432,
+      "step": 303
+    },
+    {
+      "epoch": 1.3818181818181818,
+      "grad_norm": 2.4148755073547363,
+      "learning_rate": 0.00027045454545454546,
+      "loss": 1.8412,
+      "step": 304
+    },
+    {
+      "epoch": 1.3863636363636362,
+      "grad_norm": 1.668143391609192,
+      "learning_rate": 0.00026969696969696974,
+      "loss": 0.9952,
+      "step": 305
+    },
+    {
+      "epoch": 1.3909090909090909,
+      "grad_norm": 2.411818504333496,
+      "learning_rate": 0.0002689393939393939,
+      "loss": 1.657,
+      "step": 306
+    },
+    {
+      "epoch": 1.3954545454545455,
+      "grad_norm": 2.2723898887634277,
+      "learning_rate": 0.0002681818181818182,
+      "loss": 1.6628,
+      "step": 307
+    },
+    {
+      "epoch": 1.4,
+      "grad_norm": 1.8225884437561035,
+      "learning_rate": 0.0002674242424242424,
+      "loss": 1.3039,
+      "step": 308
+    },
+    {
+      "epoch": 1.4045454545454545,
+      "grad_norm": 1.860181450843811,
+      "learning_rate": 0.0002666666666666667,
+      "loss": 1.4974,
+      "step": 309
+    },
+    {
+      "epoch": 1.4090909090909092,
+      "grad_norm": 2.22611927986145,
+      "learning_rate": 0.00026590909090909096,
+      "loss": 1.3242,
+      "step": 310
+    },
+    {
+      "epoch": 1.4136363636363636,
+      "grad_norm": 2.4301326274871826,
+      "learning_rate": 0.0002651515151515151,
+      "loss": 1.4631,
+      "step": 311
+    },
+    {
+      "epoch": 1.4181818181818182,
+      "grad_norm": 2.2716891765594482,
+      "learning_rate": 0.0002643939393939394,
+      "loss": 1.4076,
+      "step": 312
+    },
+    {
+      "epoch": 1.4227272727272728,
+      "grad_norm": 1.8279646635055542,
+      "learning_rate": 0.0002636363636363636,
+      "loss": 1.1232,
+      "step": 313
+    },
+    {
+      "epoch": 1.4272727272727272,
+      "grad_norm": 1.9681382179260254,
+      "learning_rate": 0.0002628787878787879,
+      "loss": 1.0339,
+      "step": 314
+    },
+    {
+      "epoch": 1.4318181818181819,
+      "grad_norm": 2.138864278793335,
+      "learning_rate": 0.0002621212121212122,
+      "loss": 1.4739,
+      "step": 315
+    },
+    {
+      "epoch": 1.4363636363636363,
+      "grad_norm": 1.997995376586914,
+      "learning_rate": 0.00026136363636363634,
+      "loss": 1.6025,
+      "step": 316
+    },
+    {
+      "epoch": 1.440909090909091,
+      "grad_norm": 2.493382215499878,
+      "learning_rate": 0.0002606060606060606,
+      "loss": 1.9215,
+      "step": 317
+    },
+    {
+      "epoch": 1.4454545454545453,
+      "grad_norm": 2.0182077884674072,
+      "learning_rate": 0.00025984848484848484,
+      "loss": 0.8709,
+      "step": 318
+    },
+    {
+      "epoch": 1.45,
+      "grad_norm": 2.0383524894714355,
+      "learning_rate": 0.0002590909090909091,
+      "loss": 1.1791,
+      "step": 319
+    },
+    {
+      "epoch": 1.4545454545454546,
+      "grad_norm": 2.159406900405884,
+      "learning_rate": 0.00025833333333333334,
+      "loss": 1.896,
+      "step": 320
+    },
+    {
+      "epoch": 1.459090909090909,
+      "grad_norm": 2.154700756072998,
+      "learning_rate": 0.00025757575757575756,
+      "loss": 1.4738,
+      "step": 321
+    },
+    {
+      "epoch": 1.4636363636363636,
+      "grad_norm": 2.071272134780884,
+      "learning_rate": 0.00025681818181818184,
+      "loss": 1.4454,
+      "step": 322
+    },
+    {
+      "epoch": 1.4681818181818183,
+      "grad_norm": 2.1091556549072266,
+      "learning_rate": 0.00025606060606060606,
+      "loss": 1.2386,
+      "step": 323
+    },
+    {
+      "epoch": 1.4727272727272727,
+      "grad_norm": 1.8080791234970093,
+      "learning_rate": 0.00025530303030303034,
+      "loss": 0.9288,
+      "step": 324
+    },
+    {
+      "epoch": 1.4772727272727273,
+      "grad_norm": 2.170426607131958,
+      "learning_rate": 0.0002545454545454545,
+      "loss": 1.6025,
+      "step": 325
+    },
+    {
+      "epoch": 1.481818181818182,
+      "grad_norm": 2.9302620887756348,
+      "learning_rate": 0.0002537878787878788,
+      "loss": 2.0105,
+      "step": 326
+    },
+    {
+      "epoch": 1.4863636363636363,
+      "grad_norm": 2.4640023708343506,
+      "learning_rate": 0.00025303030303030306,
+      "loss": 1.5101,
+      "step": 327
+    },
+    {
+      "epoch": 1.490909090909091,
+      "grad_norm": 1.6185390949249268,
+      "learning_rate": 0.0002522727272727273,
+      "loss": 0.9489,
+      "step": 328
+    },
+    {
+      "epoch": 1.4954545454545456,
+      "grad_norm": 1.4413659572601318,
+      "learning_rate": 0.0002515151515151515,
+      "loss": 0.8982,
+      "step": 329
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 1.884208083152771,
+      "learning_rate": 0.00025075757575757573,
+      "loss": 1.2771,
+      "step": 330
+    },
+    {
+      "epoch": 1.5045454545454544,
+      "grad_norm": 1.9692877531051636,
+      "learning_rate": 0.00025,
+      "loss": 1.6345,
+      "step": 331
+    },
+    {
+      "epoch": 1.509090909090909,
+      "grad_norm": 2.3343496322631836,
+      "learning_rate": 0.00024924242424242423,
+      "loss": 1.6179,
+      "step": 332
+    },
+    {
+      "epoch": 1.5136363636363637,
+      "grad_norm": 2.2422614097595215,
+      "learning_rate": 0.0002484848484848485,
+      "loss": 1.3785,
+      "step": 333
+    },
+    {
+      "epoch": 1.518181818181818,
+      "grad_norm": 2.130425214767456,
+      "learning_rate": 0.0002477272727272727,
+      "loss": 1.6261,
+      "step": 334
+    },
+    {
+      "epoch": 1.5227272727272727,
+      "grad_norm": 2.13519287109375,
+      "learning_rate": 0.000246969696969697,
+      "loss": 1.4136,
+      "step": 335
+    },
+    {
+      "epoch": 1.5272727272727273,
+      "grad_norm": 1.268389344215393,
+      "learning_rate": 0.0002462121212121212,
+      "loss": 0.9329,
+      "step": 336
+    },
+    {
+      "epoch": 1.5318181818181817,
+      "grad_norm": 2.094594955444336,
+      "learning_rate": 0.00024545454545454545,
+      "loss": 1.3465,
+      "step": 337
+    },
+    {
+      "epoch": 1.5363636363636364,
+      "grad_norm": 2.156473159790039,
+      "learning_rate": 0.00024469696969696967,
+      "loss": 1.2741,
+      "step": 338
+    },
+    {
+      "epoch": 1.540909090909091,
+      "grad_norm": 1.898336410522461,
+      "learning_rate": 0.00024393939393939392,
+      "loss": 1.059,
+      "step": 339
+    },
+    {
+      "epoch": 1.5454545454545454,
+      "grad_norm": 1.8859503269195557,
+      "learning_rate": 0.0002431818181818182,
+      "loss": 1.4959,
+      "step": 340
+    },
+    {
+      "epoch": 1.55,
+      "grad_norm": 2.060011863708496,
+      "learning_rate": 0.00024242424242424245,
+      "loss": 1.3644,
+      "step": 341
+    },
+    {
+      "epoch": 1.5545454545454547,
+      "grad_norm": 2.3816416263580322,
+      "learning_rate": 0.00024166666666666667,
+      "loss": 1.4375,
+      "step": 342
+    },
+    {
+      "epoch": 1.559090909090909,
+      "grad_norm": 3.5078084468841553,
+      "learning_rate": 0.00024090909090909092,
+      "loss": 1.422,
+      "step": 343
+    },
+    {
+      "epoch": 1.5636363636363635,
+      "grad_norm": 2.221052885055542,
+      "learning_rate": 0.00024015151515151514,
+      "loss": 1.3024,
+      "step": 344
+    },
+    {
+      "epoch": 1.5681818181818183,
+      "grad_norm": 2.4711906909942627,
+      "learning_rate": 0.0002393939393939394,
+      "loss": 1.4838,
+      "step": 345
+    },
+    {
+      "epoch": 1.5727272727272728,
+      "grad_norm": 1.9803431034088135,
+      "learning_rate": 0.00023863636363636364,
+      "loss": 1.1055,
+      "step": 346
+    },
+    {
+      "epoch": 1.5772727272727272,
+      "grad_norm": 2.3196969032287598,
+      "learning_rate": 0.0002378787878787879,
+      "loss": 1.8282,
+      "step": 347
+    },
+    {
+      "epoch": 1.5818181818181818,
+      "grad_norm": 2.094829797744751,
+      "learning_rate": 0.00023712121212121214,
+      "loss": 1.2518,
+      "step": 348
+    },
+    {
+      "epoch": 1.5863636363636364,
+      "grad_norm": 2.0144240856170654,
+      "learning_rate": 0.00023636363636363636,
+      "loss": 1.4561,
+      "step": 349
+    },
+    {
+      "epoch": 1.5909090909090908,
+      "grad_norm": 1.8540327548980713,
+      "learning_rate": 0.0002356060606060606,
+      "loss": 1.3567,
+      "step": 350
+    },
+    {
+      "epoch": 1.5954545454545455,
+      "grad_norm": 2.6601638793945312,
+      "learning_rate": 0.00023484848484848486,
+      "loss": 1.7279,
+      "step": 351
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 2.5020227432250977,
+      "learning_rate": 0.0002340909090909091,
+      "loss": 1.7535,
+      "step": 352
+    },
+    {
+      "epoch": 1.6045454545454545,
+      "grad_norm": 2.0830986499786377,
+      "learning_rate": 0.00023333333333333333,
+      "loss": 1.2985,
+      "step": 353
+    },
+    {
+      "epoch": 1.6090909090909091,
+      "grad_norm": 1.8451659679412842,
+      "learning_rate": 0.00023257575757575758,
+      "loss": 1.0175,
+      "step": 354
+    },
+    {
+      "epoch": 1.6136363636363638,
+      "grad_norm": 2.271484375,
+      "learning_rate": 0.00023181818181818183,
+      "loss": 1.4269,
+      "step": 355
+    },
+    {
+      "epoch": 1.6181818181818182,
+      "grad_norm": 4.305004596710205,
+      "learning_rate": 0.00023106060606060605,
+      "loss": 1.3775,
+      "step": 356
+    },
+    {
+      "epoch": 1.6227272727272726,
+      "grad_norm": 1.8406462669372559,
+      "learning_rate": 0.00023030303030303033,
+      "loss": 1.2472,
+      "step": 357
+    },
+    {
+      "epoch": 1.6272727272727274,
+      "grad_norm": 2.6075923442840576,
+      "learning_rate": 0.00022954545454545455,
+      "loss": 1.3993,
+      "step": 358
+    },
+    {
+      "epoch": 1.6318181818181818,
+      "grad_norm": 2.324907064437866,
+      "learning_rate": 0.0002287878787878788,
+      "loss": 1.3076,
+      "step": 359
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 1.954463005065918,
+      "learning_rate": 0.00022803030303030302,
+      "loss": 1.4135,
+      "step": 360
+    },
+    {
+      "epoch": 1.6409090909090909,
+      "grad_norm": 1.8350000381469727,
+      "learning_rate": 0.00022727272727272727,
+      "loss": 1.2618,
+      "step": 361
+    },
+    {
+      "epoch": 1.6454545454545455,
+      "grad_norm": 2.2729427814483643,
+      "learning_rate": 0.0002265151515151515,
+      "loss": 1.2483,
+      "step": 362
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.7203510999679565,
+      "learning_rate": 0.00022575757575757577,
+      "loss": 0.9189,
+      "step": 363
+    },
+    {
+      "epoch": 1.6545454545454545,
+      "grad_norm": 2.2685649394989014,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 1.4897,
+      "step": 364
+    },
+    {
+      "epoch": 1.6590909090909092,
+      "grad_norm": 2.502887010574341,
+      "learning_rate": 0.00022424242424242424,
+      "loss": 1.315,
+      "step": 365
+    },
+    {
+      "epoch": 1.6636363636363636,
+      "grad_norm": 1.8945139646530151,
+      "learning_rate": 0.0002234848484848485,
+      "loss": 1.1686,
+      "step": 366
+    },
+    {
+      "epoch": 1.6681818181818182,
+      "grad_norm": 2.207409620285034,
+      "learning_rate": 0.00022272727272727272,
+      "loss": 1.9896,
+      "step": 367
+    },
+    {
+      "epoch": 1.6727272727272728,
+      "grad_norm": 2.028022527694702,
+      "learning_rate": 0.000221969696969697,
+      "loss": 1.5135,
+      "step": 368
+    },
+    {
+      "epoch": 1.6772727272727272,
+      "grad_norm": 2.403134822845459,
+      "learning_rate": 0.00022121212121212121,
+      "loss": 1.4201,
+      "step": 369
+    },
+    {
+      "epoch": 1.6818181818181817,
+      "grad_norm": 2.022304058074951,
+      "learning_rate": 0.00022045454545454546,
+      "loss": 1.672,
+      "step": 370
+    },
+    {
+      "epoch": 1.6863636363636365,
+      "grad_norm": 2.3457093238830566,
+      "learning_rate": 0.0002196969696969697,
+      "loss": 1.4657,
+      "step": 371
+    },
+    {
+      "epoch": 1.690909090909091,
+      "grad_norm": 2.0904908180236816,
+      "learning_rate": 0.00021893939393939394,
+      "loss": 1.5864,
+      "step": 372
+    },
+    {
+      "epoch": 1.6954545454545453,
+      "grad_norm": 1.5914294719696045,
+      "learning_rate": 0.00021818181818181818,
+      "loss": 1.2828,
+      "step": 373
+    },
+    {
+      "epoch": 1.7,
+      "grad_norm": 1.8519418239593506,
+      "learning_rate": 0.00021742424242424243,
+      "loss": 1.5195,
+      "step": 374
+    },
+    {
+      "epoch": 1.7045454545454546,
+      "grad_norm": 2.2076525688171387,
+      "learning_rate": 0.00021666666666666668,
+      "loss": 1.247,
+      "step": 375
+    },
+    {
+      "epoch": 1.709090909090909,
+      "grad_norm": 1.8584811687469482,
+      "learning_rate": 0.0002159090909090909,
+      "loss": 1.6614,
+      "step": 376
+    },
+    {
+      "epoch": 1.7136363636363636,
+      "grad_norm": 1.9943331480026245,
+      "learning_rate": 0.00021515151515151516,
+      "loss": 1.2409,
+      "step": 377
+    },
+    {
+      "epoch": 1.7181818181818183,
+      "grad_norm": 1.5293704271316528,
+      "learning_rate": 0.00021439393939393938,
+      "loss": 1.1563,
+      "step": 378
+    },
+    {
+      "epoch": 1.7227272727272727,
+      "grad_norm": 2.835527181625366,
+      "learning_rate": 0.00021363636363636363,
+      "loss": 1.3795,
+      "step": 379
+    },
+    {
+      "epoch": 1.7272727272727273,
+      "grad_norm": 2.491914987564087,
+      "learning_rate": 0.0002128787878787879,
+      "loss": 1.3017,
+      "step": 380
+    },
+    {
+      "epoch": 1.731818181818182,
+      "grad_norm": 3.2870216369628906,
+      "learning_rate": 0.00021212121212121213,
+      "loss": 1.1947,
+      "step": 381
+    },
+    {
+      "epoch": 1.7363636363636363,
+      "grad_norm": 2.1510424613952637,
+      "learning_rate": 0.00021136363636363638,
+      "loss": 1.7048,
+      "step": 382
+    },
+    {
+      "epoch": 1.740909090909091,
+      "grad_norm": 2.2372002601623535,
+      "learning_rate": 0.0002106060606060606,
+      "loss": 1.1627,
+      "step": 383
+    },
+    {
+      "epoch": 1.7454545454545456,
+      "grad_norm": 2.217090368270874,
+      "learning_rate": 0.00020984848484848485,
+      "loss": 1.8424,
+      "step": 384
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 1.8897899389266968,
+      "learning_rate": 0.00020909090909090907,
+      "loss": 1.3006,
+      "step": 385
+    },
+    {
+      "epoch": 1.7545454545454544,
+      "grad_norm": 1.7469165325164795,
+      "learning_rate": 0.00020833333333333335,
+      "loss": 1.496,
+      "step": 386
+    },
+    {
+      "epoch": 1.759090909090909,
+      "grad_norm": 1.8982353210449219,
+      "learning_rate": 0.0002075757575757576,
+      "loss": 1.2682,
+      "step": 387
+    },
+    {
+      "epoch": 1.7636363636363637,
+      "grad_norm": 2.0795273780822754,
+      "learning_rate": 0.00020681818181818182,
+      "loss": 2.2314,
+      "step": 388
+    },
+    {
+      "epoch": 1.768181818181818,
+      "grad_norm": 1.6244702339172363,
+      "learning_rate": 0.00020606060606060607,
+      "loss": 0.9585,
+      "step": 389
+    },
+    {
+      "epoch": 1.7727272727272727,
+      "grad_norm": 2.3025052547454834,
+      "learning_rate": 0.0002053030303030303,
+      "loss": 1.6639,
+      "step": 390
+    },
+    {
+      "epoch": 1.7772727272727273,
+      "grad_norm": 2.418119192123413,
+      "learning_rate": 0.00020454545454545457,
+      "loss": 1.8274,
+      "step": 391
+    },
+    {
+      "epoch": 1.7818181818181817,
+      "grad_norm": 1.70542573928833,
+      "learning_rate": 0.0002037878787878788,
+      "loss": 1.3115,
+      "step": 392
+    },
+    {
+      "epoch": 1.7863636363636364,
+      "grad_norm": 1.9681594371795654,
+      "learning_rate": 0.00020303030303030304,
+      "loss": 0.973,
+      "step": 393
+    },
+    {
+      "epoch": 1.790909090909091,
+      "grad_norm": 1.856879711151123,
+      "learning_rate": 0.0002022727272727273,
+      "loss": 1.2958,
+      "step": 394
+    },
+    {
+      "epoch": 1.7954545454545454,
+      "grad_norm": 2.1984550952911377,
+      "learning_rate": 0.0002015151515151515,
+      "loss": 1.5643,
+      "step": 395
+    },
+    {
+      "epoch": 1.8,
+      "grad_norm": 2.6989824771881104,
+      "learning_rate": 0.00020075757575757576,
+      "loss": 1.4334,
+      "step": 396
+    },
+    {
+      "epoch": 1.8045454545454547,
+      "grad_norm": 2.3298850059509277,
+      "learning_rate": 0.0002,
+      "loss": 1.5145,
+      "step": 397
+    },
+    {
+      "epoch": 1.809090909090909,
+      "grad_norm": 1.9490818977355957,
+      "learning_rate": 0.00019924242424242426,
+      "loss": 1.5346,
+      "step": 398
+    },
+    {
+      "epoch": 1.8136363636363635,
+      "grad_norm": 2.0060818195343018,
+      "learning_rate": 0.00019848484848484848,
+      "loss": 1.2493,
+      "step": 399
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 2.1615042686462402,
+      "learning_rate": 0.00019772727272727273,
+      "loss": 1.8834,
+      "step": 400
+    },
+    {
+      "epoch": 1.8227272727272728,
+      "grad_norm": 2.4424972534179688,
+      "learning_rate": 0.00019696969696969695,
+      "loss": 1.6012,
+      "step": 401
+    },
+    {
+      "epoch": 1.8272727272727272,
+      "grad_norm": 1.83026921749115,
+      "learning_rate": 0.0001962121212121212,
+      "loss": 1.4308,
+      "step": 402
+    },
+    {
+      "epoch": 1.8318181818181818,
+      "grad_norm": 2.6793453693389893,
+      "learning_rate": 0.00019545454545454548,
+      "loss": 1.6356,
+      "step": 403
+    },
+    {
+      "epoch": 1.8363636363636364,
+      "grad_norm": 2.1211740970611572,
+      "learning_rate": 0.0001946969696969697,
+      "loss": 1.0859,
+      "step": 404
+    },
+    {
+      "epoch": 1.8409090909090908,
+      "grad_norm": 1.6658772230148315,
+      "learning_rate": 0.00019393939393939395,
+      "loss": 1.3467,
+      "step": 405
+    },
+    {
+      "epoch": 1.8454545454545455,
+      "grad_norm": 1.7903032302856445,
+      "learning_rate": 0.00019318181818181817,
+      "loss": 1.0118,
+      "step": 406
+    },
+    {
+      "epoch": 1.85,
+      "grad_norm": 1.7358275651931763,
+      "learning_rate": 0.00019242424242424242,
+      "loss": 1.4404,
+      "step": 407
+    },
+    {
+      "epoch": 1.8545454545454545,
+      "grad_norm": 2.0745861530303955,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 1.3409,
+      "step": 408
+    },
+    {
+      "epoch": 1.8590909090909091,
+      "grad_norm": 2.1365160942077637,
+      "learning_rate": 0.00019090909090909092,
+      "loss": 1.4658,
+      "step": 409
+    },
+    {
+      "epoch": 1.8636363636363638,
+      "grad_norm": 2.040371894836426,
+      "learning_rate": 0.00019015151515151517,
+      "loss": 1.3165,
+      "step": 410
+    },
+    {
+      "epoch": 1.8681818181818182,
+      "grad_norm": 1.9335429668426514,
+      "learning_rate": 0.0001893939393939394,
+      "loss": 1.5063,
+      "step": 411
+    },
+    {
+      "epoch": 1.8727272727272726,
+      "grad_norm": 2.0025243759155273,
+      "learning_rate": 0.00018863636363636364,
+      "loss": 1.2584,
+      "step": 412
+    },
+    {
+      "epoch": 1.8772727272727274,
+      "grad_norm": 1.7558890581130981,
+      "learning_rate": 0.00018787878787878787,
+      "loss": 1.0937,
+      "step": 413
+    },
+    {
+      "epoch": 1.8818181818181818,
+      "grad_norm": 1.886003851890564,
+      "learning_rate": 0.00018712121212121214,
+      "loss": 1.3969,
+      "step": 414
+    },
+    {
+      "epoch": 1.8863636363636362,
+      "grad_norm": 2.6999497413635254,
+      "learning_rate": 0.00018636363636363636,
+      "loss": 1.1411,
+      "step": 415
+    },
+    {
+      "epoch": 1.8909090909090909,
+      "grad_norm": 1.923196792602539,
+      "learning_rate": 0.00018560606060606061,
+      "loss": 1.6597,
+      "step": 416
+    },
+    {
+      "epoch": 1.8954545454545455,
+      "grad_norm": 2.261504650115967,
+      "learning_rate": 0.00018484848484848484,
+      "loss": 1.738,
+      "step": 417
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 1.9908875226974487,
+      "learning_rate": 0.00018409090909090909,
+      "loss": 1.1275,
+      "step": 418
+    },
+    {
+      "epoch": 1.9045454545454545,
+      "grad_norm": 2.348240852355957,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 1.5368,
+      "step": 419
+    },
+    {
+      "epoch": 1.9090909090909092,
+      "grad_norm": 3.1023001670837402,
+      "learning_rate": 0.00018257575757575758,
+      "loss": 1.1828,
+      "step": 420
+    },
+    {
+      "epoch": 1.9136363636363636,
+      "grad_norm": 2.2243757247924805,
+      "learning_rate": 0.00018181818181818183,
+      "loss": 1.4783,
+      "step": 421
+    },
+    {
+      "epoch": 1.9181818181818182,
+      "grad_norm": 1.824317216873169,
+      "learning_rate": 0.00018106060606060606,
+      "loss": 1.178,
+      "step": 422
+    },
+    {
+      "epoch": 1.9227272727272728,
+      "grad_norm": 2.1963822841644287,
+      "learning_rate": 0.0001803030303030303,
+      "loss": 1.5811,
+      "step": 423
+    },
+    {
+      "epoch": 1.9272727272727272,
+      "grad_norm": 1.8573740720748901,
+      "learning_rate": 0.00017954545454545453,
+      "loss": 1.2361,
+      "step": 424
+    },
+    {
+      "epoch": 1.9318181818181817,
+      "grad_norm": 2.223315715789795,
+      "learning_rate": 0.0001787878787878788,
+      "loss": 1.3236,
+      "step": 425
+    },
+    {
+      "epoch": 1.9363636363636365,
+      "grad_norm": 2.0890495777130127,
+      "learning_rate": 0.00017803030303030305,
+      "loss": 1.4358,
+      "step": 426
+    },
+    {
+      "epoch": 1.940909090909091,
+      "grad_norm": 1.8097957372665405,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 1.0634,
+      "step": 427
+    },
+    {
+      "epoch": 1.9454545454545453,
+      "grad_norm": 2.378750801086426,
+      "learning_rate": 0.00017651515151515153,
+      "loss": 1.8565,
+      "step": 428
+    },
+    {
+      "epoch": 1.95,
+      "grad_norm": 1.7777960300445557,
+      "learning_rate": 0.00017575757575757575,
+      "loss": 1.5755,
+      "step": 429
+    },
+    {
+      "epoch": 1.9545454545454546,
+      "grad_norm": 2.1310245990753174,
+      "learning_rate": 0.000175,
+      "loss": 1.4119,
+      "step": 430
+    },
+    {
+      "epoch": 1.959090909090909,
+      "grad_norm": 2.506479024887085,
+      "learning_rate": 0.00017424242424242425,
+      "loss": 1.5025,
+      "step": 431
+    },
+    {
+      "epoch": 1.9636363636363636,
+      "grad_norm": 2.091891288757324,
+      "learning_rate": 0.0001734848484848485,
+      "loss": 1.383,
+      "step": 432
+    },
+    {
+      "epoch": 1.9681818181818183,
+      "grad_norm": 1.7450625896453857,
+      "learning_rate": 0.00017272727272727272,
+      "loss": 1.4122,
+      "step": 433
+    },
+    {
+      "epoch": 1.9727272727272727,
+      "grad_norm": 1.7834117412567139,
+      "learning_rate": 0.00017196969696969697,
+      "loss": 1.0932,
+      "step": 434
+    },
+    {
+      "epoch": 1.9772727272727273,
+      "grad_norm": 1.6854950189590454,
+      "learning_rate": 0.00017121212121212122,
+      "loss": 0.9985,
+      "step": 435
+    },
+    {
+      "epoch": 1.981818181818182,
+      "grad_norm": 1.4569097757339478,
+      "learning_rate": 0.00017045454545454544,
+      "loss": 1.1354,
+      "step": 436
+    },
+    {
+      "epoch": 1.9863636363636363,
+      "grad_norm": 2.3083584308624268,
+      "learning_rate": 0.00016969696969696972,
+      "loss": 1.4856,
+      "step": 437
+    },
+    {
+      "epoch": 1.990909090909091,
+      "grad_norm": 2.1282572746276855,
+      "learning_rate": 0.00016893939393939394,
+      "loss": 1.942,
+      "step": 438
+    },
+    {
+      "epoch": 1.9954545454545456,
+      "grad_norm": 2.199822187423706,
+      "learning_rate": 0.0001681818181818182,
+      "loss": 1.4891,
+      "step": 439
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.0030460357666016,
+      "learning_rate": 0.0001674242424242424,
+      "loss": 1.2939,
+      "step": 440
+    },
+    {
+      "epoch": 2.0,
+      "eval_f1": 0.8928,
+      "eval_gen_len": 42.0,
+      "eval_loss": 1.7928985357284546,
+      "eval_precision": 0.8914,
+      "eval_recall": 0.8944,
+      "eval_rouge1": 0.4605,
+      "eval_rouge2": 0.2125,
+      "eval_rougeL": 0.3897,
+      "eval_rougeLsum": 0.4259,
+      "eval_runtime": 25.1108,
+      "eval_samples_per_second": 4.381,
+      "eval_steps_per_second": 0.558,
+      "step": 440
+    },
+    {
+      "epoch": 2.0045454545454544,
+      "grad_norm": 1.4425781965255737,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 0.9057,
+      "step": 441
+    },
+    {
+      "epoch": 2.0090909090909093,
+      "grad_norm": 1.579765796661377,
+      "learning_rate": 0.00016590909090909094,
+      "loss": 0.7069,
+      "step": 442
+    },
+    {
+      "epoch": 2.0136363636363637,
+      "grad_norm": 1.8639825582504272,
+      "learning_rate": 0.00016515151515151516,
+      "loss": 1.1531,
+      "step": 443
+    },
+    {
+      "epoch": 2.018181818181818,
+      "grad_norm": 1.4890676736831665,
+      "learning_rate": 0.0001643939393939394,
+      "loss": 0.8112,
+      "step": 444
+    },
+    {
+      "epoch": 2.022727272727273,
+      "grad_norm": 1.7381901741027832,
+      "learning_rate": 0.00016363636363636363,
+      "loss": 1.2108,
+      "step": 445
+    },
+    {
+      "epoch": 2.0272727272727273,
+      "grad_norm": 1.6125924587249756,
+      "learning_rate": 0.00016287878787878788,
+      "loss": 1.0529,
+      "step": 446
+    },
+    {
+      "epoch": 2.0318181818181817,
+      "grad_norm": 1.8624428510665894,
+      "learning_rate": 0.0001621212121212121,
+      "loss": 1.006,
+      "step": 447
+    },
+    {
+      "epoch": 2.036363636363636,
+      "grad_norm": 1.719439148902893,
+      "learning_rate": 0.00016136363636363638,
+      "loss": 1.0881,
+      "step": 448
+    },
+    {
+      "epoch": 2.040909090909091,
+      "grad_norm": 2.446216106414795,
+      "learning_rate": 0.00016060606060606063,
+      "loss": 1.2399,
+      "step": 449
+    },
+    {
+      "epoch": 2.0454545454545454,
+      "grad_norm": 1.703517198562622,
+      "learning_rate": 0.00015984848484848485,
+      "loss": 0.931,
+      "step": 450
+    },
+    {
+      "epoch": 2.05,
+      "grad_norm": 1.780228853225708,
+      "learning_rate": 0.0001590909090909091,
+      "loss": 0.9769,
+      "step": 451
+    },
+    {
+      "epoch": 2.0545454545454547,
+      "grad_norm": 2.015679121017456,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 1.6044,
+      "step": 452
+    },
+    {
+      "epoch": 2.059090909090909,
+      "grad_norm": 2.084481716156006,
+      "learning_rate": 0.00015757575757575757,
+      "loss": 0.9933,
+      "step": 453
+    },
+    {
+      "epoch": 2.0636363636363635,
+      "grad_norm": 2.3098299503326416,
+      "learning_rate": 0.00015681818181818182,
+      "loss": 0.9405,
+      "step": 454
+    },
+    {
+      "epoch": 2.0681818181818183,
+      "grad_norm": 1.8041385412216187,
+      "learning_rate": 0.00015606060606060607,
+      "loss": 1.1748,
+      "step": 455
+    },
+    {
+      "epoch": 2.0727272727272728,
+      "grad_norm": 1.693158745765686,
+      "learning_rate": 0.0001553030303030303,
+      "loss": 0.9358,
+      "step": 456
+    },
+    {
+      "epoch": 2.077272727272727,
+      "grad_norm": 1.5484883785247803,
+      "learning_rate": 0.00015454545454545454,
+      "loss": 1.0664,
+      "step": 457
+    },
+    {
+      "epoch": 2.081818181818182,
+      "grad_norm": 1.4313092231750488,
+      "learning_rate": 0.0001537878787878788,
+      "loss": 0.6624,
+      "step": 458
+    },
+    {
+      "epoch": 2.0863636363636364,
+      "grad_norm": 2.218092679977417,
+      "learning_rate": 0.00015303030303030302,
+      "loss": 0.9856,
+      "step": 459
+    },
+    {
+      "epoch": 2.090909090909091,
+      "grad_norm": 2.030869960784912,
+      "learning_rate": 0.0001522727272727273,
+      "loss": 1.143,
+      "step": 460
+    },
+    {
+      "epoch": 2.0954545454545457,
+      "grad_norm": 2.190603017807007,
+      "learning_rate": 0.00015151515151515152,
+      "loss": 1.077,
+      "step": 461
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.3030821084976196,
+      "learning_rate": 0.00015075757575757576,
+      "loss": 0.6711,
+      "step": 462
+    },
+    {
+      "epoch": 2.1045454545454545,
+      "grad_norm": 1.8678494691848755,
+      "learning_rate": 0.00015,
+      "loss": 1.0674,
+      "step": 463
+    },
+    {
+      "epoch": 2.109090909090909,
+      "grad_norm": 1.407085657119751,
+      "learning_rate": 0.00014924242424242424,
+      "loss": 0.7024,
+      "step": 464
+    },
+    {
+      "epoch": 2.1136363636363638,
+      "grad_norm": 2.004911422729492,
+      "learning_rate": 0.0001484848484848485,
+      "loss": 0.795,
+      "step": 465
+    },
+    {
+      "epoch": 2.118181818181818,
+      "grad_norm": 2.25128173828125,
+      "learning_rate": 0.00014772727272727274,
+      "loss": 1.2232,
+      "step": 466
+    },
+    {
+      "epoch": 2.1227272727272726,
+      "grad_norm": 1.960771918296814,
+      "learning_rate": 0.00014696969696969698,
+      "loss": 1.0019,
+      "step": 467
+    },
+    {
+      "epoch": 2.1272727272727274,
+      "grad_norm": 1.9563887119293213,
+      "learning_rate": 0.0001462121212121212,
+      "loss": 0.9798,
+      "step": 468
+    },
+    {
+      "epoch": 2.131818181818182,
+      "grad_norm": 1.687361240386963,
+      "learning_rate": 0.00014545454545454546,
+      "loss": 0.755,
+      "step": 469
+    },
+    {
+      "epoch": 2.1363636363636362,
+      "grad_norm": 2.191286325454712,
+      "learning_rate": 0.00014469696969696968,
+      "loss": 1.0018,
+      "step": 470
+    },
+    {
+      "epoch": 2.140909090909091,
+      "grad_norm": 2.046880006790161,
+      "learning_rate": 0.00014393939393939396,
+      "loss": 1.2281,
+      "step": 471
+    },
+    {
+      "epoch": 2.1454545454545455,
+      "grad_norm": 2.4996211528778076,
+      "learning_rate": 0.00014318181818181818,
+      "loss": 1.0795,
+      "step": 472
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 1.8937994241714478,
+      "learning_rate": 0.00014242424242424243,
+      "loss": 1.0556,
+      "step": 473
+    },
+    {
+      "epoch": 2.1545454545454543,
+      "grad_norm": 2.250491142272949,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.8816,
+      "step": 474
+    },
+    {
+      "epoch": 2.159090909090909,
+      "grad_norm": NaN,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.0,
+      "step": 475
+    },
+    {
+      "epoch": 2.1636363636363636,
+      "grad_norm": 2.231706142425537,
+      "learning_rate": 0.0001409090909090909,
+      "loss": 1.2344,
+      "step": 476
+    },
+    {
+      "epoch": 2.168181818181818,
+      "grad_norm": 2.2170498371124268,
+      "learning_rate": 0.00014015151515151515,
+      "loss": 1.4409,
+      "step": 477
+    },
+    {
+      "epoch": 2.172727272727273,
+      "grad_norm": 2.3106095790863037,
+      "learning_rate": 0.0001393939393939394,
+      "loss": 0.9081,
+      "step": 478
+    },
+    {
+      "epoch": 2.1772727272727272,
+      "grad_norm": 1.9665738344192505,
+      "learning_rate": 0.00013863636363636365,
+      "loss": 1.3029,
+      "step": 479
+    },
+    {
+      "epoch": 2.1818181818181817,
+      "grad_norm": 2.321331739425659,
+      "learning_rate": 0.00013787878787878787,
+      "loss": 1.4714,
+      "step": 480
+    },
+    {
+      "epoch": 2.1863636363636365,
+      "grad_norm": 2.0038533210754395,
+      "learning_rate": 0.00013712121212121212,
+      "loss": 1.0879,
+      "step": 481
+    },
+    {
+      "epoch": 2.190909090909091,
+      "grad_norm": 1.6077767610549927,
+      "learning_rate": 0.00013636363636363637,
+      "loss": 0.6456,
+      "step": 482
+    },
+    {
+      "epoch": 2.1954545454545453,
+      "grad_norm": 1.5018125772476196,
+      "learning_rate": 0.00013560606060606062,
+      "loss": 0.6937,
+      "step": 483
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 1.5473967790603638,
+      "learning_rate": 0.00013484848484848487,
+      "loss": 0.6191,
+      "step": 484
+    },
+    {
+      "epoch": 2.2045454545454546,
+      "grad_norm": 2.5554354190826416,
+      "learning_rate": 0.0001340909090909091,
+      "loss": 1.4345,
+      "step": 485
+    },
+    {
+      "epoch": 2.209090909090909,
+      "grad_norm": 2.1666858196258545,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 1.4182,
+      "step": 486
+    },
+    {
+      "epoch": 2.213636363636364,
+      "grad_norm": 2.2915759086608887,
+      "learning_rate": 0.00013257575757575756,
+      "loss": 0.8752,
+      "step": 487
+    },
+    {
+      "epoch": 2.2181818181818183,
+      "grad_norm": 2.24314546585083,
+      "learning_rate": 0.0001318181818181818,
+      "loss": 1.3214,
+      "step": 488
+    },
+    {
+      "epoch": 2.2227272727272727,
+      "grad_norm": 2.269216537475586,
+      "learning_rate": 0.0001310606060606061,
+      "loss": 0.9968,
+      "step": 489
+    },
+    {
+      "epoch": 2.227272727272727,
+      "grad_norm": 2.3108322620391846,
+      "learning_rate": 0.0001303030303030303,
+      "loss": 0.9695,
+      "step": 490
+    },
+    {
+      "epoch": 2.231818181818182,
+      "grad_norm": 2.3146250247955322,
+      "learning_rate": 0.00012954545454545456,
+      "loss": 1.4007,
+      "step": 491
+    },
+    {
+      "epoch": 2.2363636363636363,
+      "grad_norm": 1.9747002124786377,
+      "learning_rate": 0.00012878787878787878,
+      "loss": 0.8876,
+      "step": 492
+    },
+    {
+      "epoch": 2.2409090909090907,
+      "grad_norm": 2.0410826206207275,
+      "learning_rate": 0.00012803030303030303,
+      "loss": 0.9588,
+      "step": 493
+    },
+    {
+      "epoch": 2.2454545454545456,
+      "grad_norm": 2.2743778228759766,
+      "learning_rate": 0.00012727272727272725,
+      "loss": 1.2062,
+      "step": 494
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 2.272749662399292,
+      "learning_rate": 0.00012651515151515153,
+      "loss": 0.975,
+      "step": 495
+    },
+    {
+      "epoch": 2.2545454545454544,
+      "grad_norm": 2.297175884246826,
+      "learning_rate": 0.00012575757575757575,
+      "loss": 1.0806,
+      "step": 496
+    },
+    {
+      "epoch": 2.2590909090909093,
+      "grad_norm": 2.2274718284606934,
+      "learning_rate": 0.000125,
+      "loss": 0.9391,
+      "step": 497
+    },
+    {
+      "epoch": 2.2636363636363637,
+      "grad_norm": 2.4175453186035156,
+      "learning_rate": 0.00012424242424242425,
+      "loss": 1.2736,
+      "step": 498
+    },
+    {
+      "epoch": 2.268181818181818,
+      "grad_norm": 1.7530089616775513,
+      "learning_rate": 0.0001234848484848485,
+      "loss": 1.1917,
+      "step": 499
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 2.598747730255127,
+      "learning_rate": 0.00012272727272727272,
+      "loss": 1.5901,
+      "step": 500
+    },
+    {
+      "epoch": 2.2772727272727273,
+      "grad_norm": 2.0590171813964844,
+      "learning_rate": 0.00012196969696969696,
+      "loss": 1.0049,
+      "step": 501
+    },
+    {
+      "epoch": 2.2818181818181817,
+      "grad_norm": 1.6530457735061646,
+      "learning_rate": 0.00012121212121212122,
+      "loss": 0.6991,
+      "step": 502
+    },
+    {
+      "epoch": 2.286363636363636,
+      "grad_norm": 1.4000625610351562,
+      "learning_rate": 0.00012045454545454546,
+      "loss": 0.7258,
+      "step": 503
+    },
+    {
+      "epoch": 2.290909090909091,
+      "grad_norm": 3.4282798767089844,
+      "learning_rate": 0.0001196969696969697,
+      "loss": 0.7331,
+      "step": 504
+    },
+    {
+      "epoch": 2.2954545454545454,
+      "grad_norm": 2.0328640937805176,
+      "learning_rate": 0.00011893939393939394,
+      "loss": 1.0245,
+      "step": 505
+    },
+    {
+      "epoch": 2.3,
+      "grad_norm": 2.639125347137451,
+      "learning_rate": 0.00011818181818181818,
+      "loss": 0.939,
+      "step": 506
+    },
+    {
+      "epoch": 2.3045454545454547,
+      "grad_norm": 2.069645643234253,
+      "learning_rate": 0.00011742424242424243,
+      "loss": 1.186,
+      "step": 507
+    },
+    {
+      "epoch": 2.309090909090909,
+      "grad_norm": 2.103675603866577,
+      "learning_rate": 0.00011666666666666667,
+      "loss": 1.0986,
+      "step": 508
+    },
+    {
+      "epoch": 2.3136363636363635,
+      "grad_norm": 2.022813320159912,
+      "learning_rate": 0.00011590909090909091,
+      "loss": 1.1106,
+      "step": 509
+    },
+    {
+      "epoch": 2.3181818181818183,
+      "grad_norm": 2.1240341663360596,
+      "learning_rate": 0.00011515151515151516,
+      "loss": 0.9754,
+      "step": 510
+    },
+    {
+      "epoch": 2.3227272727272728,
+      "grad_norm": 2.11362361907959,
+      "learning_rate": 0.0001143939393939394,
+      "loss": 1.243,
+      "step": 511
+    },
+    {
+      "epoch": 2.327272727272727,
+      "grad_norm": 1.9033676385879517,
+      "learning_rate": 0.00011363636363636364,
+      "loss": 0.7314,
+      "step": 512
+    },
+    {
+      "epoch": 2.331818181818182,
+      "grad_norm": 2.7902817726135254,
+      "learning_rate": 0.00011287878787878789,
+      "loss": 1.2161,
+      "step": 513
+    },
+    {
+      "epoch": 2.3363636363636364,
+      "grad_norm": 2.1139214038848877,
+      "learning_rate": 0.00011212121212121212,
+      "loss": 1.4216,
+      "step": 514
+    },
+    {
+      "epoch": 2.340909090909091,
+      "grad_norm": 2.2380800247192383,
+      "learning_rate": 0.00011136363636363636,
+      "loss": 1.0319,
+      "step": 515
+    },
+    {
+      "epoch": 2.3454545454545457,
+      "grad_norm": 1.9591755867004395,
+      "learning_rate": 0.00011060606060606061,
+      "loss": 0.7923,
+      "step": 516
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 2.6767358779907227,
+      "learning_rate": 0.00010984848484848486,
+      "loss": 0.9721,
+      "step": 517
+    },
+    {
+      "epoch": 2.3545454545454545,
+      "grad_norm": 2.350008487701416,
+      "learning_rate": 0.00010909090909090909,
+      "loss": 1.1793,
+      "step": 518
+    },
+    {
+      "epoch": 2.359090909090909,
+      "grad_norm": 2.0240652561187744,
+      "learning_rate": 0.00010833333333333334,
+      "loss": 1.1184,
+      "step": 519
+    },
+    {
+      "epoch": 2.3636363636363638,
+      "grad_norm": 2.058748960494995,
+      "learning_rate": 0.00010757575757575758,
+      "loss": 1.1886,
+      "step": 520
+    },
+    {
+      "epoch": 2.368181818181818,
+      "grad_norm": 1.7921950817108154,
+      "learning_rate": 0.00010681818181818181,
+      "loss": 0.8511,
+      "step": 521
+    },
+    {
+      "epoch": 2.3727272727272726,
+      "grad_norm": 2.027445077896118,
+      "learning_rate": 0.00010606060606060606,
+      "loss": 0.8641,
+      "step": 522
+    },
+    {
+      "epoch": 2.3772727272727274,
+      "grad_norm": 1.8156445026397705,
+      "learning_rate": 0.0001053030303030303,
+      "loss": 0.8234,
+      "step": 523
+    },
+    {
+      "epoch": 2.381818181818182,
+      "grad_norm": 2.3511455059051514,
+      "learning_rate": 0.00010454545454545454,
+      "loss": 1.048,
+      "step": 524
+    },
+    {
+      "epoch": 2.3863636363636362,
+      "grad_norm": 1.489744782447815,
+      "learning_rate": 0.0001037878787878788,
+      "loss": 0.4886,
+      "step": 525
+    },
+    {
+      "epoch": 2.390909090909091,
+      "grad_norm": 2.0359721183776855,
+      "learning_rate": 0.00010303030303030303,
+      "loss": 1.0011,
+      "step": 526
+    },
+    {
+      "epoch": 2.3954545454545455,
+      "grad_norm": 2.8290212154388428,
+      "learning_rate": 0.00010227272727272728,
+      "loss": 1.4443,
+      "step": 527
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 1.991904616355896,
+      "learning_rate": 0.00010151515151515152,
+      "loss": 0.9877,
+      "step": 528
+    },
+    {
+      "epoch": 2.4045454545454543,
+      "grad_norm": 1.8174313306808472,
+      "learning_rate": 0.00010075757575757576,
+      "loss": 0.9048,
+      "step": 529
+    },
+    {
+      "epoch": 2.409090909090909,
+      "grad_norm": 1.66022527217865,
+      "learning_rate": 0.0001,
+      "loss": 0.9039,
+      "step": 530
+    },
+    {
+      "epoch": 2.4136363636363636,
+      "grad_norm": 1.6025142669677734,
+      "learning_rate": 9.924242424242424e-05,
+      "loss": 0.8169,
+      "step": 531
+    },
+    {
+      "epoch": 2.418181818181818,
+      "grad_norm": 1.871733546257019,
+      "learning_rate": 9.848484848484848e-05,
+      "loss": 1.039,
+      "step": 532
+    },
+    {
+      "epoch": 2.422727272727273,
+      "grad_norm": 2.35320782661438,
+      "learning_rate": 9.772727272727274e-05,
+      "loss": 1.3449,
+      "step": 533
+    },
+    {
+      "epoch": 2.4272727272727272,
+      "grad_norm": 1.9311728477478027,
+      "learning_rate": 9.696969696969698e-05,
+      "loss": 1.0332,
+      "step": 534
+    },
+    {
+      "epoch": 2.4318181818181817,
+      "grad_norm": 1.6838319301605225,
+      "learning_rate": 9.621212121212121e-05,
+      "loss": 0.6631,
+      "step": 535
+    },
+    {
+      "epoch": 2.4363636363636365,
+      "grad_norm": 1.9957849979400635,
+      "learning_rate": 9.545454545454546e-05,
+      "loss": 1.0397,
+      "step": 536
+    },
+    {
+      "epoch": 2.440909090909091,
+      "grad_norm": 2.338730573654175,
+      "learning_rate": 9.46969696969697e-05,
+      "loss": 1.1155,
+      "step": 537
+    },
+    {
+      "epoch": 2.4454545454545453,
+      "grad_norm": 2.0578792095184326,
+      "learning_rate": 9.393939393939393e-05,
+      "loss": 1.0634,
+      "step": 538
+    },
+    {
+      "epoch": 2.45,
+      "grad_norm": 2.0512609481811523,
+      "learning_rate": 9.318181818181818e-05,
+      "loss": 0.9052,
+      "step": 539
+    },
+    {
+      "epoch": 2.4545454545454546,
+      "grad_norm": 2.2808845043182373,
+      "learning_rate": 9.242424242424242e-05,
+      "loss": 1.2479,
+      "step": 540
+    },
+    {
+      "epoch": 2.459090909090909,
+      "grad_norm": 1.7963327169418335,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.8655,
+      "step": 541
+    },
+    {
+      "epoch": 2.463636363636364,
+      "grad_norm": 2.378777265548706,
+      "learning_rate": 9.090909090909092e-05,
+      "loss": 1.1019,
+      "step": 542
+    },
+    {
+      "epoch": 2.4681818181818183,
+      "grad_norm": 1.7346596717834473,
+      "learning_rate": 9.015151515151515e-05,
+      "loss": 0.6478,
+      "step": 543
+    },
+    {
+      "epoch": 2.4727272727272727,
+      "grad_norm": 1.8121107816696167,
+      "learning_rate": 8.93939393939394e-05,
+      "loss": 0.9549,
+      "step": 544
+    },
+    {
+      "epoch": 2.4772727272727275,
+      "grad_norm": 1.9102083444595337,
+      "learning_rate": 8.863636363636364e-05,
+      "loss": 0.9103,
+      "step": 545
+    },
+    {
+      "epoch": 2.481818181818182,
+      "grad_norm": 2.3148677349090576,
+      "learning_rate": 8.787878787878787e-05,
+      "loss": 1.1075,
+      "step": 546
+    },
+    {
+      "epoch": 2.4863636363636363,
+      "grad_norm": 2.3098530769348145,
+      "learning_rate": 8.712121212121212e-05,
+      "loss": 1.0885,
+      "step": 547
+    },
+    {
+      "epoch": 2.4909090909090907,
+      "grad_norm": 2.061582565307617,
+      "learning_rate": 8.636363636363636e-05,
+      "loss": 0.7894,
+      "step": 548
+    },
+    {
+      "epoch": 2.4954545454545456,
+      "grad_norm": 2.3829803466796875,
+      "learning_rate": 8.560606060606061e-05,
+      "loss": 1.2397,
+      "step": 549
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 2.111055612564087,
+      "learning_rate": 8.484848484848486e-05,
+      "loss": 1.0463,
+      "step": 550
+    },
+    {
+      "epoch": 2.5045454545454544,
+      "grad_norm": 1.883468508720398,
+      "learning_rate": 8.40909090909091e-05,
+      "loss": 0.9837,
+      "step": 551
+    },
+    {
+      "epoch": 2.509090909090909,
+      "grad_norm": 1.8480087518692017,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 0.897,
+      "step": 552
+    },
+    {
+      "epoch": 2.5136363636363637,
+      "grad_norm": 1.9513871669769287,
+      "learning_rate": 8.257575757575758e-05,
+      "loss": 0.9668,
+      "step": 553
+    },
+    {
+      "epoch": 2.518181818181818,
+      "grad_norm": 1.5687415599822998,
+      "learning_rate": 8.181818181818182e-05,
+      "loss": 0.8729,
+      "step": 554
+    },
+    {
+      "epoch": 2.5227272727272725,
+      "grad_norm": 1.959887981414795,
+      "learning_rate": 8.106060606060605e-05,
+      "loss": 0.9612,
+      "step": 555
+    },
+    {
+      "epoch": 2.5272727272727273,
+      "grad_norm": 2.1609091758728027,
+      "learning_rate": 8.030303030303031e-05,
+      "loss": 1.133,
+      "step": 556
+    },
+    {
+      "epoch": 2.5318181818181817,
+      "grad_norm": 2.534611225128174,
+      "learning_rate": 7.954545454545455e-05,
+      "loss": 1.3566,
+      "step": 557
+    },
+    {
+      "epoch": 2.536363636363636,
+      "grad_norm": 2.731877088546753,
+      "learning_rate": 7.878787878787879e-05,
+      "loss": 1.1991,
+      "step": 558
+    },
+    {
+      "epoch": 2.540909090909091,
+      "grad_norm": 2.1953938007354736,
+      "learning_rate": 7.803030303030304e-05,
+      "loss": 0.932,
+      "step": 559
+    },
+    {
+      "epoch": 2.5454545454545454,
+      "grad_norm": 2.260007381439209,
+      "learning_rate": 7.727272727272727e-05,
+      "loss": 1.0682,
+      "step": 560
+    },
+    {
+      "epoch": 2.55,
+      "grad_norm": 2.9932045936584473,
+      "learning_rate": 7.651515151515151e-05,
+      "loss": 1.2489,
+      "step": 561
+    },
+    {
+      "epoch": 2.5545454545454547,
+      "grad_norm": 2.4135005474090576,
+      "learning_rate": 7.575757575757576e-05,
+      "loss": 0.7289,
+      "step": 562
+    },
+    {
+      "epoch": 2.559090909090909,
+      "grad_norm": 2.2235300540924072,
+      "learning_rate": 7.5e-05,
+      "loss": 0.7027,
+      "step": 563
+    },
+    {
+      "epoch": 2.5636363636363635,
+      "grad_norm": 2.6621127128601074,
+      "learning_rate": 7.424242424242426e-05,
+      "loss": 1.2601,
+      "step": 564
+    },
+    {
+      "epoch": 2.5681818181818183,
+      "grad_norm": 2.574686050415039,
+      "learning_rate": 7.348484848484849e-05,
+      "loss": 1.1076,
+      "step": 565
+    },
+    {
+      "epoch": 2.5727272727272728,
+      "grad_norm": 2.416339874267578,
+      "learning_rate": 7.272727272727273e-05,
+      "loss": 0.9473,
+      "step": 566
+    },
+    {
+      "epoch": 2.577272727272727,
+      "grad_norm": 1.7082793712615967,
+      "learning_rate": 7.196969696969698e-05,
+      "loss": 0.7671,
+      "step": 567
+    },
+    {
+      "epoch": 2.581818181818182,
+      "grad_norm": 2.220196008682251,
+      "learning_rate": 7.121212121212121e-05,
+      "loss": 1.1754,
+      "step": 568
+    },
+    {
+      "epoch": 2.5863636363636364,
+      "grad_norm": 2.26267409324646,
+      "learning_rate": 7.045454545454545e-05,
+      "loss": 1.4229,
+      "step": 569
+    },
+    {
+      "epoch": 2.590909090909091,
+      "grad_norm": 1.7881556749343872,
+      "learning_rate": 6.96969696969697e-05,
+      "loss": 0.8333,
+      "step": 570
+    },
+    {
+      "epoch": 2.5954545454545457,
+      "grad_norm": 2.156179904937744,
+      "learning_rate": 6.893939393939393e-05,
+      "loss": 1.1788,
+      "step": 571
+    },
+    {
+      "epoch": 2.6,
+      "grad_norm": 1.6093627214431763,
+      "learning_rate": 6.818181818181818e-05,
+      "loss": 0.6442,
+      "step": 572
+    },
+    {
+      "epoch": 2.6045454545454545,
+      "grad_norm": 1.984737753868103,
+      "learning_rate": 6.742424242424243e-05,
+      "loss": 0.9969,
+      "step": 573
+    },
+    {
+      "epoch": 2.6090909090909093,
+      "grad_norm": 1.958917498588562,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 0.8534,
+      "step": 574
+    },
+    {
+      "epoch": 2.6136363636363638,
+      "grad_norm": 3.060192346572876,
+      "learning_rate": 6.59090909090909e-05,
+      "loss": 1.4748,
+      "step": 575
+    },
+    {
+      "epoch": 2.618181818181818,
+      "grad_norm": 1.9157240390777588,
+      "learning_rate": 6.515151515151516e-05,
+      "loss": 0.8512,
+      "step": 576
+    },
+    {
+      "epoch": 2.6227272727272726,
+      "grad_norm": 1.6468448638916016,
+      "learning_rate": 6.439393939393939e-05,
+      "loss": 0.7536,
+      "step": 577
+    },
+    {
+      "epoch": 2.6272727272727274,
+      "grad_norm": 1.9406344890594482,
+      "learning_rate": 6.363636363636363e-05,
+      "loss": 0.9798,
+      "step": 578
+    },
+    {
+      "epoch": 2.631818181818182,
+      "grad_norm": 1.8992547988891602,
+      "learning_rate": 6.287878787878788e-05,
+      "loss": 1.1394,
+      "step": 579
+    },
+    {
+      "epoch": 2.6363636363636362,
+      "grad_norm": 1.8168598413467407,
+      "learning_rate": 6.212121212121213e-05,
+      "loss": 0.8848,
+      "step": 580
+    },
+    {
+      "epoch": 2.6409090909090907,
+      "grad_norm": 2.8009986877441406,
+      "learning_rate": 6.136363636363636e-05,
+      "loss": 1.1817,
+      "step": 581
+    },
+    {
+      "epoch": 2.6454545454545455,
+      "grad_norm": 1.8650470972061157,
+      "learning_rate": 6.060606060606061e-05,
+      "loss": 0.9148,
+      "step": 582
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 2.132161855697632,
+      "learning_rate": 5.984848484848485e-05,
+      "loss": 1.0103,
+      "step": 583
+    },
+    {
+      "epoch": 2.6545454545454543,
+      "grad_norm": 2.488576650619507,
+      "learning_rate": 5.909090909090909e-05,
+      "loss": 1.5804,
+      "step": 584
+    },
+    {
+      "epoch": 2.659090909090909,
+      "grad_norm": 1.7953377962112427,
+      "learning_rate": 5.833333333333333e-05,
+      "loss": 0.8419,
+      "step": 585
+    },
+    {
+      "epoch": 2.6636363636363636,
+      "grad_norm": 2.563900947570801,
+      "learning_rate": 5.757575757575758e-05,
+      "loss": 1.1122,
+      "step": 586
+    },
+    {
+      "epoch": 2.668181818181818,
+      "grad_norm": 2.112504243850708,
+      "learning_rate": 5.681818181818182e-05,
+      "loss": 0.8345,
+      "step": 587
+    },
+    {
+      "epoch": 2.672727272727273,
+      "grad_norm": 2.874629020690918,
+      "learning_rate": 5.606060606060606e-05,
+      "loss": 1.257,
+      "step": 588
+    },
+    {
+      "epoch": 2.6772727272727272,
+      "grad_norm": 2.3965139389038086,
+      "learning_rate": 5.5303030303030304e-05,
+      "loss": 1.4174,
+      "step": 589
+    },
+    {
+      "epoch": 2.6818181818181817,
+      "grad_norm": 2.149787425994873,
+      "learning_rate": 5.4545454545454546e-05,
+      "loss": 1.0162,
+      "step": 590
+    },
+    {
+      "epoch": 2.6863636363636365,
+      "grad_norm": 3.67689847946167,
+      "learning_rate": 5.378787878787879e-05,
+      "loss": 0.9925,
+      "step": 591
+    },
+    {
+      "epoch": 2.690909090909091,
+      "grad_norm": 2.144545316696167,
+      "learning_rate": 5.303030303030303e-05,
+      "loss": 1.2257,
+      "step": 592
+    },
+    {
+      "epoch": 2.6954545454545453,
+      "grad_norm": 1.9149094820022583,
+      "learning_rate": 5.227272727272727e-05,
+      "loss": 0.7236,
+      "step": 593
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 2.803966999053955,
+      "learning_rate": 5.151515151515152e-05,
+      "loss": 1.1317,
+      "step": 594
+    },
+    {
+      "epoch": 2.7045454545454546,
+      "grad_norm": 2.1107089519500732,
+      "learning_rate": 5.075757575757576e-05,
+      "loss": 0.9265,
+      "step": 595
+    },
+    {
+      "epoch": 2.709090909090909,
+      "grad_norm": 2.037118911743164,
+      "learning_rate": 5e-05,
+      "loss": 0.6859,
+      "step": 596
+    },
+    {
+      "epoch": 2.713636363636364,
+      "grad_norm": 2.310952663421631,
+      "learning_rate": 4.924242424242424e-05,
+      "loss": 0.98,
+      "step": 597
+    },
+    {
+      "epoch": 2.7181818181818183,
+      "grad_norm": 1.9925788640975952,
+      "learning_rate": 4.848484848484849e-05,
+      "loss": 0.8919,
+      "step": 598
+    },
+    {
+      "epoch": 2.7227272727272727,
+      "grad_norm": 2.466705083847046,
+      "learning_rate": 4.772727272727273e-05,
+      "loss": 1.3115,
+      "step": 599
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 2.8948001861572266,
+      "learning_rate": 4.6969696969696966e-05,
+      "loss": 1.4843,
+      "step": 600
+    },
+    {
+      "epoch": 2.731818181818182,
+      "grad_norm": 1.8009178638458252,
+      "learning_rate": 4.621212121212121e-05,
+      "loss": 0.8387,
+      "step": 601
+    },
+    {
+      "epoch": 2.7363636363636363,
+      "grad_norm": 1.7695908546447754,
+      "learning_rate": 4.545454545454546e-05,
+      "loss": 0.6376,
+      "step": 602
+    },
+    {
+      "epoch": 2.740909090909091,
+      "grad_norm": 2.255938768386841,
+      "learning_rate": 4.46969696969697e-05,
+      "loss": 0.9277,
+      "step": 603
+    },
+    {
+      "epoch": 2.7454545454545456,
+      "grad_norm": 2.6216013431549072,
+      "learning_rate": 4.393939393939394e-05,
+      "loss": 0.8539,
+      "step": 604
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 2.33111310005188,
+      "learning_rate": 4.318181818181818e-05,
+      "loss": 0.766,
+      "step": 605
+    },
+    {
+      "epoch": 2.7545454545454544,
+      "grad_norm": 2.256770610809326,
+      "learning_rate": 4.242424242424243e-05,
+      "loss": 0.9658,
+      "step": 606
+    },
+    {
+      "epoch": 2.759090909090909,
+      "grad_norm": 2.4762847423553467,
+      "learning_rate": 4.1666666666666665e-05,
+      "loss": 0.8902,
+      "step": 607
+    },
+    {
+      "epoch": 2.7636363636363637,
+      "grad_norm": 1.8913813829421997,
+      "learning_rate": 4.090909090909091e-05,
+      "loss": 0.6938,
+      "step": 608
+    },
+    {
+      "epoch": 2.768181818181818,
+      "grad_norm": 1.928743839263916,
+      "learning_rate": 4.015151515151516e-05,
+      "loss": 0.637,
+      "step": 609
+    },
+    {
+      "epoch": 2.7727272727272725,
+      "grad_norm": 3.3095438480377197,
+      "learning_rate": 3.939393939393939e-05,
+      "loss": 0.9913,
+      "step": 610
+    },
+    {
+      "epoch": 2.7772727272727273,
+      "grad_norm": 2.611701011657715,
+      "learning_rate": 3.8636363636363636e-05,
+      "loss": 1.1573,
+      "step": 611
+    },
+    {
+      "epoch": 2.7818181818181817,
+      "grad_norm": 2.022073745727539,
+      "learning_rate": 3.787878787878788e-05,
+      "loss": 0.9537,
+      "step": 612
+    },
+    {
+      "epoch": 2.786363636363636,
+      "grad_norm": 2.9264447689056396,
+      "learning_rate": 3.712121212121213e-05,
+      "loss": 1.4692,
+      "step": 613
+    },
+    {
+      "epoch": 2.790909090909091,
+      "grad_norm": 2.22469425201416,
+      "learning_rate": 3.6363636363636364e-05,
+      "loss": 1.0733,
+      "step": 614
+    },
+    {
+      "epoch": 2.7954545454545454,
+      "grad_norm": 2.8329367637634277,
+      "learning_rate": 3.560606060606061e-05,
+      "loss": 1.5153,
+      "step": 615
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 1.8949047327041626,
+      "learning_rate": 3.484848484848485e-05,
+      "loss": 0.6316,
+      "step": 616
+    },
+    {
+      "epoch": 2.8045454545454547,
+      "grad_norm": 2.597440242767334,
+      "learning_rate": 3.409090909090909e-05,
+      "loss": 1.6049,
+      "step": 617
+    },
+    {
+      "epoch": 2.809090909090909,
+      "grad_norm": 2.0482330322265625,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.0083,
+      "step": 618
+    },
+    {
+      "epoch": 2.8136363636363635,
+      "grad_norm": 1.7359944581985474,
+      "learning_rate": 3.257575757575758e-05,
+      "loss": 0.9097,
+      "step": 619
+    },
+    {
+      "epoch": 2.8181818181818183,
+      "grad_norm": 1.9825539588928223,
+      "learning_rate": 3.1818181818181814e-05,
+      "loss": 0.7107,
+      "step": 620
+    },
+    {
+      "epoch": 2.8227272727272728,
+      "grad_norm": 1.7564197778701782,
+      "learning_rate": 3.106060606060606e-05,
+      "loss": 0.9089,
+      "step": 621
+    },
+    {
+      "epoch": 2.827272727272727,
+      "grad_norm": 2.735137939453125,
+      "learning_rate": 3.0303030303030306e-05,
+      "loss": 1.4978,
+      "step": 622
+    },
+    {
+      "epoch": 2.831818181818182,
+      "grad_norm": 2.702873706817627,
+      "learning_rate": 2.9545454545454545e-05,
+      "loss": 1.2556,
+      "step": 623
+    },
+    {
+      "epoch": 2.8363636363636364,
+      "grad_norm": 1.9755101203918457,
+      "learning_rate": 2.878787878787879e-05,
+      "loss": 0.8022,
+      "step": 624
+    },
+    {
+      "epoch": 2.840909090909091,
+      "grad_norm": 2.0104050636291504,
+      "learning_rate": 2.803030303030303e-05,
+      "loss": 0.8993,
+      "step": 625
+    },
+    {
+      "epoch": 2.8454545454545457,
+      "grad_norm": 2.915588855743408,
+      "learning_rate": 2.7272727272727273e-05,
+      "loss": 1.7374,
+      "step": 626
+    },
+    {
+      "epoch": 2.85,
+      "grad_norm": 2.5364370346069336,
+      "learning_rate": 2.6515151515151516e-05,
+      "loss": 0.7967,
+      "step": 627
+    },
+    {
+      "epoch": 2.8545454545454545,
+      "grad_norm": 2.731673002243042,
+      "learning_rate": 2.575757575757576e-05,
+      "loss": 1.5847,
+      "step": 628
+    },
+    {
+      "epoch": 2.8590909090909093,
+      "grad_norm": 2.1468403339385986,
+      "learning_rate": 2.5e-05,
+      "loss": 1.0101,
+      "step": 629
+    },
+    {
+      "epoch": 2.8636363636363638,
+      "grad_norm": 1.8162040710449219,
+      "learning_rate": 2.4242424242424244e-05,
+      "loss": 0.6688,
+      "step": 630
+    },
+    {
+      "epoch": 2.868181818181818,
+      "grad_norm": 2.285930871963501,
+      "learning_rate": 2.3484848484848483e-05,
+      "loss": 1.1906,
+      "step": 631
+    },
+    {
+      "epoch": 2.8727272727272726,
+      "grad_norm": 2.0448861122131348,
+      "learning_rate": 2.272727272727273e-05,
+      "loss": 0.8868,
+      "step": 632
+    },
+    {
+      "epoch": 2.8772727272727274,
+      "grad_norm": 2.98807692527771,
+      "learning_rate": 2.196969696969697e-05,
+      "loss": 1.1679,
+      "step": 633
+    },
+    {
+      "epoch": 2.881818181818182,
+      "grad_norm": 1.9618700742721558,
+      "learning_rate": 2.1212121212121215e-05,
+      "loss": 1.0659,
+      "step": 634
+    },
+    {
+      "epoch": 2.8863636363636362,
+      "grad_norm": 2.200741767883301,
+      "learning_rate": 2.0454545454545454e-05,
+      "loss": 1.0582,
+      "step": 635
+    },
+    {
+      "epoch": 2.8909090909090907,
+      "grad_norm": 2.1953506469726562,
+      "learning_rate": 1.9696969696969697e-05,
+      "loss": 1.4356,
+      "step": 636
+    },
+    {
+      "epoch": 2.8954545454545455,
+      "grad_norm": 2.1912357807159424,
+      "learning_rate": 1.893939393939394e-05,
+      "loss": 1.0532,
+      "step": 637
+    },
+    {
+      "epoch": 2.9,
+      "grad_norm": 2.125601053237915,
+      "learning_rate": 1.8181818181818182e-05,
+      "loss": 1.458,
+      "step": 638
+    },
+    {
+      "epoch": 2.9045454545454543,
+      "grad_norm": 1.9430787563323975,
+      "learning_rate": 1.7424242424242425e-05,
+      "loss": 0.8313,
+      "step": 639
+    },
+    {
+      "epoch": 2.909090909090909,
+      "grad_norm": 1.7255802154541016,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 0.7894,
+      "step": 640
+    },
+    {
+      "epoch": 2.9136363636363636,
+      "grad_norm": 2.259798526763916,
+      "learning_rate": 1.5909090909090907e-05,
+      "loss": 1.0942,
+      "step": 641
+    },
+    {
+      "epoch": 2.918181818181818,
+      "grad_norm": 2.4443533420562744,
+      "learning_rate": 1.5151515151515153e-05,
+      "loss": 1.5392,
+      "step": 642
+    },
+    {
+      "epoch": 2.922727272727273,
+      "grad_norm": 2.437310218811035,
+      "learning_rate": 1.4393939393939396e-05,
+      "loss": 0.9475,
+      "step": 643
+    },
+    {
+      "epoch": 2.9272727272727272,
+      "grad_norm": 2.1248443126678467,
+      "learning_rate": 1.3636363636363637e-05,
+      "loss": 0.6917,
+      "step": 644
+    },
+    {
+      "epoch": 2.9318181818181817,
+      "grad_norm": 2.0161659717559814,
+      "learning_rate": 1.287878787878788e-05,
+      "loss": 1.0465,
+      "step": 645
+    },
+    {
+      "epoch": 2.9363636363636365,
+      "grad_norm": 1.825695514678955,
+      "learning_rate": 1.2121212121212122e-05,
+      "loss": 0.8001,
+      "step": 646
+    },
+    {
+      "epoch": 2.940909090909091,
+      "grad_norm": 2.58219575881958,
+      "learning_rate": 1.1363636363636365e-05,
+      "loss": 0.9649,
+      "step": 647
+    },
+    {
+      "epoch": 2.9454545454545453,
+      "grad_norm": 1.9554407596588135,
+      "learning_rate": 1.0606060606060607e-05,
+      "loss": 1.1447,
+      "step": 648
+    },
+    {
+      "epoch": 2.95,
+      "grad_norm": 2.89900541305542,
+      "learning_rate": 9.848484848484848e-06,
+      "loss": 0.9461,
+      "step": 649
+    },
+    {
+      "epoch": 2.9545454545454546,
+      "grad_norm": 1.8475868701934814,
+      "learning_rate": 9.090909090909091e-06,
+      "loss": 1.1863,
+      "step": 650
+    },
+    {
+      "epoch": 2.959090909090909,
+      "grad_norm": 2.264302968978882,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 0.9459,
+      "step": 651
+    },
+    {
+      "epoch": 2.963636363636364,
+      "grad_norm": 2.157198190689087,
+      "learning_rate": 7.5757575757575764e-06,
+      "loss": 1.4461,
+      "step": 652
+    },
+    {
+      "epoch": 2.9681818181818183,
+      "grad_norm": 2.3027210235595703,
+      "learning_rate": 6.818181818181818e-06,
+      "loss": 0.8407,
+      "step": 653
+    },
+    {
+      "epoch": 2.9727272727272727,
+      "grad_norm": 1.786800503730774,
+      "learning_rate": 6.060606060606061e-06,
+      "loss": 0.7051,
+      "step": 654
+    },
+    {
+      "epoch": 2.9772727272727275,
+      "grad_norm": 2.4173872470855713,
+      "learning_rate": 5.303030303030304e-06,
+      "loss": 1.0276,
+      "step": 655
+    },
+    {
+      "epoch": 2.981818181818182,
+      "grad_norm": 3.755701780319214,
+      "learning_rate": 4.5454545454545455e-06,
+      "loss": 0.9572,
+      "step": 656
+    },
+    {
+      "epoch": 2.9863636363636363,
+      "grad_norm": 2.0097804069519043,
+      "learning_rate": 3.7878787878787882e-06,
+      "loss": 0.9736,
+      "step": 657
+    },
+    {
+      "epoch": 2.990909090909091,
+      "grad_norm": 1.773881196975708,
+      "learning_rate": 3.0303030303030305e-06,
+      "loss": 0.8404,
+      "step": 658
+    },
+    {
+      "epoch": 2.9954545454545456,
+      "grad_norm": 2.139065980911255,
+      "learning_rate": 2.2727272727272728e-06,
+      "loss": 1.1301,
+      "step": 659
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.8846311569213867,
+      "learning_rate": 1.5151515151515152e-06,
+      "loss": 0.7227,
+      "step": 660
+    },
+    {
+      "epoch": 3.0,
+      "eval_f1": 0.8952,
+      "eval_gen_len": 41.8455,
+      "eval_loss": 1.8697103261947632,
+      "eval_precision": 0.8934,
+      "eval_recall": 0.8971,
+      "eval_rouge1": 0.4709,
+      "eval_rouge2": 0.2223,
+      "eval_rougeL": 0.3999,
+      "eval_rougeLsum": 0.4391,
+      "eval_runtime": 25.3276,
+      "eval_samples_per_second": 4.343,
+      "eval_steps_per_second": 0.553,
+      "step": 660
+    },
+    {
+      "epoch": 3.0,
+      "step": 660,
+      "total_flos": 2484005840363520.0,
+      "train_loss": 1.4827006761774872,
+      "train_runtime": 507.3071,
+      "train_samples_per_second": 5.198,
+      "train_steps_per_second": 1.301
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 660,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2484005840363520.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}