End of training

Browse files

Files changed (7) hide show

README.md +19 -17
adapter_1/adapter_config.json +32 -0
adapter_1/adapter_model.safetensors +3 -0
adapter_config.json +6 -3
adapter_model.safetensors +1 -1
trainer_state.json +1636 -0
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,8 +1,9 @@
 ---
 license: apache-2.0
-base_model: openai/whisper-large-v3
 tags:
 - generated_from_trainer
 model-index:
 - name: Whisper-large-v3-Arabic-phoneme
   results: []
@@ -13,9 +14,9 @@ should probably proofread and complete it, then remove this comment. -->
 # Whisper-large-v3-Arabic-phoneme
-This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.2361
 ## Model description
@@ -48,21 +49,22 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
-| 0.0747        | 1.0   | 546  | 0.2187          |
-| 0.0343        | 2.0   | 1092 | 0.1932          |
-| 0.0221        | 3.0   | 1638 | 0.1981          |
-| 0.0315        | 4.0   | 2184 | 0.1915          |
-| 0.0272        | 5.0   | 2730 | 0.2095          |
-| 0.0147        | 6.0   | 3276 | 0.2060          |
-| 0.0108        | 7.0   | 3822 | 0.2101          |
-| 0.0057        | 8.0   | 4368 | 0.2165          |
-| 0.0026        | 9.0   | 4914 | 0.2271          |
-| 0.002         | 10.0  | 5460 | 0.2361          |
 ### Framework versions
-- Transformers 4.35.2
-- Pytorch 2.1.0+cu121
-- Datasets 2.16.1
-- Tokenizers 0.15.0

 ---
 license: apache-2.0
+library_name: peft
 tags:
 - generated_from_trainer
+base_model: openai/whisper-large-v3
 model-index:
 - name: Whisper-large-v3-Arabic-phoneme
   results: []
 # Whisper-large-v3-Arabic-phoneme
+This model is a fine-tuned version of [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) on an unknown dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.2331
 ## Model description
 | Training Loss | Epoch | Step | Validation Loss |
 |:-------------:|:-----:|:----:|:---------------:|
+| 0.0608        | 1.0   | 546  | 0.2074          |
+| 0.0508        | 2.0   | 1092 | 0.2211          |
+| 0.0287        | 3.0   | 1638 | 0.1681          |
+| 0.0148        | 4.0   | 2184 | 0.1938          |
+| 0.0263        | 5.0   | 2730 | 0.1846          |
+| 0.0168        | 6.0   | 3276 | 0.1899          |
+| 0.0086        | 7.0   | 3822 | 0.1975          |
+| 0.0102        | 8.0   | 4368 | 0.2170          |
+| 0.0023        | 9.0   | 4914 | 0.2294          |
+| 0.0024        | 10.0  | 5460 | 0.2331          |
 ### Framework versions
+- PEFT 0.10.0
+- Transformers 4.38.2
+- Pytorch 2.2.1+cu121
+- Datasets 2.18.0
+- Tokenizers 0.15.2

adapter_1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "base_model_class": "WhisperForConditionalGeneration",
+    "parent_library": "transformers.models.whisper.modeling_whisper"
+  },
+  "base_model_name_or_path": "openai/whisper-large-v3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_1/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08026702b7a1be7021b6a8e05de062904a5b406498ebe8c9213965d21f3676a2
+size 62969640

adapter_config.json CHANGED Viewed

@@ -9,6 +9,7 @@
   "fan_in_fan_out": false,
   "inference_mode": true,
   "init_lora_weights": true,
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
@@ -22,8 +23,10 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "q_proj",
-    "v_proj"
   ],
-  "task_type": null
 }

   "fan_in_fan_out": false,
   "inference_mode": true,
   "init_lora_weights": true,
+  "layer_replication": null,
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "v_proj",
+    "q_proj"
   ],
+  "task_type": null,
+  "use_dora": false,
+  "use_rslora": false
 }

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:01fa831ca803f1d3b293be2dd55e908d1ffbf7bb14621f30dd36d4f231b07417
 size 62969640

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5ec4edfc2c7ee47d1229478f8a8d2c11bff55bd08585acde03693a1a00498f2
 size 62969640

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1636 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 5460,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.0899150371551514,
+      "learning_rate": 0.0005,
+      "loss": 2.9446,
+      "step": 25
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.7281716465950012,
+      "learning_rate": 0.001,
+      "loss": 0.7356,
+      "step": 50
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8861620426177979,
+      "learning_rate": 0.0009963031423290203,
+      "loss": 0.5765,
+      "step": 75
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 2.8832926750183105,
+      "learning_rate": 0.0009916820702402958,
+      "loss": 0.3944,
+      "step": 100
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.077749252319336,
+      "learning_rate": 0.000987060998151571,
+      "loss": 0.1654,
+      "step": 125
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5631522536277771,
+      "learning_rate": 0.0009824399260628465,
+      "loss": 0.1044,
+      "step": 150
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.25788772106170654,
+      "learning_rate": 0.000977818853974122,
+      "loss": 0.0853,
+      "step": 175
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7229248285293579,
+      "learning_rate": 0.0009731977818853974,
+      "loss": 0.088,
+      "step": 200
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.39239415526390076,
+      "learning_rate": 0.0009685767097966729,
+      "loss": 0.0781,
+      "step": 225
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.23850314319133759,
+      "learning_rate": 0.0009639556377079483,
+      "loss": 0.0729,
+      "step": 250
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.17475274205207825,
+      "learning_rate": 0.0009593345656192237,
+      "loss": 0.0671,
+      "step": 275
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.7980681657791138,
+      "learning_rate": 0.0009547134935304991,
+      "loss": 0.0836,
+      "step": 300
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.20785190165042877,
+      "learning_rate": 0.0009500924214417745,
+      "loss": 0.0717,
+      "step": 325
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.22275149822235107,
+      "learning_rate": 0.0009454713493530499,
+      "loss": 0.0469,
+      "step": 350
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.39296770095825195,
+      "learning_rate": 0.0009408502772643253,
+      "loss": 0.0815,
+      "step": 375
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.3089096248149872,
+      "learning_rate": 0.0009362292051756007,
+      "loss": 0.0668,
+      "step": 400
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.343288779258728,
+      "learning_rate": 0.0009316081330868762,
+      "loss": 0.0481,
+      "step": 425
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1959904432296753,
+      "learning_rate": 0.0009269870609981515,
+      "loss": 0.0554,
+      "step": 450
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.5835228562355042,
+      "learning_rate": 0.000922365988909427,
+      "loss": 0.079,
+      "step": 475
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.1368131786584854,
+      "learning_rate": 0.0009177449168207024,
+      "loss": 0.0575,
+      "step": 500
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.24407006800174713,
+      "learning_rate": 0.0009131238447319779,
+      "loss": 0.0608,
+      "step": 525
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.2074204683303833,
+      "eval_runtime": 457.9814,
+      "eval_samples_per_second": 1.775,
+      "eval_steps_per_second": 0.297,
+      "step": 546
+    },
+    {
+      "epoch": 1.01,
+      "grad_norm": 0.12714482843875885,
+      "learning_rate": 0.0009085027726432532,
+      "loss": 0.0468,
+      "step": 550
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 0.3958694636821747,
+      "learning_rate": 0.0009038817005545287,
+      "loss": 0.0486,
+      "step": 575
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.0621350072324276,
+      "learning_rate": 0.0008992606284658041,
+      "loss": 0.0531,
+      "step": 600
+    },
+    {
+      "epoch": 1.14,
+      "grad_norm": 0.019190674647688866,
+      "learning_rate": 0.0008946395563770795,
+      "loss": 0.0404,
+      "step": 625
+    },
+    {
+      "epoch": 1.19,
+      "grad_norm": 0.10872188955545425,
+      "learning_rate": 0.000890018484288355,
+      "loss": 0.0516,
+      "step": 650
+    },
+    {
+      "epoch": 1.24,
+      "grad_norm": 0.21620219945907593,
+      "learning_rate": 0.0008853974121996303,
+      "loss": 0.0352,
+      "step": 675
+    },
+    {
+      "epoch": 1.28,
+      "grad_norm": 0.09226030856370926,
+      "learning_rate": 0.0008807763401109058,
+      "loss": 0.046,
+      "step": 700
+    },
+    {
+      "epoch": 1.33,
+      "grad_norm": 0.28338831663131714,
+      "learning_rate": 0.0008761552680221812,
+      "loss": 0.0541,
+      "step": 725
+    },
+    {
+      "epoch": 1.37,
+      "grad_norm": 0.29886457324028015,
+      "learning_rate": 0.0008715341959334566,
+      "loss": 0.0469,
+      "step": 750
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.18679702281951904,
+      "learning_rate": 0.000866913123844732,
+      "loss": 0.0479,
+      "step": 775
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 0.37105533480644226,
+      "learning_rate": 0.0008622920517560074,
+      "loss": 0.0482,
+      "step": 800
+    },
+    {
+      "epoch": 1.51,
+      "grad_norm": 0.2705371081829071,
+      "learning_rate": 0.0008576709796672828,
+      "loss": 0.0339,
+      "step": 825
+    },
+    {
+      "epoch": 1.56,
+      "grad_norm": 0.23306304216384888,
+      "learning_rate": 0.0008530499075785582,
+      "loss": 0.0333,
+      "step": 850
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.32292288541793823,
+      "learning_rate": 0.0008484288354898336,
+      "loss": 0.0631,
+      "step": 875
+    },
+    {
+      "epoch": 1.65,
+      "grad_norm": 1.583330512046814,
+      "learning_rate": 0.0008438077634011091,
+      "loss": 0.0554,
+      "step": 900
+    },
+    {
+      "epoch": 1.69,
+      "grad_norm": 0.2681499421596527,
+      "learning_rate": 0.0008391866913123844,
+      "loss": 0.0752,
+      "step": 925
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.49241504073143005,
+      "learning_rate": 0.0008345656192236599,
+      "loss": 0.0393,
+      "step": 950
+    },
+    {
+      "epoch": 1.79,
+      "grad_norm": 0.3337648808956146,
+      "learning_rate": 0.0008299445471349352,
+      "loss": 0.0774,
+      "step": 975
+    },
+    {
+      "epoch": 1.83,
+      "grad_norm": 0.3321288228034973,
+      "learning_rate": 0.0008253234750462108,
+      "loss": 0.0513,
+      "step": 1000
+    },
+    {
+      "epoch": 1.88,
+      "grad_norm": 0.03459502011537552,
+      "learning_rate": 0.0008207024029574861,
+      "loss": 0.0389,
+      "step": 1025
+    },
+    {
+      "epoch": 1.92,
+      "grad_norm": 0.3316815197467804,
+      "learning_rate": 0.0008160813308687616,
+      "loss": 0.0556,
+      "step": 1050
+    },
+    {
+      "epoch": 1.97,
+      "grad_norm": 0.3402951657772064,
+      "learning_rate": 0.000811460258780037,
+      "loss": 0.0508,
+      "step": 1075
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.22112932801246643,
+      "eval_runtime": 457.2021,
+      "eval_samples_per_second": 1.778,
+      "eval_steps_per_second": 0.297,
+      "step": 1092
+    },
+    {
+      "epoch": 2.01,
+      "grad_norm": 0.23782561719417572,
+      "learning_rate": 0.0008068391866913124,
+      "loss": 0.0493,
+      "step": 1100
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 1.8971781730651855,
+      "learning_rate": 0.0008022181146025879,
+      "loss": 0.0332,
+      "step": 1125
+    },
+    {
+      "epoch": 2.11,
+      "grad_norm": 0.3028140664100647,
+      "learning_rate": 0.0007975970425138632,
+      "loss": 0.0718,
+      "step": 1150
+    },
+    {
+      "epoch": 2.15,
+      "grad_norm": 0.9446136355400085,
+      "learning_rate": 0.0007929759704251387,
+      "loss": 0.0541,
+      "step": 1175
+    },
+    {
+      "epoch": 2.2,
+      "grad_norm": 0.1498022973537445,
+      "learning_rate": 0.000788354898336414,
+      "loss": 0.0383,
+      "step": 1200
+    },
+    {
+      "epoch": 2.24,
+      "grad_norm": 0.2323082983493805,
+      "learning_rate": 0.0007837338262476895,
+      "loss": 0.0326,
+      "step": 1225
+    },
+    {
+      "epoch": 2.29,
+      "grad_norm": 0.15694022178649902,
+      "learning_rate": 0.0007791127541589649,
+      "loss": 0.0743,
+      "step": 1250
+    },
+    {
+      "epoch": 2.34,
+      "grad_norm": 0.18881258368492126,
+      "learning_rate": 0.0007744916820702403,
+      "loss": 0.0398,
+      "step": 1275
+    },
+    {
+      "epoch": 2.38,
+      "grad_norm": 0.5623793005943298,
+      "learning_rate": 0.0007698706099815157,
+      "loss": 0.0375,
+      "step": 1300
+    },
+    {
+      "epoch": 2.43,
+      "grad_norm": 0.5120436549186707,
+      "learning_rate": 0.0007652495378927911,
+      "loss": 0.0303,
+      "step": 1325
+    },
+    {
+      "epoch": 2.47,
+      "grad_norm": 0.22715122997760773,
+      "learning_rate": 0.0007606284658040665,
+      "loss": 0.0324,
+      "step": 1350
+    },
+    {
+      "epoch": 2.52,
+      "grad_norm": 0.235545352101326,
+      "learning_rate": 0.000756007393715342,
+      "loss": 0.0303,
+      "step": 1375
+    },
+    {
+      "epoch": 2.56,
+      "grad_norm": 0.29885298013687134,
+      "learning_rate": 0.0007513863216266173,
+      "loss": 0.0302,
+      "step": 1400
+    },
+    {
+      "epoch": 2.61,
+      "grad_norm": 0.30983462929725647,
+      "learning_rate": 0.0007467652495378928,
+      "loss": 0.0306,
+      "step": 1425
+    },
+    {
+      "epoch": 2.66,
+      "grad_norm": 0.3211610019207001,
+      "learning_rate": 0.0007421441774491681,
+      "loss": 0.0223,
+      "step": 1450
+    },
+    {
+      "epoch": 2.7,
+      "grad_norm": 0.020555827766656876,
+      "learning_rate": 0.0007375231053604437,
+      "loss": 0.0236,
+      "step": 1475
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.3003218472003937,
+      "learning_rate": 0.000732902033271719,
+      "loss": 0.0292,
+      "step": 1500
+    },
+    {
+      "epoch": 2.79,
+      "grad_norm": 0.23368410766124725,
+      "learning_rate": 0.0007282809611829945,
+      "loss": 0.035,
+      "step": 1525
+    },
+    {
+      "epoch": 2.84,
+      "grad_norm": 0.24805304408073425,
+      "learning_rate": 0.00072365988909427,
+      "loss": 0.0168,
+      "step": 1550
+    },
+    {
+      "epoch": 2.88,
+      "grad_norm": 0.1878635734319687,
+      "learning_rate": 0.0007190388170055453,
+      "loss": 0.0277,
+      "step": 1575
+    },
+    {
+      "epoch": 2.93,
+      "grad_norm": 0.3910519480705261,
+      "learning_rate": 0.0007144177449168208,
+      "loss": 0.0317,
+      "step": 1600
+    },
+    {
+      "epoch": 2.98,
+      "grad_norm": 0.12426433712244034,
+      "learning_rate": 0.0007097966728280961,
+      "loss": 0.0287,
+      "step": 1625
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.16812074184417725,
+      "eval_runtime": 461.1656,
+      "eval_samples_per_second": 1.763,
+      "eval_steps_per_second": 0.295,
+      "step": 1638
+    },
+    {
+      "epoch": 3.02,
+      "grad_norm": 0.009334838949143887,
+      "learning_rate": 0.0007051756007393716,
+      "loss": 0.0203,
+      "step": 1650
+    },
+    {
+      "epoch": 3.07,
+      "grad_norm": 0.15624983608722687,
+      "learning_rate": 0.000700554528650647,
+      "loss": 0.0222,
+      "step": 1675
+    },
+    {
+      "epoch": 3.11,
+      "grad_norm": 0.016200415790081024,
+      "learning_rate": 0.0006959334565619224,
+      "loss": 0.0157,
+      "step": 1700
+    },
+    {
+      "epoch": 3.16,
+      "grad_norm": 0.23733393847942352,
+      "learning_rate": 0.0006913123844731978,
+      "loss": 0.0291,
+      "step": 1725
+    },
+    {
+      "epoch": 3.21,
+      "grad_norm": 0.3168778419494629,
+      "learning_rate": 0.0006866913123844732,
+      "loss": 0.0256,
+      "step": 1750
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.22910478711128235,
+      "learning_rate": 0.0006820702402957486,
+      "loss": 0.0211,
+      "step": 1775
+    },
+    {
+      "epoch": 3.3,
+      "grad_norm": 0.17075732350349426,
+      "learning_rate": 0.000677449168207024,
+      "loss": 0.0267,
+      "step": 1800
+    },
+    {
+      "epoch": 3.34,
+      "grad_norm": 0.17666079103946686,
+      "learning_rate": 0.0006728280961182994,
+      "loss": 0.0187,
+      "step": 1825
+    },
+    {
+      "epoch": 3.39,
+      "grad_norm": 0.020759843289852142,
+      "learning_rate": 0.0006682070240295749,
+      "loss": 0.0265,
+      "step": 1850
+    },
+    {
+      "epoch": 3.43,
+      "grad_norm": 0.04912843555212021,
+      "learning_rate": 0.0006635859519408502,
+      "loss": 0.0359,
+      "step": 1875
+    },
+    {
+      "epoch": 3.48,
+      "grad_norm": 0.32245051860809326,
+      "learning_rate": 0.0006589648798521257,
+      "loss": 0.0175,
+      "step": 1900
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 0.11074571311473846,
+      "learning_rate": 0.000654343807763401,
+      "loss": 0.0178,
+      "step": 1925
+    },
+    {
+      "epoch": 3.57,
+      "grad_norm": 0.03214950114488602,
+      "learning_rate": 0.0006497227356746766,
+      "loss": 0.0192,
+      "step": 1950
+    },
+    {
+      "epoch": 3.62,
+      "grad_norm": 0.008081772364675999,
+      "learning_rate": 0.000645101663585952,
+      "loss": 0.0317,
+      "step": 1975
+    },
+    {
+      "epoch": 3.66,
+      "grad_norm": 0.1419508308172226,
+      "learning_rate": 0.0006404805914972274,
+      "loss": 0.0297,
+      "step": 2000
+    },
+    {
+      "epoch": 3.71,
+      "grad_norm": 0.01866302080452442,
+      "learning_rate": 0.0006358595194085029,
+      "loss": 0.0264,
+      "step": 2025
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0233779214322567,
+      "learning_rate": 0.0006312384473197782,
+      "loss": 0.0285,
+      "step": 2050
+    },
+    {
+      "epoch": 3.8,
+      "grad_norm": 0.023354342207312584,
+      "learning_rate": 0.0006266173752310537,
+      "loss": 0.021,
+      "step": 2075
+    },
+    {
+      "epoch": 3.85,
+      "grad_norm": 0.2687942087650299,
+      "learning_rate": 0.000621996303142329,
+      "loss": 0.0201,
+      "step": 2100
+    },
+    {
+      "epoch": 3.89,
+      "grad_norm": 0.15729880332946777,
+      "learning_rate": 0.0006173752310536045,
+      "loss": 0.0433,
+      "step": 2125
+    },
+    {
+      "epoch": 3.94,
+      "grad_norm": 0.13736210763454437,
+      "learning_rate": 0.0006127541589648799,
+      "loss": 0.0269,
+      "step": 2150
+    },
+    {
+      "epoch": 3.98,
+      "grad_norm": 0.17346683144569397,
+      "learning_rate": 0.0006081330868761553,
+      "loss": 0.0148,
+      "step": 2175
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.1938144713640213,
+      "eval_runtime": 460.937,
+      "eval_samples_per_second": 1.764,
+      "eval_steps_per_second": 0.295,
+      "step": 2184
+    },
+    {
+      "epoch": 4.03,
+      "grad_norm": 0.1308988779783249,
+      "learning_rate": 0.0006035120147874307,
+      "loss": 0.0195,
+      "step": 2200
+    },
+    {
+      "epoch": 4.08,
+      "grad_norm": 0.21798363327980042,
+      "learning_rate": 0.0005988909426987061,
+      "loss": 0.018,
+      "step": 2225
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 0.059930045157670975,
+      "learning_rate": 0.0005942698706099815,
+      "loss": 0.0157,
+      "step": 2250
+    },
+    {
+      "epoch": 4.17,
+      "grad_norm": 1.1142582893371582,
+      "learning_rate": 0.000589648798521257,
+      "loss": 0.0252,
+      "step": 2275
+    },
+    {
+      "epoch": 4.21,
+      "grad_norm": 0.17724983394145966,
+      "learning_rate": 0.0005850277264325323,
+      "loss": 0.0251,
+      "step": 2300
+    },
+    {
+      "epoch": 4.26,
+      "grad_norm": 0.7539493441581726,
+      "learning_rate": 0.0005804066543438078,
+      "loss": 0.0226,
+      "step": 2325
+    },
+    {
+      "epoch": 4.3,
+      "grad_norm": 0.15379472076892853,
+      "learning_rate": 0.0005757855822550831,
+      "loss": 0.0154,
+      "step": 2350
+    },
+    {
+      "epoch": 4.35,
+      "grad_norm": 0.11480142921209335,
+      "learning_rate": 0.0005711645101663586,
+      "loss": 0.0302,
+      "step": 2375
+    },
+    {
+      "epoch": 4.4,
+      "grad_norm": 0.29920852184295654,
+      "learning_rate": 0.0005665434380776339,
+      "loss": 0.0223,
+      "step": 2400
+    },
+    {
+      "epoch": 4.44,
+      "grad_norm": 0.2625471353530884,
+      "learning_rate": 0.0005619223659889095,
+      "loss": 0.0126,
+      "step": 2425
+    },
+    {
+      "epoch": 4.49,
+      "grad_norm": 0.2014468014240265,
+      "learning_rate": 0.0005573012939001849,
+      "loss": 0.0262,
+      "step": 2450
+    },
+    {
+      "epoch": 4.53,
+      "grad_norm": 0.20631028711795807,
+      "learning_rate": 0.0005526802218114603,
+      "loss": 0.0128,
+      "step": 2475
+    },
+    {
+      "epoch": 4.58,
+      "grad_norm": 0.1370575875043869,
+      "learning_rate": 0.0005480591497227358,
+      "loss": 0.0156,
+      "step": 2500
+    },
+    {
+      "epoch": 4.62,
+      "grad_norm": 0.058717742562294006,
+      "learning_rate": 0.0005434380776340111,
+      "loss": 0.0162,
+      "step": 2525
+    },
+    {
+      "epoch": 4.67,
+      "grad_norm": 0.010219058953225613,
+      "learning_rate": 0.0005388170055452866,
+      "loss": 0.0169,
+      "step": 2550
+    },
+    {
+      "epoch": 4.72,
+      "grad_norm": 0.1966046839952469,
+      "learning_rate": 0.0005341959334565619,
+      "loss": 0.0138,
+      "step": 2575
+    },
+    {
+      "epoch": 4.76,
+      "grad_norm": 0.4009633958339691,
+      "learning_rate": 0.0005295748613678374,
+      "loss": 0.0219,
+      "step": 2600
+    },
+    {
+      "epoch": 4.81,
+      "grad_norm": 0.002215989399701357,
+      "learning_rate": 0.0005249537892791128,
+      "loss": 0.0141,
+      "step": 2625
+    },
+    {
+      "epoch": 4.85,
+      "grad_norm": 0.31632617115974426,
+      "learning_rate": 0.0005203327171903882,
+      "loss": 0.0121,
+      "step": 2650
+    },
+    {
+      "epoch": 4.9,
+      "grad_norm": 0.003695544321089983,
+      "learning_rate": 0.0005157116451016636,
+      "loss": 0.0152,
+      "step": 2675
+    },
+    {
+      "epoch": 4.95,
+      "grad_norm": 0.03792522847652435,
+      "learning_rate": 0.000511090573012939,
+      "loss": 0.0248,
+      "step": 2700
+    },
+    {
+      "epoch": 4.99,
+      "grad_norm": 0.22871936857700348,
+      "learning_rate": 0.0005064695009242144,
+      "loss": 0.0263,
+      "step": 2725
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.1845603585243225,
+      "eval_runtime": 463.3179,
+      "eval_samples_per_second": 1.755,
+      "eval_steps_per_second": 0.294,
+      "step": 2730
+    },
+    {
+      "epoch": 5.04,
+      "grad_norm": 0.2171017825603485,
+      "learning_rate": 0.0005018484288354898,
+      "loss": 0.0141,
+      "step": 2750
+    },
+    {
+      "epoch": 5.08,
+      "grad_norm": 0.0038620266132056713,
+      "learning_rate": 0.0004972273567467653,
+      "loss": 0.0058,
+      "step": 2775
+    },
+    {
+      "epoch": 5.13,
+      "grad_norm": 0.2298993319272995,
+      "learning_rate": 0.0004926062846580407,
+      "loss": 0.0109,
+      "step": 2800
+    },
+    {
+      "epoch": 5.17,
+      "grad_norm": 0.11024118214845657,
+      "learning_rate": 0.0004879852125693161,
+      "loss": 0.0189,
+      "step": 2825
+    },
+    {
+      "epoch": 5.22,
+      "grad_norm": 0.0567074678838253,
+      "learning_rate": 0.00048336414048059153,
+      "loss": 0.0089,
+      "step": 2850
+    },
+    {
+      "epoch": 5.27,
+      "grad_norm": 0.004749275743961334,
+      "learning_rate": 0.00047874306839186694,
+      "loss": 0.0114,
+      "step": 2875
+    },
+    {
+      "epoch": 5.31,
+      "grad_norm": 0.14577604830265045,
+      "learning_rate": 0.00047412199630314235,
+      "loss": 0.0129,
+      "step": 2900
+    },
+    {
+      "epoch": 5.36,
+      "grad_norm": 0.11412041634321213,
+      "learning_rate": 0.00046950092421441775,
+      "loss": 0.0127,
+      "step": 2925
+    },
+    {
+      "epoch": 5.4,
+      "grad_norm": 0.29708778858184814,
+      "learning_rate": 0.00046487985212569316,
+      "loss": 0.0334,
+      "step": 2950
+    },
+    {
+      "epoch": 5.45,
+      "grad_norm": 0.12024960666894913,
+      "learning_rate": 0.00046025878003696857,
+      "loss": 0.0153,
+      "step": 2975
+    },
+    {
+      "epoch": 5.49,
+      "grad_norm": 0.12928707897663116,
+      "learning_rate": 0.000455637707948244,
+      "loss": 0.0153,
+      "step": 3000
+    },
+    {
+      "epoch": 5.54,
+      "grad_norm": 0.1562725156545639,
+      "learning_rate": 0.0004510166358595194,
+      "loss": 0.0174,
+      "step": 3025
+    },
+    {
+      "epoch": 5.59,
+      "grad_norm": 0.270773321390152,
+      "learning_rate": 0.00044639556377079484,
+      "loss": 0.0221,
+      "step": 3050
+    },
+    {
+      "epoch": 5.63,
+      "grad_norm": 0.36300143599510193,
+      "learning_rate": 0.00044177449168207025,
+      "loss": 0.0219,
+      "step": 3075
+    },
+    {
+      "epoch": 5.68,
+      "grad_norm": 0.1869664192199707,
+      "learning_rate": 0.00043715341959334566,
+      "loss": 0.0127,
+      "step": 3100
+    },
+    {
+      "epoch": 5.72,
+      "grad_norm": 0.23992718756198883,
+      "learning_rate": 0.00043253234750462107,
+      "loss": 0.0144,
+      "step": 3125
+    },
+    {
+      "epoch": 5.77,
+      "grad_norm": 0.0021614329889416695,
+      "learning_rate": 0.00042791127541589647,
+      "loss": 0.0115,
+      "step": 3150
+    },
+    {
+      "epoch": 5.82,
+      "grad_norm": 0.10629579424858093,
+      "learning_rate": 0.0004232902033271719,
+      "loss": 0.0097,
+      "step": 3175
+    },
+    {
+      "epoch": 5.86,
+      "grad_norm": 0.15990929305553436,
+      "learning_rate": 0.00041866913123844734,
+      "loss": 0.0118,
+      "step": 3200
+    },
+    {
+      "epoch": 5.91,
+      "grad_norm": 0.19984115660190582,
+      "learning_rate": 0.00041404805914972275,
+      "loss": 0.014,
+      "step": 3225
+    },
+    {
+      "epoch": 5.95,
+      "grad_norm": 0.06720598042011261,
+      "learning_rate": 0.00040942698706099816,
+      "loss": 0.0182,
+      "step": 3250
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.006452410947531462,
+      "learning_rate": 0.0004048059149722736,
+      "loss": 0.0168,
+      "step": 3275
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.18991737067699432,
+      "eval_runtime": 463.6922,
+      "eval_samples_per_second": 1.753,
+      "eval_steps_per_second": 0.293,
+      "step": 3276
+    },
+    {
+      "epoch": 6.04,
+      "grad_norm": 0.003035791451111436,
+      "learning_rate": 0.000400184842883549,
+      "loss": 0.0103,
+      "step": 3300
+    },
+    {
+      "epoch": 6.09,
+      "grad_norm": 0.20400433242321014,
+      "learning_rate": 0.00039556377079482443,
+      "loss": 0.0094,
+      "step": 3325
+    },
+    {
+      "epoch": 6.14,
+      "grad_norm": 0.05333884805440903,
+      "learning_rate": 0.00039094269870609984,
+      "loss": 0.0135,
+      "step": 3350
+    },
+    {
+      "epoch": 6.18,
+      "grad_norm": 0.0005341291544027627,
+      "learning_rate": 0.00038632162661737525,
+      "loss": 0.0042,
+      "step": 3375
+    },
+    {
+      "epoch": 6.23,
+      "grad_norm": 0.008850090205669403,
+      "learning_rate": 0.00038170055452865065,
+      "loss": 0.012,
+      "step": 3400
+    },
+    {
+      "epoch": 6.27,
+      "grad_norm": 0.013096541166305542,
+      "learning_rate": 0.00037707948243992606,
+      "loss": 0.0121,
+      "step": 3425
+    },
+    {
+      "epoch": 6.32,
+      "grad_norm": 0.04941894859075546,
+      "learning_rate": 0.00037245841035120147,
+      "loss": 0.0086,
+      "step": 3450
+    },
+    {
+      "epoch": 6.36,
+      "grad_norm": 0.02113133668899536,
+      "learning_rate": 0.0003678373382624769,
+      "loss": 0.0109,
+      "step": 3475
+    },
+    {
+      "epoch": 6.41,
+      "grad_norm": 0.03568890690803528,
+      "learning_rate": 0.0003632162661737523,
+      "loss": 0.0072,
+      "step": 3500
+    },
+    {
+      "epoch": 6.46,
+      "grad_norm": 0.0471993163228035,
+      "learning_rate": 0.00035859519408502774,
+      "loss": 0.0074,
+      "step": 3525
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.13575506210327148,
+      "learning_rate": 0.00035397412199630315,
+      "loss": 0.0084,
+      "step": 3550
+    },
+    {
+      "epoch": 6.55,
+      "grad_norm": 0.14595919847488403,
+      "learning_rate": 0.00034935304990757856,
+      "loss": 0.0063,
+      "step": 3575
+    },
+    {
+      "epoch": 6.59,
+      "grad_norm": 0.012155482545495033,
+      "learning_rate": 0.00034473197781885397,
+      "loss": 0.0092,
+      "step": 3600
+    },
+    {
+      "epoch": 6.64,
+      "grad_norm": 0.010629130527377129,
+      "learning_rate": 0.0003401109057301294,
+      "loss": 0.0134,
+      "step": 3625
+    },
+    {
+      "epoch": 6.68,
+      "grad_norm": 0.19480323791503906,
+      "learning_rate": 0.00033548983364140483,
+      "loss": 0.0141,
+      "step": 3650
+    },
+    {
+      "epoch": 6.73,
+      "grad_norm": 0.0660039409995079,
+      "learning_rate": 0.00033086876155268024,
+      "loss": 0.0077,
+      "step": 3675
+    },
+    {
+      "epoch": 6.78,
+      "grad_norm": 0.0346703939139843,
+      "learning_rate": 0.00032624768946395565,
+      "loss": 0.0117,
+      "step": 3700
+    },
+    {
+      "epoch": 6.82,
+      "grad_norm": 0.22081167995929718,
+      "learning_rate": 0.00032162661737523106,
+      "loss": 0.0099,
+      "step": 3725
+    },
+    {
+      "epoch": 6.87,
+      "grad_norm": 0.06806311756372452,
+      "learning_rate": 0.0003170055452865065,
+      "loss": 0.0115,
+      "step": 3750
+    },
+    {
+      "epoch": 6.91,
+      "grad_norm": 0.0030449284240603447,
+      "learning_rate": 0.0003123844731977819,
+      "loss": 0.0136,
+      "step": 3775
+    },
+    {
+      "epoch": 6.96,
+      "grad_norm": 0.07848404347896576,
+      "learning_rate": 0.00030776340110905733,
+      "loss": 0.0086,
+      "step": 3800
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.19748586416244507,
+      "eval_runtime": 457.6513,
+      "eval_samples_per_second": 1.776,
+      "eval_steps_per_second": 0.297,
+      "step": 3822
+    },
+    {
+      "epoch": 7.01,
+      "grad_norm": 0.04702220484614372,
+      "learning_rate": 0.00030314232902033274,
+      "loss": 0.009,
+      "step": 3825
+    },
+    {
+      "epoch": 7.05,
+      "grad_norm": 0.026063207536935806,
+      "learning_rate": 0.00029852125693160815,
+      "loss": 0.0079,
+      "step": 3850
+    },
+    {
+      "epoch": 7.1,
+      "grad_norm": 0.021074611693620682,
+      "learning_rate": 0.00029390018484288355,
+      "loss": 0.0042,
+      "step": 3875
+    },
+    {
+      "epoch": 7.14,
+      "grad_norm": 0.15062950551509857,
+      "learning_rate": 0.00028927911275415896,
+      "loss": 0.0073,
+      "step": 3900
+    },
+    {
+      "epoch": 7.19,
+      "grad_norm": 0.12703749537467957,
+      "learning_rate": 0.00028465804066543437,
+      "loss": 0.0047,
+      "step": 3925
+    },
+    {
+      "epoch": 7.23,
+      "grad_norm": 0.0032193493098020554,
+      "learning_rate": 0.0002800369685767098,
+      "loss": 0.0071,
+      "step": 3950
+    },
+    {
+      "epoch": 7.28,
+      "grad_norm": 0.046458516269922256,
+      "learning_rate": 0.0002754158964879852,
+      "loss": 0.0074,
+      "step": 3975
+    },
+    {
+      "epoch": 7.33,
+      "grad_norm": 0.0037984629161655903,
+      "learning_rate": 0.00027079482439926065,
+      "loss": 0.0059,
+      "step": 4000
+    },
+    {
+      "epoch": 7.37,
+      "grad_norm": 0.14948821067810059,
+      "learning_rate": 0.00026617375231053605,
+      "loss": 0.0058,
+      "step": 4025
+    },
+    {
+      "epoch": 7.42,
+      "grad_norm": 0.07740973681211472,
+      "learning_rate": 0.00026155268022181146,
+      "loss": 0.0069,
+      "step": 4050
+    },
+    {
+      "epoch": 7.46,
+      "grad_norm": 0.0008731162524782121,
+      "learning_rate": 0.00025693160813308687,
+      "loss": 0.0077,
+      "step": 4075
+    },
+    {
+      "epoch": 7.51,
+      "grad_norm": 0.001257123309187591,
+      "learning_rate": 0.0002523105360443623,
+      "loss": 0.0046,
+      "step": 4100
+    },
+    {
+      "epoch": 7.55,
+      "grad_norm": 0.042853593826293945,
+      "learning_rate": 0.00024768946395563774,
+      "loss": 0.0065,
+      "step": 4125
+    },
+    {
+      "epoch": 7.6,
+      "grad_norm": 0.0009361489792354405,
+      "learning_rate": 0.00024306839186691312,
+      "loss": 0.0073,
+      "step": 4150
+    },
+    {
+      "epoch": 7.65,
+      "grad_norm": 0.04179251566529274,
+      "learning_rate": 0.00023844731977818855,
+      "loss": 0.0043,
+      "step": 4175
+    },
+    {
+      "epoch": 7.69,
+      "grad_norm": 0.008879727683961391,
+      "learning_rate": 0.00023382624768946396,
+      "loss": 0.0095,
+      "step": 4200
+    },
+    {
+      "epoch": 7.74,
+      "grad_norm": 0.12861858308315277,
+      "learning_rate": 0.00022920517560073937,
+      "loss": 0.008,
+      "step": 4225
+    },
+    {
+      "epoch": 7.78,
+      "grad_norm": 0.01530044712126255,
+      "learning_rate": 0.0002245841035120148,
+      "loss": 0.0032,
+      "step": 4250
+    },
+    {
+      "epoch": 7.83,
+      "grad_norm": 0.02794441021978855,
+      "learning_rate": 0.0002199630314232902,
+      "loss": 0.0087,
+      "step": 4275
+    },
+    {
+      "epoch": 7.88,
+      "grad_norm": 0.16127441823482513,
+      "learning_rate": 0.00021534195933456564,
+      "loss": 0.0063,
+      "step": 4300
+    },
+    {
+      "epoch": 7.92,
+      "grad_norm": 0.09033193439245224,
+      "learning_rate": 0.00021072088724584105,
+      "loss": 0.0086,
+      "step": 4325
+    },
+    {
+      "epoch": 7.97,
+      "grad_norm": 0.016767054796218872,
+      "learning_rate": 0.00020609981515711646,
+      "loss": 0.0102,
+      "step": 4350
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.21702823042869568,
+      "eval_runtime": 460.637,
+      "eval_samples_per_second": 1.765,
+      "eval_steps_per_second": 0.295,
+      "step": 4368
+    },
+    {
+      "epoch": 8.01,
+      "grad_norm": 0.10574544966220856,
+      "learning_rate": 0.00020147874306839186,
+      "loss": 0.0047,
+      "step": 4375
+    },
+    {
+      "epoch": 8.06,
+      "grad_norm": 0.0006509744562208652,
+      "learning_rate": 0.00019685767097966727,
+      "loss": 0.005,
+      "step": 4400
+    },
+    {
+      "epoch": 8.1,
+      "grad_norm": 0.0009672873420640826,
+      "learning_rate": 0.0001922365988909427,
+      "loss": 0.0046,
+      "step": 4425
+    },
+    {
+      "epoch": 8.15,
+      "grad_norm": 0.07224971055984497,
+      "learning_rate": 0.0001876155268022181,
+      "loss": 0.0043,
+      "step": 4450
+    },
+    {
+      "epoch": 8.2,
+      "grad_norm": 0.12703950703144073,
+      "learning_rate": 0.00018299445471349355,
+      "loss": 0.0058,
+      "step": 4475
+    },
+    {
+      "epoch": 8.24,
+      "grad_norm": 0.0013393750414252281,
+      "learning_rate": 0.00017837338262476895,
+      "loss": 0.0041,
+      "step": 4500
+    },
+    {
+      "epoch": 8.29,
+      "grad_norm": 0.03772176802158356,
+      "learning_rate": 0.00017375231053604436,
+      "loss": 0.0043,
+      "step": 4525
+    },
+    {
+      "epoch": 8.33,
+      "grad_norm": 0.0009993729181587696,
+      "learning_rate": 0.0001691312384473198,
+      "loss": 0.0051,
+      "step": 4550
+    },
+    {
+      "epoch": 8.38,
+      "grad_norm": 0.0068801455199718475,
+      "learning_rate": 0.0001645101663585952,
+      "loss": 0.0042,
+      "step": 4575
+    },
+    {
+      "epoch": 8.42,
+      "grad_norm": 0.05337873101234436,
+      "learning_rate": 0.0001598890942698706,
+      "loss": 0.0048,
+      "step": 4600
+    },
+    {
+      "epoch": 8.47,
+      "grad_norm": 0.022360146045684814,
+      "learning_rate": 0.00015526802218114602,
+      "loss": 0.0058,
+      "step": 4625
+    },
+    {
+      "epoch": 8.52,
+      "grad_norm": 0.13477857410907745,
+      "learning_rate": 0.00015064695009242142,
+      "loss": 0.0029,
+      "step": 4650
+    },
+    {
+      "epoch": 8.56,
+      "grad_norm": 0.23147088289260864,
+      "learning_rate": 0.00014602587800369686,
+      "loss": 0.0057,
+      "step": 4675
+    },
+    {
+      "epoch": 8.61,
+      "grad_norm": 0.0034095763694494963,
+      "learning_rate": 0.0001414048059149723,
+      "loss": 0.0043,
+      "step": 4700
+    },
+    {
+      "epoch": 8.65,
+      "grad_norm": 0.024832753464579582,
+      "learning_rate": 0.0001367837338262477,
+      "loss": 0.0043,
+      "step": 4725
+    },
+    {
+      "epoch": 8.7,
+      "grad_norm": 0.0007142982794903219,
+      "learning_rate": 0.0001321626617375231,
+      "loss": 0.0052,
+      "step": 4750
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.02869781292974949,
+      "learning_rate": 0.00012754158964879852,
+      "loss": 0.0027,
+      "step": 4775
+    },
+    {
+      "epoch": 8.79,
+      "grad_norm": 0.0004513516614679247,
+      "learning_rate": 0.00012292051756007395,
+      "loss": 0.0036,
+      "step": 4800
+    },
+    {
+      "epoch": 8.84,
+      "grad_norm": 0.0008141061407513916,
+      "learning_rate": 0.00011829944547134936,
+      "loss": 0.0063,
+      "step": 4825
+    },
+    {
+      "epoch": 8.88,
+      "grad_norm": 0.054795410484075546,
+      "learning_rate": 0.00011367837338262476,
+      "loss": 0.0021,
+      "step": 4850
+    },
+    {
+      "epoch": 8.93,
+      "grad_norm": 0.02664073184132576,
+      "learning_rate": 0.0001090573012939002,
+      "loss": 0.0047,
+      "step": 4875
+    },
+    {
+      "epoch": 8.97,
+      "grad_norm": 0.0008485654252581298,
+      "learning_rate": 0.0001044362292051756,
+      "loss": 0.0023,
+      "step": 4900
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.22940348088741302,
+      "eval_runtime": 460.0411,
+      "eval_samples_per_second": 1.767,
+      "eval_steps_per_second": 0.296,
+      "step": 4914
+    },
+    {
+      "epoch": 9.02,
+      "grad_norm": 0.03195321932435036,
+      "learning_rate": 9.981515711645101e-05,
+      "loss": 0.0034,
+      "step": 4925
+    },
+    {
+      "epoch": 9.07,
+      "grad_norm": 0.0012715512420982122,
+      "learning_rate": 9.519408502772643e-05,
+      "loss": 0.0022,
+      "step": 4950
+    },
+    {
+      "epoch": 9.11,
+      "grad_norm": 0.010319654829800129,
+      "learning_rate": 9.057301293900184e-05,
+      "loss": 0.0024,
+      "step": 4975
+    },
+    {
+      "epoch": 9.16,
+      "grad_norm": 0.003004108089953661,
+      "learning_rate": 8.595194085027728e-05,
+      "loss": 0.0043,
+      "step": 5000
+    },
+    {
+      "epoch": 9.2,
+      "grad_norm": 0.0019885245710611343,
+      "learning_rate": 8.133086876155268e-05,
+      "loss": 0.0022,
+      "step": 5025
+    },
+    {
+      "epoch": 9.25,
+      "grad_norm": 0.06616940349340439,
+      "learning_rate": 7.67097966728281e-05,
+      "loss": 0.0022,
+      "step": 5050
+    },
+    {
+      "epoch": 9.29,
+      "grad_norm": 0.044188376516103745,
+      "learning_rate": 7.208872458410351e-05,
+      "loss": 0.0021,
+      "step": 5075
+    },
+    {
+      "epoch": 9.34,
+      "grad_norm": 0.07102042436599731,
+      "learning_rate": 6.746765249537892e-05,
+      "loss": 0.0023,
+      "step": 5100
+    },
+    {
+      "epoch": 9.39,
+      "grad_norm": 0.018956031650304794,
+      "learning_rate": 6.284658040665435e-05,
+      "loss": 0.002,
+      "step": 5125
+    },
+    {
+      "epoch": 9.43,
+      "grad_norm": 0.007081813644617796,
+      "learning_rate": 5.822550831792976e-05,
+      "loss": 0.0012,
+      "step": 5150
+    },
+    {
+      "epoch": 9.48,
+      "grad_norm": 0.004433480557054281,
+      "learning_rate": 5.3604436229205174e-05,
+      "loss": 0.002,
+      "step": 5175
+    },
+    {
+      "epoch": 9.52,
+      "grad_norm": 0.0015681196236982942,
+      "learning_rate": 4.8983364140480595e-05,
+      "loss": 0.002,
+      "step": 5200
+    },
+    {
+      "epoch": 9.57,
+      "grad_norm": 0.003421030705794692,
+      "learning_rate": 4.436229205175601e-05,
+      "loss": 0.0018,
+      "step": 5225
+    },
+    {
+      "epoch": 9.62,
+      "grad_norm": 0.13036024570465088,
+      "learning_rate": 3.974121996303143e-05,
+      "loss": 0.0016,
+      "step": 5250
+    },
+    {
+      "epoch": 9.66,
+      "grad_norm": 0.05646170675754547,
+      "learning_rate": 3.5120147874306844e-05,
+      "loss": 0.0017,
+      "step": 5275
+    },
+    {
+      "epoch": 9.71,
+      "grad_norm": 0.11519595235586166,
+      "learning_rate": 3.0499075785582258e-05,
+      "loss": 0.0041,
+      "step": 5300
+    },
+    {
+      "epoch": 9.75,
+      "grad_norm": 0.08969979733228683,
+      "learning_rate": 2.5878003696857672e-05,
+      "loss": 0.0022,
+      "step": 5325
+    },
+    {
+      "epoch": 9.8,
+      "grad_norm": 0.13669085502624512,
+      "learning_rate": 2.1256931608133086e-05,
+      "loss": 0.0014,
+      "step": 5350
+    },
+    {
+      "epoch": 9.84,
+      "grad_norm": 0.13668496906757355,
+      "learning_rate": 1.6635859519408503e-05,
+      "loss": 0.0021,
+      "step": 5375
+    },
+    {
+      "epoch": 9.89,
+      "grad_norm": 0.018298327922821045,
+      "learning_rate": 1.2014787430683919e-05,
+      "loss": 0.0023,
+      "step": 5400
+    },
+    {
+      "epoch": 9.94,
+      "grad_norm": 0.007483182940632105,
+      "learning_rate": 7.393715341959335e-06,
+      "loss": 0.0024,
+      "step": 5425
+    },
+    {
+      "epoch": 9.98,
+      "grad_norm": 0.03663533180952072,
+      "learning_rate": 2.7726432532347505e-06,
+      "loss": 0.0024,
+      "step": 5450
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.23311161994934082,
+      "eval_runtime": 467.6697,
+      "eval_samples_per_second": 1.738,
+      "eval_steps_per_second": 0.291,
+      "step": 5460
+    },
+    {
+      "epoch": 10.0,
+      "step": 5460,
+      "total_flos": 1.135723105419264e+20,
+      "train_loss": 0.04421832180674096,
+      "train_runtime": 28971.5279,
+      "train_samples_per_second": 1.13,
+      "train_steps_per_second": 0.188
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 5460,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "total_flos": 1.135723105419264e+20,
+  "train_batch_size": 6,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe84d81133dec319b7abc4166cb1b6304581fa18cff39fdc63458eef1f671fbf
-size 4792

 version https://git-lfs.github.com/spec/v1
+oid sha256:44d94fd419ab4b304255d4c0a82980578e4fa72b83b3f051c5ca0af108a4e5d1
+size 5112