Upload 8 files

Browse files

Files changed (8) hide show

README.md +202 -3
adapter_config.json +29 -0
adapter_model.bin +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1533 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
----
-license: apache-2.0
----

+---
+library_name: peft
+base_model: meta-llama/Llama-2-7b-hf
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.10.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e211554d2d2499de5ff09fc119f5a602c4e7c5adb4f41cb62700940735a0ef9a
+size 16823434

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2cd740efb0df9f099d25d11aa3aeb6ff75e7bd3df0bf04dcd481b6c226378d7
+size 33662074

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:804f20381c37b46c96243c17cabb9b983b23be6ab9d6205d2ecfa2104981c41c
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f486c885567a0baec6e9923fa8524bdf6029a8792e5a0054f492153f7cdbf0a
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1533 @@

+{
+  "best_metric": 0.28356996178627014,
+  "best_model_checkpoint": "/scratch/czm5kz/llama2-7b_8_50_0.0003_sg_finetuned_combined_with_output/checkpoint-840",
+  "epoch": 49.411764705882355,
+  "eval_steps": 20,
+  "global_step": 840,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.3730320930480957,
+      "learning_rate": 0.0002985882352941176,
+      "loss": 5.2567,
+      "step": 5
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 1.691916584968567,
+      "learning_rate": 0.0002968235294117647,
+      "loss": 4.6315,
+      "step": 10
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.246384859085083,
+      "learning_rate": 0.0002954117647058823,
+      "loss": 4.0725,
+      "step": 15
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 1.5116314888000488,
+      "learning_rate": 0.000294,
+      "loss": 3.5847,
+      "step": 20
+    },
+    {
+      "epoch": 1.18,
+      "eval_loss": 3.544229507446289,
+      "eval_runtime": 1.9069,
+      "eval_samples_per_second": 70.27,
+      "eval_steps_per_second": 8.915,
+      "step": 20
+    },
+    {
+      "epoch": 1.47,
+      "grad_norm": 10.177547454833984,
+      "learning_rate": 0.000292235294117647,
+      "loss": 3.3666,
+      "step": 25
+    },
+    {
+      "epoch": 1.76,
+      "grad_norm": 2.3323142528533936,
+      "learning_rate": 0.0002904705882352941,
+      "loss": 3.0209,
+      "step": 30
+    },
+    {
+      "epoch": 2.06,
+      "grad_norm": 2.621335744857788,
+      "learning_rate": 0.00028870588235294114,
+      "loss": 2.727,
+      "step": 35
+    },
+    {
+      "epoch": 2.35,
+      "grad_norm": 2.8985095024108887,
+      "learning_rate": 0.00028694117647058817,
+      "loss": 2.3286,
+      "step": 40
+    },
+    {
+      "epoch": 2.35,
+      "eval_loss": 2.301132917404175,
+      "eval_runtime": 1.9092,
+      "eval_samples_per_second": 70.187,
+      "eval_steps_per_second": 8.904,
+      "step": 40
+    },
+    {
+      "epoch": 2.65,
+      "grad_norm": 2.4989898204803467,
+      "learning_rate": 0.00028517647058823526,
+      "loss": 2.2305,
+      "step": 45
+    },
+    {
+      "epoch": 2.94,
+      "grad_norm": 3.2718331813812256,
+      "learning_rate": 0.00028341176470588234,
+      "loss": 2.1788,
+      "step": 50
+    },
+    {
+      "epoch": 3.24,
+      "grad_norm": 5.42415714263916,
+      "learning_rate": 0.0002816470588235294,
+      "loss": 1.7836,
+      "step": 55
+    },
+    {
+      "epoch": 3.53,
+      "grad_norm": 4.854240417480469,
+      "learning_rate": 0.00027988235294117646,
+      "loss": 1.5169,
+      "step": 60
+    },
+    {
+      "epoch": 3.53,
+      "eval_loss": 1.3232438564300537,
+      "eval_runtime": 1.9048,
+      "eval_samples_per_second": 70.347,
+      "eval_steps_per_second": 8.925,
+      "step": 60
+    },
+    {
+      "epoch": 3.82,
+      "grad_norm": 3.4465317726135254,
+      "learning_rate": 0.0002781176470588235,
+      "loss": 1.5221,
+      "step": 65
+    },
+    {
+      "epoch": 4.12,
+      "grad_norm": 3.842264175415039,
+      "learning_rate": 0.0002763529411764706,
+      "loss": 1.1387,
+      "step": 70
+    },
+    {
+      "epoch": 4.41,
+      "grad_norm": 5.933740139007568,
+      "learning_rate": 0.0002745882352941176,
+      "loss": 0.9832,
+      "step": 75
+    },
+    {
+      "epoch": 4.71,
+      "grad_norm": 3.646710157394409,
+      "learning_rate": 0.0002728235294117647,
+      "loss": 0.8744,
+      "step": 80
+    },
+    {
+      "epoch": 4.71,
+      "eval_loss": 0.7472816705703735,
+      "eval_runtime": 1.908,
+      "eval_samples_per_second": 70.232,
+      "eval_steps_per_second": 8.91,
+      "step": 80
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 6.984489440917969,
+      "learning_rate": 0.00027105882352941173,
+      "loss": 0.9865,
+      "step": 85
+    },
+    {
+      "epoch": 5.29,
+      "grad_norm": 3.219074010848999,
+      "learning_rate": 0.0002692941176470588,
+      "loss": 0.7145,
+      "step": 90
+    },
+    {
+      "epoch": 5.59,
+      "grad_norm": 4.300885200500488,
+      "learning_rate": 0.00026752941176470585,
+      "loss": 0.6537,
+      "step": 95
+    },
+    {
+      "epoch": 5.88,
+      "grad_norm": 4.009711742401123,
+      "learning_rate": 0.00026576470588235293,
+      "loss": 0.6585,
+      "step": 100
+    },
+    {
+      "epoch": 5.88,
+      "eval_loss": 0.5125073194503784,
+      "eval_runtime": 1.9095,
+      "eval_samples_per_second": 70.175,
+      "eval_steps_per_second": 8.903,
+      "step": 100
+    },
+    {
+      "epoch": 6.18,
+      "grad_norm": 3.3402371406555176,
+      "learning_rate": 0.00026399999999999997,
+      "loss": 0.5481,
+      "step": 105
+    },
+    {
+      "epoch": 6.47,
+      "grad_norm": 5.765257835388184,
+      "learning_rate": 0.00026223529411764705,
+      "loss": 0.5487,
+      "step": 110
+    },
+    {
+      "epoch": 6.76,
+      "grad_norm": 5.511760234832764,
+      "learning_rate": 0.0002604705882352941,
+      "loss": 0.5538,
+      "step": 115
+    },
+    {
+      "epoch": 7.06,
+      "grad_norm": 2.6698474884033203,
+      "learning_rate": 0.0002587058823529411,
+      "loss": 0.5079,
+      "step": 120
+    },
+    {
+      "epoch": 7.06,
+      "eval_loss": 0.42265063524246216,
+      "eval_runtime": 1.9148,
+      "eval_samples_per_second": 69.982,
+      "eval_steps_per_second": 8.878,
+      "step": 120
+    },
+    {
+      "epoch": 7.35,
+      "grad_norm": 2.159120559692383,
+      "learning_rate": 0.0002569411764705882,
+      "loss": 0.4498,
+      "step": 125
+    },
+    {
+      "epoch": 7.65,
+      "grad_norm": 4.337136268615723,
+      "learning_rate": 0.0002551764705882353,
+      "loss": 0.4972,
+      "step": 130
+    },
+    {
+      "epoch": 7.94,
+      "grad_norm": 3.299618721008301,
+      "learning_rate": 0.0002534117647058823,
+      "loss": 0.4687,
+      "step": 135
+    },
+    {
+      "epoch": 8.24,
+      "grad_norm": 2.5050439834594727,
+      "learning_rate": 0.0002516470588235294,
+      "loss": 0.4272,
+      "step": 140
+    },
+    {
+      "epoch": 8.24,
+      "eval_loss": 0.38163435459136963,
+      "eval_runtime": 1.9145,
+      "eval_samples_per_second": 69.992,
+      "eval_steps_per_second": 8.88,
+      "step": 140
+    },
+    {
+      "epoch": 8.53,
+      "grad_norm": 2.425208330154419,
+      "learning_rate": 0.00024988235294117644,
+      "loss": 0.4338,
+      "step": 145
+    },
+    {
+      "epoch": 8.82,
+      "grad_norm": 2.357813835144043,
+      "learning_rate": 0.0002481176470588235,
+      "loss": 0.446,
+      "step": 150
+    },
+    {
+      "epoch": 9.12,
+      "grad_norm": 2.171400547027588,
+      "learning_rate": 0.00024635294117647056,
+      "loss": 0.4526,
+      "step": 155
+    },
+    {
+      "epoch": 9.41,
+      "grad_norm": 2.656935691833496,
+      "learning_rate": 0.00024458823529411764,
+      "loss": 0.3549,
+      "step": 160
+    },
+    {
+      "epoch": 9.41,
+      "eval_loss": 0.3644285202026367,
+      "eval_runtime": 1.9129,
+      "eval_samples_per_second": 70.053,
+      "eval_steps_per_second": 8.887,
+      "step": 160
+    },
+    {
+      "epoch": 9.71,
+      "grad_norm": 1.6828022003173828,
+      "learning_rate": 0.0002428235294117647,
+      "loss": 0.4212,
+      "step": 165
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.211177110671997,
+      "learning_rate": 0.00024105882352941176,
+      "loss": 0.4538,
+      "step": 170
+    },
+    {
+      "epoch": 10.29,
+      "grad_norm": 3.526498317718506,
+      "learning_rate": 0.0002392941176470588,
+      "loss": 0.3516,
+      "step": 175
+    },
+    {
+      "epoch": 10.59,
+      "grad_norm": 1.7221890687942505,
+      "learning_rate": 0.00023752941176470585,
+      "loss": 0.357,
+      "step": 180
+    },
+    {
+      "epoch": 10.59,
+      "eval_loss": 0.3521924316883087,
+      "eval_runtime": 1.9162,
+      "eval_samples_per_second": 69.93,
+      "eval_steps_per_second": 8.872,
+      "step": 180
+    },
+    {
+      "epoch": 10.88,
+      "grad_norm": 4.03507137298584,
+      "learning_rate": 0.0002357647058823529,
+      "loss": 0.4831,
+      "step": 185
+    },
+    {
+      "epoch": 11.18,
+      "grad_norm": 1.6729460954666138,
+      "learning_rate": 0.000234,
+      "loss": 0.3714,
+      "step": 190
+    },
+    {
+      "epoch": 11.47,
+      "grad_norm": 7.34691047668457,
+      "learning_rate": 0.00023223529411764705,
+      "loss": 0.3907,
+      "step": 195
+    },
+    {
+      "epoch": 11.76,
+      "grad_norm": 2.635348081588745,
+      "learning_rate": 0.0002304705882352941,
+      "loss": 0.3602,
+      "step": 200
+    },
+    {
+      "epoch": 11.76,
+      "eval_loss": 0.36221587657928467,
+      "eval_runtime": 1.9149,
+      "eval_samples_per_second": 69.977,
+      "eval_steps_per_second": 8.878,
+      "step": 200
+    },
+    {
+      "epoch": 12.06,
+      "grad_norm": 1.367008924484253,
+      "learning_rate": 0.00022870588235294115,
+      "loss": 0.4724,
+      "step": 205
+    },
+    {
+      "epoch": 12.35,
+      "grad_norm": 3.3520970344543457,
+      "learning_rate": 0.0002269411764705882,
+      "loss": 0.3446,
+      "step": 210
+    },
+    {
+      "epoch": 12.65,
+      "grad_norm": 1.5351004600524902,
+      "learning_rate": 0.0002251764705882353,
+      "loss": 0.3585,
+      "step": 215
+    },
+    {
+      "epoch": 12.94,
+      "grad_norm": 1.8775451183319092,
+      "learning_rate": 0.00022341176470588235,
+      "loss": 0.3948,
+      "step": 220
+    },
+    {
+      "epoch": 12.94,
+      "eval_loss": 0.32533711194992065,
+      "eval_runtime": 1.916,
+      "eval_samples_per_second": 69.939,
+      "eval_steps_per_second": 8.873,
+      "step": 220
+    },
+    {
+      "epoch": 13.24,
+      "grad_norm": 9.952605247497559,
+      "learning_rate": 0.00022164705882352938,
+      "loss": 0.3759,
+      "step": 225
+    },
+    {
+      "epoch": 13.53,
+      "grad_norm": 1.7639845609664917,
+      "learning_rate": 0.00021988235294117644,
+      "loss": 0.3728,
+      "step": 230
+    },
+    {
+      "epoch": 13.82,
+      "grad_norm": 4.684399127960205,
+      "learning_rate": 0.0002181176470588235,
+      "loss": 0.3876,
+      "step": 235
+    },
+    {
+      "epoch": 14.12,
+      "grad_norm": 1.1134511232376099,
+      "learning_rate": 0.00021635294117647059,
+      "loss": 0.3364,
+      "step": 240
+    },
+    {
+      "epoch": 14.12,
+      "eval_loss": 0.3308536410331726,
+      "eval_runtime": 1.9172,
+      "eval_samples_per_second": 69.894,
+      "eval_steps_per_second": 8.867,
+      "step": 240
+    },
+    {
+      "epoch": 14.41,
+      "grad_norm": 1.279011607170105,
+      "learning_rate": 0.00021458823529411764,
+      "loss": 0.3951,
+      "step": 245
+    },
+    {
+      "epoch": 14.71,
+      "grad_norm": 1.3024864196777344,
+      "learning_rate": 0.0002128235294117647,
+      "loss": 0.3397,
+      "step": 250
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 1.8494690656661987,
+      "learning_rate": 0.00021105882352941174,
+      "loss": 0.3925,
+      "step": 255
+    },
+    {
+      "epoch": 15.29,
+      "grad_norm": 1.291985034942627,
+      "learning_rate": 0.0002092941176470588,
+      "loss": 0.3352,
+      "step": 260
+    },
+    {
+      "epoch": 15.29,
+      "eval_loss": 0.31582340598106384,
+      "eval_runtime": 1.9196,
+      "eval_samples_per_second": 69.805,
+      "eval_steps_per_second": 8.856,
+      "step": 260
+    },
+    {
+      "epoch": 15.59,
+      "grad_norm": 1.7603998184204102,
+      "learning_rate": 0.00020752941176470585,
+      "loss": 0.3316,
+      "step": 265
+    },
+    {
+      "epoch": 15.88,
+      "grad_norm": 1.2616525888442993,
+      "learning_rate": 0.00020576470588235294,
+      "loss": 0.3795,
+      "step": 270
+    },
+    {
+      "epoch": 16.18,
+      "grad_norm": 0.840260922908783,
+      "learning_rate": 0.000204,
+      "loss": 0.3323,
+      "step": 275
+    },
+    {
+      "epoch": 16.47,
+      "grad_norm": 1.1025527715682983,
+      "learning_rate": 0.00020223529411764703,
+      "loss": 0.3213,
+      "step": 280
+    },
+    {
+      "epoch": 16.47,
+      "eval_loss": 0.31221747398376465,
+      "eval_runtime": 1.9206,
+      "eval_samples_per_second": 69.771,
+      "eval_steps_per_second": 8.851,
+      "step": 280
+    },
+    {
+      "epoch": 16.76,
+      "grad_norm": 1.5651593208312988,
+      "learning_rate": 0.0002004705882352941,
+      "loss": 0.3623,
+      "step": 285
+    },
+    {
+      "epoch": 17.06,
+      "grad_norm": 1.450316309928894,
+      "learning_rate": 0.00019870588235294115,
+      "loss": 0.3603,
+      "step": 290
+    },
+    {
+      "epoch": 17.35,
+      "grad_norm": 1.2301772832870483,
+      "learning_rate": 0.00019694117647058823,
+      "loss": 0.3284,
+      "step": 295
+    },
+    {
+      "epoch": 17.65,
+      "grad_norm": 1.1196820735931396,
+      "learning_rate": 0.0001951764705882353,
+      "loss": 0.3634,
+      "step": 300
+    },
+    {
+      "epoch": 17.65,
+      "eval_loss": 0.3087502419948578,
+      "eval_runtime": 1.9196,
+      "eval_samples_per_second": 69.805,
+      "eval_steps_per_second": 8.856,
+      "step": 300
+    },
+    {
+      "epoch": 17.94,
+      "grad_norm": 0.9788000583648682,
+      "learning_rate": 0.00019341176470588233,
+      "loss": 0.3166,
+      "step": 305
+    },
+    {
+      "epoch": 18.24,
+      "grad_norm": 1.0924116373062134,
+      "learning_rate": 0.00019164705882352938,
+      "loss": 0.3131,
+      "step": 310
+    },
+    {
+      "epoch": 18.53,
+      "grad_norm": 0.8083896040916443,
+      "learning_rate": 0.00018988235294117644,
+      "loss": 0.2831,
+      "step": 315
+    },
+    {
+      "epoch": 18.82,
+      "grad_norm": 1.431838870048523,
+      "learning_rate": 0.00018811764705882353,
+      "loss": 0.391,
+      "step": 320
+    },
+    {
+      "epoch": 18.82,
+      "eval_loss": 0.3061848282814026,
+      "eval_runtime": 1.9274,
+      "eval_samples_per_second": 69.525,
+      "eval_steps_per_second": 8.82,
+      "step": 320
+    },
+    {
+      "epoch": 19.12,
+      "grad_norm": 1.0592421293258667,
+      "learning_rate": 0.0001863529411764706,
+      "loss": 0.3602,
+      "step": 325
+    },
+    {
+      "epoch": 19.41,
+      "grad_norm": 1.2424482107162476,
+      "learning_rate": 0.00018458823529411762,
+      "loss": 0.3267,
+      "step": 330
+    },
+    {
+      "epoch": 19.71,
+      "grad_norm": 0.9783121347427368,
+      "learning_rate": 0.00018282352941176468,
+      "loss": 0.3244,
+      "step": 335
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 1.1153055429458618,
+      "learning_rate": 0.00018105882352941174,
+      "loss": 0.3375,
+      "step": 340
+    },
+    {
+      "epoch": 20.0,
+      "eval_loss": 0.296856552362442,
+      "eval_runtime": 1.9197,
+      "eval_samples_per_second": 69.802,
+      "eval_steps_per_second": 8.855,
+      "step": 340
+    },
+    {
+      "epoch": 20.29,
+      "grad_norm": 0.8358296751976013,
+      "learning_rate": 0.0001792941176470588,
+      "loss": 0.2988,
+      "step": 345
+    },
+    {
+      "epoch": 20.59,
+      "grad_norm": 0.925070583820343,
+      "learning_rate": 0.00017752941176470588,
+      "loss": 0.3155,
+      "step": 350
+    },
+    {
+      "epoch": 20.88,
+      "grad_norm": 1.3069535493850708,
+      "learning_rate": 0.00017576470588235294,
+      "loss": 0.3569,
+      "step": 355
+    },
+    {
+      "epoch": 21.18,
+      "grad_norm": 0.7938595414161682,
+      "learning_rate": 0.00017399999999999997,
+      "loss": 0.3207,
+      "step": 360
+    },
+    {
+      "epoch": 21.18,
+      "eval_loss": 0.2990414798259735,
+      "eval_runtime": 1.9221,
+      "eval_samples_per_second": 69.715,
+      "eval_steps_per_second": 8.844,
+      "step": 360
+    },
+    {
+      "epoch": 21.47,
+      "grad_norm": 1.0098820924758911,
+      "learning_rate": 0.00017223529411764703,
+      "loss": 0.3155,
+      "step": 365
+    },
+    {
+      "epoch": 21.76,
+      "grad_norm": 0.9777490496635437,
+      "learning_rate": 0.0001704705882352941,
+      "loss": 0.3352,
+      "step": 370
+    },
+    {
+      "epoch": 22.06,
+      "grad_norm": 0.9189445972442627,
+      "learning_rate": 0.00016870588235294118,
+      "loss": 0.3605,
+      "step": 375
+    },
+    {
+      "epoch": 22.35,
+      "grad_norm": 0.9985356330871582,
+      "learning_rate": 0.00016694117647058824,
+      "loss": 0.3043,
+      "step": 380
+    },
+    {
+      "epoch": 22.35,
+      "eval_loss": 0.29840487241744995,
+      "eval_runtime": 1.9285,
+      "eval_samples_per_second": 69.485,
+      "eval_steps_per_second": 8.815,
+      "step": 380
+    },
+    {
+      "epoch": 22.65,
+      "grad_norm": 1.288031816482544,
+      "learning_rate": 0.00016517647058823527,
+      "loss": 0.325,
+      "step": 385
+    },
+    {
+      "epoch": 22.94,
+      "grad_norm": 1.1529875993728638,
+      "learning_rate": 0.00016341176470588233,
+      "loss": 0.3246,
+      "step": 390
+    },
+    {
+      "epoch": 23.24,
+      "grad_norm": 1.0920621156692505,
+      "learning_rate": 0.0001616470588235294,
+      "loss": 0.2981,
+      "step": 395
+    },
+    {
+      "epoch": 23.53,
+      "grad_norm": 1.149031639099121,
+      "learning_rate": 0.00015988235294117647,
+      "loss": 0.3224,
+      "step": 400
+    },
+    {
+      "epoch": 23.53,
+      "eval_loss": 0.30002671480178833,
+      "eval_runtime": 1.9209,
+      "eval_samples_per_second": 69.759,
+      "eval_steps_per_second": 8.85,
+      "step": 400
+    },
+    {
+      "epoch": 23.82,
+      "grad_norm": 1.0282126665115356,
+      "learning_rate": 0.00015811764705882353,
+      "loss": 0.3327,
+      "step": 405
+    },
+    {
+      "epoch": 24.12,
+      "grad_norm": 0.7551491856575012,
+      "learning_rate": 0.00015635294117647056,
+      "loss": 0.3043,
+      "step": 410
+    },
+    {
+      "epoch": 24.41,
+      "grad_norm": 0.8478440642356873,
+      "learning_rate": 0.00015458823529411762,
+      "loss": 0.3008,
+      "step": 415
+    },
+    {
+      "epoch": 24.71,
+      "grad_norm": 1.0422290563583374,
+      "learning_rate": 0.00015282352941176468,
+      "loss": 0.3192,
+      "step": 420
+    },
+    {
+      "epoch": 24.71,
+      "eval_loss": 0.30060964822769165,
+      "eval_runtime": 1.921,
+      "eval_samples_per_second": 69.755,
+      "eval_steps_per_second": 8.849,
+      "step": 420
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 1.0847035646438599,
+      "learning_rate": 0.00015105882352941177,
+      "loss": 0.3513,
+      "step": 425
+    },
+    {
+      "epoch": 25.29,
+      "grad_norm": 1.3947087526321411,
+      "learning_rate": 0.0001492941176470588,
+      "loss": 0.3211,
+      "step": 430
+    },
+    {
+      "epoch": 25.59,
+      "grad_norm": 0.791212797164917,
+      "learning_rate": 0.00014752941176470586,
+      "loss": 0.3131,
+      "step": 435
+    },
+    {
+      "epoch": 25.88,
+      "grad_norm": 0.8528968691825867,
+      "learning_rate": 0.00014576470588235294,
+      "loss": 0.3181,
+      "step": 440
+    },
+    {
+      "epoch": 25.88,
+      "eval_loss": 0.2958623468875885,
+      "eval_runtime": 1.9218,
+      "eval_samples_per_second": 69.728,
+      "eval_steps_per_second": 8.846,
+      "step": 440
+    },
+    {
+      "epoch": 26.18,
+      "grad_norm": 0.97452712059021,
+      "learning_rate": 0.00014399999999999998,
+      "loss": 0.296,
+      "step": 445
+    },
+    {
+      "epoch": 26.47,
+      "grad_norm": 0.8431199193000793,
+      "learning_rate": 0.00014223529411764704,
+      "loss": 0.3248,
+      "step": 450
+    },
+    {
+      "epoch": 26.76,
+      "grad_norm": 0.9948679804801941,
+      "learning_rate": 0.00014047058823529412,
+      "loss": 0.3144,
+      "step": 455
+    },
+    {
+      "epoch": 27.06,
+      "grad_norm": 0.8486154079437256,
+      "learning_rate": 0.00013870588235294115,
+      "loss": 0.3112,
+      "step": 460
+    },
+    {
+      "epoch": 27.06,
+      "eval_loss": 0.29038235545158386,
+      "eval_runtime": 1.9379,
+      "eval_samples_per_second": 69.149,
+      "eval_steps_per_second": 8.773,
+      "step": 460
+    },
+    {
+      "epoch": 27.35,
+      "grad_norm": 1.0578465461730957,
+      "learning_rate": 0.00013694117647058824,
+      "loss": 0.293,
+      "step": 465
+    },
+    {
+      "epoch": 27.65,
+      "grad_norm": 1.0739288330078125,
+      "learning_rate": 0.00013517647058823527,
+      "loss": 0.3178,
+      "step": 470
+    },
+    {
+      "epoch": 27.94,
+      "grad_norm": 0.979906439781189,
+      "learning_rate": 0.00013341176470588233,
+      "loss": 0.3319,
+      "step": 475
+    },
+    {
+      "epoch": 28.24,
+      "grad_norm": 0.7465171813964844,
+      "learning_rate": 0.00013164705882352942,
+      "loss": 0.2888,
+      "step": 480
+    },
+    {
+      "epoch": 28.24,
+      "eval_loss": 0.2925453782081604,
+      "eval_runtime": 1.9228,
+      "eval_samples_per_second": 69.69,
+      "eval_steps_per_second": 8.841,
+      "step": 480
+    },
+    {
+      "epoch": 28.53,
+      "grad_norm": 1.0128196477890015,
+      "learning_rate": 0.00012988235294117645,
+      "loss": 0.3161,
+      "step": 485
+    },
+    {
+      "epoch": 28.82,
+      "grad_norm": 0.9486992359161377,
+      "learning_rate": 0.0001281176470588235,
+      "loss": 0.3116,
+      "step": 490
+    },
+    {
+      "epoch": 29.12,
+      "grad_norm": 0.8076702356338501,
+      "learning_rate": 0.0001263529411764706,
+      "loss": 0.3181,
+      "step": 495
+    },
+    {
+      "epoch": 29.41,
+      "grad_norm": 1.0319690704345703,
+      "learning_rate": 0.00012458823529411763,
+      "loss": 0.294,
+      "step": 500
+    },
+    {
+      "epoch": 29.41,
+      "eval_loss": 0.29097291827201843,
+      "eval_runtime": 1.9206,
+      "eval_samples_per_second": 69.769,
+      "eval_steps_per_second": 8.851,
+      "step": 500
+    },
+    {
+      "epoch": 29.71,
+      "grad_norm": 0.86397784948349,
+      "learning_rate": 0.0001228235294117647,
+      "loss": 0.3109,
+      "step": 505
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 1.0455472469329834,
+      "learning_rate": 0.00012105882352941174,
+      "loss": 0.3299,
+      "step": 510
+    },
+    {
+      "epoch": 30.29,
+      "grad_norm": 1.1065205335617065,
+      "learning_rate": 0.00011929411764705882,
+      "loss": 0.2855,
+      "step": 515
+    },
+    {
+      "epoch": 30.59,
+      "grad_norm": 0.8805180788040161,
+      "learning_rate": 0.00011752941176470587,
+      "loss": 0.3111,
+      "step": 520
+    },
+    {
+      "epoch": 30.59,
+      "eval_loss": 0.2902802526950836,
+      "eval_runtime": 1.9204,
+      "eval_samples_per_second": 69.777,
+      "eval_steps_per_second": 8.852,
+      "step": 520
+    },
+    {
+      "epoch": 30.88,
+      "grad_norm": 0.9637109637260437,
+      "learning_rate": 0.00011576470588235292,
+      "loss": 0.3211,
+      "step": 525
+    },
+    {
+      "epoch": 31.18,
+      "grad_norm": 1.0489997863769531,
+      "learning_rate": 0.00011399999999999999,
+      "loss": 0.3081,
+      "step": 530
+    },
+    {
+      "epoch": 31.47,
+      "grad_norm": 0.8896048665046692,
+      "learning_rate": 0.00011223529411764705,
+      "loss": 0.2956,
+      "step": 535
+    },
+    {
+      "epoch": 31.76,
+      "grad_norm": 1.0869696140289307,
+      "learning_rate": 0.00011047058823529411,
+      "loss": 0.3126,
+      "step": 540
+    },
+    {
+      "epoch": 31.76,
+      "eval_loss": 0.29041385650634766,
+      "eval_runtime": 1.9231,
+      "eval_samples_per_second": 69.679,
+      "eval_steps_per_second": 8.84,
+      "step": 540
+    },
+    {
+      "epoch": 32.06,
+      "grad_norm": 0.8613612651824951,
+      "learning_rate": 0.00010870588235294117,
+      "loss": 0.301,
+      "step": 545
+    },
+    {
+      "epoch": 32.35,
+      "grad_norm": 0.9187039732933044,
+      "learning_rate": 0.00010694117647058822,
+      "loss": 0.2988,
+      "step": 550
+    },
+    {
+      "epoch": 32.65,
+      "grad_norm": 0.8785188794136047,
+      "learning_rate": 0.00010517647058823529,
+      "loss": 0.2911,
+      "step": 555
+    },
+    {
+      "epoch": 32.94,
+      "grad_norm": 0.9897841811180115,
+      "learning_rate": 0.00010341176470588235,
+      "loss": 0.3146,
+      "step": 560
+    },
+    {
+      "epoch": 32.94,
+      "eval_loss": 0.2882142663002014,
+      "eval_runtime": 1.9204,
+      "eval_samples_per_second": 69.779,
+      "eval_steps_per_second": 8.853,
+      "step": 560
+    },
+    {
+      "epoch": 33.24,
+      "grad_norm": 1.1079126596450806,
+      "learning_rate": 0.00010164705882352939,
+      "loss": 0.3177,
+      "step": 565
+    },
+    {
+      "epoch": 33.53,
+      "grad_norm": 0.7854738235473633,
+      "learning_rate": 9.988235294117646e-05,
+      "loss": 0.2869,
+      "step": 570
+    },
+    {
+      "epoch": 33.82,
+      "grad_norm": 0.8364090919494629,
+      "learning_rate": 9.811764705882352e-05,
+      "loss": 0.2907,
+      "step": 575
+    },
+    {
+      "epoch": 34.12,
+      "grad_norm": 0.8299133777618408,
+      "learning_rate": 9.635294117647058e-05,
+      "loss": 0.3208,
+      "step": 580
+    },
+    {
+      "epoch": 34.12,
+      "eval_loss": 0.2867657542228699,
+      "eval_runtime": 1.9208,
+      "eval_samples_per_second": 69.762,
+      "eval_steps_per_second": 8.85,
+      "step": 580
+    },
+    {
+      "epoch": 34.41,
+      "grad_norm": 0.9654828906059265,
+      "learning_rate": 9.458823529411764e-05,
+      "loss": 0.2868,
+      "step": 585
+    },
+    {
+      "epoch": 34.71,
+      "grad_norm": 0.8983874917030334,
+      "learning_rate": 9.282352941176469e-05,
+      "loss": 0.2827,
+      "step": 590
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 1.3956698179244995,
+      "learning_rate": 9.105882352941176e-05,
+      "loss": 0.3394,
+      "step": 595
+    },
+    {
+      "epoch": 35.29,
+      "grad_norm": 0.8269518613815308,
+      "learning_rate": 8.929411764705882e-05,
+      "loss": 0.3004,
+      "step": 600
+    },
+    {
+      "epoch": 35.29,
+      "eval_loss": 0.2868058979511261,
+      "eval_runtime": 1.922,
+      "eval_samples_per_second": 69.718,
+      "eval_steps_per_second": 8.845,
+      "step": 600
+    },
+    {
+      "epoch": 35.59,
+      "grad_norm": 0.7342262864112854,
+      "learning_rate": 8.752941176470586e-05,
+      "loss": 0.3018,
+      "step": 605
+    },
+    {
+      "epoch": 35.88,
+      "grad_norm": 0.8472093939781189,
+      "learning_rate": 8.576470588235294e-05,
+      "loss": 0.298,
+      "step": 610
+    },
+    {
+      "epoch": 36.18,
+      "grad_norm": 0.8700847625732422,
+      "learning_rate": 8.4e-05,
+      "loss": 0.3138,
+      "step": 615
+    },
+    {
+      "epoch": 36.47,
+      "grad_norm": 0.6622535586357117,
+      "learning_rate": 8.223529411764705e-05,
+      "loss": 0.2798,
+      "step": 620
+    },
+    {
+      "epoch": 36.47,
+      "eval_loss": 0.286550372838974,
+      "eval_runtime": 1.9225,
+      "eval_samples_per_second": 69.7,
+      "eval_steps_per_second": 8.843,
+      "step": 620
+    },
+    {
+      "epoch": 36.76,
+      "grad_norm": 0.8359465003013611,
+      "learning_rate": 8.047058823529411e-05,
+      "loss": 0.2864,
+      "step": 625
+    },
+    {
+      "epoch": 37.06,
+      "grad_norm": 0.9133177399635315,
+      "learning_rate": 7.870588235294116e-05,
+      "loss": 0.3314,
+      "step": 630
+    },
+    {
+      "epoch": 37.35,
+      "grad_norm": 0.9836115837097168,
+      "learning_rate": 7.694117647058823e-05,
+      "loss": 0.3159,
+      "step": 635
+    },
+    {
+      "epoch": 37.65,
+      "grad_norm": 0.7992879748344421,
+      "learning_rate": 7.517647058823529e-05,
+      "loss": 0.2773,
+      "step": 640
+    },
+    {
+      "epoch": 37.65,
+      "eval_loss": 0.28679487109184265,
+      "eval_runtime": 1.9224,
+      "eval_samples_per_second": 69.705,
+      "eval_steps_per_second": 8.843,
+      "step": 640
+    },
+    {
+      "epoch": 37.94,
+      "grad_norm": 0.9645748138427734,
+      "learning_rate": 7.341176470588235e-05,
+      "loss": 0.3025,
+      "step": 645
+    },
+    {
+      "epoch": 38.24,
+      "grad_norm": 0.7317566275596619,
+      "learning_rate": 7.164705882352941e-05,
+      "loss": 0.2833,
+      "step": 650
+    },
+    {
+      "epoch": 38.53,
+      "grad_norm": 1.154581069946289,
+      "learning_rate": 6.988235294117647e-05,
+      "loss": 0.3224,
+      "step": 655
+    },
+    {
+      "epoch": 38.82,
+      "grad_norm": 0.853617250919342,
+      "learning_rate": 6.811764705882353e-05,
+      "loss": 0.2803,
+      "step": 660
+    },
+    {
+      "epoch": 38.82,
+      "eval_loss": 0.2860107123851776,
+      "eval_runtime": 1.921,
+      "eval_samples_per_second": 69.754,
+      "eval_steps_per_second": 8.849,
+      "step": 660
+    },
+    {
+      "epoch": 39.12,
+      "grad_norm": 0.5558089017868042,
+      "learning_rate": 6.635294117647059e-05,
+      "loss": 0.2774,
+      "step": 665
+    },
+    {
+      "epoch": 39.41,
+      "grad_norm": 0.9582533240318298,
+      "learning_rate": 6.458823529411764e-05,
+      "loss": 0.2729,
+      "step": 670
+    },
+    {
+      "epoch": 39.71,
+      "grad_norm": 0.84820556640625,
+      "learning_rate": 6.28235294117647e-05,
+      "loss": 0.3111,
+      "step": 675
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 1.6696014404296875,
+      "learning_rate": 6.105882352941176e-05,
+      "loss": 0.325,
+      "step": 680
+    },
+    {
+      "epoch": 40.0,
+      "eval_loss": 0.2854286730289459,
+      "eval_runtime": 1.9213,
+      "eval_samples_per_second": 69.744,
+      "eval_steps_per_second": 8.848,
+      "step": 680
+    },
+    {
+      "epoch": 40.29,
+      "grad_norm": 0.9443490505218506,
+      "learning_rate": 5.9294117647058814e-05,
+      "loss": 0.2928,
+      "step": 685
+    },
+    {
+      "epoch": 40.59,
+      "grad_norm": 0.8773037195205688,
+      "learning_rate": 5.752941176470588e-05,
+      "loss": 0.2932,
+      "step": 690
+    },
+    {
+      "epoch": 40.88,
+      "grad_norm": 0.7517346739768982,
+      "learning_rate": 5.576470588235294e-05,
+      "loss": 0.2881,
+      "step": 695
+    },
+    {
+      "epoch": 41.18,
+      "grad_norm": 0.6551481485366821,
+      "learning_rate": 5.399999999999999e-05,
+      "loss": 0.2936,
+      "step": 700
+    },
+    {
+      "epoch": 41.18,
+      "eval_loss": 0.28510692715644836,
+      "eval_runtime": 1.9207,
+      "eval_samples_per_second": 69.767,
+      "eval_steps_per_second": 8.851,
+      "step": 700
+    },
+    {
+      "epoch": 41.47,
+      "grad_norm": 1.038570523262024,
+      "learning_rate": 5.223529411764705e-05,
+      "loss": 0.3111,
+      "step": 705
+    },
+    {
+      "epoch": 41.76,
+      "grad_norm": 0.9088438749313354,
+      "learning_rate": 5.0470588235294116e-05,
+      "loss": 0.3072,
+      "step": 710
+    },
+    {
+      "epoch": 42.06,
+      "grad_norm": 1.004533290863037,
+      "learning_rate": 4.8705882352941175e-05,
+      "loss": 0.2824,
+      "step": 715
+    },
+    {
+      "epoch": 42.35,
+      "grad_norm": 0.8459485173225403,
+      "learning_rate": 4.694117647058823e-05,
+      "loss": 0.3218,
+      "step": 720
+    },
+    {
+      "epoch": 42.35,
+      "eval_loss": 0.28572869300842285,
+      "eval_runtime": 1.9201,
+      "eval_samples_per_second": 69.788,
+      "eval_steps_per_second": 8.854,
+      "step": 720
+    },
+    {
+      "epoch": 42.65,
+      "grad_norm": 0.9539296627044678,
+      "learning_rate": 4.5176470588235286e-05,
+      "loss": 0.2714,
+      "step": 725
+    },
+    {
+      "epoch": 42.94,
+      "grad_norm": 1.0208020210266113,
+      "learning_rate": 4.341176470588235e-05,
+      "loss": 0.2865,
+      "step": 730
+    },
+    {
+      "epoch": 43.24,
+      "grad_norm": 0.8807281851768494,
+      "learning_rate": 4.164705882352941e-05,
+      "loss": 0.29,
+      "step": 735
+    },
+    {
+      "epoch": 43.53,
+      "grad_norm": 0.983396589756012,
+      "learning_rate": 3.988235294117646e-05,
+      "loss": 0.2992,
+      "step": 740
+    },
+    {
+      "epoch": 43.53,
+      "eval_loss": 0.28524819016456604,
+      "eval_runtime": 1.9225,
+      "eval_samples_per_second": 69.7,
+      "eval_steps_per_second": 8.843,
+      "step": 740
+    },
+    {
+      "epoch": 43.82,
+      "grad_norm": 0.9107353687286377,
+      "learning_rate": 3.811764705882352e-05,
+      "loss": 0.2823,
+      "step": 745
+    },
+    {
+      "epoch": 44.12,
+      "grad_norm": 0.8981562256813049,
+      "learning_rate": 3.635294117647058e-05,
+      "loss": 0.3103,
+      "step": 750
+    },
+    {
+      "epoch": 44.41,
+      "grad_norm": 0.8787701725959778,
+      "learning_rate": 3.458823529411765e-05,
+      "loss": 0.2844,
+      "step": 755
+    },
+    {
+      "epoch": 44.71,
+      "grad_norm": 0.7710863351821899,
+      "learning_rate": 3.28235294117647e-05,
+      "loss": 0.2864,
+      "step": 760
+    },
+    {
+      "epoch": 44.71,
+      "eval_loss": 0.28503838181495667,
+      "eval_runtime": 1.92,
+      "eval_samples_per_second": 69.791,
+      "eval_steps_per_second": 8.854,
+      "step": 760
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 1.724103331565857,
+      "learning_rate": 3.1058823529411765e-05,
+      "loss": 0.3147,
+      "step": 765
+    },
+    {
+      "epoch": 45.29,
+      "grad_norm": 0.855762779712677,
+      "learning_rate": 2.929411764705882e-05,
+      "loss": 0.2925,
+      "step": 770
+    },
+    {
+      "epoch": 45.59,
+      "grad_norm": 0.9360259771347046,
+      "learning_rate": 2.752941176470588e-05,
+      "loss": 0.2921,
+      "step": 775
+    },
+    {
+      "epoch": 45.88,
+      "grad_norm": 0.8693228960037231,
+      "learning_rate": 2.5764705882352938e-05,
+      "loss": 0.298,
+      "step": 780
+    },
+    {
+      "epoch": 45.88,
+      "eval_loss": 0.2839866876602173,
+      "eval_runtime": 1.9241,
+      "eval_samples_per_second": 69.644,
+      "eval_steps_per_second": 8.835,
+      "step": 780
+    },
+    {
+      "epoch": 46.18,
+      "grad_norm": 0.775013267993927,
+      "learning_rate": 2.3999999999999997e-05,
+      "loss": 0.2635,
+      "step": 785
+    },
+    {
+      "epoch": 46.47,
+      "grad_norm": 0.8748236298561096,
+      "learning_rate": 2.2235294117647056e-05,
+      "loss": 0.2809,
+      "step": 790
+    },
+    {
+      "epoch": 46.76,
+      "grad_norm": 0.9206897616386414,
+      "learning_rate": 2.0470588235294115e-05,
+      "loss": 0.2908,
+      "step": 795
+    },
+    {
+      "epoch": 47.06,
+      "grad_norm": 0.9479169845581055,
+      "learning_rate": 1.8705882352941174e-05,
+      "loss": 0.3173,
+      "step": 800
+    },
+    {
+      "epoch": 47.06,
+      "eval_loss": 0.2837767004966736,
+      "eval_runtime": 1.9213,
+      "eval_samples_per_second": 69.745,
+      "eval_steps_per_second": 8.848,
+      "step": 800
+    },
+    {
+      "epoch": 47.35,
+      "grad_norm": 0.8086124658584595,
+      "learning_rate": 1.6941176470588233e-05,
+      "loss": 0.269,
+      "step": 805
+    },
+    {
+      "epoch": 47.65,
+      "grad_norm": 0.9204294681549072,
+      "learning_rate": 1.5176470588235294e-05,
+      "loss": 0.2863,
+      "step": 810
+    },
+    {
+      "epoch": 47.94,
+      "grad_norm": 0.8865261673927307,
+      "learning_rate": 1.3411764705882353e-05,
+      "loss": 0.2973,
+      "step": 815
+    },
+    {
+      "epoch": 48.24,
+      "grad_norm": 0.7301666736602783,
+      "learning_rate": 1.1647058823529412e-05,
+      "loss": 0.2938,
+      "step": 820
+    },
+    {
+      "epoch": 48.24,
+      "eval_loss": 0.2837049663066864,
+      "eval_runtime": 1.92,
+      "eval_samples_per_second": 69.79,
+      "eval_steps_per_second": 8.854,
+      "step": 820
+    },
+    {
+      "epoch": 48.53,
+      "grad_norm": 0.8452021479606628,
+      "learning_rate": 9.88235294117647e-06,
+      "loss": 0.2715,
+      "step": 825
+    },
+    {
+      "epoch": 48.82,
+      "grad_norm": 0.9324547648429871,
+      "learning_rate": 8.117647058823528e-06,
+      "loss": 0.2914,
+      "step": 830
+    },
+    {
+      "epoch": 49.12,
+      "grad_norm": 0.7236127257347107,
+      "learning_rate": 6.352941176470587e-06,
+      "loss": 0.2979,
+      "step": 835
+    },
+    {
+      "epoch": 49.41,
+      "grad_norm": 0.8993136286735535,
+      "learning_rate": 4.588235294117647e-06,
+      "loss": 0.3072,
+      "step": 840
+    },
+    {
+      "epoch": 49.41,
+      "eval_loss": 0.28356996178627014,
+      "eval_runtime": 1.921,
+      "eval_samples_per_second": 69.754,
+      "eval_steps_per_second": 8.849,
+      "step": 840
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 850,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 20,
+  "total_flos": 7164579843932160.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89310ca0fa296cfd3190a1f966d6c5915fb9ebe5f1f4ad421e2c3b10b8cdbd9d
+size 5048