ayjays132 commited on
Commit
5074731
1 Parent(s): 84dd748

Upload 12 files

Browse files
config.json CHANGED
@@ -27,6 +27,7 @@
27
  "num_hidden_layers": 12,
28
  "pad_token_id": 0,
29
  "position_embedding_type": "absolute",
 
30
  "torch_dtype": "float32",
31
  "transformers_version": "4.37.2",
32
  "type_vocab_size": 2,
 
27
  "num_hidden_layers": 12,
28
  "pad_token_id": 0,
29
  "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
  "torch_dtype": "float32",
32
  "transformers_version": "4.37.2",
33
  "type_vocab_size": 2,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f799cdcc085774c23ccf5f4120f54a336a78a9c4ebe728a2eff8de48967d7598
3
  size 437961724
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3029633edda1e7c958a7a98a95df28eebc4700a7aada1736a5fcbce95f350f74
3
  size 437961724
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:463878b7a315533895202600e43838bfdd00a9594cbe4fa1389833d9b1cee09a
3
+ size 876044538
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59614382a5848cd194b6135dba711609088a9a1719669badea77f8e779f27f2c
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80b6323e3afe4eb24b96b5717a14f206b672405c6c570ae57429c71de8d10624
3
+ size 1064
special_tokens_map.json CHANGED
@@ -101,7 +101,13 @@
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
- "cls_token": "[CLS]",
 
 
 
 
 
 
105
  "eos_token": {
106
  "content": "</s>",
107
  "lstrip": false,
@@ -109,7 +115,13 @@
109
  "rstrip": false,
110
  "single_word": false
111
  },
112
- "mask_token": "[MASK]",
 
 
 
 
 
 
113
  "pad_token": {
114
  "content": "<pad>",
115
  "lstrip": false,
@@ -117,7 +129,13 @@
117
  "rstrip": false,
118
  "single_word": false
119
  },
120
- "sep_token": "[SEP]",
 
 
 
 
 
 
121
  "unk_token": {
122
  "content": "<unk>",
123
  "lstrip": false,
 
101
  "<extra_id_98>",
102
  "<extra_id_99>"
103
  ],
104
+ "cls_token": {
105
+ "content": "[CLS]",
106
+ "lstrip": false,
107
+ "normalized": false,
108
+ "rstrip": false,
109
+ "single_word": false
110
+ },
111
  "eos_token": {
112
  "content": "</s>",
113
  "lstrip": false,
 
115
  "rstrip": false,
116
  "single_word": false
117
  },
118
+ "mask_token": {
119
+ "content": "[MASK]",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false
124
+ },
125
  "pad_token": {
126
  "content": "<pad>",
127
  "lstrip": false,
 
129
  "rstrip": false,
130
  "single_word": false
131
  },
132
+ "sep_token": {
133
+ "content": "[SEP]",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false
138
+ },
139
  "unk_token": {
140
  "content": "<unk>",
141
  "lstrip": false,
tokenizer.json CHANGED
@@ -1,7 +1,21 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 512,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": {
11
+ "Fixed": 512
12
+ },
13
+ "direction": "Right",
14
+ "pad_to_multiple_of": null,
15
+ "pad_id": 30534,
16
+ "pad_type_id": 0,
17
+ "pad_token": "<pad>"
18
+ },
19
  "added_tokens": [
20
  {
21
  "id": 0,
trainer_state.json ADDED
@@ -0,0 +1,3141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7981651376146789,
3
+ "best_model_checkpoint": "./results\\checkpoint-3000",
4
+ "epoch": 0.9501187648456056,
5
+ "eval_steps": 50,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.3752969121140145e-07,
14
+ "loss": 1.1402,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 4.750593824228029e-07,
20
+ "loss": 1.0834,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.01,
25
+ "learning_rate": 7.125890736342043e-07,
26
+ "loss": 0.9403,
27
+ "step": 30
28
+ },
29
+ {
30
+ "epoch": 0.01,
31
+ "learning_rate": 9.501187648456058e-07,
32
+ "loss": 0.8238,
33
+ "step": 40
34
+ },
35
+ {
36
+ "epoch": 0.01,
37
+ "learning_rate": 1.187648456057007e-06,
38
+ "loss": 0.7737,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.01,
43
+ "eval_accuracy": 0.5091743119266054,
44
+ "eval_loss": 0.7308847904205322,
45
+ "eval_runtime": 10.532,
46
+ "eval_samples_per_second": 82.796,
47
+ "eval_steps_per_second": 5.222,
48
+ "step": 50
49
+ },
50
+ {
51
+ "epoch": 0.01,
52
+ "learning_rate": 1.4251781472684086e-06,
53
+ "loss": 0.7321,
54
+ "step": 60
55
+ },
56
+ {
57
+ "epoch": 0.02,
58
+ "learning_rate": 1.6627078384798101e-06,
59
+ "loss": 0.7221,
60
+ "step": 70
61
+ },
62
+ {
63
+ "epoch": 0.02,
64
+ "learning_rate": 1.9002375296912116e-06,
65
+ "loss": 0.7081,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.02,
70
+ "learning_rate": 2.137767220902613e-06,
71
+ "loss": 0.7255,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.02,
76
+ "learning_rate": 2.375296912114014e-06,
77
+ "loss": 0.7094,
78
+ "step": 100
79
+ },
80
+ {
81
+ "epoch": 0.02,
82
+ "eval_accuracy": 0.5091743119266054,
83
+ "eval_loss": 0.7017449736595154,
84
+ "eval_runtime": 10.4451,
85
+ "eval_samples_per_second": 83.484,
86
+ "eval_steps_per_second": 5.266,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "learning_rate": 2.612826603325416e-06,
92
+ "loss": 0.6745,
93
+ "step": 110
94
+ },
95
+ {
96
+ "epoch": 0.03,
97
+ "learning_rate": 2.850356294536817e-06,
98
+ "loss": 0.674,
99
+ "step": 120
100
+ },
101
+ {
102
+ "epoch": 0.03,
103
+ "learning_rate": 3.0878859857482185e-06,
104
+ "loss": 0.7235,
105
+ "step": 130
106
+ },
107
+ {
108
+ "epoch": 0.03,
109
+ "learning_rate": 3.3254156769596202e-06,
110
+ "loss": 0.7192,
111
+ "step": 140
112
+ },
113
+ {
114
+ "epoch": 0.04,
115
+ "learning_rate": 3.5629453681710215e-06,
116
+ "loss": 0.7033,
117
+ "step": 150
118
+ },
119
+ {
120
+ "epoch": 0.04,
121
+ "eval_accuracy": 0.4908256880733945,
122
+ "eval_loss": 0.7025471329689026,
123
+ "eval_runtime": 10.4492,
124
+ "eval_samples_per_second": 83.451,
125
+ "eval_steps_per_second": 5.264,
126
+ "step": 150
127
+ },
128
+ {
129
+ "epoch": 0.04,
130
+ "learning_rate": 3.8004750593824232e-06,
131
+ "loss": 0.7037,
132
+ "step": 160
133
+ },
134
+ {
135
+ "epoch": 0.04,
136
+ "learning_rate": 4.038004750593825e-06,
137
+ "loss": 0.6754,
138
+ "step": 170
139
+ },
140
+ {
141
+ "epoch": 0.04,
142
+ "learning_rate": 4.275534441805226e-06,
143
+ "loss": 0.7119,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.05,
148
+ "learning_rate": 4.513064133016627e-06,
149
+ "loss": 0.7077,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.05,
154
+ "learning_rate": 4.750593824228028e-06,
155
+ "loss": 0.698,
156
+ "step": 200
157
+ },
158
+ {
159
+ "epoch": 0.05,
160
+ "eval_accuracy": 0.4873853211009174,
161
+ "eval_loss": 0.7025485038757324,
162
+ "eval_runtime": 10.6349,
163
+ "eval_samples_per_second": 81.994,
164
+ "eval_steps_per_second": 5.172,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.05,
169
+ "learning_rate": 4.98812351543943e-06,
170
+ "loss": 0.6954,
171
+ "step": 210
172
+ },
173
+ {
174
+ "epoch": 0.05,
175
+ "learning_rate": 5.225653206650832e-06,
176
+ "loss": 0.7078,
177
+ "step": 220
178
+ },
179
+ {
180
+ "epoch": 0.05,
181
+ "learning_rate": 5.4631828978622335e-06,
182
+ "loss": 0.6521,
183
+ "step": 230
184
+ },
185
+ {
186
+ "epoch": 0.06,
187
+ "learning_rate": 5.700712589073634e-06,
188
+ "loss": 0.6937,
189
+ "step": 240
190
+ },
191
+ {
192
+ "epoch": 0.06,
193
+ "learning_rate": 5.938242280285035e-06,
194
+ "loss": 0.6965,
195
+ "step": 250
196
+ },
197
+ {
198
+ "epoch": 0.06,
199
+ "eval_accuracy": 0.5229357798165137,
200
+ "eval_loss": 0.6914423704147339,
201
+ "eval_runtime": 10.7015,
202
+ "eval_samples_per_second": 81.484,
203
+ "eval_steps_per_second": 5.139,
204
+ "step": 250
205
+ },
206
+ {
207
+ "epoch": 0.06,
208
+ "learning_rate": 6.175771971496437e-06,
209
+ "loss": 0.679,
210
+ "step": 260
211
+ },
212
+ {
213
+ "epoch": 0.06,
214
+ "learning_rate": 6.4133016627078396e-06,
215
+ "loss": 0.68,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.07,
220
+ "learning_rate": 6.6508313539192404e-06,
221
+ "loss": 0.679,
222
+ "step": 280
223
+ },
224
+ {
225
+ "epoch": 0.07,
226
+ "learning_rate": 6.888361045130641e-06,
227
+ "loss": 0.7055,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.07,
232
+ "learning_rate": 7.125890736342043e-06,
233
+ "loss": 0.6866,
234
+ "step": 300
235
+ },
236
+ {
237
+ "epoch": 0.07,
238
+ "eval_accuracy": 0.5091743119266054,
239
+ "eval_loss": 0.7029032707214355,
240
+ "eval_runtime": 10.4489,
241
+ "eval_samples_per_second": 83.453,
242
+ "eval_steps_per_second": 5.264,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.07,
247
+ "learning_rate": 7.363420427553444e-06,
248
+ "loss": 0.6868,
249
+ "step": 310
250
+ },
251
+ {
252
+ "epoch": 0.08,
253
+ "learning_rate": 7.6009501187648464e-06,
254
+ "loss": 0.6827,
255
+ "step": 320
256
+ },
257
+ {
258
+ "epoch": 0.08,
259
+ "learning_rate": 7.838479809976247e-06,
260
+ "loss": 0.6765,
261
+ "step": 330
262
+ },
263
+ {
264
+ "epoch": 0.08,
265
+ "learning_rate": 8.07600950118765e-06,
266
+ "loss": 0.6836,
267
+ "step": 340
268
+ },
269
+ {
270
+ "epoch": 0.08,
271
+ "learning_rate": 8.31353919239905e-06,
272
+ "loss": 0.6894,
273
+ "step": 350
274
+ },
275
+ {
276
+ "epoch": 0.08,
277
+ "eval_accuracy": 0.5068807339449541,
278
+ "eval_loss": 0.6934388875961304,
279
+ "eval_runtime": 9.7535,
280
+ "eval_samples_per_second": 89.404,
281
+ "eval_steps_per_second": 5.639,
282
+ "step": 350
283
+ },
284
+ {
285
+ "epoch": 0.09,
286
+ "learning_rate": 8.551068883610452e-06,
287
+ "loss": 0.6939,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 0.09,
292
+ "learning_rate": 8.788598574821852e-06,
293
+ "loss": 0.6903,
294
+ "step": 370
295
+ },
296
+ {
297
+ "epoch": 0.09,
298
+ "learning_rate": 9.026128266033253e-06,
299
+ "loss": 0.6508,
300
+ "step": 380
301
+ },
302
+ {
303
+ "epoch": 0.09,
304
+ "learning_rate": 9.263657957244656e-06,
305
+ "loss": 0.7377,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.1,
310
+ "learning_rate": 9.501187648456057e-06,
311
+ "loss": 0.7484,
312
+ "step": 400
313
+ },
314
+ {
315
+ "epoch": 0.1,
316
+ "eval_accuracy": 0.4908256880733945,
317
+ "eval_loss": 0.7315114140510559,
318
+ "eval_runtime": 11.129,
319
+ "eval_samples_per_second": 78.354,
320
+ "eval_steps_per_second": 4.942,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.1,
325
+ "learning_rate": 9.73871733966746e-06,
326
+ "loss": 0.6887,
327
+ "step": 410
328
+ },
329
+ {
330
+ "epoch": 0.1,
331
+ "learning_rate": 9.97624703087886e-06,
332
+ "loss": 0.6562,
333
+ "step": 420
334
+ },
335
+ {
336
+ "epoch": 0.1,
337
+ "learning_rate": 1.0213776722090261e-05,
338
+ "loss": 0.6696,
339
+ "step": 430
340
+ },
341
+ {
342
+ "epoch": 0.1,
343
+ "learning_rate": 1.0451306413301664e-05,
344
+ "loss": 0.7045,
345
+ "step": 440
346
+ },
347
+ {
348
+ "epoch": 0.11,
349
+ "learning_rate": 1.0688836104513065e-05,
350
+ "loss": 0.6498,
351
+ "step": 450
352
+ },
353
+ {
354
+ "epoch": 0.11,
355
+ "eval_accuracy": 0.5997706422018348,
356
+ "eval_loss": 0.6686387658119202,
357
+ "eval_runtime": 18.4439,
358
+ "eval_samples_per_second": 47.278,
359
+ "eval_steps_per_second": 2.982,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 0.11,
364
+ "learning_rate": 1.0926365795724467e-05,
365
+ "loss": 0.67,
366
+ "step": 460
367
+ },
368
+ {
369
+ "epoch": 0.11,
370
+ "learning_rate": 1.1163895486935868e-05,
371
+ "loss": 0.6821,
372
+ "step": 470
373
+ },
374
+ {
375
+ "epoch": 0.11,
376
+ "learning_rate": 1.1401425178147269e-05,
377
+ "loss": 0.6477,
378
+ "step": 480
379
+ },
380
+ {
381
+ "epoch": 0.12,
382
+ "learning_rate": 1.163895486935867e-05,
383
+ "loss": 0.6935,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.12,
388
+ "learning_rate": 1.187648456057007e-05,
389
+ "loss": 0.6687,
390
+ "step": 500
391
+ },
392
+ {
393
+ "epoch": 0.12,
394
+ "eval_accuracy": 0.5745412844036697,
395
+ "eval_loss": 0.6807542443275452,
396
+ "eval_runtime": 24.027,
397
+ "eval_samples_per_second": 36.293,
398
+ "eval_steps_per_second": 2.289,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.12,
403
+ "learning_rate": 1.2114014251781473e-05,
404
+ "loss": 0.6542,
405
+ "step": 510
406
+ },
407
+ {
408
+ "epoch": 0.12,
409
+ "learning_rate": 1.2351543942992874e-05,
410
+ "loss": 0.7159,
411
+ "step": 520
412
+ },
413
+ {
414
+ "epoch": 0.13,
415
+ "learning_rate": 1.2589073634204277e-05,
416
+ "loss": 0.7332,
417
+ "step": 530
418
+ },
419
+ {
420
+ "epoch": 0.13,
421
+ "learning_rate": 1.2826603325415679e-05,
422
+ "loss": 0.6521,
423
+ "step": 540
424
+ },
425
+ {
426
+ "epoch": 0.13,
427
+ "learning_rate": 1.3064133016627078e-05,
428
+ "loss": 0.673,
429
+ "step": 550
430
+ },
431
+ {
432
+ "epoch": 0.13,
433
+ "eval_accuracy": 0.6594036697247706,
434
+ "eval_loss": 0.6414512395858765,
435
+ "eval_runtime": 24.4855,
436
+ "eval_samples_per_second": 35.613,
437
+ "eval_steps_per_second": 2.246,
438
+ "step": 550
439
+ },
440
+ {
441
+ "epoch": 0.13,
442
+ "learning_rate": 1.3301662707838481e-05,
443
+ "loss": 0.6498,
444
+ "step": 560
445
+ },
446
+ {
447
+ "epoch": 0.14,
448
+ "learning_rate": 1.3539192399049882e-05,
449
+ "loss": 0.6634,
450
+ "step": 570
451
+ },
452
+ {
453
+ "epoch": 0.14,
454
+ "learning_rate": 1.3776722090261283e-05,
455
+ "loss": 0.6557,
456
+ "step": 580
457
+ },
458
+ {
459
+ "epoch": 0.14,
460
+ "learning_rate": 1.4014251781472683e-05,
461
+ "loss": 0.6924,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 0.14,
466
+ "learning_rate": 1.4251781472684086e-05,
467
+ "loss": 0.629,
468
+ "step": 600
469
+ },
470
+ {
471
+ "epoch": 0.14,
472
+ "eval_accuracy": 0.6192660550458715,
473
+ "eval_loss": 0.6346095204353333,
474
+ "eval_runtime": 21.937,
475
+ "eval_samples_per_second": 39.75,
476
+ "eval_steps_per_second": 2.507,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 0.14,
481
+ "learning_rate": 1.4489311163895489e-05,
482
+ "loss": 0.6623,
483
+ "step": 610
484
+ },
485
+ {
486
+ "epoch": 0.15,
487
+ "learning_rate": 1.4726840855106888e-05,
488
+ "loss": 0.6252,
489
+ "step": 620
490
+ },
491
+ {
492
+ "epoch": 0.15,
493
+ "learning_rate": 1.496437054631829e-05,
494
+ "loss": 0.6662,
495
+ "step": 630
496
+ },
497
+ {
498
+ "epoch": 0.15,
499
+ "learning_rate": 1.5201900237529693e-05,
500
+ "loss": 0.6038,
501
+ "step": 640
502
+ },
503
+ {
504
+ "epoch": 0.15,
505
+ "learning_rate": 1.5439429928741092e-05,
506
+ "loss": 0.611,
507
+ "step": 650
508
+ },
509
+ {
510
+ "epoch": 0.15,
511
+ "eval_accuracy": 0.6915137614678899,
512
+ "eval_loss": 0.5941868424415588,
513
+ "eval_runtime": 23.4369,
514
+ "eval_samples_per_second": 37.206,
515
+ "eval_steps_per_second": 2.347,
516
+ "step": 650
517
+ },
518
+ {
519
+ "epoch": 0.16,
520
+ "learning_rate": 1.5676959619952495e-05,
521
+ "loss": 0.5919,
522
+ "step": 660
523
+ },
524
+ {
525
+ "epoch": 0.16,
526
+ "learning_rate": 1.5914489311163897e-05,
527
+ "loss": 0.6268,
528
+ "step": 670
529
+ },
530
+ {
531
+ "epoch": 0.16,
532
+ "learning_rate": 1.61520190023753e-05,
533
+ "loss": 0.5221,
534
+ "step": 680
535
+ },
536
+ {
537
+ "epoch": 0.16,
538
+ "learning_rate": 1.63895486935867e-05,
539
+ "loss": 0.7314,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 0.17,
544
+ "learning_rate": 1.66270783847981e-05,
545
+ "loss": 0.7231,
546
+ "step": 700
547
+ },
548
+ {
549
+ "epoch": 0.17,
550
+ "eval_accuracy": 0.6410550458715596,
551
+ "eval_loss": 0.6234958171844482,
552
+ "eval_runtime": 17.243,
553
+ "eval_samples_per_second": 50.571,
554
+ "eval_steps_per_second": 3.19,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 0.17,
559
+ "learning_rate": 1.6864608076009504e-05,
560
+ "loss": 0.6203,
561
+ "step": 710
562
+ },
563
+ {
564
+ "epoch": 0.17,
565
+ "learning_rate": 1.7102137767220903e-05,
566
+ "loss": 0.6168,
567
+ "step": 720
568
+ },
569
+ {
570
+ "epoch": 0.17,
571
+ "learning_rate": 1.7339667458432306e-05,
572
+ "loss": 0.6334,
573
+ "step": 730
574
+ },
575
+ {
576
+ "epoch": 0.18,
577
+ "learning_rate": 1.7577197149643705e-05,
578
+ "loss": 0.6718,
579
+ "step": 740
580
+ },
581
+ {
582
+ "epoch": 0.18,
583
+ "learning_rate": 1.7814726840855108e-05,
584
+ "loss": 0.6445,
585
+ "step": 750
586
+ },
587
+ {
588
+ "epoch": 0.18,
589
+ "eval_accuracy": 0.7075688073394495,
590
+ "eval_loss": 0.5777224898338318,
591
+ "eval_runtime": 24.0528,
592
+ "eval_samples_per_second": 36.254,
593
+ "eval_steps_per_second": 2.287,
594
+ "step": 750
595
+ },
596
+ {
597
+ "epoch": 0.18,
598
+ "learning_rate": 1.8052256532066507e-05,
599
+ "loss": 0.6013,
600
+ "step": 760
601
+ },
602
+ {
603
+ "epoch": 0.18,
604
+ "learning_rate": 1.828978622327791e-05,
605
+ "loss": 0.5739,
606
+ "step": 770
607
+ },
608
+ {
609
+ "epoch": 0.19,
610
+ "learning_rate": 1.8527315914489312e-05,
611
+ "loss": 0.6245,
612
+ "step": 780
613
+ },
614
+ {
615
+ "epoch": 0.19,
616
+ "learning_rate": 1.876484560570071e-05,
617
+ "loss": 0.5358,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 0.19,
622
+ "learning_rate": 1.9002375296912114e-05,
623
+ "loss": 0.5986,
624
+ "step": 800
625
+ },
626
+ {
627
+ "epoch": 0.19,
628
+ "eval_accuracy": 0.6685779816513762,
629
+ "eval_loss": 0.6269965767860413,
630
+ "eval_runtime": 24.3364,
631
+ "eval_samples_per_second": 35.831,
632
+ "eval_steps_per_second": 2.26,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 0.19,
637
+ "learning_rate": 1.9239904988123516e-05,
638
+ "loss": 0.6833,
639
+ "step": 810
640
+ },
641
+ {
642
+ "epoch": 0.19,
643
+ "learning_rate": 1.947743467933492e-05,
644
+ "loss": 0.6864,
645
+ "step": 820
646
+ },
647
+ {
648
+ "epoch": 0.2,
649
+ "learning_rate": 1.9714964370546318e-05,
650
+ "loss": 0.5795,
651
+ "step": 830
652
+ },
653
+ {
654
+ "epoch": 0.2,
655
+ "learning_rate": 1.995249406175772e-05,
656
+ "loss": 0.6274,
657
+ "step": 840
658
+ },
659
+ {
660
+ "epoch": 0.2,
661
+ "learning_rate": 2.0190023752969123e-05,
662
+ "loss": 0.663,
663
+ "step": 850
664
+ },
665
+ {
666
+ "epoch": 0.2,
667
+ "eval_accuracy": 0.6972477064220184,
668
+ "eval_loss": 0.5657657384872437,
669
+ "eval_runtime": 26.7355,
670
+ "eval_samples_per_second": 32.616,
671
+ "eval_steps_per_second": 2.057,
672
+ "step": 850
673
+ },
674
+ {
675
+ "epoch": 0.2,
676
+ "learning_rate": 2.0427553444180522e-05,
677
+ "loss": 0.5579,
678
+ "step": 860
679
+ },
680
+ {
681
+ "epoch": 0.21,
682
+ "learning_rate": 2.0665083135391925e-05,
683
+ "loss": 0.5483,
684
+ "step": 870
685
+ },
686
+ {
687
+ "epoch": 0.21,
688
+ "learning_rate": 2.0902612826603327e-05,
689
+ "loss": 0.5827,
690
+ "step": 880
691
+ },
692
+ {
693
+ "epoch": 0.21,
694
+ "learning_rate": 2.114014251781473e-05,
695
+ "loss": 0.5757,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 0.21,
700
+ "learning_rate": 2.137767220902613e-05,
701
+ "loss": 0.6553,
702
+ "step": 900
703
+ },
704
+ {
705
+ "epoch": 0.21,
706
+ "eval_accuracy": 0.6112385321100917,
707
+ "eval_loss": 0.639819860458374,
708
+ "eval_runtime": 23.3332,
709
+ "eval_samples_per_second": 37.372,
710
+ "eval_steps_per_second": 2.357,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 0.22,
715
+ "learning_rate": 2.161520190023753e-05,
716
+ "loss": 0.5793,
717
+ "step": 910
718
+ },
719
+ {
720
+ "epoch": 0.22,
721
+ "learning_rate": 2.1852731591448934e-05,
722
+ "loss": 0.5839,
723
+ "step": 920
724
+ },
725
+ {
726
+ "epoch": 0.22,
727
+ "learning_rate": 2.2090261282660333e-05,
728
+ "loss": 0.5909,
729
+ "step": 930
730
+ },
731
+ {
732
+ "epoch": 0.22,
733
+ "learning_rate": 2.2327790973871736e-05,
734
+ "loss": 0.5239,
735
+ "step": 940
736
+ },
737
+ {
738
+ "epoch": 0.23,
739
+ "learning_rate": 2.2565320665083135e-05,
740
+ "loss": 0.5402,
741
+ "step": 950
742
+ },
743
+ {
744
+ "epoch": 0.23,
745
+ "eval_accuracy": 0.6915137614678899,
746
+ "eval_loss": 0.5853348970413208,
747
+ "eval_runtime": 28.0695,
748
+ "eval_samples_per_second": 31.066,
749
+ "eval_steps_per_second": 1.959,
750
+ "step": 950
751
+ },
752
+ {
753
+ "epoch": 0.23,
754
+ "learning_rate": 2.2802850356294538e-05,
755
+ "loss": 0.5396,
756
+ "step": 960
757
+ },
758
+ {
759
+ "epoch": 0.23,
760
+ "learning_rate": 2.3040380047505937e-05,
761
+ "loss": 0.5771,
762
+ "step": 970
763
+ },
764
+ {
765
+ "epoch": 0.23,
766
+ "learning_rate": 2.327790973871734e-05,
767
+ "loss": 0.5087,
768
+ "step": 980
769
+ },
770
+ {
771
+ "epoch": 0.24,
772
+ "learning_rate": 2.3515439429928742e-05,
773
+ "loss": 0.5752,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 0.24,
778
+ "learning_rate": 2.375296912114014e-05,
779
+ "loss": 0.7053,
780
+ "step": 1000
781
+ },
782
+ {
783
+ "epoch": 0.24,
784
+ "eval_accuracy": 0.5779816513761468,
785
+ "eval_loss": 0.7715405225753784,
786
+ "eval_runtime": 22.4698,
787
+ "eval_samples_per_second": 38.808,
788
+ "eval_steps_per_second": 2.448,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 0.24,
793
+ "learning_rate": 2.3990498812351544e-05,
794
+ "loss": 0.5453,
795
+ "step": 1010
796
+ },
797
+ {
798
+ "epoch": 0.24,
799
+ "learning_rate": 2.4228028503562946e-05,
800
+ "loss": 0.5469,
801
+ "step": 1020
802
+ },
803
+ {
804
+ "epoch": 0.24,
805
+ "learning_rate": 2.446555819477435e-05,
806
+ "loss": 0.5702,
807
+ "step": 1030
808
+ },
809
+ {
810
+ "epoch": 0.25,
811
+ "learning_rate": 2.4703087885985748e-05,
812
+ "loss": 0.519,
813
+ "step": 1040
814
+ },
815
+ {
816
+ "epoch": 0.25,
817
+ "learning_rate": 2.494061757719715e-05,
818
+ "loss": 0.613,
819
+ "step": 1050
820
+ },
821
+ {
822
+ "epoch": 0.25,
823
+ "eval_accuracy": 0.6112385321100917,
824
+ "eval_loss": 0.713302493095398,
825
+ "eval_runtime": 22.1389,
826
+ "eval_samples_per_second": 39.388,
827
+ "eval_steps_per_second": 2.484,
828
+ "step": 1050
829
+ },
830
+ {
831
+ "epoch": 0.25,
832
+ "learning_rate": 2.5178147268408553e-05,
833
+ "loss": 0.5345,
834
+ "step": 1060
835
+ },
836
+ {
837
+ "epoch": 0.25,
838
+ "learning_rate": 2.5415676959619956e-05,
839
+ "loss": 0.5408,
840
+ "step": 1070
841
+ },
842
+ {
843
+ "epoch": 0.26,
844
+ "learning_rate": 2.5653206650831358e-05,
845
+ "loss": 0.5472,
846
+ "step": 1080
847
+ },
848
+ {
849
+ "epoch": 0.26,
850
+ "learning_rate": 2.5890736342042754e-05,
851
+ "loss": 0.6234,
852
+ "step": 1090
853
+ },
854
+ {
855
+ "epoch": 0.26,
856
+ "learning_rate": 2.6128266033254157e-05,
857
+ "loss": 0.4965,
858
+ "step": 1100
859
+ },
860
+ {
861
+ "epoch": 0.26,
862
+ "eval_accuracy": 0.7385321100917431,
863
+ "eval_loss": 0.5148417353630066,
864
+ "eval_runtime": 11.4511,
865
+ "eval_samples_per_second": 76.15,
866
+ "eval_steps_per_second": 4.803,
867
+ "step": 1100
868
+ },
869
+ {
870
+ "epoch": 0.26,
871
+ "learning_rate": 2.636579572446556e-05,
872
+ "loss": 0.5747,
873
+ "step": 1110
874
+ },
875
+ {
876
+ "epoch": 0.27,
877
+ "learning_rate": 2.6603325415676962e-05,
878
+ "loss": 0.8352,
879
+ "step": 1120
880
+ },
881
+ {
882
+ "epoch": 0.27,
883
+ "learning_rate": 2.6840855106888364e-05,
884
+ "loss": 0.6406,
885
+ "step": 1130
886
+ },
887
+ {
888
+ "epoch": 0.27,
889
+ "learning_rate": 2.7078384798099763e-05,
890
+ "loss": 0.5701,
891
+ "step": 1140
892
+ },
893
+ {
894
+ "epoch": 0.27,
895
+ "learning_rate": 2.7315914489311166e-05,
896
+ "loss": 0.5114,
897
+ "step": 1150
898
+ },
899
+ {
900
+ "epoch": 0.27,
901
+ "eval_accuracy": 0.6467889908256881,
902
+ "eval_loss": 0.6622269749641418,
903
+ "eval_runtime": 11.4464,
904
+ "eval_samples_per_second": 76.181,
905
+ "eval_steps_per_second": 4.805,
906
+ "step": 1150
907
+ },
908
+ {
909
+ "epoch": 0.28,
910
+ "learning_rate": 2.7553444180522565e-05,
911
+ "loss": 0.6769,
912
+ "step": 1160
913
+ },
914
+ {
915
+ "epoch": 0.28,
916
+ "learning_rate": 2.7790973871733968e-05,
917
+ "loss": 0.5533,
918
+ "step": 1170
919
+ },
920
+ {
921
+ "epoch": 0.28,
922
+ "learning_rate": 2.8028503562945367e-05,
923
+ "loss": 0.4968,
924
+ "step": 1180
925
+ },
926
+ {
927
+ "epoch": 0.28,
928
+ "learning_rate": 2.826603325415677e-05,
929
+ "loss": 0.4944,
930
+ "step": 1190
931
+ },
932
+ {
933
+ "epoch": 0.29,
934
+ "learning_rate": 2.8503562945368172e-05,
935
+ "loss": 0.4682,
936
+ "step": 1200
937
+ },
938
+ {
939
+ "epoch": 0.29,
940
+ "eval_accuracy": 0.7213302752293578,
941
+ "eval_loss": 0.5471405386924744,
942
+ "eval_runtime": 11.4757,
943
+ "eval_samples_per_second": 75.987,
944
+ "eval_steps_per_second": 4.793,
945
+ "step": 1200
946
+ },
947
+ {
948
+ "epoch": 0.29,
949
+ "learning_rate": 2.8741092636579575e-05,
950
+ "loss": 0.5635,
951
+ "step": 1210
952
+ },
953
+ {
954
+ "epoch": 0.29,
955
+ "learning_rate": 2.8978622327790977e-05,
956
+ "loss": 0.4861,
957
+ "step": 1220
958
+ },
959
+ {
960
+ "epoch": 0.29,
961
+ "learning_rate": 2.9216152019002373e-05,
962
+ "loss": 0.5109,
963
+ "step": 1230
964
+ },
965
+ {
966
+ "epoch": 0.29,
967
+ "learning_rate": 2.9453681710213776e-05,
968
+ "loss": 0.5228,
969
+ "step": 1240
970
+ },
971
+ {
972
+ "epoch": 0.3,
973
+ "learning_rate": 2.9691211401425178e-05,
974
+ "loss": 0.4801,
975
+ "step": 1250
976
+ },
977
+ {
978
+ "epoch": 0.3,
979
+ "eval_accuracy": 0.7545871559633027,
980
+ "eval_loss": 0.4974513649940491,
981
+ "eval_runtime": 11.4419,
982
+ "eval_samples_per_second": 76.211,
983
+ "eval_steps_per_second": 4.807,
984
+ "step": 1250
985
+ },
986
+ {
987
+ "epoch": 0.3,
988
+ "learning_rate": 2.992874109263658e-05,
989
+ "loss": 0.3829,
990
+ "step": 1260
991
+ },
992
+ {
993
+ "epoch": 0.3,
994
+ "learning_rate": 3.0166270783847983e-05,
995
+ "loss": 0.527,
996
+ "step": 1270
997
+ },
998
+ {
999
+ "epoch": 0.3,
1000
+ "learning_rate": 3.0403800475059386e-05,
1001
+ "loss": 0.5883,
1002
+ "step": 1280
1003
+ },
1004
+ {
1005
+ "epoch": 0.31,
1006
+ "learning_rate": 3.064133016627079e-05,
1007
+ "loss": 0.5133,
1008
+ "step": 1290
1009
+ },
1010
+ {
1011
+ "epoch": 0.31,
1012
+ "learning_rate": 3.0878859857482184e-05,
1013
+ "loss": 0.5178,
1014
+ "step": 1300
1015
+ },
1016
+ {
1017
+ "epoch": 0.31,
1018
+ "eval_accuracy": 0.7236238532110092,
1019
+ "eval_loss": 0.5414833426475525,
1020
+ "eval_runtime": 11.5239,
1021
+ "eval_samples_per_second": 75.669,
1022
+ "eval_steps_per_second": 4.773,
1023
+ "step": 1300
1024
+ },
1025
+ {
1026
+ "epoch": 0.31,
1027
+ "learning_rate": 3.111638954869359e-05,
1028
+ "loss": 0.4607,
1029
+ "step": 1310
1030
+ },
1031
+ {
1032
+ "epoch": 0.31,
1033
+ "learning_rate": 3.135391923990499e-05,
1034
+ "loss": 0.4809,
1035
+ "step": 1320
1036
+ },
1037
+ {
1038
+ "epoch": 0.32,
1039
+ "learning_rate": 3.159144893111639e-05,
1040
+ "loss": 0.5025,
1041
+ "step": 1330
1042
+ },
1043
+ {
1044
+ "epoch": 0.32,
1045
+ "learning_rate": 3.1828978622327794e-05,
1046
+ "loss": 0.514,
1047
+ "step": 1340
1048
+ },
1049
+ {
1050
+ "epoch": 0.32,
1051
+ "learning_rate": 3.20665083135392e-05,
1052
+ "loss": 0.581,
1053
+ "step": 1350
1054
+ },
1055
+ {
1056
+ "epoch": 0.32,
1057
+ "eval_accuracy": 0.7614678899082569,
1058
+ "eval_loss": 0.5108276009559631,
1059
+ "eval_runtime": 11.4369,
1060
+ "eval_samples_per_second": 76.245,
1061
+ "eval_steps_per_second": 4.809,
1062
+ "step": 1350
1063
+ },
1064
+ {
1065
+ "epoch": 0.32,
1066
+ "learning_rate": 3.23040380047506e-05,
1067
+ "loss": 0.5912,
1068
+ "step": 1360
1069
+ },
1070
+ {
1071
+ "epoch": 0.33,
1072
+ "learning_rate": 3.2541567695961995e-05,
1073
+ "loss": 0.5702,
1074
+ "step": 1370
1075
+ },
1076
+ {
1077
+ "epoch": 0.33,
1078
+ "learning_rate": 3.27790973871734e-05,
1079
+ "loss": 0.5757,
1080
+ "step": 1380
1081
+ },
1082
+ {
1083
+ "epoch": 0.33,
1084
+ "learning_rate": 3.30166270783848e-05,
1085
+ "loss": 0.5328,
1086
+ "step": 1390
1087
+ },
1088
+ {
1089
+ "epoch": 0.33,
1090
+ "learning_rate": 3.32541567695962e-05,
1091
+ "loss": 0.5175,
1092
+ "step": 1400
1093
+ },
1094
+ {
1095
+ "epoch": 0.33,
1096
+ "eval_accuracy": 0.6651376146788991,
1097
+ "eval_loss": 0.5825160145759583,
1098
+ "eval_runtime": 12.4677,
1099
+ "eval_samples_per_second": 69.941,
1100
+ "eval_steps_per_second": 4.411,
1101
+ "step": 1400
1102
+ },
1103
+ {
1104
+ "epoch": 0.33,
1105
+ "learning_rate": 3.3491686460807606e-05,
1106
+ "loss": 0.4831,
1107
+ "step": 1410
1108
+ },
1109
+ {
1110
+ "epoch": 0.34,
1111
+ "learning_rate": 3.372921615201901e-05,
1112
+ "loss": 0.3822,
1113
+ "step": 1420
1114
+ },
1115
+ {
1116
+ "epoch": 0.34,
1117
+ "learning_rate": 3.396674584323041e-05,
1118
+ "loss": 0.4679,
1119
+ "step": 1430
1120
+ },
1121
+ {
1122
+ "epoch": 0.34,
1123
+ "learning_rate": 3.4204275534441806e-05,
1124
+ "loss": 0.4313,
1125
+ "step": 1440
1126
+ },
1127
+ {
1128
+ "epoch": 0.34,
1129
+ "learning_rate": 3.444180522565321e-05,
1130
+ "loss": 0.4897,
1131
+ "step": 1450
1132
+ },
1133
+ {
1134
+ "epoch": 0.34,
1135
+ "eval_accuracy": 0.7362385321100917,
1136
+ "eval_loss": 0.5593736171722412,
1137
+ "eval_runtime": 26.9471,
1138
+ "eval_samples_per_second": 32.36,
1139
+ "eval_steps_per_second": 2.041,
1140
+ "step": 1450
1141
+ },
1142
+ {
1143
+ "epoch": 0.35,
1144
+ "learning_rate": 3.467933491686461e-05,
1145
+ "loss": 0.4222,
1146
+ "step": 1460
1147
+ },
1148
+ {
1149
+ "epoch": 0.35,
1150
+ "learning_rate": 3.4916864608076014e-05,
1151
+ "loss": 0.4754,
1152
+ "step": 1470
1153
+ },
1154
+ {
1155
+ "epoch": 0.35,
1156
+ "learning_rate": 3.515439429928741e-05,
1157
+ "loss": 0.5092,
1158
+ "step": 1480
1159
+ },
1160
+ {
1161
+ "epoch": 0.35,
1162
+ "learning_rate": 3.539192399049881e-05,
1163
+ "loss": 0.4992,
1164
+ "step": 1490
1165
+ },
1166
+ {
1167
+ "epoch": 0.36,
1168
+ "learning_rate": 3.5629453681710215e-05,
1169
+ "loss": 0.5653,
1170
+ "step": 1500
1171
+ },
1172
+ {
1173
+ "epoch": 0.36,
1174
+ "eval_accuracy": 0.7672018348623854,
1175
+ "eval_loss": 0.48585110902786255,
1176
+ "eval_runtime": 24.8178,
1177
+ "eval_samples_per_second": 35.136,
1178
+ "eval_steps_per_second": 2.216,
1179
+ "step": 1500
1180
+ },
1181
+ {
1182
+ "epoch": 0.36,
1183
+ "learning_rate": 3.586698337292162e-05,
1184
+ "loss": 0.4569,
1185
+ "step": 1510
1186
+ },
1187
+ {
1188
+ "epoch": 0.36,
1189
+ "learning_rate": 3.6104513064133013e-05,
1190
+ "loss": 0.4708,
1191
+ "step": 1520
1192
+ },
1193
+ {
1194
+ "epoch": 0.36,
1195
+ "learning_rate": 3.6342042755344416e-05,
1196
+ "loss": 0.5629,
1197
+ "step": 1530
1198
+ },
1199
+ {
1200
+ "epoch": 0.37,
1201
+ "learning_rate": 3.657957244655582e-05,
1202
+ "loss": 0.4748,
1203
+ "step": 1540
1204
+ },
1205
+ {
1206
+ "epoch": 0.37,
1207
+ "learning_rate": 3.681710213776722e-05,
1208
+ "loss": 0.4647,
1209
+ "step": 1550
1210
+ },
1211
+ {
1212
+ "epoch": 0.37,
1213
+ "eval_accuracy": 0.7511467889908257,
1214
+ "eval_loss": 0.5034508109092712,
1215
+ "eval_runtime": 25.1309,
1216
+ "eval_samples_per_second": 34.698,
1217
+ "eval_steps_per_second": 2.189,
1218
+ "step": 1550
1219
+ },
1220
+ {
1221
+ "epoch": 0.37,
1222
+ "learning_rate": 3.7054631828978624e-05,
1223
+ "loss": 0.3937,
1224
+ "step": 1560
1225
+ },
1226
+ {
1227
+ "epoch": 0.37,
1228
+ "learning_rate": 3.7292161520190026e-05,
1229
+ "loss": 0.4446,
1230
+ "step": 1570
1231
+ },
1232
+ {
1233
+ "epoch": 0.38,
1234
+ "learning_rate": 3.752969121140142e-05,
1235
+ "loss": 0.4019,
1236
+ "step": 1580
1237
+ },
1238
+ {
1239
+ "epoch": 0.38,
1240
+ "learning_rate": 3.7767220902612825e-05,
1241
+ "loss": 0.4452,
1242
+ "step": 1590
1243
+ },
1244
+ {
1245
+ "epoch": 0.38,
1246
+ "learning_rate": 3.800475059382423e-05,
1247
+ "loss": 0.5062,
1248
+ "step": 1600
1249
+ },
1250
+ {
1251
+ "epoch": 0.38,
1252
+ "eval_accuracy": 0.7775229357798165,
1253
+ "eval_loss": 0.4737890362739563,
1254
+ "eval_runtime": 28.0162,
1255
+ "eval_samples_per_second": 31.125,
1256
+ "eval_steps_per_second": 1.963,
1257
+ "step": 1600
1258
+ },
1259
+ {
1260
+ "epoch": 0.38,
1261
+ "learning_rate": 3.824228028503563e-05,
1262
+ "loss": 0.5397,
1263
+ "step": 1610
1264
+ },
1265
+ {
1266
+ "epoch": 0.38,
1267
+ "learning_rate": 3.847980997624703e-05,
1268
+ "loss": 0.3397,
1269
+ "step": 1620
1270
+ },
1271
+ {
1272
+ "epoch": 0.39,
1273
+ "learning_rate": 3.8717339667458435e-05,
1274
+ "loss": 0.474,
1275
+ "step": 1630
1276
+ },
1277
+ {
1278
+ "epoch": 0.39,
1279
+ "learning_rate": 3.895486935866984e-05,
1280
+ "loss": 0.4831,
1281
+ "step": 1640
1282
+ },
1283
+ {
1284
+ "epoch": 0.39,
1285
+ "learning_rate": 3.919239904988123e-05,
1286
+ "loss": 0.4812,
1287
+ "step": 1650
1288
+ },
1289
+ {
1290
+ "epoch": 0.39,
1291
+ "eval_accuracy": 0.7362385321100917,
1292
+ "eval_loss": 0.5663760304450989,
1293
+ "eval_runtime": 23.8733,
1294
+ "eval_samples_per_second": 36.526,
1295
+ "eval_steps_per_second": 2.304,
1296
+ "step": 1650
1297
+ },
1298
+ {
1299
+ "epoch": 0.39,
1300
+ "learning_rate": 3.9429928741092636e-05,
1301
+ "loss": 0.4661,
1302
+ "step": 1660
1303
+ },
1304
+ {
1305
+ "epoch": 0.4,
1306
+ "learning_rate": 3.966745843230404e-05,
1307
+ "loss": 0.5724,
1308
+ "step": 1670
1309
+ },
1310
+ {
1311
+ "epoch": 0.4,
1312
+ "learning_rate": 3.990498812351544e-05,
1313
+ "loss": 0.5851,
1314
+ "step": 1680
1315
+ },
1316
+ {
1317
+ "epoch": 0.4,
1318
+ "learning_rate": 4.0142517814726843e-05,
1319
+ "loss": 0.5317,
1320
+ "step": 1690
1321
+ },
1322
+ {
1323
+ "epoch": 0.4,
1324
+ "learning_rate": 4.0380047505938246e-05,
1325
+ "loss": 0.4891,
1326
+ "step": 1700
1327
+ },
1328
+ {
1329
+ "epoch": 0.4,
1330
+ "eval_accuracy": 0.6123853211009175,
1331
+ "eval_loss": 0.7159540057182312,
1332
+ "eval_runtime": 21.4544,
1333
+ "eval_samples_per_second": 40.644,
1334
+ "eval_steps_per_second": 2.564,
1335
+ "step": 1700
1336
+ },
1337
+ {
1338
+ "epoch": 0.41,
1339
+ "learning_rate": 4.061757719714965e-05,
1340
+ "loss": 0.5505,
1341
+ "step": 1710
1342
+ },
1343
+ {
1344
+ "epoch": 0.41,
1345
+ "learning_rate": 4.0855106888361044e-05,
1346
+ "loss": 0.4684,
1347
+ "step": 1720
1348
+ },
1349
+ {
1350
+ "epoch": 0.41,
1351
+ "learning_rate": 4.109263657957245e-05,
1352
+ "loss": 0.3916,
1353
+ "step": 1730
1354
+ },
1355
+ {
1356
+ "epoch": 0.41,
1357
+ "learning_rate": 4.133016627078385e-05,
1358
+ "loss": 0.48,
1359
+ "step": 1740
1360
+ },
1361
+ {
1362
+ "epoch": 0.42,
1363
+ "learning_rate": 4.156769596199525e-05,
1364
+ "loss": 0.494,
1365
+ "step": 1750
1366
+ },
1367
+ {
1368
+ "epoch": 0.42,
1369
+ "eval_accuracy": 0.6961009174311926,
1370
+ "eval_loss": 0.6904969811439514,
1371
+ "eval_runtime": 12.0521,
1372
+ "eval_samples_per_second": 72.352,
1373
+ "eval_steps_per_second": 4.564,
1374
+ "step": 1750
1375
+ },
1376
+ {
1377
+ "epoch": 0.42,
1378
+ "learning_rate": 4.1805225653206655e-05,
1379
+ "loss": 0.4914,
1380
+ "step": 1760
1381
+ },
1382
+ {
1383
+ "epoch": 0.42,
1384
+ "learning_rate": 4.204275534441806e-05,
1385
+ "loss": 0.4142,
1386
+ "step": 1770
1387
+ },
1388
+ {
1389
+ "epoch": 0.42,
1390
+ "learning_rate": 4.228028503562946e-05,
1391
+ "loss": 0.3954,
1392
+ "step": 1780
1393
+ },
1394
+ {
1395
+ "epoch": 0.43,
1396
+ "learning_rate": 4.2517814726840856e-05,
1397
+ "loss": 0.4304,
1398
+ "step": 1790
1399
+ },
1400
+ {
1401
+ "epoch": 0.43,
1402
+ "learning_rate": 4.275534441805226e-05,
1403
+ "loss": 0.4446,
1404
+ "step": 1800
1405
+ },
1406
+ {
1407
+ "epoch": 0.43,
1408
+ "eval_accuracy": 0.7603211009174312,
1409
+ "eval_loss": 0.470760315656662,
1410
+ "eval_runtime": 11.4415,
1411
+ "eval_samples_per_second": 76.214,
1412
+ "eval_steps_per_second": 4.807,
1413
+ "step": 1800
1414
+ },
1415
+ {
1416
+ "epoch": 0.43,
1417
+ "learning_rate": 4.299287410926366e-05,
1418
+ "loss": 0.4779,
1419
+ "step": 1810
1420
+ },
1421
+ {
1422
+ "epoch": 0.43,
1423
+ "learning_rate": 4.323040380047506e-05,
1424
+ "loss": 0.4142,
1425
+ "step": 1820
1426
+ },
1427
+ {
1428
+ "epoch": 0.43,
1429
+ "learning_rate": 4.3467933491686466e-05,
1430
+ "loss": 0.4997,
1431
+ "step": 1830
1432
+ },
1433
+ {
1434
+ "epoch": 0.44,
1435
+ "learning_rate": 4.370546318289787e-05,
1436
+ "loss": 0.456,
1437
+ "step": 1840
1438
+ },
1439
+ {
1440
+ "epoch": 0.44,
1441
+ "learning_rate": 4.394299287410927e-05,
1442
+ "loss": 0.3551,
1443
+ "step": 1850
1444
+ },
1445
+ {
1446
+ "epoch": 0.44,
1447
+ "eval_accuracy": 0.6720183486238532,
1448
+ "eval_loss": 0.9761282801628113,
1449
+ "eval_runtime": 11.4416,
1450
+ "eval_samples_per_second": 76.213,
1451
+ "eval_steps_per_second": 4.807,
1452
+ "step": 1850
1453
+ },
1454
+ {
1455
+ "epoch": 0.44,
1456
+ "learning_rate": 4.418052256532067e-05,
1457
+ "loss": 0.5181,
1458
+ "step": 1860
1459
+ },
1460
+ {
1461
+ "epoch": 0.44,
1462
+ "learning_rate": 4.441805225653207e-05,
1463
+ "loss": 0.4539,
1464
+ "step": 1870
1465
+ },
1466
+ {
1467
+ "epoch": 0.45,
1468
+ "learning_rate": 4.465558194774347e-05,
1469
+ "loss": 0.5408,
1470
+ "step": 1880
1471
+ },
1472
+ {
1473
+ "epoch": 0.45,
1474
+ "learning_rate": 4.4893111638954874e-05,
1475
+ "loss": 0.4418,
1476
+ "step": 1890
1477
+ },
1478
+ {
1479
+ "epoch": 0.45,
1480
+ "learning_rate": 4.513064133016627e-05,
1481
+ "loss": 0.4393,
1482
+ "step": 1900
1483
+ },
1484
+ {
1485
+ "epoch": 0.45,
1486
+ "eval_accuracy": 0.7488532110091743,
1487
+ "eval_loss": 0.5114963054656982,
1488
+ "eval_runtime": 11.4593,
1489
+ "eval_samples_per_second": 76.095,
1490
+ "eval_steps_per_second": 4.8,
1491
+ "step": 1900
1492
+ },
1493
+ {
1494
+ "epoch": 0.45,
1495
+ "learning_rate": 4.536817102137767e-05,
1496
+ "loss": 0.3827,
1497
+ "step": 1910
1498
+ },
1499
+ {
1500
+ "epoch": 0.46,
1501
+ "learning_rate": 4.5605700712589075e-05,
1502
+ "loss": 0.3863,
1503
+ "step": 1920
1504
+ },
1505
+ {
1506
+ "epoch": 0.46,
1507
+ "learning_rate": 4.584323040380048e-05,
1508
+ "loss": 0.3994,
1509
+ "step": 1930
1510
+ },
1511
+ {
1512
+ "epoch": 0.46,
1513
+ "learning_rate": 4.6080760095011874e-05,
1514
+ "loss": 0.498,
1515
+ "step": 1940
1516
+ },
1517
+ {
1518
+ "epoch": 0.46,
1519
+ "learning_rate": 4.6318289786223276e-05,
1520
+ "loss": 0.4129,
1521
+ "step": 1950
1522
+ },
1523
+ {
1524
+ "epoch": 0.46,
1525
+ "eval_accuracy": 0.7924311926605505,
1526
+ "eval_loss": 0.4416314661502838,
1527
+ "eval_runtime": 27.8567,
1528
+ "eval_samples_per_second": 31.303,
1529
+ "eval_steps_per_second": 1.974,
1530
+ "step": 1950
1531
+ },
1532
+ {
1533
+ "epoch": 0.47,
1534
+ "learning_rate": 4.655581947743468e-05,
1535
+ "loss": 0.305,
1536
+ "step": 1960
1537
+ },
1538
+ {
1539
+ "epoch": 0.47,
1540
+ "learning_rate": 4.679334916864608e-05,
1541
+ "loss": 0.3489,
1542
+ "step": 1970
1543
+ },
1544
+ {
1545
+ "epoch": 0.47,
1546
+ "learning_rate": 4.7030878859857484e-05,
1547
+ "loss": 0.5725,
1548
+ "step": 1980
1549
+ },
1550
+ {
1551
+ "epoch": 0.47,
1552
+ "learning_rate": 4.7268408551068886e-05,
1553
+ "loss": 0.3962,
1554
+ "step": 1990
1555
+ },
1556
+ {
1557
+ "epoch": 0.48,
1558
+ "learning_rate": 4.750593824228028e-05,
1559
+ "loss": 0.428,
1560
+ "step": 2000
1561
+ },
1562
+ {
1563
+ "epoch": 0.48,
1564
+ "eval_accuracy": 0.713302752293578,
1565
+ "eval_loss": 0.5976735949516296,
1566
+ "eval_runtime": 26.9577,
1567
+ "eval_samples_per_second": 32.347,
1568
+ "eval_steps_per_second": 2.04,
1569
+ "step": 2000
1570
+ },
1571
+ {
1572
+ "epoch": 0.48,
1573
+ "learning_rate": 4.7743467933491685e-05,
1574
+ "loss": 0.4047,
1575
+ "step": 2010
1576
+ },
1577
+ {
1578
+ "epoch": 0.48,
1579
+ "learning_rate": 4.798099762470309e-05,
1580
+ "loss": 0.4644,
1581
+ "step": 2020
1582
+ },
1583
+ {
1584
+ "epoch": 0.48,
1585
+ "learning_rate": 4.821852731591449e-05,
1586
+ "loss": 0.5173,
1587
+ "step": 2030
1588
+ },
1589
+ {
1590
+ "epoch": 0.48,
1591
+ "learning_rate": 4.845605700712589e-05,
1592
+ "loss": 0.4207,
1593
+ "step": 2040
1594
+ },
1595
+ {
1596
+ "epoch": 0.49,
1597
+ "learning_rate": 4.8693586698337295e-05,
1598
+ "loss": 0.6847,
1599
+ "step": 2050
1600
+ },
1601
+ {
1602
+ "epoch": 0.49,
1603
+ "eval_accuracy": 0.7522935779816514,
1604
+ "eval_loss": 0.4740794599056244,
1605
+ "eval_runtime": 26.5007,
1606
+ "eval_samples_per_second": 32.905,
1607
+ "eval_steps_per_second": 2.075,
1608
+ "step": 2050
1609
+ },
1610
+ {
1611
+ "epoch": 0.49,
1612
+ "learning_rate": 4.89311163895487e-05,
1613
+ "loss": 0.4262,
1614
+ "step": 2060
1615
+ },
1616
+ {
1617
+ "epoch": 0.49,
1618
+ "learning_rate": 4.9168646080760093e-05,
1619
+ "loss": 0.3127,
1620
+ "step": 2070
1621
+ },
1622
+ {
1623
+ "epoch": 0.49,
1624
+ "learning_rate": 4.9406175771971496e-05,
1625
+ "loss": 0.4341,
1626
+ "step": 2080
1627
+ },
1628
+ {
1629
+ "epoch": 0.5,
1630
+ "learning_rate": 4.96437054631829e-05,
1631
+ "loss": 0.3944,
1632
+ "step": 2090
1633
+ },
1634
+ {
1635
+ "epoch": 0.5,
1636
+ "learning_rate": 4.98812351543943e-05,
1637
+ "loss": 0.4921,
1638
+ "step": 2100
1639
+ },
1640
+ {
1641
+ "epoch": 0.5,
1642
+ "eval_accuracy": 0.731651376146789,
1643
+ "eval_loss": 0.5092917680740356,
1644
+ "eval_runtime": 29.3929,
1645
+ "eval_samples_per_second": 29.667,
1646
+ "eval_steps_per_second": 1.871,
1647
+ "step": 2100
1648
+ },
1649
+ {
1650
+ "epoch": 0.5,
1651
+ "learning_rate": 4.998680390604381e-05,
1652
+ "loss": 0.4166,
1653
+ "step": 2110
1654
+ },
1655
+ {
1656
+ "epoch": 0.5,
1657
+ "learning_rate": 4.996041171813143e-05,
1658
+ "loss": 0.3623,
1659
+ "step": 2120
1660
+ },
1661
+ {
1662
+ "epoch": 0.51,
1663
+ "learning_rate": 4.9934019530219056e-05,
1664
+ "loss": 0.4034,
1665
+ "step": 2130
1666
+ },
1667
+ {
1668
+ "epoch": 0.51,
1669
+ "learning_rate": 4.990762734230668e-05,
1670
+ "loss": 0.4566,
1671
+ "step": 2140
1672
+ },
1673
+ {
1674
+ "epoch": 0.51,
1675
+ "learning_rate": 4.98812351543943e-05,
1676
+ "loss": 0.4414,
1677
+ "step": 2150
1678
+ },
1679
+ {
1680
+ "epoch": 0.51,
1681
+ "eval_accuracy": 0.713302752293578,
1682
+ "eval_loss": 0.6652459502220154,
1683
+ "eval_runtime": 25.6161,
1684
+ "eval_samples_per_second": 34.041,
1685
+ "eval_steps_per_second": 2.147,
1686
+ "step": 2150
1687
+ },
1688
+ {
1689
+ "epoch": 0.51,
1690
+ "learning_rate": 4.9854842966481924e-05,
1691
+ "loss": 0.4862,
1692
+ "step": 2160
1693
+ },
1694
+ {
1695
+ "epoch": 0.52,
1696
+ "learning_rate": 4.9831089997360785e-05,
1697
+ "loss": 0.5651,
1698
+ "step": 2170
1699
+ },
1700
+ {
1701
+ "epoch": 0.52,
1702
+ "learning_rate": 4.98046978094484e-05,
1703
+ "loss": 0.4251,
1704
+ "step": 2180
1705
+ },
1706
+ {
1707
+ "epoch": 0.52,
1708
+ "learning_rate": 4.9778305621536024e-05,
1709
+ "loss": 0.514,
1710
+ "step": 2190
1711
+ },
1712
+ {
1713
+ "epoch": 0.52,
1714
+ "learning_rate": 4.9751913433623646e-05,
1715
+ "loss": 0.3697,
1716
+ "step": 2200
1717
+ },
1718
+ {
1719
+ "epoch": 0.52,
1720
+ "eval_accuracy": 0.7350917431192661,
1721
+ "eval_loss": 0.5495473146438599,
1722
+ "eval_runtime": 27.2319,
1723
+ "eval_samples_per_second": 32.021,
1724
+ "eval_steps_per_second": 2.02,
1725
+ "step": 2200
1726
+ },
1727
+ {
1728
+ "epoch": 0.52,
1729
+ "learning_rate": 4.972552124571127e-05,
1730
+ "loss": 0.3555,
1731
+ "step": 2210
1732
+ },
1733
+ {
1734
+ "epoch": 0.53,
1735
+ "learning_rate": 4.969912905779889e-05,
1736
+ "loss": 0.3627,
1737
+ "step": 2220
1738
+ },
1739
+ {
1740
+ "epoch": 0.53,
1741
+ "learning_rate": 4.9672736869886514e-05,
1742
+ "loss": 0.353,
1743
+ "step": 2230
1744
+ },
1745
+ {
1746
+ "epoch": 0.53,
1747
+ "learning_rate": 4.964634468197414e-05,
1748
+ "loss": 0.3561,
1749
+ "step": 2240
1750
+ },
1751
+ {
1752
+ "epoch": 0.53,
1753
+ "learning_rate": 4.961995249406176e-05,
1754
+ "loss": 0.3599,
1755
+ "step": 2250
1756
+ },
1757
+ {
1758
+ "epoch": 0.53,
1759
+ "eval_accuracy": 0.783256880733945,
1760
+ "eval_loss": 0.4456700384616852,
1761
+ "eval_runtime": 27.007,
1762
+ "eval_samples_per_second": 32.288,
1763
+ "eval_steps_per_second": 2.037,
1764
+ "step": 2250
1765
+ },
1766
+ {
1767
+ "epoch": 0.54,
1768
+ "learning_rate": 4.959356030614938e-05,
1769
+ "loss": 0.3917,
1770
+ "step": 2260
1771
+ },
1772
+ {
1773
+ "epoch": 0.54,
1774
+ "learning_rate": 4.9567168118237005e-05,
1775
+ "loss": 0.3342,
1776
+ "step": 2270
1777
+ },
1778
+ {
1779
+ "epoch": 0.54,
1780
+ "learning_rate": 4.954077593032463e-05,
1781
+ "loss": 0.3964,
1782
+ "step": 2280
1783
+ },
1784
+ {
1785
+ "epoch": 0.54,
1786
+ "learning_rate": 4.9514383742412244e-05,
1787
+ "loss": 0.4588,
1788
+ "step": 2290
1789
+ },
1790
+ {
1791
+ "epoch": 0.55,
1792
+ "learning_rate": 4.9487991554499866e-05,
1793
+ "loss": 0.4021,
1794
+ "step": 2300
1795
+ },
1796
+ {
1797
+ "epoch": 0.55,
1798
+ "eval_accuracy": 0.7924311926605505,
1799
+ "eval_loss": 0.43415939807891846,
1800
+ "eval_runtime": 28.1746,
1801
+ "eval_samples_per_second": 30.95,
1802
+ "eval_steps_per_second": 1.952,
1803
+ "step": 2300
1804
+ },
1805
+ {
1806
+ "epoch": 0.55,
1807
+ "learning_rate": 4.946159936658749e-05,
1808
+ "loss": 0.3698,
1809
+ "step": 2310
1810
+ },
1811
+ {
1812
+ "epoch": 0.55,
1813
+ "learning_rate": 4.943520717867511e-05,
1814
+ "loss": 0.4352,
1815
+ "step": 2320
1816
+ },
1817
+ {
1818
+ "epoch": 0.55,
1819
+ "learning_rate": 4.9408814990762734e-05,
1820
+ "loss": 0.4253,
1821
+ "step": 2330
1822
+ },
1823
+ {
1824
+ "epoch": 0.56,
1825
+ "learning_rate": 4.938242280285036e-05,
1826
+ "loss": 0.3442,
1827
+ "step": 2340
1828
+ },
1829
+ {
1830
+ "epoch": 0.56,
1831
+ "learning_rate": 4.935603061493798e-05,
1832
+ "loss": 0.4341,
1833
+ "step": 2350
1834
+ },
1835
+ {
1836
+ "epoch": 0.56,
1837
+ "eval_accuracy": 0.7626146788990825,
1838
+ "eval_loss": 0.4870525598526001,
1839
+ "eval_runtime": 27.9701,
1840
+ "eval_samples_per_second": 31.176,
1841
+ "eval_steps_per_second": 1.966,
1842
+ "step": 2350
1843
+ },
1844
+ {
1845
+ "epoch": 0.56,
1846
+ "learning_rate": 4.93296384270256e-05,
1847
+ "loss": 0.3156,
1848
+ "step": 2360
1849
+ },
1850
+ {
1851
+ "epoch": 0.56,
1852
+ "learning_rate": 4.9303246239113225e-05,
1853
+ "loss": 0.3465,
1854
+ "step": 2370
1855
+ },
1856
+ {
1857
+ "epoch": 0.57,
1858
+ "learning_rate": 4.927685405120085e-05,
1859
+ "loss": 0.3555,
1860
+ "step": 2380
1861
+ },
1862
+ {
1863
+ "epoch": 0.57,
1864
+ "learning_rate": 4.9250461863288464e-05,
1865
+ "loss": 0.4034,
1866
+ "step": 2390
1867
+ },
1868
+ {
1869
+ "epoch": 0.57,
1870
+ "learning_rate": 4.9224069675376086e-05,
1871
+ "loss": 0.4811,
1872
+ "step": 2400
1873
+ },
1874
+ {
1875
+ "epoch": 0.57,
1876
+ "eval_accuracy": 0.694954128440367,
1877
+ "eval_loss": 1.0977351665496826,
1878
+ "eval_runtime": 25.7159,
1879
+ "eval_samples_per_second": 33.909,
1880
+ "eval_steps_per_second": 2.139,
1881
+ "step": 2400
1882
+ },
1883
+ {
1884
+ "epoch": 0.57,
1885
+ "learning_rate": 4.919767748746371e-05,
1886
+ "loss": 0.7109,
1887
+ "step": 2410
1888
+ },
1889
+ {
1890
+ "epoch": 0.57,
1891
+ "learning_rate": 4.917128529955133e-05,
1892
+ "loss": 0.4311,
1893
+ "step": 2420
1894
+ },
1895
+ {
1896
+ "epoch": 0.58,
1897
+ "learning_rate": 4.9144893111638955e-05,
1898
+ "loss": 0.4666,
1899
+ "step": 2430
1900
+ },
1901
+ {
1902
+ "epoch": 0.58,
1903
+ "learning_rate": 4.911850092372658e-05,
1904
+ "loss": 0.6718,
1905
+ "step": 2440
1906
+ },
1907
+ {
1908
+ "epoch": 0.58,
1909
+ "learning_rate": 4.90921087358142e-05,
1910
+ "loss": 0.417,
1911
+ "step": 2450
1912
+ },
1913
+ {
1914
+ "epoch": 0.58,
1915
+ "eval_accuracy": 0.7637614678899083,
1916
+ "eval_loss": 0.4990720748901367,
1917
+ "eval_runtime": 32.7311,
1918
+ "eval_samples_per_second": 26.641,
1919
+ "eval_steps_per_second": 1.68,
1920
+ "step": 2450
1921
+ },
1922
+ {
1923
+ "epoch": 0.58,
1924
+ "learning_rate": 4.906571654790182e-05,
1925
+ "loss": 0.4022,
1926
+ "step": 2460
1927
+ },
1928
+ {
1929
+ "epoch": 0.59,
1930
+ "learning_rate": 4.9039324359989445e-05,
1931
+ "loss": 0.3948,
1932
+ "step": 2470
1933
+ },
1934
+ {
1935
+ "epoch": 0.59,
1936
+ "learning_rate": 4.901293217207707e-05,
1937
+ "loss": 0.4361,
1938
+ "step": 2480
1939
+ },
1940
+ {
1941
+ "epoch": 0.59,
1942
+ "learning_rate": 4.898653998416469e-05,
1943
+ "loss": 0.3763,
1944
+ "step": 2490
1945
+ },
1946
+ {
1947
+ "epoch": 0.59,
1948
+ "learning_rate": 4.8960147796252307e-05,
1949
+ "loss": 0.4257,
1950
+ "step": 2500
1951
+ },
1952
+ {
1953
+ "epoch": 0.59,
1954
+ "eval_accuracy": 0.7626146788990825,
1955
+ "eval_loss": 0.6092020869255066,
1956
+ "eval_runtime": 17.7696,
1957
+ "eval_samples_per_second": 49.073,
1958
+ "eval_steps_per_second": 3.095,
1959
+ "step": 2500
1960
+ },
1961
+ {
1962
+ "epoch": 0.6,
1963
+ "learning_rate": 4.893375560833993e-05,
1964
+ "loss": 0.3702,
1965
+ "step": 2510
1966
+ },
1967
+ {
1968
+ "epoch": 0.6,
1969
+ "learning_rate": 4.890736342042755e-05,
1970
+ "loss": 0.3374,
1971
+ "step": 2520
1972
+ },
1973
+ {
1974
+ "epoch": 0.6,
1975
+ "learning_rate": 4.8880971232515175e-05,
1976
+ "loss": 0.4812,
1977
+ "step": 2530
1978
+ },
1979
+ {
1980
+ "epoch": 0.6,
1981
+ "learning_rate": 4.88545790446028e-05,
1982
+ "loss": 0.3349,
1983
+ "step": 2540
1984
+ },
1985
+ {
1986
+ "epoch": 0.61,
1987
+ "learning_rate": 4.882818685669042e-05,
1988
+ "loss": 0.4071,
1989
+ "step": 2550
1990
+ },
1991
+ {
1992
+ "epoch": 0.61,
1993
+ "eval_accuracy": 0.8084862385321101,
1994
+ "eval_loss": 0.44936081767082214,
1995
+ "eval_runtime": 11.5824,
1996
+ "eval_samples_per_second": 75.287,
1997
+ "eval_steps_per_second": 4.749,
1998
+ "step": 2550
1999
+ },
2000
+ {
2001
+ "epoch": 0.61,
2002
+ "learning_rate": 4.880179466877804e-05,
2003
+ "loss": 0.4774,
2004
+ "step": 2560
2005
+ },
2006
+ {
2007
+ "epoch": 0.61,
2008
+ "learning_rate": 4.8775402480865665e-05,
2009
+ "loss": 0.3806,
2010
+ "step": 2570
2011
+ },
2012
+ {
2013
+ "epoch": 0.61,
2014
+ "learning_rate": 4.874901029295329e-05,
2015
+ "loss": 0.38,
2016
+ "step": 2580
2017
+ },
2018
+ {
2019
+ "epoch": 0.62,
2020
+ "learning_rate": 4.872261810504091e-05,
2021
+ "loss": 0.4337,
2022
+ "step": 2590
2023
+ },
2024
+ {
2025
+ "epoch": 0.62,
2026
+ "learning_rate": 4.869622591712853e-05,
2027
+ "loss": 0.3033,
2028
+ "step": 2600
2029
+ },
2030
+ {
2031
+ "epoch": 0.62,
2032
+ "eval_accuracy": 0.783256880733945,
2033
+ "eval_loss": 0.4898684322834015,
2034
+ "eval_runtime": 28.0155,
2035
+ "eval_samples_per_second": 31.126,
2036
+ "eval_steps_per_second": 1.963,
2037
+ "step": 2600
2038
+ },
2039
+ {
2040
+ "epoch": 0.62,
2041
+ "learning_rate": 4.866983372921615e-05,
2042
+ "loss": 0.3711,
2043
+ "step": 2610
2044
+ },
2045
+ {
2046
+ "epoch": 0.62,
2047
+ "learning_rate": 4.864344154130377e-05,
2048
+ "loss": 0.3867,
2049
+ "step": 2620
2050
+ },
2051
+ {
2052
+ "epoch": 0.62,
2053
+ "learning_rate": 4.8617049353391395e-05,
2054
+ "loss": 0.4458,
2055
+ "step": 2630
2056
+ },
2057
+ {
2058
+ "epoch": 0.63,
2059
+ "learning_rate": 4.859065716547902e-05,
2060
+ "loss": 0.3669,
2061
+ "step": 2640
2062
+ },
2063
+ {
2064
+ "epoch": 0.63,
2065
+ "learning_rate": 4.856426497756664e-05,
2066
+ "loss": 0.4616,
2067
+ "step": 2650
2068
+ },
2069
+ {
2070
+ "epoch": 0.63,
2071
+ "eval_accuracy": 0.7844036697247706,
2072
+ "eval_loss": 0.46433430910110474,
2073
+ "eval_runtime": 21.9181,
2074
+ "eval_samples_per_second": 39.784,
2075
+ "eval_steps_per_second": 2.509,
2076
+ "step": 2650
2077
+ },
2078
+ {
2079
+ "epoch": 0.63,
2080
+ "learning_rate": 4.853787278965426e-05,
2081
+ "loss": 0.5175,
2082
+ "step": 2660
2083
+ },
2084
+ {
2085
+ "epoch": 0.63,
2086
+ "learning_rate": 4.8511480601741886e-05,
2087
+ "loss": 0.4101,
2088
+ "step": 2670
2089
+ },
2090
+ {
2091
+ "epoch": 0.64,
2092
+ "learning_rate": 4.848508841382951e-05,
2093
+ "loss": 0.3916,
2094
+ "step": 2680
2095
+ },
2096
+ {
2097
+ "epoch": 0.64,
2098
+ "learning_rate": 4.845869622591713e-05,
2099
+ "loss": 0.3209,
2100
+ "step": 2690
2101
+ },
2102
+ {
2103
+ "epoch": 0.64,
2104
+ "learning_rate": 4.8432304038004754e-05,
2105
+ "loss": 0.4432,
2106
+ "step": 2700
2107
+ },
2108
+ {
2109
+ "epoch": 0.64,
2110
+ "eval_accuracy": 0.7981651376146789,
2111
+ "eval_loss": 0.46843382716178894,
2112
+ "eval_runtime": 21.7437,
2113
+ "eval_samples_per_second": 40.104,
2114
+ "eval_steps_per_second": 2.529,
2115
+ "step": 2700
2116
+ },
2117
+ {
2118
+ "epoch": 0.64,
2119
+ "learning_rate": 4.840591185009237e-05,
2120
+ "loss": 0.33,
2121
+ "step": 2710
2122
+ },
2123
+ {
2124
+ "epoch": 0.65,
2125
+ "learning_rate": 4.837951966217999e-05,
2126
+ "loss": 0.3966,
2127
+ "step": 2720
2128
+ },
2129
+ {
2130
+ "epoch": 0.65,
2131
+ "learning_rate": 4.8353127474267615e-05,
2132
+ "loss": 0.312,
2133
+ "step": 2730
2134
+ },
2135
+ {
2136
+ "epoch": 0.65,
2137
+ "learning_rate": 4.832673528635524e-05,
2138
+ "loss": 0.3508,
2139
+ "step": 2740
2140
+ },
2141
+ {
2142
+ "epoch": 0.65,
2143
+ "learning_rate": 4.830034309844286e-05,
2144
+ "loss": 0.3636,
2145
+ "step": 2750
2146
+ },
2147
+ {
2148
+ "epoch": 0.65,
2149
+ "eval_accuracy": 0.7694954128440367,
2150
+ "eval_loss": 0.6283801198005676,
2151
+ "eval_runtime": 20.1708,
2152
+ "eval_samples_per_second": 43.231,
2153
+ "eval_steps_per_second": 2.727,
2154
+ "step": 2750
2155
+ },
2156
+ {
2157
+ "epoch": 0.66,
2158
+ "learning_rate": 4.827395091053048e-05,
2159
+ "loss": 0.5102,
2160
+ "step": 2760
2161
+ },
2162
+ {
2163
+ "epoch": 0.66,
2164
+ "learning_rate": 4.8247558722618106e-05,
2165
+ "loss": 0.4305,
2166
+ "step": 2770
2167
+ },
2168
+ {
2169
+ "epoch": 0.66,
2170
+ "learning_rate": 4.822116653470573e-05,
2171
+ "loss": 0.3684,
2172
+ "step": 2780
2173
+ },
2174
+ {
2175
+ "epoch": 0.66,
2176
+ "learning_rate": 4.819477434679335e-05,
2177
+ "loss": 0.3314,
2178
+ "step": 2790
2179
+ },
2180
+ {
2181
+ "epoch": 0.67,
2182
+ "learning_rate": 4.8168382158880974e-05,
2183
+ "loss": 0.4871,
2184
+ "step": 2800
2185
+ },
2186
+ {
2187
+ "epoch": 0.67,
2188
+ "eval_accuracy": 0.7729357798165137,
2189
+ "eval_loss": 0.5208825469017029,
2190
+ "eval_runtime": 21.7303,
2191
+ "eval_samples_per_second": 40.128,
2192
+ "eval_steps_per_second": 2.531,
2193
+ "step": 2800
2194
+ },
2195
+ {
2196
+ "epoch": 0.67,
2197
+ "learning_rate": 4.814198997096859e-05,
2198
+ "loss": 0.3994,
2199
+ "step": 2810
2200
+ },
2201
+ {
2202
+ "epoch": 0.67,
2203
+ "learning_rate": 4.811559778305621e-05,
2204
+ "loss": 0.3938,
2205
+ "step": 2820
2206
+ },
2207
+ {
2208
+ "epoch": 0.67,
2209
+ "learning_rate": 4.8089205595143835e-05,
2210
+ "loss": 0.361,
2211
+ "step": 2830
2212
+ },
2213
+ {
2214
+ "epoch": 0.67,
2215
+ "learning_rate": 4.806281340723146e-05,
2216
+ "loss": 0.3131,
2217
+ "step": 2840
2218
+ },
2219
+ {
2220
+ "epoch": 0.68,
2221
+ "learning_rate": 4.803642121931908e-05,
2222
+ "loss": 0.4091,
2223
+ "step": 2850
2224
+ },
2225
+ {
2226
+ "epoch": 0.68,
2227
+ "eval_accuracy": 0.8027522935779816,
2228
+ "eval_loss": 0.43396520614624023,
2229
+ "eval_runtime": 21.2308,
2230
+ "eval_samples_per_second": 41.072,
2231
+ "eval_steps_per_second": 2.591,
2232
+ "step": 2850
2233
+ },
2234
+ {
2235
+ "epoch": 0.68,
2236
+ "learning_rate": 4.80100290314067e-05,
2237
+ "loss": 0.386,
2238
+ "step": 2860
2239
+ },
2240
+ {
2241
+ "epoch": 0.68,
2242
+ "learning_rate": 4.7983636843494326e-05,
2243
+ "loss": 0.3953,
2244
+ "step": 2870
2245
+ },
2246
+ {
2247
+ "epoch": 0.68,
2248
+ "learning_rate": 4.795724465558195e-05,
2249
+ "loss": 0.4312,
2250
+ "step": 2880
2251
+ },
2252
+ {
2253
+ "epoch": 0.69,
2254
+ "learning_rate": 4.793085246766957e-05,
2255
+ "loss": 0.3507,
2256
+ "step": 2890
2257
+ },
2258
+ {
2259
+ "epoch": 0.69,
2260
+ "learning_rate": 4.7904460279757194e-05,
2261
+ "loss": 0.2085,
2262
+ "step": 2900
2263
+ },
2264
+ {
2265
+ "epoch": 0.69,
2266
+ "eval_accuracy": 0.8004587155963303,
2267
+ "eval_loss": 0.5883902311325073,
2268
+ "eval_runtime": 21.6546,
2269
+ "eval_samples_per_second": 40.269,
2270
+ "eval_steps_per_second": 2.54,
2271
+ "step": 2900
2272
+ },
2273
+ {
2274
+ "epoch": 0.69,
2275
+ "learning_rate": 4.7878068091844817e-05,
2276
+ "loss": 0.5439,
2277
+ "step": 2910
2278
+ },
2279
+ {
2280
+ "epoch": 0.69,
2281
+ "learning_rate": 4.785167590393243e-05,
2282
+ "loss": 0.4659,
2283
+ "step": 2920
2284
+ },
2285
+ {
2286
+ "epoch": 0.7,
2287
+ "learning_rate": 4.7825283716020055e-05,
2288
+ "loss": 0.495,
2289
+ "step": 2930
2290
+ },
2291
+ {
2292
+ "epoch": 0.7,
2293
+ "learning_rate": 4.779889152810768e-05,
2294
+ "loss": 0.4629,
2295
+ "step": 2940
2296
+ },
2297
+ {
2298
+ "epoch": 0.7,
2299
+ "learning_rate": 4.77724993401953e-05,
2300
+ "loss": 0.3517,
2301
+ "step": 2950
2302
+ },
2303
+ {
2304
+ "epoch": 0.7,
2305
+ "eval_accuracy": 0.7844036697247706,
2306
+ "eval_loss": 0.5798487067222595,
2307
+ "eval_runtime": 22.0365,
2308
+ "eval_samples_per_second": 39.571,
2309
+ "eval_steps_per_second": 2.496,
2310
+ "step": 2950
2311
+ },
2312
+ {
2313
+ "epoch": 0.7,
2314
+ "learning_rate": 4.774610715228292e-05,
2315
+ "loss": 0.5166,
2316
+ "step": 2960
2317
+ },
2318
+ {
2319
+ "epoch": 0.71,
2320
+ "learning_rate": 4.7719714964370546e-05,
2321
+ "loss": 0.4298,
2322
+ "step": 2970
2323
+ },
2324
+ {
2325
+ "epoch": 0.71,
2326
+ "learning_rate": 4.769332277645817e-05,
2327
+ "loss": 0.3885,
2328
+ "step": 2980
2329
+ },
2330
+ {
2331
+ "epoch": 0.71,
2332
+ "learning_rate": 4.766693058854579e-05,
2333
+ "loss": 0.4401,
2334
+ "step": 2990
2335
+ },
2336
+ {
2337
+ "epoch": 0.71,
2338
+ "learning_rate": 4.7640538400633414e-05,
2339
+ "loss": 0.37,
2340
+ "step": 3000
2341
+ },
2342
+ {
2343
+ "epoch": 0.71,
2344
+ "eval_accuracy": 0.7981651376146789,
2345
+ "eval_loss": 0.5207229852676392,
2346
+ "eval_runtime": 11.4929,
2347
+ "eval_samples_per_second": 75.873,
2348
+ "eval_steps_per_second": 4.786,
2349
+ "step": 3000
2350
+ },
2351
+ {
2352
+ "epoch": 0.71,
2353
+ "learning_rate": 4.761414621272104e-05,
2354
+ "loss": 0.434,
2355
+ "step": 3010
2356
+ },
2357
+ {
2358
+ "epoch": 0.72,
2359
+ "learning_rate": 4.758775402480866e-05,
2360
+ "loss": 0.3609,
2361
+ "step": 3020
2362
+ },
2363
+ {
2364
+ "epoch": 0.72,
2365
+ "learning_rate": 4.756136183689628e-05,
2366
+ "loss": 0.4148,
2367
+ "step": 3030
2368
+ },
2369
+ {
2370
+ "epoch": 0.72,
2371
+ "learning_rate": 4.7534969648983905e-05,
2372
+ "loss": 0.3513,
2373
+ "step": 3040
2374
+ },
2375
+ {
2376
+ "epoch": 0.72,
2377
+ "learning_rate": 4.750857746107152e-05,
2378
+ "loss": 0.4267,
2379
+ "step": 3050
2380
+ },
2381
+ {
2382
+ "epoch": 0.72,
2383
+ "eval_accuracy": 0.7752293577981652,
2384
+ "eval_loss": 0.4631665349006653,
2385
+ "eval_runtime": 11.4748,
2386
+ "eval_samples_per_second": 75.993,
2387
+ "eval_steps_per_second": 4.793,
2388
+ "step": 3050
2389
+ },
2390
+ {
2391
+ "epoch": 0.73,
2392
+ "learning_rate": 4.748218527315914e-05,
2393
+ "loss": 0.3484,
2394
+ "step": 3060
2395
+ },
2396
+ {
2397
+ "epoch": 0.73,
2398
+ "learning_rate": 4.7455793085246766e-05,
2399
+ "loss": 0.2985,
2400
+ "step": 3070
2401
+ },
2402
+ {
2403
+ "epoch": 0.73,
2404
+ "learning_rate": 4.742940089733439e-05,
2405
+ "loss": 0.4244,
2406
+ "step": 3080
2407
+ },
2408
+ {
2409
+ "epoch": 0.73,
2410
+ "learning_rate": 4.740300870942201e-05,
2411
+ "loss": 0.3836,
2412
+ "step": 3090
2413
+ },
2414
+ {
2415
+ "epoch": 0.74,
2416
+ "learning_rate": 4.7376616521509634e-05,
2417
+ "loss": 0.4646,
2418
+ "step": 3100
2419
+ },
2420
+ {
2421
+ "epoch": 0.74,
2422
+ "eval_accuracy": 0.7591743119266054,
2423
+ "eval_loss": 0.5199323296546936,
2424
+ "eval_runtime": 11.5496,
2425
+ "eval_samples_per_second": 75.5,
2426
+ "eval_steps_per_second": 4.762,
2427
+ "step": 3100
2428
+ },
2429
+ {
2430
+ "epoch": 0.74,
2431
+ "learning_rate": 4.735022433359726e-05,
2432
+ "loss": 0.3064,
2433
+ "step": 3110
2434
+ },
2435
+ {
2436
+ "epoch": 0.74,
2437
+ "learning_rate": 4.732383214568488e-05,
2438
+ "loss": 0.3248,
2439
+ "step": 3120
2440
+ },
2441
+ {
2442
+ "epoch": 0.74,
2443
+ "learning_rate": 4.72974399577725e-05,
2444
+ "loss": 0.3718,
2445
+ "step": 3130
2446
+ },
2447
+ {
2448
+ "epoch": 0.75,
2449
+ "learning_rate": 4.7271047769860125e-05,
2450
+ "loss": 0.3535,
2451
+ "step": 3140
2452
+ },
2453
+ {
2454
+ "epoch": 0.75,
2455
+ "learning_rate": 4.724465558194775e-05,
2456
+ "loss": 0.3569,
2457
+ "step": 3150
2458
+ },
2459
+ {
2460
+ "epoch": 0.75,
2461
+ "eval_accuracy": 0.7672018348623854,
2462
+ "eval_loss": 0.4929494559764862,
2463
+ "eval_runtime": 12.339,
2464
+ "eval_samples_per_second": 70.67,
2465
+ "eval_steps_per_second": 4.457,
2466
+ "step": 3150
2467
+ },
2468
+ {
2469
+ "epoch": 0.75,
2470
+ "learning_rate": 4.721826339403537e-05,
2471
+ "loss": 0.5133,
2472
+ "step": 3160
2473
+ },
2474
+ {
2475
+ "epoch": 0.75,
2476
+ "learning_rate": 4.7191871206122986e-05,
2477
+ "loss": 0.3535,
2478
+ "step": 3170
2479
+ },
2480
+ {
2481
+ "epoch": 0.76,
2482
+ "learning_rate": 4.716547901821061e-05,
2483
+ "loss": 0.4051,
2484
+ "step": 3180
2485
+ },
2486
+ {
2487
+ "epoch": 0.76,
2488
+ "learning_rate": 4.713908683029823e-05,
2489
+ "loss": 0.341,
2490
+ "step": 3190
2491
+ },
2492
+ {
2493
+ "epoch": 0.76,
2494
+ "learning_rate": 4.7112694642385854e-05,
2495
+ "loss": 0.3356,
2496
+ "step": 3200
2497
+ },
2498
+ {
2499
+ "epoch": 0.76,
2500
+ "eval_accuracy": 0.7844036697247706,
2501
+ "eval_loss": 0.4769574701786041,
2502
+ "eval_runtime": 11.5756,
2503
+ "eval_samples_per_second": 75.331,
2504
+ "eval_steps_per_second": 4.751,
2505
+ "step": 3200
2506
+ },
2507
+ {
2508
+ "epoch": 0.76,
2509
+ "learning_rate": 4.708630245447348e-05,
2510
+ "loss": 0.3053,
2511
+ "step": 3210
2512
+ },
2513
+ {
2514
+ "epoch": 0.76,
2515
+ "learning_rate": 4.70599102665611e-05,
2516
+ "loss": 0.3905,
2517
+ "step": 3220
2518
+ },
2519
+ {
2520
+ "epoch": 0.77,
2521
+ "learning_rate": 4.703351807864872e-05,
2522
+ "loss": 0.4397,
2523
+ "step": 3230
2524
+ },
2525
+ {
2526
+ "epoch": 0.77,
2527
+ "learning_rate": 4.7007125890736345e-05,
2528
+ "loss": 0.4199,
2529
+ "step": 3240
2530
+ },
2531
+ {
2532
+ "epoch": 0.77,
2533
+ "learning_rate": 4.698073370282397e-05,
2534
+ "loss": 0.3777,
2535
+ "step": 3250
2536
+ },
2537
+ {
2538
+ "epoch": 0.77,
2539
+ "eval_accuracy": 0.786697247706422,
2540
+ "eval_loss": 0.4674142897129059,
2541
+ "eval_runtime": 25.3754,
2542
+ "eval_samples_per_second": 34.364,
2543
+ "eval_steps_per_second": 2.167,
2544
+ "step": 3250
2545
+ },
2546
+ {
2547
+ "epoch": 0.77,
2548
+ "learning_rate": 4.695434151491159e-05,
2549
+ "loss": 0.3039,
2550
+ "step": 3260
2551
+ },
2552
+ {
2553
+ "epoch": 0.78,
2554
+ "learning_rate": 4.692794932699921e-05,
2555
+ "loss": 0.3881,
2556
+ "step": 3270
2557
+ },
2558
+ {
2559
+ "epoch": 0.78,
2560
+ "learning_rate": 4.6901557139086836e-05,
2561
+ "loss": 0.2602,
2562
+ "step": 3280
2563
+ },
2564
+ {
2565
+ "epoch": 0.78,
2566
+ "learning_rate": 4.687516495117445e-05,
2567
+ "loss": 0.3553,
2568
+ "step": 3290
2569
+ },
2570
+ {
2571
+ "epoch": 0.78,
2572
+ "learning_rate": 4.6848772763262074e-05,
2573
+ "loss": 0.3472,
2574
+ "step": 3300
2575
+ },
2576
+ {
2577
+ "epoch": 0.78,
2578
+ "eval_accuracy": 0.7373853211009175,
2579
+ "eval_loss": 0.5634092688560486,
2580
+ "eval_runtime": 26.2811,
2581
+ "eval_samples_per_second": 33.18,
2582
+ "eval_steps_per_second": 2.093,
2583
+ "step": 3300
2584
+ },
2585
+ {
2586
+ "epoch": 0.79,
2587
+ "learning_rate": 4.68223805753497e-05,
2588
+ "loss": 0.4972,
2589
+ "step": 3310
2590
+ },
2591
+ {
2592
+ "epoch": 0.79,
2593
+ "learning_rate": 4.679598838743732e-05,
2594
+ "loss": 0.3577,
2595
+ "step": 3320
2596
+ },
2597
+ {
2598
+ "epoch": 0.79,
2599
+ "learning_rate": 4.676959619952494e-05,
2600
+ "loss": 0.3653,
2601
+ "step": 3330
2602
+ },
2603
+ {
2604
+ "epoch": 0.79,
2605
+ "learning_rate": 4.6743204011612565e-05,
2606
+ "loss": 0.2476,
2607
+ "step": 3340
2608
+ },
2609
+ {
2610
+ "epoch": 0.8,
2611
+ "learning_rate": 4.671681182370019e-05,
2612
+ "loss": 0.4177,
2613
+ "step": 3350
2614
+ },
2615
+ {
2616
+ "epoch": 0.8,
2617
+ "eval_accuracy": 0.783256880733945,
2618
+ "eval_loss": 0.5188720226287842,
2619
+ "eval_runtime": 24.6069,
2620
+ "eval_samples_per_second": 35.437,
2621
+ "eval_steps_per_second": 2.235,
2622
+ "step": 3350
2623
+ },
2624
+ {
2625
+ "epoch": 0.8,
2626
+ "learning_rate": 4.669041963578781e-05,
2627
+ "loss": 0.3181,
2628
+ "step": 3360
2629
+ },
2630
+ {
2631
+ "epoch": 0.8,
2632
+ "learning_rate": 4.666402744787543e-05,
2633
+ "loss": 0.3579,
2634
+ "step": 3370
2635
+ },
2636
+ {
2637
+ "epoch": 0.8,
2638
+ "learning_rate": 4.6637635259963056e-05,
2639
+ "loss": 0.5733,
2640
+ "step": 3380
2641
+ },
2642
+ {
2643
+ "epoch": 0.81,
2644
+ "learning_rate": 4.661124307205068e-05,
2645
+ "loss": 0.3851,
2646
+ "step": 3390
2647
+ },
2648
+ {
2649
+ "epoch": 0.81,
2650
+ "learning_rate": 4.65848508841383e-05,
2651
+ "loss": 0.4028,
2652
+ "step": 3400
2653
+ },
2654
+ {
2655
+ "epoch": 0.81,
2656
+ "eval_accuracy": 0.7844036697247706,
2657
+ "eval_loss": 0.4956331253051758,
2658
+ "eval_runtime": 24.5882,
2659
+ "eval_samples_per_second": 35.464,
2660
+ "eval_steps_per_second": 2.237,
2661
+ "step": 3400
2662
+ },
2663
+ {
2664
+ "epoch": 0.81,
2665
+ "learning_rate": 4.6558458696225924e-05,
2666
+ "loss": 0.3687,
2667
+ "step": 3410
2668
+ },
2669
+ {
2670
+ "epoch": 0.81,
2671
+ "learning_rate": 4.653206650831354e-05,
2672
+ "loss": 0.353,
2673
+ "step": 3420
2674
+ },
2675
+ {
2676
+ "epoch": 0.81,
2677
+ "learning_rate": 4.650567432040116e-05,
2678
+ "loss": 0.3628,
2679
+ "step": 3430
2680
+ },
2681
+ {
2682
+ "epoch": 0.82,
2683
+ "learning_rate": 4.6479282132488785e-05,
2684
+ "loss": 0.5389,
2685
+ "step": 3440
2686
+ },
2687
+ {
2688
+ "epoch": 0.82,
2689
+ "learning_rate": 4.645288994457641e-05,
2690
+ "loss": 0.483,
2691
+ "step": 3450
2692
+ },
2693
+ {
2694
+ "epoch": 0.82,
2695
+ "eval_accuracy": 0.8084862385321101,
2696
+ "eval_loss": 0.43807780742645264,
2697
+ "eval_runtime": 24.4347,
2698
+ "eval_samples_per_second": 35.687,
2699
+ "eval_steps_per_second": 2.251,
2700
+ "step": 3450
2701
+ },
2702
+ {
2703
+ "epoch": 0.82,
2704
+ "learning_rate": 4.642649775666403e-05,
2705
+ "loss": 0.3414,
2706
+ "step": 3460
2707
+ },
2708
+ {
2709
+ "epoch": 0.82,
2710
+ "learning_rate": 4.640010556875165e-05,
2711
+ "loss": 0.4314,
2712
+ "step": 3470
2713
+ },
2714
+ {
2715
+ "epoch": 0.83,
2716
+ "learning_rate": 4.6373713380839276e-05,
2717
+ "loss": 0.2962,
2718
+ "step": 3480
2719
+ },
2720
+ {
2721
+ "epoch": 0.83,
2722
+ "learning_rate": 4.63473211929269e-05,
2723
+ "loss": 0.2627,
2724
+ "step": 3490
2725
+ },
2726
+ {
2727
+ "epoch": 0.83,
2728
+ "learning_rate": 4.632092900501452e-05,
2729
+ "loss": 0.3413,
2730
+ "step": 3500
2731
+ },
2732
+ {
2733
+ "epoch": 0.83,
2734
+ "eval_accuracy": 0.7935779816513762,
2735
+ "eval_loss": 0.5697915554046631,
2736
+ "eval_runtime": 14.9148,
2737
+ "eval_samples_per_second": 58.465,
2738
+ "eval_steps_per_second": 3.688,
2739
+ "step": 3500
2740
+ },
2741
+ {
2742
+ "epoch": 0.83,
2743
+ "learning_rate": 4.6294536817102144e-05,
2744
+ "loss": 0.4393,
2745
+ "step": 3510
2746
+ },
2747
+ {
2748
+ "epoch": 0.84,
2749
+ "learning_rate": 4.626814462918977e-05,
2750
+ "loss": 0.4142,
2751
+ "step": 3520
2752
+ },
2753
+ {
2754
+ "epoch": 0.84,
2755
+ "learning_rate": 4.624175244127739e-05,
2756
+ "loss": 0.3803,
2757
+ "step": 3530
2758
+ },
2759
+ {
2760
+ "epoch": 0.84,
2761
+ "learning_rate": 4.6215360253365005e-05,
2762
+ "loss": 0.4087,
2763
+ "step": 3540
2764
+ },
2765
+ {
2766
+ "epoch": 0.84,
2767
+ "learning_rate": 4.618896806545263e-05,
2768
+ "loss": 0.3966,
2769
+ "step": 3550
2770
+ },
2771
+ {
2772
+ "epoch": 0.84,
2773
+ "eval_accuracy": 0.786697247706422,
2774
+ "eval_loss": 0.47139275074005127,
2775
+ "eval_runtime": 23.4839,
2776
+ "eval_samples_per_second": 37.132,
2777
+ "eval_steps_per_second": 2.342,
2778
+ "step": 3550
2779
+ },
2780
+ {
2781
+ "epoch": 0.85,
2782
+ "learning_rate": 4.616257587754025e-05,
2783
+ "loss": 0.3998,
2784
+ "step": 3560
2785
+ },
2786
+ {
2787
+ "epoch": 0.85,
2788
+ "learning_rate": 4.6136183689627873e-05,
2789
+ "loss": 0.3903,
2790
+ "step": 3570
2791
+ },
2792
+ {
2793
+ "epoch": 0.85,
2794
+ "learning_rate": 4.6109791501715496e-05,
2795
+ "loss": 0.3425,
2796
+ "step": 3580
2797
+ },
2798
+ {
2799
+ "epoch": 0.85,
2800
+ "learning_rate": 4.608339931380312e-05,
2801
+ "loss": 0.321,
2802
+ "step": 3590
2803
+ },
2804
+ {
2805
+ "epoch": 0.86,
2806
+ "learning_rate": 4.605700712589074e-05,
2807
+ "loss": 0.3299,
2808
+ "step": 3600
2809
+ },
2810
+ {
2811
+ "epoch": 0.86,
2812
+ "eval_accuracy": 0.783256880733945,
2813
+ "eval_loss": 0.46382883191108704,
2814
+ "eval_runtime": 24.48,
2815
+ "eval_samples_per_second": 35.621,
2816
+ "eval_steps_per_second": 2.247,
2817
+ "step": 3600
2818
+ },
2819
+ {
2820
+ "epoch": 0.86,
2821
+ "learning_rate": 4.6030614937978364e-05,
2822
+ "loss": 0.4465,
2823
+ "step": 3610
2824
+ },
2825
+ {
2826
+ "epoch": 0.86,
2827
+ "learning_rate": 4.600422275006599e-05,
2828
+ "loss": 0.3761,
2829
+ "step": 3620
2830
+ },
2831
+ {
2832
+ "epoch": 0.86,
2833
+ "learning_rate": 4.597783056215361e-05,
2834
+ "loss": 0.346,
2835
+ "step": 3630
2836
+ },
2837
+ {
2838
+ "epoch": 0.86,
2839
+ "learning_rate": 4.595143837424123e-05,
2840
+ "loss": 0.3839,
2841
+ "step": 3640
2842
+ },
2843
+ {
2844
+ "epoch": 0.87,
2845
+ "learning_rate": 4.592504618632885e-05,
2846
+ "loss": 0.4783,
2847
+ "step": 3650
2848
+ },
2849
+ {
2850
+ "epoch": 0.87,
2851
+ "eval_accuracy": 0.7844036697247706,
2852
+ "eval_loss": 0.49812304973602295,
2853
+ "eval_runtime": 24.7287,
2854
+ "eval_samples_per_second": 35.263,
2855
+ "eval_steps_per_second": 2.224,
2856
+ "step": 3650
2857
+ },
2858
+ {
2859
+ "epoch": 0.87,
2860
+ "learning_rate": 4.589865399841647e-05,
2861
+ "loss": 0.3789,
2862
+ "step": 3660
2863
+ },
2864
+ {
2865
+ "epoch": 0.87,
2866
+ "learning_rate": 4.5872261810504094e-05,
2867
+ "loss": 0.4411,
2868
+ "step": 3670
2869
+ },
2870
+ {
2871
+ "epoch": 0.87,
2872
+ "learning_rate": 4.5845869622591716e-05,
2873
+ "loss": 0.4694,
2874
+ "step": 3680
2875
+ },
2876
+ {
2877
+ "epoch": 0.88,
2878
+ "learning_rate": 4.581947743467934e-05,
2879
+ "loss": 0.2994,
2880
+ "step": 3690
2881
+ },
2882
+ {
2883
+ "epoch": 0.88,
2884
+ "learning_rate": 4.579308524676696e-05,
2885
+ "loss": 0.4475,
2886
+ "step": 3700
2887
+ },
2888
+ {
2889
+ "epoch": 0.88,
2890
+ "eval_accuracy": 0.8027522935779816,
2891
+ "eval_loss": 0.4598585367202759,
2892
+ "eval_runtime": 24.9853,
2893
+ "eval_samples_per_second": 34.901,
2894
+ "eval_steps_per_second": 2.201,
2895
+ "step": 3700
2896
+ },
2897
+ {
2898
+ "epoch": 0.88,
2899
+ "learning_rate": 4.5766693058854584e-05,
2900
+ "loss": 0.3204,
2901
+ "step": 3710
2902
+ },
2903
+ {
2904
+ "epoch": 0.88,
2905
+ "learning_rate": 4.574030087094221e-05,
2906
+ "loss": 0.3833,
2907
+ "step": 3720
2908
+ },
2909
+ {
2910
+ "epoch": 0.89,
2911
+ "learning_rate": 4.571390868302983e-05,
2912
+ "loss": 0.2844,
2913
+ "step": 3730
2914
+ },
2915
+ {
2916
+ "epoch": 0.89,
2917
+ "learning_rate": 4.568751649511745e-05,
2918
+ "loss": 0.3206,
2919
+ "step": 3740
2920
+ },
2921
+ {
2922
+ "epoch": 0.89,
2923
+ "learning_rate": 4.5661124307205075e-05,
2924
+ "loss": 0.3527,
2925
+ "step": 3750
2926
+ },
2927
+ {
2928
+ "epoch": 0.89,
2929
+ "eval_accuracy": 0.7981651376146789,
2930
+ "eval_loss": 0.5331198573112488,
2931
+ "eval_runtime": 24.8534,
2932
+ "eval_samples_per_second": 35.086,
2933
+ "eval_steps_per_second": 2.213,
2934
+ "step": 3750
2935
+ },
2936
+ {
2937
+ "epoch": 0.89,
2938
+ "learning_rate": 4.563473211929269e-05,
2939
+ "loss": 0.3891,
2940
+ "step": 3760
2941
+ },
2942
+ {
2943
+ "epoch": 0.9,
2944
+ "learning_rate": 4.5608339931380314e-05,
2945
+ "loss": 0.324,
2946
+ "step": 3770
2947
+ },
2948
+ {
2949
+ "epoch": 0.9,
2950
+ "learning_rate": 4.5581947743467936e-05,
2951
+ "loss": 0.3532,
2952
+ "step": 3780
2953
+ },
2954
+ {
2955
+ "epoch": 0.9,
2956
+ "learning_rate": 4.555555555555556e-05,
2957
+ "loss": 0.3021,
2958
+ "step": 3790
2959
+ },
2960
+ {
2961
+ "epoch": 0.9,
2962
+ "learning_rate": 4.552916336764318e-05,
2963
+ "loss": 0.4124,
2964
+ "step": 3800
2965
+ },
2966
+ {
2967
+ "epoch": 0.9,
2968
+ "eval_accuracy": 0.7626146788990825,
2969
+ "eval_loss": 0.5969462394714355,
2970
+ "eval_runtime": 19.8238,
2971
+ "eval_samples_per_second": 43.988,
2972
+ "eval_steps_per_second": 2.774,
2973
+ "step": 3800
2974
+ },
2975
+ {
2976
+ "epoch": 0.9,
2977
+ "learning_rate": 4.5502771179730804e-05,
2978
+ "loss": 0.5429,
2979
+ "step": 3810
2980
+ },
2981
+ {
2982
+ "epoch": 0.91,
2983
+ "learning_rate": 4.547637899181843e-05,
2984
+ "loss": 0.4458,
2985
+ "step": 3820
2986
+ },
2987
+ {
2988
+ "epoch": 0.91,
2989
+ "learning_rate": 4.544998680390605e-05,
2990
+ "loss": 0.3272,
2991
+ "step": 3830
2992
+ },
2993
+ {
2994
+ "epoch": 0.91,
2995
+ "learning_rate": 4.542359461599367e-05,
2996
+ "loss": 0.3482,
2997
+ "step": 3840
2998
+ },
2999
+ {
3000
+ "epoch": 0.91,
3001
+ "learning_rate": 4.5397202428081295e-05,
3002
+ "loss": 0.3683,
3003
+ "step": 3850
3004
+ },
3005
+ {
3006
+ "epoch": 0.91,
3007
+ "eval_accuracy": 0.7775229357798165,
3008
+ "eval_loss": 0.5118904113769531,
3009
+ "eval_runtime": 11.5129,
3010
+ "eval_samples_per_second": 75.741,
3011
+ "eval_steps_per_second": 4.777,
3012
+ "step": 3850
3013
+ },
3014
+ {
3015
+ "epoch": 0.92,
3016
+ "learning_rate": 4.537081024016891e-05,
3017
+ "loss": 0.2501,
3018
+ "step": 3860
3019
+ },
3020
+ {
3021
+ "epoch": 0.92,
3022
+ "learning_rate": 4.5344418052256534e-05,
3023
+ "loss": 0.3256,
3024
+ "step": 3870
3025
+ },
3026
+ {
3027
+ "epoch": 0.92,
3028
+ "learning_rate": 4.5318025864344157e-05,
3029
+ "loss": 0.4049,
3030
+ "step": 3880
3031
+ },
3032
+ {
3033
+ "epoch": 0.92,
3034
+ "learning_rate": 4.529163367643178e-05,
3035
+ "loss": 0.2541,
3036
+ "step": 3890
3037
+ },
3038
+ {
3039
+ "epoch": 0.93,
3040
+ "learning_rate": 4.52652414885194e-05,
3041
+ "loss": 0.3894,
3042
+ "step": 3900
3043
+ },
3044
+ {
3045
+ "epoch": 0.93,
3046
+ "eval_accuracy": 0.8084862385321101,
3047
+ "eval_loss": 0.5941323041915894,
3048
+ "eval_runtime": 25.3823,
3049
+ "eval_samples_per_second": 34.355,
3050
+ "eval_steps_per_second": 2.167,
3051
+ "step": 3900
3052
+ },
3053
+ {
3054
+ "epoch": 0.93,
3055
+ "learning_rate": 4.5238849300607025e-05,
3056
+ "loss": 0.4195,
3057
+ "step": 3910
3058
+ },
3059
+ {
3060
+ "epoch": 0.93,
3061
+ "learning_rate": 4.521245711269465e-05,
3062
+ "loss": 0.2915,
3063
+ "step": 3920
3064
+ },
3065
+ {
3066
+ "epoch": 0.93,
3067
+ "learning_rate": 4.518606492478227e-05,
3068
+ "loss": 0.3745,
3069
+ "step": 3930
3070
+ },
3071
+ {
3072
+ "epoch": 0.94,
3073
+ "learning_rate": 4.515967273686989e-05,
3074
+ "loss": 0.3915,
3075
+ "step": 3940
3076
+ },
3077
+ {
3078
+ "epoch": 0.94,
3079
+ "learning_rate": 4.5133280548957515e-05,
3080
+ "loss": 0.4001,
3081
+ "step": 3950
3082
+ },
3083
+ {
3084
+ "epoch": 0.94,
3085
+ "eval_accuracy": 0.7717889908256881,
3086
+ "eval_loss": 0.4977372884750366,
3087
+ "eval_runtime": 11.5193,
3088
+ "eval_samples_per_second": 75.699,
3089
+ "eval_steps_per_second": 4.775,
3090
+ "step": 3950
3091
+ },
3092
+ {
3093
+ "epoch": 0.94,
3094
+ "learning_rate": 4.510688836104514e-05,
3095
+ "loss": 0.2936,
3096
+ "step": 3960
3097
+ },
3098
+ {
3099
+ "epoch": 0.94,
3100
+ "learning_rate": 4.5080496173132754e-05,
3101
+ "loss": 0.3909,
3102
+ "step": 3970
3103
+ },
3104
+ {
3105
+ "epoch": 0.95,
3106
+ "learning_rate": 4.505410398522038e-05,
3107
+ "loss": 0.356,
3108
+ "step": 3980
3109
+ },
3110
+ {
3111
+ "epoch": 0.95,
3112
+ "learning_rate": 4.5027711797308e-05,
3113
+ "loss": 0.3305,
3114
+ "step": 3990
3115
+ },
3116
+ {
3117
+ "epoch": 0.95,
3118
+ "learning_rate": 4.500131960939562e-05,
3119
+ "loss": 0.3394,
3120
+ "step": 4000
3121
+ },
3122
+ {
3123
+ "epoch": 0.95,
3124
+ "eval_accuracy": 0.7981651376146789,
3125
+ "eval_loss": 0.5128748416900635,
3126
+ "eval_runtime": 11.5097,
3127
+ "eval_samples_per_second": 75.762,
3128
+ "eval_steps_per_second": 4.779,
3129
+ "step": 4000
3130
+ }
3131
+ ],
3132
+ "logging_steps": 10,
3133
+ "max_steps": 21050,
3134
+ "num_input_tokens_seen": 0,
3135
+ "num_train_epochs": 5,
3136
+ "save_steps": 500,
3137
+ "total_flos": 1.6839258734592e+16,
3138
+ "train_batch_size": 16,
3139
+ "trial_name": null,
3140
+ "trial_params": null
3141
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bc585ad17bd13281f69216d6547ff6f14216d5477c2b78f12c8d3899f2679b3
3
+ size 4728