Robin Ding commited on
Commit
b6d8b1f
1 Parent(s): 5ae00d2

update 1030 models

Browse files
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/deepseek-coder-6.7b-base",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 32013,
9
+ "eos_token_id": 32014,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "max_position_embeddings": 16384,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 32,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": {
23
+ "factor": 4.0,
24
+ "rope_type": "linear",
25
+ "type": "linear"
26
+ },
27
+ "rope_theta": 100000,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "float16",
30
+ "transformers_version": "4.44.2",
31
+ "use_cache": true,
32
+ "vocab_size": 32256
33
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 32013,
4
+ "eos_token_id": 32014,
5
+ "transformers_version": "4.44.2"
6
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a16635bad922d2cf2eedc97088c6269934d3fe38b185e2791663432d0d307325
3
+ size 4941082400
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d62a0369c05e4081e0d938fcf240ff8c781f549ddc685236dd24bde0671da85
3
+ size 4947390768
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd50a3bdd97f74279fe736cafd4b008c7623671c645ca248aeae425958e92862
3
+ size 3592585888
model.safetensors.index.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 13481025536
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.norm.weight": "model-00003-of-00003.safetensors"
297
+ }
298
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "32000": {
7
+ "content": "õ",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": false
13
+ },
14
+ "32001": {
15
+ "content": "÷",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "32002": {
23
+ "content": "Á",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "32003": {
31
+ "content": "ý",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "32004": {
39
+ "content": "À",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "32005": {
47
+ "content": "ÿ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "32006": {
55
+ "content": "ø",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "32007": {
63
+ "content": "ú",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "32008": {
71
+ "content": "þ",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "32009": {
79
+ "content": "ü",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "32010": {
87
+ "content": "ù",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "32011": {
95
+ "content": "ö",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "32012": {
103
+ "content": "û",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "32013": {
111
+ "content": "<|begin▁of▁sentence|>",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "32014": {
119
+ "content": "<|end▁of▁sentence|>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "32015": {
127
+ "content": "<|fim▁hole|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "32016": {
135
+ "content": "<|fim▁begin|>",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "32017": {
143
+ "content": "<|fim▁end|>",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "32018": {
151
+ "content": "<pad>",
152
+ "lstrip": false,
153
+ "normalized": true,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "32019": {
159
+ "content": "<|User|>",
160
+ "lstrip": false,
161
+ "normalized": true,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "32020": {
167
+ "content": "<|Assistant|>",
168
+ "lstrip": false,
169
+ "normalized": true,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "32021": {
175
+ "content": "<|EOT|>",
176
+ "lstrip": false,
177
+ "normalized": true,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|end▁of▁sentence|>",
186
+ "legacy": true,
187
+ "model_max_length": 16384,
188
+ "pad_token": "<|end▁of▁sentence|>",
189
+ "sp_model_kwargs": {},
190
+ "tokenizer_class": "LlamaTokenizer",
191
+ "unk_token": null,
192
+ "use_default_system_prompt": false
193
+ }
trainer_state.json ADDED
@@ -0,0 +1,3133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9974842767295597,
5
+ "eval_steps": 20,
6
+ "global_step": 794,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.005031446540880503,
13
+ "grad_norm": 0.0,
14
+ "learning_rate": 0.0,
15
+ "loss": 0.8316,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.010062893081761006,
20
+ "grad_norm": 0.0,
21
+ "learning_rate": 0.0,
22
+ "loss": 0.8423,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.01509433962264151,
27
+ "grad_norm": 0.0,
28
+ "learning_rate": 0.0,
29
+ "loss": 0.8389,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.02012578616352201,
34
+ "grad_norm": 0.0,
35
+ "learning_rate": 0.0,
36
+ "loss": 0.8302,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.025157232704402517,
41
+ "grad_norm": 0.0,
42
+ "learning_rate": 0.0,
43
+ "loss": 0.8634,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.03018867924528302,
48
+ "grad_norm": 0.0,
49
+ "learning_rate": 0.0,
50
+ "loss": 0.8553,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.03522012578616352,
55
+ "grad_norm": 0.7739448547363281,
56
+ "learning_rate": 1.6666666666666667e-06,
57
+ "loss": 0.8423,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.04025157232704402,
62
+ "grad_norm": 0.7387983202934265,
63
+ "learning_rate": 5e-06,
64
+ "loss": 0.8385,
65
+ "step": 16
66
+ },
67
+ {
68
+ "epoch": 0.045283018867924525,
69
+ "grad_norm": 0.34386318922042847,
70
+ "learning_rate": 8.333333333333334e-06,
71
+ "loss": 0.8163,
72
+ "step": 18
73
+ },
74
+ {
75
+ "epoch": 0.050314465408805034,
76
+ "grad_norm": 0.32477569580078125,
77
+ "learning_rate": 1e-05,
78
+ "loss": 0.7541,
79
+ "step": 20
80
+ },
81
+ {
82
+ "epoch": 0.050314465408805034,
83
+ "eval_loss": 0.7434237599372864,
84
+ "eval_runtime": 512.9045,
85
+ "eval_samples_per_second": 20.873,
86
+ "eval_steps_per_second": 0.164,
87
+ "step": 20
88
+ },
89
+ {
90
+ "epoch": 0.055345911949685536,
91
+ "grad_norm": 0.3248128294944763,
92
+ "learning_rate": 1.3333333333333333e-05,
93
+ "loss": 0.7494,
94
+ "step": 22
95
+ },
96
+ {
97
+ "epoch": 0.06037735849056604,
98
+ "grad_norm": 0.25117382407188416,
99
+ "learning_rate": 1.6666666666666667e-05,
100
+ "loss": 0.7204,
101
+ "step": 24
102
+ },
103
+ {
104
+ "epoch": 0.06540880503144654,
105
+ "grad_norm": 0.23494713008403778,
106
+ "learning_rate": 2e-05,
107
+ "loss": 0.6902,
108
+ "step": 26
109
+ },
110
+ {
111
+ "epoch": 0.07044025157232704,
112
+ "grad_norm": 0.19120900332927704,
113
+ "learning_rate": 2.3333333333333336e-05,
114
+ "loss": 0.6971,
115
+ "step": 28
116
+ },
117
+ {
118
+ "epoch": 0.07547169811320754,
119
+ "grad_norm": 0.16960154473781586,
120
+ "learning_rate": 2.6666666666666667e-05,
121
+ "loss": 0.677,
122
+ "step": 30
123
+ },
124
+ {
125
+ "epoch": 0.08050314465408805,
126
+ "grad_norm": 0.16359932720661163,
127
+ "learning_rate": 3e-05,
128
+ "loss": 0.6863,
129
+ "step": 32
130
+ },
131
+ {
132
+ "epoch": 0.08553459119496855,
133
+ "grad_norm": 0.1389608234167099,
134
+ "learning_rate": 3.3333333333333335e-05,
135
+ "loss": 0.6786,
136
+ "step": 34
137
+ },
138
+ {
139
+ "epoch": 0.09056603773584905,
140
+ "grad_norm": 0.14994372427463531,
141
+ "learning_rate": 3.6666666666666666e-05,
142
+ "loss": 0.6692,
143
+ "step": 36
144
+ },
145
+ {
146
+ "epoch": 0.09559748427672957,
147
+ "grad_norm": 0.13334757089614868,
148
+ "learning_rate": 4e-05,
149
+ "loss": 0.666,
150
+ "step": 38
151
+ },
152
+ {
153
+ "epoch": 0.10062893081761007,
154
+ "grad_norm": 0.14932414889335632,
155
+ "learning_rate": 4.3333333333333334e-05,
156
+ "loss": 0.6771,
157
+ "step": 40
158
+ },
159
+ {
160
+ "epoch": 0.10062893081761007,
161
+ "eval_loss": 0.6487388610839844,
162
+ "eval_runtime": 477.079,
163
+ "eval_samples_per_second": 22.441,
164
+ "eval_steps_per_second": 0.176,
165
+ "step": 40
166
+ },
167
+ {
168
+ "epoch": 0.10566037735849057,
169
+ "grad_norm": 0.1352750062942505,
170
+ "learning_rate": 4.666666666666667e-05,
171
+ "loss": 0.668,
172
+ "step": 42
173
+ },
174
+ {
175
+ "epoch": 0.11069182389937107,
176
+ "grad_norm": 0.11694569140672684,
177
+ "learning_rate": 5e-05,
178
+ "loss": 0.6517,
179
+ "step": 44
180
+ },
181
+ {
182
+ "epoch": 0.11572327044025157,
183
+ "grad_norm": 0.12432057410478592,
184
+ "learning_rate": 4.9999239107866414e-05,
185
+ "loss": 0.6353,
186
+ "step": 46
187
+ },
188
+ {
189
+ "epoch": 0.12075471698113208,
190
+ "grad_norm": 0.121449314057827,
191
+ "learning_rate": 4.9996956482928485e-05,
192
+ "loss": 0.6348,
193
+ "step": 48
194
+ },
195
+ {
196
+ "epoch": 0.12578616352201258,
197
+ "grad_norm": 0.119928739964962,
198
+ "learning_rate": 4.999315227957123e-05,
199
+ "loss": 0.6402,
200
+ "step": 50
201
+ },
202
+ {
203
+ "epoch": 0.13081761006289308,
204
+ "grad_norm": 0.12270842492580414,
205
+ "learning_rate": 4.998782675509138e-05,
206
+ "loss": 0.6294,
207
+ "step": 52
208
+ },
209
+ {
210
+ "epoch": 0.13584905660377358,
211
+ "grad_norm": 0.1132158562541008,
212
+ "learning_rate": 4.998098026968003e-05,
213
+ "loss": 0.6417,
214
+ "step": 54
215
+ },
216
+ {
217
+ "epoch": 0.14088050314465408,
218
+ "grad_norm": 0.13522745668888092,
219
+ "learning_rate": 4.997261328639824e-05,
220
+ "loss": 0.6366,
221
+ "step": 56
222
+ },
223
+ {
224
+ "epoch": 0.1459119496855346,
225
+ "grad_norm": 0.11879543960094452,
226
+ "learning_rate": 4.996272637114571e-05,
227
+ "loss": 0.6335,
228
+ "step": 58
229
+ },
230
+ {
231
+ "epoch": 0.1509433962264151,
232
+ "grad_norm": 0.1235610768198967,
233
+ "learning_rate": 4.995132019262254e-05,
234
+ "loss": 0.6483,
235
+ "step": 60
236
+ },
237
+ {
238
+ "epoch": 0.1509433962264151,
239
+ "eval_loss": 0.6241472363471985,
240
+ "eval_runtime": 445.3592,
241
+ "eval_samples_per_second": 24.039,
242
+ "eval_steps_per_second": 0.189,
243
+ "step": 60
244
+ },
245
+ {
246
+ "epoch": 0.1559748427672956,
247
+ "grad_norm": 0.11320329457521439,
248
+ "learning_rate": 4.993839552228398e-05,
249
+ "loss": 0.6242,
250
+ "step": 62
251
+ },
252
+ {
253
+ "epoch": 0.1610062893081761,
254
+ "grad_norm": 0.12192777544260025,
255
+ "learning_rate": 4.992395323428824e-05,
256
+ "loss": 0.6214,
257
+ "step": 64
258
+ },
259
+ {
260
+ "epoch": 0.1660377358490566,
261
+ "grad_norm": 0.12313053756952286,
262
+ "learning_rate": 4.9907994305437405e-05,
263
+ "loss": 0.6331,
264
+ "step": 66
265
+ },
266
+ {
267
+ "epoch": 0.1710691823899371,
268
+ "grad_norm": 0.12243402749300003,
269
+ "learning_rate": 4.989051981511133e-05,
270
+ "loss": 0.6222,
271
+ "step": 68
272
+ },
273
+ {
274
+ "epoch": 0.1761006289308176,
275
+ "grad_norm": 0.12040385603904724,
276
+ "learning_rate": 4.9871530945194654e-05,
277
+ "loss": 0.6394,
278
+ "step": 70
279
+ },
280
+ {
281
+ "epoch": 0.1811320754716981,
282
+ "grad_norm": 0.11485008895397186,
283
+ "learning_rate": 4.985102897999687e-05,
284
+ "loss": 0.5755,
285
+ "step": 72
286
+ },
287
+ {
288
+ "epoch": 0.1861635220125786,
289
+ "grad_norm": 0.1206885501742363,
290
+ "learning_rate": 4.982901530616545e-05,
291
+ "loss": 0.6354,
292
+ "step": 74
293
+ },
294
+ {
295
+ "epoch": 0.19119496855345913,
296
+ "grad_norm": 0.12174931168556213,
297
+ "learning_rate": 4.980549141259205e-05,
298
+ "loss": 0.6362,
299
+ "step": 76
300
+ },
301
+ {
302
+ "epoch": 0.19622641509433963,
303
+ "grad_norm": 0.13587461411952972,
304
+ "learning_rate": 4.9780458890311846e-05,
305
+ "loss": 0.616,
306
+ "step": 78
307
+ },
308
+ {
309
+ "epoch": 0.20125786163522014,
310
+ "grad_norm": 0.12933531403541565,
311
+ "learning_rate": 4.9753919432395876e-05,
312
+ "loss": 0.6363,
313
+ "step": 80
314
+ },
315
+ {
316
+ "epoch": 0.20125786163522014,
317
+ "eval_loss": 0.6132378578186035,
318
+ "eval_runtime": 463.9686,
319
+ "eval_samples_per_second": 23.075,
320
+ "eval_steps_per_second": 0.181,
321
+ "step": 80
322
+ },
323
+ {
324
+ "epoch": 0.20628930817610064,
325
+ "grad_norm": 0.11656603217124939,
326
+ "learning_rate": 4.9725874833836574e-05,
327
+ "loss": 0.62,
328
+ "step": 82
329
+ },
330
+ {
331
+ "epoch": 0.21132075471698114,
332
+ "grad_norm": 0.11784744262695312,
333
+ "learning_rate": 4.969632699142632e-05,
334
+ "loss": 0.6094,
335
+ "step": 84
336
+ },
337
+ {
338
+ "epoch": 0.21635220125786164,
339
+ "grad_norm": 0.11344680190086365,
340
+ "learning_rate": 4.966527790362919e-05,
341
+ "loss": 0.5968,
342
+ "step": 86
343
+ },
344
+ {
345
+ "epoch": 0.22138364779874214,
346
+ "grad_norm": 0.11833988130092621,
347
+ "learning_rate": 4.963272967044579e-05,
348
+ "loss": 0.6306,
349
+ "step": 88
350
+ },
351
+ {
352
+ "epoch": 0.22641509433962265,
353
+ "grad_norm": 0.11473698914051056,
354
+ "learning_rate": 4.959868449327119e-05,
355
+ "loss": 0.6152,
356
+ "step": 90
357
+ },
358
+ {
359
+ "epoch": 0.23144654088050315,
360
+ "grad_norm": 0.10996667295694351,
361
+ "learning_rate": 4.9563144674746046e-05,
362
+ "loss": 0.595,
363
+ "step": 92
364
+ },
365
+ {
366
+ "epoch": 0.23647798742138365,
367
+ "grad_norm": 0.11677283048629761,
368
+ "learning_rate": 4.952611261860089e-05,
369
+ "loss": 0.5967,
370
+ "step": 94
371
+ },
372
+ {
373
+ "epoch": 0.24150943396226415,
374
+ "grad_norm": 0.1206679567694664,
375
+ "learning_rate": 4.9487590829493514e-05,
376
+ "loss": 0.604,
377
+ "step": 96
378
+ },
379
+ {
380
+ "epoch": 0.24654088050314465,
381
+ "grad_norm": 0.12233300507068634,
382
+ "learning_rate": 4.944758191283959e-05,
383
+ "loss": 0.6043,
384
+ "step": 98
385
+ },
386
+ {
387
+ "epoch": 0.25157232704402516,
388
+ "grad_norm": 0.11701351404190063,
389
+ "learning_rate": 4.940608857463644e-05,
390
+ "loss": 0.6,
391
+ "step": 100
392
+ },
393
+ {
394
+ "epoch": 0.25157232704402516,
395
+ "eval_loss": 0.6065506339073181,
396
+ "eval_runtime": 422.5948,
397
+ "eval_samples_per_second": 25.334,
398
+ "eval_steps_per_second": 0.199,
399
+ "step": 100
400
+ },
401
+ {
402
+ "epoch": 0.25660377358490566,
403
+ "grad_norm": 0.1179809644818306,
404
+ "learning_rate": 4.9363113621280036e-05,
405
+ "loss": 0.6406,
406
+ "step": 102
407
+ },
408
+ {
409
+ "epoch": 0.26163522012578616,
410
+ "grad_norm": 0.12534746527671814,
411
+ "learning_rate": 4.931865995937519e-05,
412
+ "loss": 0.6227,
413
+ "step": 104
414
+ },
415
+ {
416
+ "epoch": 0.26666666666666666,
417
+ "grad_norm": 0.12294790893793106,
418
+ "learning_rate": 4.927273059553892e-05,
419
+ "loss": 0.5957,
420
+ "step": 106
421
+ },
422
+ {
423
+ "epoch": 0.27169811320754716,
424
+ "grad_norm": 0.12756744027137756,
425
+ "learning_rate": 4.9225328636197144e-05,
426
+ "loss": 0.6226,
427
+ "step": 108
428
+ },
429
+ {
430
+ "epoch": 0.27672955974842767,
431
+ "grad_norm": 0.12662391364574432,
432
+ "learning_rate": 4.9176457287374584e-05,
433
+ "loss": 0.5899,
434
+ "step": 110
435
+ },
436
+ {
437
+ "epoch": 0.28176100628930817,
438
+ "grad_norm": 0.12516862154006958,
439
+ "learning_rate": 4.912611985447789e-05,
440
+ "loss": 0.6238,
441
+ "step": 112
442
+ },
443
+ {
444
+ "epoch": 0.28679245283018867,
445
+ "grad_norm": 0.116559699177742,
446
+ "learning_rate": 4.907431974207211e-05,
447
+ "loss": 0.6112,
448
+ "step": 114
449
+ },
450
+ {
451
+ "epoch": 0.2918238993710692,
452
+ "grad_norm": 0.11833405494689941,
453
+ "learning_rate": 4.90210604536504e-05,
454
+ "loss": 0.6087,
455
+ "step": 116
456
+ },
457
+ {
458
+ "epoch": 0.2968553459119497,
459
+ "grad_norm": 0.1188972070813179,
460
+ "learning_rate": 4.896634559139707e-05,
461
+ "loss": 0.594,
462
+ "step": 118
463
+ },
464
+ {
465
+ "epoch": 0.3018867924528302,
466
+ "grad_norm": 0.11520268023014069,
467
+ "learning_rate": 4.891017885594399e-05,
468
+ "loss": 0.6059,
469
+ "step": 120
470
+ },
471
+ {
472
+ "epoch": 0.3018867924528302,
473
+ "eval_loss": 0.6016330718994141,
474
+ "eval_runtime": 415.9012,
475
+ "eval_samples_per_second": 25.742,
476
+ "eval_steps_per_second": 0.202,
477
+ "step": 120
478
+ },
479
+ {
480
+ "epoch": 0.3069182389937107,
481
+ "grad_norm": 0.11097536981105804,
482
+ "learning_rate": 4.885256404612022e-05,
483
+ "loss": 0.5963,
484
+ "step": 122
485
+ },
486
+ {
487
+ "epoch": 0.3119496855345912,
488
+ "grad_norm": 0.1210559606552124,
489
+ "learning_rate": 4.8793505058695155e-05,
490
+ "loss": 0.6367,
491
+ "step": 124
492
+ },
493
+ {
494
+ "epoch": 0.3169811320754717,
495
+ "grad_norm": 0.12011069059371948,
496
+ "learning_rate": 4.8733005888114915e-05,
497
+ "loss": 0.5889,
498
+ "step": 126
499
+ },
500
+ {
501
+ "epoch": 0.3220125786163522,
502
+ "grad_norm": 0.12133902311325073,
503
+ "learning_rate": 4.867107062623223e-05,
504
+ "loss": 0.5812,
505
+ "step": 128
506
+ },
507
+ {
508
+ "epoch": 0.3270440251572327,
509
+ "grad_norm": 0.11452817916870117,
510
+ "learning_rate": 4.860770346202962e-05,
511
+ "loss": 0.607,
512
+ "step": 130
513
+ },
514
+ {
515
+ "epoch": 0.3320754716981132,
516
+ "grad_norm": 0.1322205364704132,
517
+ "learning_rate": 4.854290868133614e-05,
518
+ "loss": 0.5789,
519
+ "step": 132
520
+ },
521
+ {
522
+ "epoch": 0.3371069182389937,
523
+ "grad_norm": 0.1177845150232315,
524
+ "learning_rate": 4.847669066653746e-05,
525
+ "loss": 0.5834,
526
+ "step": 134
527
+ },
528
+ {
529
+ "epoch": 0.3421383647798742,
530
+ "grad_norm": 0.12132906913757324,
531
+ "learning_rate": 4.840905389627951e-05,
532
+ "loss": 0.6178,
533
+ "step": 136
534
+ },
535
+ {
536
+ "epoch": 0.3471698113207547,
537
+ "grad_norm": 0.11665117740631104,
538
+ "learning_rate": 4.834000294516552e-05,
539
+ "loss": 0.5999,
540
+ "step": 138
541
+ },
542
+ {
543
+ "epoch": 0.3522012578616352,
544
+ "grad_norm": 0.12204349786043167,
545
+ "learning_rate": 4.8269542483446654e-05,
546
+ "loss": 0.602,
547
+ "step": 140
548
+ },
549
+ {
550
+ "epoch": 0.3522012578616352,
551
+ "eval_loss": 0.5978492498397827,
552
+ "eval_runtime": 462.6047,
553
+ "eval_samples_per_second": 23.143,
554
+ "eval_steps_per_second": 0.182,
555
+ "step": 140
556
+ },
557
+ {
558
+ "epoch": 0.3572327044025157,
559
+ "grad_norm": 0.1177980899810791,
560
+ "learning_rate": 4.819767727670612e-05,
561
+ "loss": 0.6035,
562
+ "step": 142
563
+ },
564
+ {
565
+ "epoch": 0.3622641509433962,
566
+ "grad_norm": 0.12166167795658112,
567
+ "learning_rate": 4.812441218553683e-05,
568
+ "loss": 0.5909,
569
+ "step": 144
570
+ },
571
+ {
572
+ "epoch": 0.3672955974842767,
573
+ "grad_norm": 0.13141223788261414,
574
+ "learning_rate": 4.804975216521272e-05,
575
+ "loss": 0.5985,
576
+ "step": 146
577
+ },
578
+ {
579
+ "epoch": 0.3723270440251572,
580
+ "grad_norm": 0.12138612568378448,
581
+ "learning_rate": 4.797370226535353e-05,
582
+ "loss": 0.5866,
583
+ "step": 148
584
+ },
585
+ {
586
+ "epoch": 0.37735849056603776,
587
+ "grad_norm": 0.12571364641189575,
588
+ "learning_rate": 4.789626762958331e-05,
589
+ "loss": 0.5789,
590
+ "step": 150
591
+ },
592
+ {
593
+ "epoch": 0.38238993710691827,
594
+ "grad_norm": 0.1183118149638176,
595
+ "learning_rate": 4.781745349518252e-05,
596
+ "loss": 0.5958,
597
+ "step": 152
598
+ },
599
+ {
600
+ "epoch": 0.38742138364779877,
601
+ "grad_norm": 0.11182838678359985,
602
+ "learning_rate": 4.7737265192733815e-05,
603
+ "loss": 0.5768,
604
+ "step": 154
605
+ },
606
+ {
607
+ "epoch": 0.39245283018867927,
608
+ "grad_norm": 0.12711752951145172,
609
+ "learning_rate": 4.765570814576153e-05,
610
+ "loss": 0.5951,
611
+ "step": 156
612
+ },
613
+ {
614
+ "epoch": 0.39748427672955977,
615
+ "grad_norm": 0.12628626823425293,
616
+ "learning_rate": 4.757278787036479e-05,
617
+ "loss": 0.5907,
618
+ "step": 158
619
+ },
620
+ {
621
+ "epoch": 0.4025157232704403,
622
+ "grad_norm": 0.11637621372938156,
623
+ "learning_rate": 4.748850997484452e-05,
624
+ "loss": 0.6115,
625
+ "step": 160
626
+ },
627
+ {
628
+ "epoch": 0.4025157232704403,
629
+ "eval_loss": 0.5944415926933289,
630
+ "eval_runtime": 416.2458,
631
+ "eval_samples_per_second": 25.72,
632
+ "eval_steps_per_second": 0.202,
633
+ "step": 160
634
+ },
635
+ {
636
+ "epoch": 0.4075471698113208,
637
+ "grad_norm": 0.11336452513933182,
638
+ "learning_rate": 4.7402880159324084e-05,
639
+ "loss": 0.5685,
640
+ "step": 162
641
+ },
642
+ {
643
+ "epoch": 0.4125786163522013,
644
+ "grad_norm": 0.12172479182481766,
645
+ "learning_rate": 4.7315904215363734e-05,
646
+ "loss": 0.5765,
647
+ "step": 164
648
+ },
649
+ {
650
+ "epoch": 0.4176100628930818,
651
+ "grad_norm": 0.11980731040239334,
652
+ "learning_rate": 4.722758802556896e-05,
653
+ "loss": 0.5948,
654
+ "step": 166
655
+ },
656
+ {
657
+ "epoch": 0.4226415094339623,
658
+ "grad_norm": 0.11705721169710159,
659
+ "learning_rate": 4.7137937563192555e-05,
660
+ "loss": 0.5749,
661
+ "step": 168
662
+ },
663
+ {
664
+ "epoch": 0.4276729559748428,
665
+ "grad_norm": 0.1204642727971077,
666
+ "learning_rate": 4.704695889173066e-05,
667
+ "loss": 0.6069,
668
+ "step": 170
669
+ },
670
+ {
671
+ "epoch": 0.4327044025157233,
672
+ "grad_norm": 0.11957567930221558,
673
+ "learning_rate": 4.695465816451266e-05,
674
+ "loss": 0.5724,
675
+ "step": 172
676
+ },
677
+ {
678
+ "epoch": 0.4377358490566038,
679
+ "grad_norm": 0.12370068579912186,
680
+ "learning_rate": 4.686104162428497e-05,
681
+ "loss": 0.6164,
682
+ "step": 174
683
+ },
684
+ {
685
+ "epoch": 0.4427672955974843,
686
+ "grad_norm": 0.14216069877147675,
687
+ "learning_rate": 4.676611560278884e-05,
688
+ "loss": 0.6018,
689
+ "step": 176
690
+ },
691
+ {
692
+ "epoch": 0.4477987421383648,
693
+ "grad_norm": 0.12077496945858002,
694
+ "learning_rate": 4.66698865203321e-05,
695
+ "loss": 0.5919,
696
+ "step": 178
697
+ },
698
+ {
699
+ "epoch": 0.4528301886792453,
700
+ "grad_norm": 0.12382370233535767,
701
+ "learning_rate": 4.6572360885354905e-05,
702
+ "loss": 0.6304,
703
+ "step": 180
704
+ },
705
+ {
706
+ "epoch": 0.4528301886792453,
707
+ "eval_loss": 0.5917236804962158,
708
+ "eval_runtime": 416.6125,
709
+ "eval_samples_per_second": 25.698,
710
+ "eval_steps_per_second": 0.202,
711
+ "step": 180
712
+ },
713
+ {
714
+ "epoch": 0.4578616352201258,
715
+ "grad_norm": 0.11310654878616333,
716
+ "learning_rate": 4.647354529398957e-05,
717
+ "loss": 0.5984,
718
+ "step": 182
719
+ },
720
+ {
721
+ "epoch": 0.4628930817610063,
722
+ "grad_norm": 0.11125874519348145,
723
+ "learning_rate": 4.637344642961442e-05,
724
+ "loss": 0.5993,
725
+ "step": 184
726
+ },
727
+ {
728
+ "epoch": 0.4679245283018868,
729
+ "grad_norm": 0.10250482708215714,
730
+ "learning_rate": 4.627207106240176e-05,
731
+ "loss": 0.5958,
732
+ "step": 186
733
+ },
734
+ {
735
+ "epoch": 0.4729559748427673,
736
+ "grad_norm": 0.10660769045352936,
737
+ "learning_rate": 4.6169426048859994e-05,
738
+ "loss": 0.5602,
739
+ "step": 188
740
+ },
741
+ {
742
+ "epoch": 0.4779874213836478,
743
+ "grad_norm": 0.10431323200464249,
744
+ "learning_rate": 4.606551833136985e-05,
745
+ "loss": 0.5903,
746
+ "step": 190
747
+ },
748
+ {
749
+ "epoch": 0.4830188679245283,
750
+ "grad_norm": 0.11766020953655243,
751
+ "learning_rate": 4.596035493771488e-05,
752
+ "loss": 0.6004,
753
+ "step": 192
754
+ },
755
+ {
756
+ "epoch": 0.4880503144654088,
757
+ "grad_norm": 0.1166047751903534,
758
+ "learning_rate": 4.585394298060611e-05,
759
+ "loss": 0.5977,
760
+ "step": 194
761
+ },
762
+ {
763
+ "epoch": 0.4930817610062893,
764
+ "grad_norm": 0.1147083193063736,
765
+ "learning_rate": 4.574628965720097e-05,
766
+ "loss": 0.5897,
767
+ "step": 196
768
+ },
769
+ {
770
+ "epoch": 0.4981132075471698,
771
+ "grad_norm": 0.10733990371227264,
772
+ "learning_rate": 4.5637402248616506e-05,
773
+ "loss": 0.5978,
774
+ "step": 198
775
+ },
776
+ {
777
+ "epoch": 0.5031446540880503,
778
+ "grad_norm": 0.10964643210172653,
779
+ "learning_rate": 4.552728811943696e-05,
780
+ "loss": 0.602,
781
+ "step": 200
782
+ },
783
+ {
784
+ "epoch": 0.5031446540880503,
785
+ "eval_loss": 0.5892359018325806,
786
+ "eval_runtime": 416.8902,
787
+ "eval_samples_per_second": 25.681,
788
+ "eval_steps_per_second": 0.201,
789
+ "step": 200
790
+ },
791
+ {
792
+ "epoch": 0.5081761006289308,
793
+ "grad_norm": 0.11032500118017197,
794
+ "learning_rate": 4.54717733587572e-05,
795
+ "loss": 0.5817,
796
+ "step": 202
797
+ },
798
+ {
799
+ "epoch": 0.5132075471698113,
800
+ "grad_norm": 0.1092258021235466,
801
+ "learning_rate": 4.5359833138637734e-05,
802
+ "loss": 0.5982,
803
+ "step": 204
804
+ },
805
+ {
806
+ "epoch": 0.5182389937106918,
807
+ "grad_norm": 0.10684628039598465,
808
+ "learning_rate": 4.524668497127006e-05,
809
+ "loss": 0.5923,
810
+ "step": 206
811
+ },
812
+ {
813
+ "epoch": 0.5232704402515723,
814
+ "grad_norm": 0.11228887736797333,
815
+ "learning_rate": 4.513233650941422e-05,
816
+ "loss": 0.5742,
817
+ "step": 208
818
+ },
819
+ {
820
+ "epoch": 0.5283018867924528,
821
+ "grad_norm": 0.1152106523513794,
822
+ "learning_rate": 4.501679548701201e-05,
823
+ "loss": 0.5955,
824
+ "step": 210
825
+ },
826
+ {
827
+ "epoch": 0.5333333333333333,
828
+ "grad_norm": 0.11588042229413986,
829
+ "learning_rate": 4.490006971866385e-05,
830
+ "loss": 0.5936,
831
+ "step": 212
832
+ },
833
+ {
834
+ "epoch": 0.5383647798742138,
835
+ "grad_norm": 0.11394830793142319,
836
+ "learning_rate": 4.478216709910035e-05,
837
+ "loss": 0.5937,
838
+ "step": 214
839
+ },
840
+ {
841
+ "epoch": 0.5433962264150943,
842
+ "grad_norm": 0.11031538993120193,
843
+ "learning_rate": 4.466309560264822e-05,
844
+ "loss": 0.5973,
845
+ "step": 216
846
+ },
847
+ {
848
+ "epoch": 0.5484276729559748,
849
+ "grad_norm": 0.11161042749881744,
850
+ "learning_rate": 4.4542863282691014e-05,
851
+ "loss": 0.5701,
852
+ "step": 218
853
+ },
854
+ {
855
+ "epoch": 0.5534591194968553,
856
+ "grad_norm": 0.11783146113157272,
857
+ "learning_rate": 4.4421478271124426e-05,
858
+ "loss": 0.603,
859
+ "step": 220
860
+ },
861
+ {
862
+ "epoch": 0.5534591194968553,
863
+ "eval_loss": 0.5872239470481873,
864
+ "eval_runtime": 412.7815,
865
+ "eval_samples_per_second": 25.936,
866
+ "eval_steps_per_second": 0.203,
867
+ "step": 220
868
+ },
869
+ {
870
+ "epoch": 0.5584905660377358,
871
+ "grad_norm": 0.10747622698545456,
872
+ "learning_rate": 4.429894877780627e-05,
873
+ "loss": 0.6089,
874
+ "step": 222
875
+ },
876
+ {
877
+ "epoch": 0.5635220125786163,
878
+ "grad_norm": 0.1075616180896759,
879
+ "learning_rate": 4.4175283090001225e-05,
880
+ "loss": 0.6042,
881
+ "step": 224
882
+ },
883
+ {
884
+ "epoch": 0.5685534591194968,
885
+ "grad_norm": 0.11763885617256165,
886
+ "learning_rate": 4.4050489571820306e-05,
887
+ "loss": 0.5854,
888
+ "step": 226
889
+ },
890
+ {
891
+ "epoch": 0.5735849056603773,
892
+ "grad_norm": 0.1110839769244194,
893
+ "learning_rate": 4.392457666365519e-05,
894
+ "loss": 0.5731,
895
+ "step": 228
896
+ },
897
+ {
898
+ "epoch": 0.5786163522012578,
899
+ "grad_norm": 0.1211901530623436,
900
+ "learning_rate": 4.379755288160733e-05,
901
+ "loss": 0.5571,
902
+ "step": 230
903
+ },
904
+ {
905
+ "epoch": 0.5836477987421383,
906
+ "grad_norm": 0.10990186035633087,
907
+ "learning_rate": 4.3669426816911985e-05,
908
+ "loss": 0.5919,
909
+ "step": 232
910
+ },
911
+ {
912
+ "epoch": 0.5886792452830188,
913
+ "grad_norm": 0.1129634901881218,
914
+ "learning_rate": 4.354020713535711e-05,
915
+ "loss": 0.5853,
916
+ "step": 234
917
+ },
918
+ {
919
+ "epoch": 0.5937106918238994,
920
+ "grad_norm": 0.11691266298294067,
921
+ "learning_rate": 4.340990257669732e-05,
922
+ "loss": 0.5878,
923
+ "step": 236
924
+ },
925
+ {
926
+ "epoch": 0.5987421383647799,
927
+ "grad_norm": 0.11645273119211197,
928
+ "learning_rate": 4.327852195406271e-05,
929
+ "loss": 0.5946,
930
+ "step": 238
931
+ },
932
+ {
933
+ "epoch": 0.6037735849056604,
934
+ "grad_norm": 0.1155095249414444,
935
+ "learning_rate": 4.314607415336281e-05,
936
+ "loss": 0.6004,
937
+ "step": 240
938
+ },
939
+ {
940
+ "epoch": 0.6037735849056604,
941
+ "eval_loss": 0.5851526856422424,
942
+ "eval_runtime": 413.8117,
943
+ "eval_samples_per_second": 25.872,
944
+ "eval_steps_per_second": 0.203,
945
+ "step": 240
946
+ },
947
+ {
948
+ "epoch": 0.6088050314465409,
949
+ "grad_norm": 0.11004958301782608,
950
+ "learning_rate": 4.301256813268559e-05,
951
+ "loss": 0.5846,
952
+ "step": 242
953
+ },
954
+ {
955
+ "epoch": 0.6138364779874214,
956
+ "grad_norm": 0.10904593765735626,
957
+ "learning_rate": 4.287801292169159e-05,
958
+ "loss": 0.5871,
959
+ "step": 244
960
+ },
961
+ {
962
+ "epoch": 0.6188679245283019,
963
+ "grad_norm": 0.11857830733060837,
964
+ "learning_rate": 4.274241762100315e-05,
965
+ "loss": 0.5826,
966
+ "step": 246
967
+ },
968
+ {
969
+ "epoch": 0.6238993710691824,
970
+ "grad_norm": 0.11342642456293106,
971
+ "learning_rate": 4.260579140158898e-05,
972
+ "loss": 0.5807,
973
+ "step": 248
974
+ },
975
+ {
976
+ "epoch": 0.6289308176100629,
977
+ "grad_norm": 0.10923154652118683,
978
+ "learning_rate": 4.246814350414377e-05,
979
+ "loss": 0.5732,
980
+ "step": 250
981
+ },
982
+ {
983
+ "epoch": 0.6339622641509434,
984
+ "grad_norm": 0.11175252497196198,
985
+ "learning_rate": 4.2329483238463304e-05,
986
+ "loss": 0.5649,
987
+ "step": 252
988
+ },
989
+ {
990
+ "epoch": 0.6389937106918239,
991
+ "grad_norm": 0.10550232976675034,
992
+ "learning_rate": 4.218981998281471e-05,
993
+ "loss": 0.5853,
994
+ "step": 254
995
+ },
996
+ {
997
+ "epoch": 0.6440251572327044,
998
+ "grad_norm": 0.10999017208814621,
999
+ "learning_rate": 4.204916318330225e-05,
1000
+ "loss": 0.5864,
1001
+ "step": 256
1002
+ },
1003
+ {
1004
+ "epoch": 0.6490566037735849,
1005
+ "grad_norm": 0.13525208830833435,
1006
+ "learning_rate": 4.190752235322832e-05,
1007
+ "loss": 0.5842,
1008
+ "step": 258
1009
+ },
1010
+ {
1011
+ "epoch": 0.6540880503144654,
1012
+ "grad_norm": 0.11547064781188965,
1013
+ "learning_rate": 4.176490707245011e-05,
1014
+ "loss": 0.5891,
1015
+ "step": 260
1016
+ },
1017
+ {
1018
+ "epoch": 0.6540880503144654,
1019
+ "eval_loss": 0.5836161971092224,
1020
+ "eval_runtime": 414.2177,
1021
+ "eval_samples_per_second": 25.846,
1022
+ "eval_steps_per_second": 0.203,
1023
+ "step": 260
1024
+ },
1025
+ {
1026
+ "epoch": 0.6591194968553459,
1027
+ "grad_norm": 0.10233239829540253,
1028
+ "learning_rate": 4.162132698673167e-05,
1029
+ "loss": 0.5708,
1030
+ "step": 262
1031
+ },
1032
+ {
1033
+ "epoch": 0.6641509433962264,
1034
+ "grad_norm": 0.11854418367147446,
1035
+ "learning_rate": 4.1476791807091445e-05,
1036
+ "loss": 0.6074,
1037
+ "step": 264
1038
+ },
1039
+ {
1040
+ "epoch": 0.6691823899371069,
1041
+ "grad_norm": 0.13011477887630463,
1042
+ "learning_rate": 4.133131130914555e-05,
1043
+ "loss": 0.597,
1044
+ "step": 266
1045
+ },
1046
+ {
1047
+ "epoch": 0.6742138364779874,
1048
+ "grad_norm": 0.11256586760282516,
1049
+ "learning_rate": 4.118489533244655e-05,
1050
+ "loss": 0.5895,
1051
+ "step": 268
1052
+ },
1053
+ {
1054
+ "epoch": 0.6792452830188679,
1055
+ "grad_norm": 0.10651887208223343,
1056
+ "learning_rate": 4.1037553779818016e-05,
1057
+ "loss": 0.5934,
1058
+ "step": 270
1059
+ },
1060
+ {
1061
+ "epoch": 0.6842767295597484,
1062
+ "grad_norm": 0.11151424050331116,
1063
+ "learning_rate": 4.088929661668468e-05,
1064
+ "loss": 0.6028,
1065
+ "step": 272
1066
+ },
1067
+ {
1068
+ "epoch": 0.6893081761006289,
1069
+ "grad_norm": 0.10524857044219971,
1070
+ "learning_rate": 4.0740133870398456e-05,
1071
+ "loss": 0.608,
1072
+ "step": 274
1073
+ },
1074
+ {
1075
+ "epoch": 0.6943396226415094,
1076
+ "grad_norm": 0.12080366164445877,
1077
+ "learning_rate": 4.059007562956027e-05,
1078
+ "loss": 0.6175,
1079
+ "step": 276
1080
+ },
1081
+ {
1082
+ "epoch": 0.6993710691823899,
1083
+ "grad_norm": 0.1079036071896553,
1084
+ "learning_rate": 4.0439132043337666e-05,
1085
+ "loss": 0.5938,
1086
+ "step": 278
1087
+ },
1088
+ {
1089
+ "epoch": 0.7044025157232704,
1090
+ "grad_norm": 0.11142345517873764,
1091
+ "learning_rate": 4.028731332077843e-05,
1092
+ "loss": 0.5752,
1093
+ "step": 280
1094
+ },
1095
+ {
1096
+ "epoch": 0.7044025157232704,
1097
+ "eval_loss": 0.5814208984375,
1098
+ "eval_runtime": 413.672,
1099
+ "eval_samples_per_second": 25.88,
1100
+ "eval_steps_per_second": 0.203,
1101
+ "step": 280
1102
+ },
1103
+ {
1104
+ "epoch": 0.7094339622641509,
1105
+ "grad_norm": 0.11195844411849976,
1106
+ "learning_rate": 4.0134629730120045e-05,
1107
+ "loss": 0.583,
1108
+ "step": 282
1109
+ },
1110
+ {
1111
+ "epoch": 0.7144654088050314,
1112
+ "grad_norm": 0.10488727688789368,
1113
+ "learning_rate": 3.9981091598095213e-05,
1114
+ "loss": 0.5593,
1115
+ "step": 284
1116
+ },
1117
+ {
1118
+ "epoch": 0.7194968553459119,
1119
+ "grad_norm": 0.10145172476768494,
1120
+ "learning_rate": 3.9826709309233454e-05,
1121
+ "loss": 0.5839,
1122
+ "step": 286
1123
+ },
1124
+ {
1125
+ "epoch": 0.7245283018867924,
1126
+ "grad_norm": 0.1036810651421547,
1127
+ "learning_rate": 3.967149330515867e-05,
1128
+ "loss": 0.5796,
1129
+ "step": 288
1130
+ },
1131
+ {
1132
+ "epoch": 0.7295597484276729,
1133
+ "grad_norm": 0.1167021244764328,
1134
+ "learning_rate": 3.951545408388301e-05,
1135
+ "loss": 0.6006,
1136
+ "step": 290
1137
+ },
1138
+ {
1139
+ "epoch": 0.7345911949685534,
1140
+ "grad_norm": 0.10975436121225357,
1141
+ "learning_rate": 3.935860219909679e-05,
1142
+ "loss": 0.5802,
1143
+ "step": 292
1144
+ },
1145
+ {
1146
+ "epoch": 0.7396226415094339,
1147
+ "grad_norm": 0.11112164705991745,
1148
+ "learning_rate": 3.920094825945468e-05,
1149
+ "loss": 0.5851,
1150
+ "step": 294
1151
+ },
1152
+ {
1153
+ "epoch": 0.7446540880503144,
1154
+ "grad_norm": 0.10697871446609497,
1155
+ "learning_rate": 3.904250292785825e-05,
1156
+ "loss": 0.5855,
1157
+ "step": 296
1158
+ },
1159
+ {
1160
+ "epoch": 0.7496855345911949,
1161
+ "grad_norm": 0.10061946511268616,
1162
+ "learning_rate": 3.8883276920734736e-05,
1163
+ "loss": 0.5941,
1164
+ "step": 298
1165
+ },
1166
+ {
1167
+ "epoch": 0.7547169811320755,
1168
+ "grad_norm": 0.102944515645504,
1169
+ "learning_rate": 3.8723281007312256e-05,
1170
+ "loss": 0.5732,
1171
+ "step": 300
1172
+ },
1173
+ {
1174
+ "epoch": 0.7547169811320755,
1175
+ "eval_loss": 0.5801501870155334,
1176
+ "eval_runtime": 412.1928,
1177
+ "eval_samples_per_second": 25.973,
1178
+ "eval_steps_per_second": 0.204,
1179
+ "step": 300
1180
+ },
1181
+ {
1182
+ "epoch": 0.759748427672956,
1183
+ "grad_norm": 0.10275246948003769,
1184
+ "learning_rate": 3.856252600889143e-05,
1185
+ "loss": 0.5803,
1186
+ "step": 302
1187
+ },
1188
+ {
1189
+ "epoch": 0.7647798742138365,
1190
+ "grad_norm": 0.10686468333005905,
1191
+ "learning_rate": 3.840102279811345e-05,
1192
+ "loss": 0.585,
1193
+ "step": 304
1194
+ },
1195
+ {
1196
+ "epoch": 0.769811320754717,
1197
+ "grad_norm": 0.11284485459327698,
1198
+ "learning_rate": 3.82387822982248e-05,
1199
+ "loss": 0.5937,
1200
+ "step": 306
1201
+ },
1202
+ {
1203
+ "epoch": 0.7748427672955975,
1204
+ "grad_norm": 0.10271965712308884,
1205
+ "learning_rate": 3.807581548233837e-05,
1206
+ "loss": 0.5777,
1207
+ "step": 308
1208
+ },
1209
+ {
1210
+ "epoch": 0.779874213836478,
1211
+ "grad_norm": 0.10338881611824036,
1212
+ "learning_rate": 3.791213337269134e-05,
1213
+ "loss": 0.5888,
1214
+ "step": 310
1215
+ },
1216
+ {
1217
+ "epoch": 0.7849056603773585,
1218
+ "grad_norm": 0.11098296195268631,
1219
+ "learning_rate": 3.7747747039899676e-05,
1220
+ "loss": 0.5764,
1221
+ "step": 312
1222
+ },
1223
+ {
1224
+ "epoch": 0.789937106918239,
1225
+ "grad_norm": 0.10431266576051712,
1226
+ "learning_rate": 3.758266760220937e-05,
1227
+ "loss": 0.5985,
1228
+ "step": 314
1229
+ },
1230
+ {
1231
+ "epoch": 0.7949685534591195,
1232
+ "grad_norm": 0.10191661864519119,
1233
+ "learning_rate": 3.741690622474449e-05,
1234
+ "loss": 0.5626,
1235
+ "step": 316
1236
+ },
1237
+ {
1238
+ "epoch": 0.8,
1239
+ "grad_norm": 0.11264406889677048,
1240
+ "learning_rate": 3.7250474118751974e-05,
1241
+ "loss": 0.6094,
1242
+ "step": 318
1243
+ },
1244
+ {
1245
+ "epoch": 0.8050314465408805,
1246
+ "grad_norm": 0.10798995941877365,
1247
+ "learning_rate": 3.708338254084339e-05,
1248
+ "loss": 0.5714,
1249
+ "step": 320
1250
+ },
1251
+ {
1252
+ "epoch": 0.8050314465408805,
1253
+ "eval_loss": 0.5783895254135132,
1254
+ "eval_runtime": 411.3396,
1255
+ "eval_samples_per_second": 26.027,
1256
+ "eval_steps_per_second": 0.204,
1257
+ "step": 320
1258
+ },
1259
+ {
1260
+ "epoch": 0.810062893081761,
1261
+ "grad_norm": 0.10546964406967163,
1262
+ "learning_rate": 3.69156427922336e-05,
1263
+ "loss": 0.5682,
1264
+ "step": 322
1265
+ },
1266
+ {
1267
+ "epoch": 0.8150943396226416,
1268
+ "grad_norm": 0.11086619645357132,
1269
+ "learning_rate": 3.6747266217976414e-05,
1270
+ "loss": 0.5682,
1271
+ "step": 324
1272
+ },
1273
+ {
1274
+ "epoch": 0.820125786163522,
1275
+ "grad_norm": 0.11166223883628845,
1276
+ "learning_rate": 3.6578264206197245e-05,
1277
+ "loss": 0.5798,
1278
+ "step": 326
1279
+ },
1280
+ {
1281
+ "epoch": 0.8251572327044026,
1282
+ "grad_norm": 0.1012871265411377,
1283
+ "learning_rate": 3.6408648187322854e-05,
1284
+ "loss": 0.5694,
1285
+ "step": 328
1286
+ },
1287
+ {
1288
+ "epoch": 0.8301886792452831,
1289
+ "grad_norm": 0.11129719018936157,
1290
+ "learning_rate": 3.623842963330832e-05,
1291
+ "loss": 0.5827,
1292
+ "step": 330
1293
+ },
1294
+ {
1295
+ "epoch": 0.8352201257861636,
1296
+ "grad_norm": 0.10445208847522736,
1297
+ "learning_rate": 3.6067620056861086e-05,
1298
+ "loss": 0.5684,
1299
+ "step": 332
1300
+ },
1301
+ {
1302
+ "epoch": 0.8402515723270441,
1303
+ "grad_norm": 0.09836594015359879,
1304
+ "learning_rate": 3.589623101066232e-05,
1305
+ "loss": 0.5755,
1306
+ "step": 334
1307
+ },
1308
+ {
1309
+ "epoch": 0.8452830188679246,
1310
+ "grad_norm": 0.10354923456907272,
1311
+ "learning_rate": 3.572427408658552e-05,
1312
+ "loss": 0.5782,
1313
+ "step": 336
1314
+ },
1315
+ {
1316
+ "epoch": 0.8503144654088051,
1317
+ "grad_norm": 0.10813874751329422,
1318
+ "learning_rate": 3.5551760914912546e-05,
1319
+ "loss": 0.5939,
1320
+ "step": 338
1321
+ },
1322
+ {
1323
+ "epoch": 0.8553459119496856,
1324
+ "grad_norm": 0.10268381237983704,
1325
+ "learning_rate": 3.537870316354699e-05,
1326
+ "loss": 0.5697,
1327
+ "step": 340
1328
+ },
1329
+ {
1330
+ "epoch": 0.8553459119496856,
1331
+ "eval_loss": 0.5766698122024536,
1332
+ "eval_runtime": 412.4072,
1333
+ "eval_samples_per_second": 25.96,
1334
+ "eval_steps_per_second": 0.204,
1335
+ "step": 340
1336
+ },
1337
+ {
1338
+ "epoch": 0.8603773584905661,
1339
+ "grad_norm": 0.10261879861354828,
1340
+ "learning_rate": 3.5205112537224974e-05,
1341
+ "loss": 0.5694,
1342
+ "step": 342
1343
+ },
1344
+ {
1345
+ "epoch": 0.8654088050314466,
1346
+ "grad_norm": 0.10596223920583725,
1347
+ "learning_rate": 3.50310007767236e-05,
1348
+ "loss": 0.6091,
1349
+ "step": 344
1350
+ },
1351
+ {
1352
+ "epoch": 0.8704402515723271,
1353
+ "grad_norm": 0.09846257418394089,
1354
+ "learning_rate": 3.485637965806674e-05,
1355
+ "loss": 0.5808,
1356
+ "step": 346
1357
+ },
1358
+ {
1359
+ "epoch": 0.8754716981132076,
1360
+ "grad_norm": 0.10605087131261826,
1361
+ "learning_rate": 3.4681260991728685e-05,
1362
+ "loss": 0.5737,
1363
+ "step": 348
1364
+ },
1365
+ {
1366
+ "epoch": 0.8805031446540881,
1367
+ "grad_norm": 0.11082804203033447,
1368
+ "learning_rate": 3.450565662183527e-05,
1369
+ "loss": 0.5826,
1370
+ "step": 350
1371
+ },
1372
+ {
1373
+ "epoch": 0.8855345911949686,
1374
+ "grad_norm": 0.10169358551502228,
1375
+ "learning_rate": 3.432957842536282e-05,
1376
+ "loss": 0.5724,
1377
+ "step": 352
1378
+ },
1379
+ {
1380
+ "epoch": 0.8905660377358491,
1381
+ "grad_norm": 0.10837408900260925,
1382
+ "learning_rate": 3.415303831133485e-05,
1383
+ "loss": 0.5736,
1384
+ "step": 354
1385
+ },
1386
+ {
1387
+ "epoch": 0.8955974842767296,
1388
+ "grad_norm": 0.11302381008863449,
1389
+ "learning_rate": 3.3976048220016604e-05,
1390
+ "loss": 0.589,
1391
+ "step": 356
1392
+ },
1393
+ {
1394
+ "epoch": 0.9006289308176101,
1395
+ "grad_norm": 0.10759381949901581,
1396
+ "learning_rate": 3.37986201221075e-05,
1397
+ "loss": 0.5769,
1398
+ "step": 358
1399
+ },
1400
+ {
1401
+ "epoch": 0.9056603773584906,
1402
+ "grad_norm": 0.11644274741411209,
1403
+ "learning_rate": 3.362076601793142e-05,
1404
+ "loss": 0.5603,
1405
+ "step": 360
1406
+ },
1407
+ {
1408
+ "epoch": 0.9056603773584906,
1409
+ "eval_loss": 0.5751825571060181,
1410
+ "eval_runtime": 411.2176,
1411
+ "eval_samples_per_second": 26.035,
1412
+ "eval_steps_per_second": 0.204,
1413
+ "step": 360
1414
+ },
1415
+ {
1416
+ "epoch": 0.9106918238993711,
1417
+ "grad_norm": 0.10910682380199432,
1418
+ "learning_rate": 3.344249793662514e-05,
1419
+ "loss": 0.5632,
1420
+ "step": 362
1421
+ },
1422
+ {
1423
+ "epoch": 0.9157232704402516,
1424
+ "grad_norm": 0.10915949195623398,
1425
+ "learning_rate": 3.326382793532476e-05,
1426
+ "loss": 0.5903,
1427
+ "step": 364
1428
+ },
1429
+ {
1430
+ "epoch": 0.9207547169811321,
1431
+ "grad_norm": 0.11586213856935501,
1432
+ "learning_rate": 3.308476809835013e-05,
1433
+ "loss": 0.594,
1434
+ "step": 366
1435
+ },
1436
+ {
1437
+ "epoch": 0.9257861635220126,
1438
+ "grad_norm": 0.11010655015707016,
1439
+ "learning_rate": 3.290533053638759e-05,
1440
+ "loss": 0.5723,
1441
+ "step": 368
1442
+ },
1443
+ {
1444
+ "epoch": 0.9308176100628931,
1445
+ "grad_norm": 0.11362364888191223,
1446
+ "learning_rate": 3.272552738567086e-05,
1447
+ "loss": 0.5825,
1448
+ "step": 370
1449
+ },
1450
+ {
1451
+ "epoch": 0.9358490566037736,
1452
+ "grad_norm": 0.10686285048723221,
1453
+ "learning_rate": 3.254537080716021e-05,
1454
+ "loss": 0.586,
1455
+ "step": 372
1456
+ },
1457
+ {
1458
+ "epoch": 0.9408805031446541,
1459
+ "grad_norm": 0.11016613245010376,
1460
+ "learning_rate": 3.236487298571996e-05,
1461
+ "loss": 0.5813,
1462
+ "step": 374
1463
+ },
1464
+ {
1465
+ "epoch": 0.9459119496855346,
1466
+ "grad_norm": 0.10999295115470886,
1467
+ "learning_rate": 3.2184046129294295e-05,
1468
+ "loss": 0.5716,
1469
+ "step": 376
1470
+ },
1471
+ {
1472
+ "epoch": 0.9509433962264151,
1473
+ "grad_norm": 0.1100740134716034,
1474
+ "learning_rate": 3.20029024680817e-05,
1475
+ "loss": 0.5424,
1476
+ "step": 378
1477
+ },
1478
+ {
1479
+ "epoch": 0.9559748427672956,
1480
+ "grad_norm": 0.105403371155262,
1481
+ "learning_rate": 3.1821454253707646e-05,
1482
+ "loss": 0.5963,
1483
+ "step": 380
1484
+ },
1485
+ {
1486
+ "epoch": 0.9559748427672956,
1487
+ "eval_loss": 0.5737633109092712,
1488
+ "eval_runtime": 412.702,
1489
+ "eval_samples_per_second": 25.941,
1490
+ "eval_steps_per_second": 0.204,
1491
+ "step": 380
1492
+ },
1493
+ {
1494
+ "epoch": 0.9610062893081761,
1495
+ "grad_norm": 0.10489460825920105,
1496
+ "learning_rate": 3.1639713758396055e-05,
1497
+ "loss": 0.567,
1498
+ "step": 382
1499
+ },
1500
+ {
1501
+ "epoch": 0.9660377358490566,
1502
+ "grad_norm": 0.10349351167678833,
1503
+ "learning_rate": 3.145769327413922e-05,
1504
+ "loss": 0.5721,
1505
+ "step": 384
1506
+ },
1507
+ {
1508
+ "epoch": 0.9710691823899371,
1509
+ "grad_norm": 0.10395889729261398,
1510
+ "learning_rate": 3.127540511186643e-05,
1511
+ "loss": 0.569,
1512
+ "step": 386
1513
+ },
1514
+ {
1515
+ "epoch": 0.9761006289308176,
1516
+ "grad_norm": 0.10146255046129227,
1517
+ "learning_rate": 3.109286160061136e-05,
1518
+ "loss": 0.5699,
1519
+ "step": 388
1520
+ },
1521
+ {
1522
+ "epoch": 0.9811320754716981,
1523
+ "grad_norm": 0.10547154396772385,
1524
+ "learning_rate": 3.091007508667814e-05,
1525
+ "loss": 0.5686,
1526
+ "step": 390
1527
+ },
1528
+ {
1529
+ "epoch": 0.9861635220125786,
1530
+ "grad_norm": 0.10124453902244568,
1531
+ "learning_rate": 3.072705793280642e-05,
1532
+ "loss": 0.5983,
1533
+ "step": 392
1534
+ },
1535
+ {
1536
+ "epoch": 0.9911949685534591,
1537
+ "grad_norm": 0.10909611731767654,
1538
+ "learning_rate": 3.054382251733507e-05,
1539
+ "loss": 0.5881,
1540
+ "step": 394
1541
+ },
1542
+ {
1543
+ "epoch": 0.9962264150943396,
1544
+ "grad_norm": 0.10168620944023132,
1545
+ "learning_rate": 3.0360381233365105e-05,
1546
+ "loss": 0.5978,
1547
+ "step": 396
1548
+ },
1549
+ {
1550
+ "epoch": 1.0012578616352201,
1551
+ "grad_norm": 0.1318611353635788,
1552
+ "learning_rate": 3.0176746487921404e-05,
1553
+ "loss": 0.5694,
1554
+ "step": 398
1555
+ },
1556
+ {
1557
+ "epoch": 1.0062893081761006,
1558
+ "grad_norm": 0.12533968687057495,
1559
+ "learning_rate": 2.9992930701113586e-05,
1560
+ "loss": 0.5082,
1561
+ "step": 400
1562
+ },
1563
+ {
1564
+ "epoch": 1.0062893081761006,
1565
+ "eval_loss": 0.5776596665382385,
1566
+ "eval_runtime": 410.8135,
1567
+ "eval_samples_per_second": 26.06,
1568
+ "eval_steps_per_second": 0.204,
1569
+ "step": 400
1570
+ },
1571
+ {
1572
+ "epoch": 1.0113207547169811,
1573
+ "grad_norm": 0.12798835337162018,
1574
+ "learning_rate": 2.9808946305295988e-05,
1575
+ "loss": 0.4912,
1576
+ "step": 402
1577
+ },
1578
+ {
1579
+ "epoch": 1.0163522012578616,
1580
+ "grad_norm": 0.13782812654972076,
1581
+ "learning_rate": 2.962480574422678e-05,
1582
+ "loss": 0.5288,
1583
+ "step": 404
1584
+ },
1585
+ {
1586
+ "epoch": 1.0213836477987421,
1587
+ "grad_norm": 0.11341769248247147,
1588
+ "learning_rate": 2.9440521472226368e-05,
1589
+ "loss": 0.5032,
1590
+ "step": 406
1591
+ },
1592
+ {
1593
+ "epoch": 1.0264150943396226,
1594
+ "grad_norm": 0.1318856179714203,
1595
+ "learning_rate": 2.9256105953334982e-05,
1596
+ "loss": 0.5038,
1597
+ "step": 408
1598
+ },
1599
+ {
1600
+ "epoch": 1.0314465408805031,
1601
+ "grad_norm": 0.11981856822967529,
1602
+ "learning_rate": 2.9071571660469775e-05,
1603
+ "loss": 0.4965,
1604
+ "step": 410
1605
+ },
1606
+ {
1607
+ "epoch": 1.0364779874213836,
1608
+ "grad_norm": 0.11475057154893875,
1609
+ "learning_rate": 2.888693107458111e-05,
1610
+ "loss": 0.4912,
1611
+ "step": 412
1612
+ },
1613
+ {
1614
+ "epoch": 1.0415094339622641,
1615
+ "grad_norm": 0.13232818245887756,
1616
+ "learning_rate": 2.8702196683808496e-05,
1617
+ "loss": 0.5065,
1618
+ "step": 414
1619
+ },
1620
+ {
1621
+ "epoch": 1.0465408805031446,
1622
+ "grad_norm": 0.1348014920949936,
1623
+ "learning_rate": 2.8517380982635906e-05,
1624
+ "loss": 0.5293,
1625
+ "step": 416
1626
+ },
1627
+ {
1628
+ "epoch": 1.0515723270440251,
1629
+ "grad_norm": 0.11755809187889099,
1630
+ "learning_rate": 2.8332496471046737e-05,
1631
+ "loss": 0.486,
1632
+ "step": 418
1633
+ },
1634
+ {
1635
+ "epoch": 1.0566037735849056,
1636
+ "grad_norm": 0.1252366006374359,
1637
+ "learning_rate": 2.8147555653678353e-05,
1638
+ "loss": 0.4975,
1639
+ "step": 420
1640
+ },
1641
+ {
1642
+ "epoch": 1.0566037735849056,
1643
+ "eval_loss": 0.5800932049751282,
1644
+ "eval_runtime": 411.2332,
1645
+ "eval_samples_per_second": 26.034,
1646
+ "eval_steps_per_second": 0.204,
1647
+ "step": 420
1648
+ },
1649
+ {
1650
+ "epoch": 1.0616352201257861,
1651
+ "grad_norm": 0.11638514697551727,
1652
+ "learning_rate": 2.7962571038976376e-05,
1653
+ "loss": 0.5021,
1654
+ "step": 422
1655
+ },
1656
+ {
1657
+ "epoch": 1.0666666666666667,
1658
+ "grad_norm": 0.11196921765804291,
1659
+ "learning_rate": 2.777755513834865e-05,
1660
+ "loss": 0.5081,
1661
+ "step": 424
1662
+ },
1663
+ {
1664
+ "epoch": 1.0716981132075472,
1665
+ "grad_norm": 0.12703673541545868,
1666
+ "learning_rate": 2.7592520465319012e-05,
1667
+ "loss": 0.5186,
1668
+ "step": 426
1669
+ },
1670
+ {
1671
+ "epoch": 1.0767295597484277,
1672
+ "grad_norm": 0.12006295472383499,
1673
+ "learning_rate": 2.7407479534680997e-05,
1674
+ "loss": 0.5123,
1675
+ "step": 428
1676
+ },
1677
+ {
1678
+ "epoch": 1.0817610062893082,
1679
+ "grad_norm": 0.11968886107206345,
1680
+ "learning_rate": 2.722244486165136e-05,
1681
+ "loss": 0.5135,
1682
+ "step": 430
1683
+ },
1684
+ {
1685
+ "epoch": 1.0867924528301887,
1686
+ "grad_norm": 0.11439641565084457,
1687
+ "learning_rate": 2.7037428961023632e-05,
1688
+ "loss": 0.4964,
1689
+ "step": 432
1690
+ },
1691
+ {
1692
+ "epoch": 1.0918238993710692,
1693
+ "grad_norm": 0.12513814866542816,
1694
+ "learning_rate": 2.685244434632166e-05,
1695
+ "loss": 0.5103,
1696
+ "step": 434
1697
+ },
1698
+ {
1699
+ "epoch": 1.0968553459119497,
1700
+ "grad_norm": 0.11105407029390335,
1701
+ "learning_rate": 2.6667503528953275e-05,
1702
+ "loss": 0.4915,
1703
+ "step": 436
1704
+ },
1705
+ {
1706
+ "epoch": 1.1018867924528302,
1707
+ "grad_norm": 0.1270296275615692,
1708
+ "learning_rate": 2.6482619017364096e-05,
1709
+ "loss": 0.5197,
1710
+ "step": 438
1711
+ },
1712
+ {
1713
+ "epoch": 1.1069182389937107,
1714
+ "grad_norm": 0.12292016297578812,
1715
+ "learning_rate": 2.629780331619151e-05,
1716
+ "loss": 0.5131,
1717
+ "step": 440
1718
+ },
1719
+ {
1720
+ "epoch": 1.1069182389937107,
1721
+ "eval_loss": 0.5787190198898315,
1722
+ "eval_runtime": 410.8629,
1723
+ "eval_samples_per_second": 26.057,
1724
+ "eval_steps_per_second": 0.204,
1725
+ "step": 440
1726
+ },
1727
+ {
1728
+ "epoch": 1.1119496855345912,
1729
+ "grad_norm": 0.10833011567592621,
1730
+ "learning_rate": 2.6113068925418892e-05,
1731
+ "loss": 0.4747,
1732
+ "step": 442
1733
+ },
1734
+ {
1735
+ "epoch": 1.1169811320754717,
1736
+ "grad_norm": 0.11480649560689926,
1737
+ "learning_rate": 2.592842833953023e-05,
1738
+ "loss": 0.5033,
1739
+ "step": 444
1740
+ },
1741
+ {
1742
+ "epoch": 1.1220125786163522,
1743
+ "grad_norm": 0.11621647328138351,
1744
+ "learning_rate": 2.5743894046665013e-05,
1745
+ "loss": 0.5041,
1746
+ "step": 446
1747
+ },
1748
+ {
1749
+ "epoch": 1.1270440251572327,
1750
+ "grad_norm": 0.11848396062850952,
1751
+ "learning_rate": 2.555947852777364e-05,
1752
+ "loss": 0.4868,
1753
+ "step": 448
1754
+ },
1755
+ {
1756
+ "epoch": 1.1320754716981132,
1757
+ "grad_norm": 0.11532817780971527,
1758
+ "learning_rate": 2.537519425577322e-05,
1759
+ "loss": 0.485,
1760
+ "step": 450
1761
+ },
1762
+ {
1763
+ "epoch": 1.1371069182389937,
1764
+ "grad_norm": 0.11470374464988708,
1765
+ "learning_rate": 2.519105369470402e-05,
1766
+ "loss": 0.5009,
1767
+ "step": 452
1768
+ },
1769
+ {
1770
+ "epoch": 1.1421383647798742,
1771
+ "grad_norm": 0.11918400973081589,
1772
+ "learning_rate": 2.5007069298886416e-05,
1773
+ "loss": 0.492,
1774
+ "step": 454
1775
+ },
1776
+ {
1777
+ "epoch": 1.1471698113207547,
1778
+ "grad_norm": 0.11399897933006287,
1779
+ "learning_rate": 2.4823253512078605e-05,
1780
+ "loss": 0.5227,
1781
+ "step": 456
1782
+ },
1783
+ {
1784
+ "epoch": 1.1522012578616352,
1785
+ "grad_norm": 0.1135721504688263,
1786
+ "learning_rate": 2.4639618766634904e-05,
1787
+ "loss": 0.4948,
1788
+ "step": 458
1789
+ },
1790
+ {
1791
+ "epoch": 1.1572327044025157,
1792
+ "grad_norm": 0.11768464744091034,
1793
+ "learning_rate": 2.4456177482664932e-05,
1794
+ "loss": 0.5069,
1795
+ "step": 460
1796
+ },
1797
+ {
1798
+ "epoch": 1.1572327044025157,
1799
+ "eval_loss": 0.5783973932266235,
1800
+ "eval_runtime": 410.9352,
1801
+ "eval_samples_per_second": 26.053,
1802
+ "eval_steps_per_second": 0.204,
1803
+ "step": 460
1804
+ },
1805
+ {
1806
+ "epoch": 1.1622641509433962,
1807
+ "grad_norm": 0.11413212865591049,
1808
+ "learning_rate": 2.4272942067193593e-05,
1809
+ "loss": 0.4919,
1810
+ "step": 462
1811
+ },
1812
+ {
1813
+ "epoch": 1.1672955974842767,
1814
+ "grad_norm": 0.11099102348089218,
1815
+ "learning_rate": 2.4089924913321854e-05,
1816
+ "loss": 0.5131,
1817
+ "step": 464
1818
+ },
1819
+ {
1820
+ "epoch": 1.1723270440251572,
1821
+ "grad_norm": 0.12233982235193253,
1822
+ "learning_rate": 2.3907138399388656e-05,
1823
+ "loss": 0.5152,
1824
+ "step": 466
1825
+ },
1826
+ {
1827
+ "epoch": 1.1773584905660377,
1828
+ "grad_norm": 0.109150230884552,
1829
+ "learning_rate": 2.3724594888133578e-05,
1830
+ "loss": 0.4942,
1831
+ "step": 468
1832
+ },
1833
+ {
1834
+ "epoch": 1.1823899371069182,
1835
+ "grad_norm": 0.11547227948904037,
1836
+ "learning_rate": 2.354230672586079e-05,
1837
+ "loss": 0.5087,
1838
+ "step": 470
1839
+ },
1840
+ {
1841
+ "epoch": 1.1874213836477987,
1842
+ "grad_norm": 0.11096334457397461,
1843
+ "learning_rate": 2.3360286241603947e-05,
1844
+ "loss": 0.528,
1845
+ "step": 472
1846
+ },
1847
+ {
1848
+ "epoch": 1.1924528301886792,
1849
+ "grad_norm": 0.11583118885755539,
1850
+ "learning_rate": 2.3178545746292363e-05,
1851
+ "loss": 0.5053,
1852
+ "step": 474
1853
+ },
1854
+ {
1855
+ "epoch": 1.1974842767295597,
1856
+ "grad_norm": 0.12394651025533676,
1857
+ "learning_rate": 2.299709753191831e-05,
1858
+ "loss": 0.519,
1859
+ "step": 476
1860
+ },
1861
+ {
1862
+ "epoch": 1.2025157232704402,
1863
+ "grad_norm": 0.10962869971990585,
1864
+ "learning_rate": 2.281595387070571e-05,
1865
+ "loss": 0.5119,
1866
+ "step": 478
1867
+ },
1868
+ {
1869
+ "epoch": 1.2075471698113207,
1870
+ "grad_norm": 0.11548969149589539,
1871
+ "learning_rate": 2.263512701428005e-05,
1872
+ "loss": 0.5053,
1873
+ "step": 480
1874
+ },
1875
+ {
1876
+ "epoch": 1.2075471698113207,
1877
+ "eval_loss": 0.5777500867843628,
1878
+ "eval_runtime": 410.7907,
1879
+ "eval_samples_per_second": 26.062,
1880
+ "eval_steps_per_second": 0.204,
1881
+ "step": 480
1882
+ },
1883
+ {
1884
+ "epoch": 1.2125786163522012,
1885
+ "grad_norm": 0.11964194476604462,
1886
+ "learning_rate": 2.2454629192839782e-05,
1887
+ "loss": 0.5067,
1888
+ "step": 482
1889
+ },
1890
+ {
1891
+ "epoch": 1.2176100628930817,
1892
+ "grad_norm": 0.11280932277441025,
1893
+ "learning_rate": 2.2274472614329146e-05,
1894
+ "loss": 0.5097,
1895
+ "step": 484
1896
+ },
1897
+ {
1898
+ "epoch": 1.2226415094339622,
1899
+ "grad_norm": 0.11903873831033707,
1900
+ "learning_rate": 2.2094669463612417e-05,
1901
+ "loss": 0.4973,
1902
+ "step": 486
1903
+ },
1904
+ {
1905
+ "epoch": 1.2276729559748427,
1906
+ "grad_norm": 0.11274790018796921,
1907
+ "learning_rate": 2.191523190164988e-05,
1908
+ "loss": 0.4802,
1909
+ "step": 488
1910
+ },
1911
+ {
1912
+ "epoch": 1.2327044025157232,
1913
+ "grad_norm": 0.12155181169509888,
1914
+ "learning_rate": 2.1736172064675242e-05,
1915
+ "loss": 0.5039,
1916
+ "step": 490
1917
+ },
1918
+ {
1919
+ "epoch": 1.2377358490566037,
1920
+ "grad_norm": 0.1149788573384285,
1921
+ "learning_rate": 2.1557502063374863e-05,
1922
+ "loss": 0.5018,
1923
+ "step": 492
1924
+ },
1925
+ {
1926
+ "epoch": 1.2427672955974842,
1927
+ "grad_norm": 0.11619096249341965,
1928
+ "learning_rate": 2.1379233982068597e-05,
1929
+ "loss": 0.5001,
1930
+ "step": 494
1931
+ },
1932
+ {
1933
+ "epoch": 1.2477987421383647,
1934
+ "grad_norm": 0.11911306530237198,
1935
+ "learning_rate": 2.120137987789252e-05,
1936
+ "loss": 0.5257,
1937
+ "step": 496
1938
+ },
1939
+ {
1940
+ "epoch": 1.2528301886792452,
1941
+ "grad_norm": 0.11577396094799042,
1942
+ "learning_rate": 2.1023951779983408e-05,
1943
+ "loss": 0.5156,
1944
+ "step": 498
1945
+ },
1946
+ {
1947
+ "epoch": 1.2578616352201257,
1948
+ "grad_norm": 0.11911614239215851,
1949
+ "learning_rate": 2.0846961688665158e-05,
1950
+ "loss": 0.5189,
1951
+ "step": 500
1952
+ },
1953
+ {
1954
+ "epoch": 1.2578616352201257,
1955
+ "eval_loss": 0.5771186947822571,
1956
+ "eval_runtime": 410.8096,
1957
+ "eval_samples_per_second": 26.061,
1958
+ "eval_steps_per_second": 0.204,
1959
+ "step": 500
1960
+ },
1961
+ {
1962
+ "epoch": 1.2628930817610062,
1963
+ "grad_norm": 0.12026111036539078,
1964
+ "learning_rate": 2.0670421574637182e-05,
1965
+ "loss": 0.4965,
1966
+ "step": 502
1967
+ },
1968
+ {
1969
+ "epoch": 1.2679245283018867,
1970
+ "grad_norm": 0.11090611666440964,
1971
+ "learning_rate": 2.0494343378164736e-05,
1972
+ "loss": 0.4924,
1973
+ "step": 504
1974
+ },
1975
+ {
1976
+ "epoch": 1.2729559748427672,
1977
+ "grad_norm": 0.11498820036649704,
1978
+ "learning_rate": 2.0318739008271327e-05,
1979
+ "loss": 0.5069,
1980
+ "step": 506
1981
+ },
1982
+ {
1983
+ "epoch": 1.2779874213836477,
1984
+ "grad_norm": 0.11681642383337021,
1985
+ "learning_rate": 2.014362034193326e-05,
1986
+ "loss": 0.5208,
1987
+ "step": 508
1988
+ },
1989
+ {
1990
+ "epoch": 1.2830188679245282,
1991
+ "grad_norm": 0.11289618164300919,
1992
+ "learning_rate": 1.9968999223276406e-05,
1993
+ "loss": 0.497,
1994
+ "step": 510
1995
+ },
1996
+ {
1997
+ "epoch": 1.2880503144654087,
1998
+ "grad_norm": 0.11700893938541412,
1999
+ "learning_rate": 1.979488746277503e-05,
2000
+ "loss": 0.4872,
2001
+ "step": 512
2002
+ },
2003
+ {
2004
+ "epoch": 1.2930817610062892,
2005
+ "grad_norm": 0.11669134348630905,
2006
+ "learning_rate": 1.9621296836453025e-05,
2007
+ "loss": 0.5117,
2008
+ "step": 514
2009
+ },
2010
+ {
2011
+ "epoch": 1.2981132075471697,
2012
+ "grad_norm": 0.11578242480754852,
2013
+ "learning_rate": 1.944823908508745e-05,
2014
+ "loss": 0.5046,
2015
+ "step": 516
2016
+ },
2017
+ {
2018
+ "epoch": 1.3031446540880502,
2019
+ "grad_norm": 0.11336881667375565,
2020
+ "learning_rate": 1.9275725913414483e-05,
2021
+ "loss": 0.4828,
2022
+ "step": 518
2023
+ },
2024
+ {
2025
+ "epoch": 1.3081761006289307,
2026
+ "grad_norm": 0.1218356043100357,
2027
+ "learning_rate": 1.910376898933769e-05,
2028
+ "loss": 0.5173,
2029
+ "step": 520
2030
+ },
2031
+ {
2032
+ "epoch": 1.3081761006289307,
2033
+ "eval_loss": 0.5762000679969788,
2034
+ "eval_runtime": 410.9091,
2035
+ "eval_samples_per_second": 26.054,
2036
+ "eval_steps_per_second": 0.204,
2037
+ "step": 520
2038
+ },
2039
+ {
2040
+ "epoch": 1.3132075471698113,
2041
+ "grad_norm": 0.11644181609153748,
2042
+ "learning_rate": 1.8932379943138916e-05,
2043
+ "loss": 0.5002,
2044
+ "step": 522
2045
+ },
2046
+ {
2047
+ "epoch": 1.3182389937106918,
2048
+ "grad_norm": 0.11215106397867203,
2049
+ "learning_rate": 1.8761570366691684e-05,
2050
+ "loss": 0.4808,
2051
+ "step": 524
2052
+ },
2053
+ {
2054
+ "epoch": 1.3232704402515723,
2055
+ "grad_norm": 0.11506900936365128,
2056
+ "learning_rate": 1.8591351812677144e-05,
2057
+ "loss": 0.4915,
2058
+ "step": 526
2059
+ },
2060
+ {
2061
+ "epoch": 1.3283018867924528,
2062
+ "grad_norm": 0.11646901071071625,
2063
+ "learning_rate": 1.8421735793802763e-05,
2064
+ "loss": 0.5067,
2065
+ "step": 528
2066
+ },
2067
+ {
2068
+ "epoch": 1.3333333333333333,
2069
+ "grad_norm": 0.1217241957783699,
2070
+ "learning_rate": 1.8252733782023584e-05,
2071
+ "loss": 0.5105,
2072
+ "step": 530
2073
+ },
2074
+ {
2075
+ "epoch": 1.3383647798742138,
2076
+ "grad_norm": 0.12330880761146545,
2077
+ "learning_rate": 1.8084357207766406e-05,
2078
+ "loss": 0.5107,
2079
+ "step": 532
2080
+ },
2081
+ {
2082
+ "epoch": 1.3433962264150943,
2083
+ "grad_norm": 0.10948923975229263,
2084
+ "learning_rate": 1.7916617459156615e-05,
2085
+ "loss": 0.4929,
2086
+ "step": 534
2087
+ },
2088
+ {
2089
+ "epoch": 1.3484276729559748,
2090
+ "grad_norm": 0.11415420472621918,
2091
+ "learning_rate": 1.7749525881248035e-05,
2092
+ "loss": 0.5123,
2093
+ "step": 536
2094
+ },
2095
+ {
2096
+ "epoch": 1.3534591194968553,
2097
+ "grad_norm": 0.11750365048646927,
2098
+ "learning_rate": 1.7583093775255516e-05,
2099
+ "loss": 0.5082,
2100
+ "step": 538
2101
+ },
2102
+ {
2103
+ "epoch": 1.3584905660377358,
2104
+ "grad_norm": 0.11664094030857086,
2105
+ "learning_rate": 1.741733239779063e-05,
2106
+ "loss": 0.5048,
2107
+ "step": 540
2108
+ },
2109
+ {
2110
+ "epoch": 1.3584905660377358,
2111
+ "eval_loss": 0.5755621194839478,
2112
+ "eval_runtime": 410.7838,
2113
+ "eval_samples_per_second": 26.062,
2114
+ "eval_steps_per_second": 0.204,
2115
+ "step": 540
2116
+ },
2117
+ {
2118
+ "epoch": 1.3635220125786163,
2119
+ "grad_norm": 0.11655986309051514,
2120
+ "learning_rate": 1.725225296010034e-05,
2121
+ "loss": 0.4923,
2122
+ "step": 542
2123
+ },
2124
+ {
2125
+ "epoch": 1.3685534591194968,
2126
+ "grad_norm": 0.11432712525129318,
2127
+ "learning_rate": 1.7087866627308664e-05,
2128
+ "loss": 0.4976,
2129
+ "step": 544
2130
+ },
2131
+ {
2132
+ "epoch": 1.3735849056603773,
2133
+ "grad_norm": 0.11400482058525085,
2134
+ "learning_rate": 1.692418451766163e-05,
2135
+ "loss": 0.5026,
2136
+ "step": 546
2137
+ },
2138
+ {
2139
+ "epoch": 1.378616352201258,
2140
+ "grad_norm": 0.11588790267705917,
2141
+ "learning_rate": 1.6761217701775207e-05,
2142
+ "loss": 0.5031,
2143
+ "step": 548
2144
+ },
2145
+ {
2146
+ "epoch": 1.3836477987421385,
2147
+ "grad_norm": 0.11426915228366852,
2148
+ "learning_rate": 1.6598977201886558e-05,
2149
+ "loss": 0.5001,
2150
+ "step": 550
2151
+ },
2152
+ {
2153
+ "epoch": 1.388679245283019,
2154
+ "grad_norm": 0.11552328616380692,
2155
+ "learning_rate": 1.6437473991108585e-05,
2156
+ "loss": 0.4928,
2157
+ "step": 552
2158
+ },
2159
+ {
2160
+ "epoch": 1.3937106918238995,
2161
+ "grad_norm": 0.11312104761600494,
2162
+ "learning_rate": 1.6276718992687746e-05,
2163
+ "loss": 0.4977,
2164
+ "step": 554
2165
+ },
2166
+ {
2167
+ "epoch": 1.39874213836478,
2168
+ "grad_norm": 0.11197475343942642,
2169
+ "learning_rate": 1.6116723079265263e-05,
2170
+ "loss": 0.489,
2171
+ "step": 556
2172
+ },
2173
+ {
2174
+ "epoch": 1.4037735849056605,
2175
+ "grad_norm": 0.11652438342571259,
2176
+ "learning_rate": 1.5957497072141758e-05,
2177
+ "loss": 0.4971,
2178
+ "step": 558
2179
+ },
2180
+ {
2181
+ "epoch": 1.408805031446541,
2182
+ "grad_norm": 0.1163628101348877,
2183
+ "learning_rate": 1.579905174054533e-05,
2184
+ "loss": 0.4986,
2185
+ "step": 560
2186
+ },
2187
+ {
2188
+ "epoch": 1.408805031446541,
2189
+ "eval_loss": 0.5742356777191162,
2190
+ "eval_runtime": 410.5998,
2191
+ "eval_samples_per_second": 26.074,
2192
+ "eval_steps_per_second": 0.205,
2193
+ "step": 560
2194
+ },
2195
+ {
2196
+ "epoch": 1.4138364779874215,
2197
+ "grad_norm": 0.1128329187631607,
2198
+ "learning_rate": 1.5641397800903222e-05,
2199
+ "loss": 0.5068,
2200
+ "step": 562
2201
+ },
2202
+ {
2203
+ "epoch": 1.418867924528302,
2204
+ "grad_norm": 0.11648018658161163,
2205
+ "learning_rate": 1.5484545916116995e-05,
2206
+ "loss": 0.4958,
2207
+ "step": 564
2208
+ },
2209
+ {
2210
+ "epoch": 1.4238993710691825,
2211
+ "grad_norm": 0.1150885596871376,
2212
+ "learning_rate": 1.5328506694841334e-05,
2213
+ "loss": 0.4855,
2214
+ "step": 566
2215
+ },
2216
+ {
2217
+ "epoch": 1.428930817610063,
2218
+ "grad_norm": 0.11181043833494186,
2219
+ "learning_rate": 1.5173290690766553e-05,
2220
+ "loss": 0.5114,
2221
+ "step": 568
2222
+ },
2223
+ {
2224
+ "epoch": 1.4339622641509435,
2225
+ "grad_norm": 0.11899517476558685,
2226
+ "learning_rate": 1.5018908401904785e-05,
2227
+ "loss": 0.5048,
2228
+ "step": 570
2229
+ },
2230
+ {
2231
+ "epoch": 1.438993710691824,
2232
+ "grad_norm": 0.11897191405296326,
2233
+ "learning_rate": 1.4865370269879955e-05,
2234
+ "loss": 0.5308,
2235
+ "step": 572
2236
+ },
2237
+ {
2238
+ "epoch": 1.4440251572327045,
2239
+ "grad_norm": 0.11142674088478088,
2240
+ "learning_rate": 1.471268667922157e-05,
2241
+ "loss": 0.4958,
2242
+ "step": 574
2243
+ },
2244
+ {
2245
+ "epoch": 1.449056603773585,
2246
+ "grad_norm": 0.1150866225361824,
2247
+ "learning_rate": 1.4560867956662336e-05,
2248
+ "loss": 0.4939,
2249
+ "step": 576
2250
+ },
2251
+ {
2252
+ "epoch": 1.4540880503144655,
2253
+ "grad_norm": 0.11816877871751785,
2254
+ "learning_rate": 1.4409924370439737e-05,
2255
+ "loss": 0.4913,
2256
+ "step": 578
2257
+ },
2258
+ {
2259
+ "epoch": 1.459119496855346,
2260
+ "grad_norm": 0.11381349712610245,
2261
+ "learning_rate": 1.425986612960155e-05,
2262
+ "loss": 0.5039,
2263
+ "step": 580
2264
+ },
2265
+ {
2266
+ "epoch": 1.459119496855346,
2267
+ "eval_loss": 0.573836624622345,
2268
+ "eval_runtime": 410.5985,
2269
+ "eval_samples_per_second": 26.074,
2270
+ "eval_steps_per_second": 0.205,
2271
+ "step": 580
2272
+ },
2273
+ {
2274
+ "epoch": 1.4641509433962265,
2275
+ "grad_norm": 0.1094905436038971,
2276
+ "learning_rate": 1.4110703383315326e-05,
2277
+ "loss": 0.4901,
2278
+ "step": 582
2279
+ },
2280
+ {
2281
+ "epoch": 1.469182389937107,
2282
+ "grad_norm": 0.11396130174398422,
2283
+ "learning_rate": 1.396244622018199e-05,
2284
+ "loss": 0.5081,
2285
+ "step": 584
2286
+ },
2287
+ {
2288
+ "epoch": 1.4742138364779875,
2289
+ "grad_norm": 0.1160426139831543,
2290
+ "learning_rate": 1.3815104667553452e-05,
2291
+ "loss": 0.4869,
2292
+ "step": 586
2293
+ },
2294
+ {
2295
+ "epoch": 1.479245283018868,
2296
+ "grad_norm": 0.11492225527763367,
2297
+ "learning_rate": 1.3668688690854453e-05,
2298
+ "loss": 0.4888,
2299
+ "step": 588
2300
+ },
2301
+ {
2302
+ "epoch": 1.4842767295597485,
2303
+ "grad_norm": 0.11282163113355637,
2304
+ "learning_rate": 1.3523208192908562e-05,
2305
+ "loss": 0.4983,
2306
+ "step": 590
2307
+ },
2308
+ {
2309
+ "epoch": 1.489308176100629,
2310
+ "grad_norm": 0.11276757717132568,
2311
+ "learning_rate": 1.3378673013268336e-05,
2312
+ "loss": 0.517,
2313
+ "step": 592
2314
+ },
2315
+ {
2316
+ "epoch": 1.4943396226415095,
2317
+ "grad_norm": 0.11005326360464096,
2318
+ "learning_rate": 1.3235092927549888e-05,
2319
+ "loss": 0.4933,
2320
+ "step": 594
2321
+ },
2322
+ {
2323
+ "epoch": 1.49937106918239,
2324
+ "grad_norm": 0.11391846090555191,
2325
+ "learning_rate": 1.3092477646771686e-05,
2326
+ "loss": 0.5047,
2327
+ "step": 596
2328
+ },
2329
+ {
2330
+ "epoch": 1.5044025157232706,
2331
+ "grad_norm": 0.112746462225914,
2332
+ "learning_rate": 1.2950836816697753e-05,
2333
+ "loss": 0.4933,
2334
+ "step": 598
2335
+ },
2336
+ {
2337
+ "epoch": 1.509433962264151,
2338
+ "grad_norm": 0.11274772882461548,
2339
+ "learning_rate": 1.2810180017185286e-05,
2340
+ "loss": 0.4928,
2341
+ "step": 600
2342
+ },
2343
+ {
2344
+ "epoch": 1.509433962264151,
2345
+ "eval_loss": 0.5733875632286072,
2346
+ "eval_runtime": 410.9812,
2347
+ "eval_samples_per_second": 26.05,
2348
+ "eval_steps_per_second": 0.204,
2349
+ "step": 600
2350
+ },
2351
+ {
2352
+ "epoch": 1.5144654088050316,
2353
+ "grad_norm": 0.11344069242477417,
2354
+ "learning_rate": 1.2670516761536705e-05,
2355
+ "loss": 0.5083,
2356
+ "step": 602
2357
+ },
2358
+ {
2359
+ "epoch": 1.519496855345912,
2360
+ "grad_norm": 0.1206919476389885,
2361
+ "learning_rate": 1.2531856495856234e-05,
2362
+ "loss": 0.4931,
2363
+ "step": 604
2364
+ },
2365
+ {
2366
+ "epoch": 1.5245283018867926,
2367
+ "grad_norm": 0.11568621546030045,
2368
+ "learning_rate": 1.2394208598411026e-05,
2369
+ "loss": 0.4961,
2370
+ "step": 606
2371
+ },
2372
+ {
2373
+ "epoch": 1.529559748427673,
2374
+ "grad_norm": 0.11288320273160934,
2375
+ "learning_rate": 1.2257582378996846e-05,
2376
+ "loss": 0.493,
2377
+ "step": 608
2378
+ },
2379
+ {
2380
+ "epoch": 1.5345911949685536,
2381
+ "grad_norm": 0.11447000503540039,
2382
+ "learning_rate": 1.2121987078308414e-05,
2383
+ "loss": 0.487,
2384
+ "step": 610
2385
+ },
2386
+ {
2387
+ "epoch": 1.539622641509434,
2388
+ "grad_norm": 0.1167483702301979,
2389
+ "learning_rate": 1.1987431867314417e-05,
2390
+ "loss": 0.5078,
2391
+ "step": 612
2392
+ },
2393
+ {
2394
+ "epoch": 1.5446540880503146,
2395
+ "grad_norm": 0.11339499801397324,
2396
+ "learning_rate": 1.1853925846637192e-05,
2397
+ "loss": 0.5101,
2398
+ "step": 614
2399
+ },
2400
+ {
2401
+ "epoch": 1.549685534591195,
2402
+ "grad_norm": 0.11238376796245575,
2403
+ "learning_rate": 1.1721478045937298e-05,
2404
+ "loss": 0.5075,
2405
+ "step": 616
2406
+ },
2407
+ {
2408
+ "epoch": 1.5547169811320756,
2409
+ "grad_norm": 0.11750028282403946,
2410
+ "learning_rate": 1.1590097423302684e-05,
2411
+ "loss": 0.5223,
2412
+ "step": 618
2413
+ },
2414
+ {
2415
+ "epoch": 1.559748427672956,
2416
+ "grad_norm": 0.11243315786123276,
2417
+ "learning_rate": 1.1459792864642889e-05,
2418
+ "loss": 0.5014,
2419
+ "step": 620
2420
+ },
2421
+ {
2422
+ "epoch": 1.559748427672956,
2423
+ "eval_loss": 0.5725140571594238,
2424
+ "eval_runtime": 411.304,
2425
+ "eval_samples_per_second": 26.029,
2426
+ "eval_steps_per_second": 0.204,
2427
+ "step": 620
2428
+ },
2429
+ {
2430
+ "epoch": 1.5647798742138366,
2431
+ "grad_norm": 0.10880452394485474,
2432
+ "learning_rate": 1.1330573183088027e-05,
2433
+ "loss": 0.4946,
2434
+ "step": 622
2435
+ },
2436
+ {
2437
+ "epoch": 1.569811320754717,
2438
+ "grad_norm": 0.11715767532587051,
2439
+ "learning_rate": 1.1202447118392666e-05,
2440
+ "loss": 0.4934,
2441
+ "step": 624
2442
+ },
2443
+ {
2444
+ "epoch": 1.5748427672955976,
2445
+ "grad_norm": 0.1085837110877037,
2446
+ "learning_rate": 1.1075423336344815e-05,
2447
+ "loss": 0.4918,
2448
+ "step": 626
2449
+ },
2450
+ {
2451
+ "epoch": 1.579874213836478,
2452
+ "grad_norm": 0.11400571465492249,
2453
+ "learning_rate": 1.0949510428179703e-05,
2454
+ "loss": 0.4907,
2455
+ "step": 628
2456
+ },
2457
+ {
2458
+ "epoch": 1.5849056603773586,
2459
+ "grad_norm": 0.11114535480737686,
2460
+ "learning_rate": 1.0824716909998783e-05,
2461
+ "loss": 0.504,
2462
+ "step": 630
2463
+ },
2464
+ {
2465
+ "epoch": 1.589937106918239,
2466
+ "grad_norm": 0.10678807646036148,
2467
+ "learning_rate": 1.0701051222193734e-05,
2468
+ "loss": 0.4757,
2469
+ "step": 632
2470
+ },
2471
+ {
2472
+ "epoch": 1.5949685534591196,
2473
+ "grad_norm": 0.11523126810789108,
2474
+ "learning_rate": 1.0578521728875578e-05,
2475
+ "loss": 0.5019,
2476
+ "step": 634
2477
+ },
2478
+ {
2479
+ "epoch": 1.6,
2480
+ "grad_norm": 0.11389489471912384,
2481
+ "learning_rate": 1.0457136717308988e-05,
2482
+ "loss": 0.5162,
2483
+ "step": 636
2484
+ },
2485
+ {
2486
+ "epoch": 1.6050314465408806,
2487
+ "grad_norm": 0.11754269152879715,
2488
+ "learning_rate": 1.0336904397351794e-05,
2489
+ "loss": 0.4991,
2490
+ "step": 638
2491
+ },
2492
+ {
2493
+ "epoch": 1.610062893081761,
2494
+ "grad_norm": 0.11521238088607788,
2495
+ "learning_rate": 1.021783290089966e-05,
2496
+ "loss": 0.5041,
2497
+ "step": 640
2498
+ },
2499
+ {
2500
+ "epoch": 1.610062893081761,
2501
+ "eval_loss": 0.57233065366745,
2502
+ "eval_runtime": 417.7875,
2503
+ "eval_samples_per_second": 25.625,
2504
+ "eval_steps_per_second": 0.201,
2505
+ "step": 640
2506
+ },
2507
+ {
2508
+ "epoch": 1.6150943396226416,
2509
+ "grad_norm": 0.1117156520485878,
2510
+ "learning_rate": 1.009993028133615e-05,
2511
+ "loss": 0.4919,
2512
+ "step": 642
2513
+ },
2514
+ {
2515
+ "epoch": 1.620125786163522,
2516
+ "grad_norm": 0.11549975723028183,
2517
+ "learning_rate": 9.983204512988004e-06,
2518
+ "loss": 0.4988,
2519
+ "step": 644
2520
+ },
2521
+ {
2522
+ "epoch": 1.6251572327044026,
2523
+ "grad_norm": 0.11243242025375366,
2524
+ "learning_rate": 9.867663490585783e-06,
2525
+ "loss": 0.5128,
2526
+ "step": 646
2527
+ },
2528
+ {
2529
+ "epoch": 1.630188679245283,
2530
+ "grad_norm": 0.11129079759120941,
2531
+ "learning_rate": 9.753315028729948e-06,
2532
+ "loss": 0.4893,
2533
+ "step": 648
2534
+ },
2535
+ {
2536
+ "epoch": 1.6352201257861636,
2537
+ "grad_norm": 0.11360695213079453,
2538
+ "learning_rate": 9.640166861362268e-06,
2539
+ "loss": 0.503,
2540
+ "step": 650
2541
+ },
2542
+ {
2543
+ "epoch": 1.640251572327044,
2544
+ "grad_norm": 0.11027677357196808,
2545
+ "learning_rate": 9.528226641242804e-06,
2546
+ "loss": 0.4933,
2547
+ "step": 652
2548
+ },
2549
+ {
2550
+ "epoch": 1.6452830188679246,
2551
+ "grad_norm": 0.11328162252902985,
2552
+ "learning_rate": 9.417501939432257e-06,
2553
+ "loss": 0.4969,
2554
+ "step": 654
2555
+ },
2556
+ {
2557
+ "epoch": 1.650314465408805,
2558
+ "grad_norm": 0.111870676279068,
2559
+ "learning_rate": 9.308000244779918e-06,
2560
+ "loss": 0.5009,
2561
+ "step": 656
2562
+ },
2563
+ {
2564
+ "epoch": 1.6553459119496856,
2565
+ "grad_norm": 0.11578749120235443,
2566
+ "learning_rate": 9.19972896341717e-06,
2567
+ "loss": 0.5226,
2568
+ "step": 658
2569
+ },
2570
+ {
2571
+ "epoch": 1.6603773584905661,
2572
+ "grad_norm": 0.11840783059597015,
2573
+ "learning_rate": 9.09269541825658e-06,
2574
+ "loss": 0.4876,
2575
+ "step": 660
2576
+ },
2577
+ {
2578
+ "epoch": 1.6603773584905661,
2579
+ "eval_loss": 0.5716937184333801,
2580
+ "eval_runtime": 411.8982,
2581
+ "eval_samples_per_second": 25.992,
2582
+ "eval_steps_per_second": 0.204,
2583
+ "step": 660
2584
+ },
2585
+ {
2586
+ "epoch": 1.6654088050314466,
2587
+ "grad_norm": 0.10933776944875717,
2588
+ "learning_rate": 8.98690684849659e-06,
2589
+ "loss": 0.5217,
2590
+ "step": 662
2591
+ },
2592
+ {
2593
+ "epoch": 1.6704402515723271,
2594
+ "grad_norm": 0.11809239536523819,
2595
+ "learning_rate": 8.882370409131924e-06,
2596
+ "loss": 0.5182,
2597
+ "step": 664
2598
+ },
2599
+ {
2600
+ "epoch": 1.6754716981132076,
2601
+ "grad_norm": 0.1144753098487854,
2602
+ "learning_rate": 8.779093170469629e-06,
2603
+ "loss": 0.4999,
2604
+ "step": 666
2605
+ },
2606
+ {
2607
+ "epoch": 1.6805031446540881,
2608
+ "grad_norm": 0.11396916210651398,
2609
+ "learning_rate": 8.677082117650906e-06,
2610
+ "loss": 0.507,
2611
+ "step": 668
2612
+ },
2613
+ {
2614
+ "epoch": 1.6855345911949686,
2615
+ "grad_norm": 0.11068397760391235,
2616
+ "learning_rate": 8.576344150178653e-06,
2617
+ "loss": 0.5136,
2618
+ "step": 670
2619
+ },
2620
+ {
2621
+ "epoch": 1.6905660377358491,
2622
+ "grad_norm": 0.1053067147731781,
2623
+ "learning_rate": 8.47688608145083e-06,
2624
+ "loss": 0.4907,
2625
+ "step": 672
2626
+ },
2627
+ {
2628
+ "epoch": 1.6955974842767296,
2629
+ "grad_norm": 0.1102994978427887,
2630
+ "learning_rate": 8.378714638299628e-06,
2631
+ "loss": 0.4881,
2632
+ "step": 674
2633
+ },
2634
+ {
2635
+ "epoch": 1.7006289308176101,
2636
+ "grad_norm": 0.10971739888191223,
2637
+ "learning_rate": 8.28183646053649e-06,
2638
+ "loss": 0.5176,
2639
+ "step": 676
2640
+ },
2641
+ {
2642
+ "epoch": 1.7056603773584906,
2643
+ "grad_norm": 0.11169516295194626,
2644
+ "learning_rate": 8.186258100503058e-06,
2645
+ "loss": 0.5102,
2646
+ "step": 678
2647
+ },
2648
+ {
2649
+ "epoch": 1.7106918238993711,
2650
+ "grad_norm": 0.1112278550863266,
2651
+ "learning_rate": 8.091986022627978e-06,
2652
+ "loss": 0.5272,
2653
+ "step": 680
2654
+ },
2655
+ {
2656
+ "epoch": 1.7106918238993711,
2657
+ "eval_loss": 0.5712010860443115,
2658
+ "eval_runtime": 411.4247,
2659
+ "eval_samples_per_second": 26.022,
2660
+ "eval_steps_per_second": 0.204,
2661
+ "step": 680
2662
+ },
2663
+ {
2664
+ "epoch": 1.7157232704402516,
2665
+ "grad_norm": 0.11466188728809357,
2666
+ "learning_rate": 7.999026602989687e-06,
2667
+ "loss": 0.4974,
2668
+ "step": 682
2669
+ },
2670
+ {
2671
+ "epoch": 1.7207547169811321,
2672
+ "grad_norm": 0.12257977575063705,
2673
+ "learning_rate": 7.907386128885182e-06,
2674
+ "loss": 0.4946,
2675
+ "step": 684
2676
+ },
2677
+ {
2678
+ "epoch": 1.7257861635220126,
2679
+ "grad_norm": 0.12135323882102966,
2680
+ "learning_rate": 7.817070798404755e-06,
2681
+ "loss": 0.5374,
2682
+ "step": 686
2683
+ },
2684
+ {
2685
+ "epoch": 1.7308176100628931,
2686
+ "grad_norm": 0.11566798388957977,
2687
+ "learning_rate": 7.728086720012813e-06,
2688
+ "loss": 0.5048,
2689
+ "step": 688
2690
+ },
2691
+ {
2692
+ "epoch": 1.7358490566037736,
2693
+ "grad_norm": 0.11244137585163116,
2694
+ "learning_rate": 7.640439912134711e-06,
2695
+ "loss": 0.5169,
2696
+ "step": 690
2697
+ },
2698
+ {
2699
+ "epoch": 1.7408805031446541,
2700
+ "grad_norm": 0.1125202625989914,
2701
+ "learning_rate": 7.554136302749705e-06,
2702
+ "loss": 0.5076,
2703
+ "step": 692
2704
+ },
2705
+ {
2706
+ "epoch": 1.7459119496855346,
2707
+ "grad_norm": 0.1143079325556755,
2708
+ "learning_rate": 7.469181728990013e-06,
2709
+ "loss": 0.4961,
2710
+ "step": 694
2711
+ },
2712
+ {
2713
+ "epoch": 1.7509433962264151,
2714
+ "grad_norm": 0.11676593869924545,
2715
+ "learning_rate": 7.385581936746035e-06,
2716
+ "loss": 0.5003,
2717
+ "step": 696
2718
+ },
2719
+ {
2720
+ "epoch": 1.7559748427672957,
2721
+ "grad_norm": 0.1095028892159462,
2722
+ "learning_rate": 7.303342580277696e-06,
2723
+ "loss": 0.4755,
2724
+ "step": 698
2725
+ },
2726
+ {
2727
+ "epoch": 1.7610062893081762,
2728
+ "grad_norm": 0.11439331620931625,
2729
+ "learning_rate": 7.222469221832061e-06,
2730
+ "loss": 0.5057,
2731
+ "step": 700
2732
+ },
2733
+ {
2734
+ "epoch": 1.7610062893081762,
2735
+ "eval_loss": 0.5707039833068848,
2736
+ "eval_runtime": 411.7854,
2737
+ "eval_samples_per_second": 25.999,
2738
+ "eval_steps_per_second": 0.204,
2739
+ "step": 700
2740
+ },
2741
+ {
2742
+ "epoch": 1.7660377358490567,
2743
+ "grad_norm": 0.11388733237981796,
2744
+ "learning_rate": 7.142967331267113e-06,
2745
+ "loss": 0.4748,
2746
+ "step": 702
2747
+ },
2748
+ {
2749
+ "epoch": 1.7710691823899372,
2750
+ "grad_norm": 0.11583738774061203,
2751
+ "learning_rate": 7.064842285681781e-06,
2752
+ "loss": 0.494,
2753
+ "step": 704
2754
+ },
2755
+ {
2756
+ "epoch": 1.7761006289308177,
2757
+ "grad_norm": 0.11597929149866104,
2758
+ "learning_rate": 6.988099369052318e-06,
2759
+ "loss": 0.5106,
2760
+ "step": 706
2761
+ },
2762
+ {
2763
+ "epoch": 1.7811320754716982,
2764
+ "grad_norm": 0.1117326021194458,
2765
+ "learning_rate": 6.9127437718748465e-06,
2766
+ "loss": 0.4844,
2767
+ "step": 708
2768
+ },
2769
+ {
2770
+ "epoch": 1.7861635220125787,
2771
+ "grad_norm": 0.11276806890964508,
2772
+ "learning_rate": 6.838780590814366e-06,
2773
+ "loss": 0.5221,
2774
+ "step": 710
2775
+ },
2776
+ {
2777
+ "epoch": 1.7911949685534592,
2778
+ "grad_norm": 0.11557289958000183,
2779
+ "learning_rate": 6.7662148283599955e-06,
2780
+ "loss": 0.5021,
2781
+ "step": 712
2782
+ },
2783
+ {
2784
+ "epoch": 1.7962264150943397,
2785
+ "grad_norm": 0.11254438757896423,
2786
+ "learning_rate": 6.695051392486652e-06,
2787
+ "loss": 0.4999,
2788
+ "step": 714
2789
+ },
2790
+ {
2791
+ "epoch": 1.8012578616352202,
2792
+ "grad_norm": 0.11114822328090668,
2793
+ "learning_rate": 6.625295096323097e-06,
2794
+ "loss": 0.4849,
2795
+ "step": 716
2796
+ },
2797
+ {
2798
+ "epoch": 1.8062893081761007,
2799
+ "grad_norm": 0.11419788002967834,
2800
+ "learning_rate": 6.556950657826405e-06,
2801
+ "loss": 0.5227,
2802
+ "step": 718
2803
+ },
2804
+ {
2805
+ "epoch": 1.8113207547169812,
2806
+ "grad_norm": 0.11834213882684708,
2807
+ "learning_rate": 6.490022699462844e-06,
2808
+ "loss": 0.5043,
2809
+ "step": 720
2810
+ },
2811
+ {
2812
+ "epoch": 1.8113207547169812,
2813
+ "eval_loss": 0.5703166723251343,
2814
+ "eval_runtime": 411.34,
2815
+ "eval_samples_per_second": 26.027,
2816
+ "eval_steps_per_second": 0.204,
2817
+ "step": 720
2818
+ },
2819
+ {
2820
+ "epoch": 1.8163522012578617,
2821
+ "grad_norm": 0.11034736037254333,
2822
+ "learning_rate": 6.424515747895265e-06,
2823
+ "loss": 0.48,
2824
+ "step": 722
2825
+ },
2826
+ {
2827
+ "epoch": 1.8213836477987422,
2828
+ "grad_norm": 0.11254922300577164,
2829
+ "learning_rate": 6.360434233676926e-06,
2830
+ "loss": 0.4864,
2831
+ "step": 724
2832
+ },
2833
+ {
2834
+ "epoch": 1.8264150943396227,
2835
+ "grad_norm": 0.10830461978912354,
2836
+ "learning_rate": 6.297782490951833e-06,
2837
+ "loss": 0.4943,
2838
+ "step": 726
2839
+ },
2840
+ {
2841
+ "epoch": 1.8314465408805032,
2842
+ "grad_norm": 0.1077997013926506,
2843
+ "learning_rate": 6.236564757161608e-06,
2844
+ "loss": 0.4865,
2845
+ "step": 728
2846
+ },
2847
+ {
2848
+ "epoch": 1.8364779874213837,
2849
+ "grad_norm": 0.11505385488271713,
2850
+ "learning_rate": 6.176785172758871e-06,
2851
+ "loss": 0.5039,
2852
+ "step": 730
2853
+ },
2854
+ {
2855
+ "epoch": 1.8415094339622642,
2856
+ "grad_norm": 0.12043328583240509,
2857
+ "learning_rate": 6.118447780927233e-06,
2858
+ "loss": 0.4909,
2859
+ "step": 732
2860
+ },
2861
+ {
2862
+ "epoch": 1.8465408805031447,
2863
+ "grad_norm": 0.11934798955917358,
2864
+ "learning_rate": 6.0615565273078025e-06,
2865
+ "loss": 0.4978,
2866
+ "step": 734
2867
+ },
2868
+ {
2869
+ "epoch": 1.8515723270440252,
2870
+ "grad_norm": 0.11153744161128998,
2871
+ "learning_rate": 6.006115259732345e-06,
2872
+ "loss": 0.4924,
2873
+ "step": 736
2874
+ },
2875
+ {
2876
+ "epoch": 1.8566037735849057,
2877
+ "grad_norm": 0.11369643360376358,
2878
+ "learning_rate": 5.952127727963029e-06,
2879
+ "loss": 0.4938,
2880
+ "step": 738
2881
+ },
2882
+ {
2883
+ "epoch": 1.8616352201257862,
2884
+ "grad_norm": 0.1063813716173172,
2885
+ "learning_rate": 5.899597583438808e-06,
2886
+ "loss": 0.5059,
2887
+ "step": 740
2888
+ },
2889
+ {
2890
+ "epoch": 1.8616352201257862,
2891
+ "eval_loss": 0.5707431435585022,
2892
+ "eval_runtime": 411.3909,
2893
+ "eval_samples_per_second": 26.024,
2894
+ "eval_steps_per_second": 0.204,
2895
+ "step": 740
2896
+ },
2897
+ {
2898
+ "epoch": 1.8666666666666667,
2899
+ "grad_norm": 0.12777185440063477,
2900
+ "learning_rate": 5.848528379028456e-06,
2901
+ "loss": 0.5138,
2902
+ "step": 742
2903
+ },
2904
+ {
2905
+ "epoch": 1.8716981132075472,
2906
+ "grad_norm": 0.11385183781385422,
2907
+ "learning_rate": 5.798923568790283e-06,
2908
+ "loss": 0.5101,
2909
+ "step": 744
2910
+ },
2911
+ {
2912
+ "epoch": 1.8767295597484277,
2913
+ "grad_norm": 0.11212068051099777,
2914
+ "learning_rate": 5.750786507738497e-06,
2915
+ "loss": 0.4845,
2916
+ "step": 746
2917
+ },
2918
+ {
2919
+ "epoch": 1.8817610062893082,
2920
+ "grad_norm": 0.11255759000778198,
2921
+ "learning_rate": 5.704120451616305e-06,
2922
+ "loss": 0.5019,
2923
+ "step": 748
2924
+ },
2925
+ {
2926
+ "epoch": 1.8867924528301887,
2927
+ "grad_norm": 0.11542278528213501,
2928
+ "learning_rate": 5.6589285566757095e-06,
2929
+ "loss": 0.5014,
2930
+ "step": 750
2931
+ },
2932
+ {
2933
+ "epoch": 1.8918238993710692,
2934
+ "grad_norm": 0.11562332510948181,
2935
+ "learning_rate": 5.61521387946403e-06,
2936
+ "loss": 0.5077,
2937
+ "step": 752
2938
+ },
2939
+ {
2940
+ "epoch": 1.8968553459119497,
2941
+ "grad_norm": 0.10956519842147827,
2942
+ "learning_rate": 5.572979376617183e-06,
2943
+ "loss": 0.4842,
2944
+ "step": 754
2945
+ },
2946
+ {
2947
+ "epoch": 1.9018867924528302,
2948
+ "grad_norm": 0.11526080965995789,
2949
+ "learning_rate": 5.532227904659695e-06,
2950
+ "loss": 0.488,
2951
+ "step": 756
2952
+ },
2953
+ {
2954
+ "epoch": 1.9069182389937107,
2955
+ "grad_norm": 0.11320438235998154,
2956
+ "learning_rate": 5.49296221981152e-06,
2957
+ "loss": 0.5021,
2958
+ "step": 758
2959
+ },
2960
+ {
2961
+ "epoch": 1.9119496855345912,
2962
+ "grad_norm": 0.11186771839857101,
2963
+ "learning_rate": 5.455184977801612e-06,
2964
+ "loss": 0.4996,
2965
+ "step": 760
2966
+ },
2967
+ {
2968
+ "epoch": 1.9119496855345912,
2969
+ "eval_loss": 0.5699969530105591,
2970
+ "eval_runtime": 411.4136,
2971
+ "eval_samples_per_second": 26.022,
2972
+ "eval_steps_per_second": 0.204,
2973
+ "step": 760
2974
+ },
2975
+ {
2976
+ "epoch": 1.9169811320754717,
2977
+ "grad_norm": 0.1088699921965599,
2978
+ "learning_rate": 5.418898733688302e-06,
2979
+ "loss": 0.4933,
2980
+ "step": 762
2981
+ },
2982
+ {
2983
+ "epoch": 1.9220125786163522,
2984
+ "grad_norm": 0.11521943658590317,
2985
+ "learning_rate": 5.384105941686499e-06,
2986
+ "loss": 0.5018,
2987
+ "step": 764
2988
+ },
2989
+ {
2990
+ "epoch": 1.9270440251572327,
2991
+ "grad_norm": 0.1104261726140976,
2992
+ "learning_rate": 5.350808955001693e-06,
2993
+ "loss": 0.4945,
2994
+ "step": 766
2995
+ },
2996
+ {
2997
+ "epoch": 1.9320754716981132,
2998
+ "grad_norm": 0.11762084811925888,
2999
+ "learning_rate": 5.3190100256707905e-06,
3000
+ "loss": 0.4892,
3001
+ "step": 768
3002
+ },
3003
+ {
3004
+ "epoch": 1.9371069182389937,
3005
+ "grad_norm": 0.1136779636144638,
3006
+ "learning_rate": 5.288711304409814e-06,
3007
+ "loss": 0.5075,
3008
+ "step": 770
3009
+ },
3010
+ {
3011
+ "epoch": 1.9421383647798742,
3012
+ "grad_norm": 0.11609724909067154,
3013
+ "learning_rate": 5.259914840468416e-06,
3014
+ "loss": 0.5052,
3015
+ "step": 772
3016
+ },
3017
+ {
3018
+ "epoch": 1.9471698113207547,
3019
+ "grad_norm": 0.11565674096345901,
3020
+ "learning_rate": 5.2326225814913e-06,
3021
+ "loss": 0.5054,
3022
+ "step": 774
3023
+ },
3024
+ {
3025
+ "epoch": 1.9522012578616352,
3026
+ "grad_norm": 0.10971004515886307,
3027
+ "learning_rate": 5.206836373386482e-06,
3028
+ "loss": 0.4711,
3029
+ "step": 776
3030
+ },
3031
+ {
3032
+ "epoch": 1.9572327044025157,
3033
+ "grad_norm": 0.11187240481376648,
3034
+ "learning_rate": 5.182557960200441e-06,
3035
+ "loss": 0.4946,
3036
+ "step": 778
3037
+ },
3038
+ {
3039
+ "epoch": 1.9622641509433962,
3040
+ "grad_norm": 0.11195476353168488,
3041
+ "learning_rate": 5.1597889840001635e-06,
3042
+ "loss": 0.4975,
3043
+ "step": 780
3044
+ },
3045
+ {
3046
+ "epoch": 1.9622641509433962,
3047
+ "eval_loss": 0.5697274208068848,
3048
+ "eval_runtime": 411.2337,
3049
+ "eval_samples_per_second": 26.034,
3050
+ "eval_steps_per_second": 0.204,
3051
+ "step": 780
3052
+ },
3053
+ {
3054
+ "epoch": 1.9672955974842767,
3055
+ "grad_norm": 0.11138684302568436,
3056
+ "learning_rate": 5.138530984762087e-06,
3057
+ "loss": 0.4915,
3058
+ "step": 782
3059
+ },
3060
+ {
3061
+ "epoch": 1.9723270440251572,
3062
+ "grad_norm": 0.10923836380243301,
3063
+ "learning_rate": 5.118785400267929e-06,
3064
+ "loss": 0.4855,
3065
+ "step": 784
3066
+ },
3067
+ {
3068
+ "epoch": 1.9773584905660377,
3069
+ "grad_norm": 0.10922150313854218,
3070
+ "learning_rate": 5.100553566007467e-06,
3071
+ "loss": 0.4794,
3072
+ "step": 786
3073
+ },
3074
+ {
3075
+ "epoch": 1.9823899371069182,
3076
+ "grad_norm": 0.10965920239686966,
3077
+ "learning_rate": 5.083836715088188e-06,
3078
+ "loss": 0.4836,
3079
+ "step": 788
3080
+ },
3081
+ {
3082
+ "epoch": 1.9874213836477987,
3083
+ "grad_norm": 0.11357378959655762,
3084
+ "learning_rate": 5.068635978151901e-06,
3085
+ "loss": 0.4942,
3086
+ "step": 790
3087
+ },
3088
+ {
3089
+ "epoch": 1.9924528301886792,
3090
+ "grad_norm": 0.11112317442893982,
3091
+ "learning_rate": 5.0549523832982645e-06,
3092
+ "loss": 0.4939,
3093
+ "step": 792
3094
+ },
3095
+ {
3096
+ "epoch": 1.9974842767295597,
3097
+ "grad_norm": 0.11381904780864716,
3098
+ "learning_rate": 5.042786856015253e-06,
3099
+ "loss": 0.5013,
3100
+ "step": 794
3101
+ },
3102
+ {
3103
+ "epoch": 1.9974842767295597,
3104
+ "step": 794,
3105
+ "total_flos": 946984181301248.0,
3106
+ "train_loss": 0.5561736259862818,
3107
+ "train_runtime": 112418.4712,
3108
+ "train_samples_per_second": 3.619,
3109
+ "train_steps_per_second": 0.007
3110
+ }
3111
+ ],
3112
+ "logging_steps": 2,
3113
+ "max_steps": 794,
3114
+ "num_input_tokens_seen": 0,
3115
+ "num_train_epochs": 2,
3116
+ "save_steps": 1000,
3117
+ "stateful_callbacks": {
3118
+ "TrainerControl": {
3119
+ "args": {
3120
+ "should_epoch_stop": false,
3121
+ "should_evaluate": false,
3122
+ "should_log": false,
3123
+ "should_save": true,
3124
+ "should_training_stop": true
3125
+ },
3126
+ "attributes": {}
3127
+ }
3128
+ },
3129
+ "total_flos": 946984181301248.0,
3130
+ "train_batch_size": 16,
3131
+ "trial_name": null,
3132
+ "trial_params": null
3133
+ }