agentlans commited on
Commit
0f821c2
1 Parent(s): 5c25919

Retrained with larger dataset

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 1.7358769019486208e+16,
4
- "train_loss": 1.7481338489563856,
5
- "train_runtime": 1121.6717,
6
- "train_samples": 10693,
7
- "train_samples_per_second": 95.331,
8
- "train_steps_per_second": 11.92
9
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 3.2973681787109376e+16,
4
+ "train_loss": 1.6927287036132812,
5
+ "train_runtime": 2178.3738,
6
+ "train_samples": 20000,
7
+ "train_samples_per_second": 91.812,
8
+ "train_steps_per_second": 11.476
9
  }
config.json CHANGED
@@ -56,7 +56,7 @@
56
  },
57
  "tie_word_embeddings": false,
58
  "torch_dtype": "float32",
59
- "transformers_version": "4.44.2",
60
  "use_cache": true,
61
  "vocab_size": 32128
62
  }
 
56
  },
57
  "tie_word_embeddings": false,
58
  "torch_dtype": "float32",
59
+ "transformers_version": "4.45.1",
60
  "use_cache": true,
61
  "vocab_size": 32128
62
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "decoder_start_token_id": 0,
4
  "eos_token_id": 1,
5
  "pad_token_id": 0,
6
- "transformers_version": "4.44.2"
7
  }
 
3
  "decoder_start_token_id": 0,
4
  "eos_token_id": 1,
5
  "pad_token_id": 0,
6
+ "transformers_version": "4.45.1"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e210fd97954bbbde1bc52e61324bb89dc084dc87d3f900928df9f351dacd6a9
3
  size 307867048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c440e1e5c5a811a5e853a1f7889a4c1dbb779680e9164c82fb83d1fa7ae909
3
  size 307867048
tokenizer_config.json CHANGED
@@ -927,7 +927,7 @@
927
  "<extra_id_98>",
928
  "<extra_id_99>"
929
  ],
930
- "clean_up_tokenization_spaces": true,
931
  "eos_token": "</s>",
932
  "extra_ids": 100,
933
  "model_max_length": 512,
 
927
  "<extra_id_98>",
928
  "<extra_id_99>"
929
  ],
930
+ "clean_up_tokenization_spaces": false,
931
  "eos_token": "</s>",
932
  "extra_ids": 100,
933
  "model_max_length": 512,
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 1.7358769019486208e+16,
4
- "train_loss": 1.7481338489563856,
5
- "train_runtime": 1121.6717,
6
- "train_samples": 10693,
7
- "train_samples_per_second": 95.331,
8
- "train_steps_per_second": 11.92
9
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 3.2973681787109376e+16,
4
+ "train_loss": 1.6927287036132812,
5
+ "train_runtime": 2178.3738,
6
+ "train_samples": 20000,
7
+ "train_samples_per_second": 91.812,
8
+ "train_steps_per_second": 11.476
9
  }
trainer_state.json CHANGED
@@ -2,206 +2,374 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
- "eval_steps": 500,
6
- "global_step": 13370,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.3739715781600598,
13
- "grad_norm": 3.664752721786499,
14
- "learning_rate": 4.813014210919971e-05,
15
- "loss": 2.1969,
16
  "step": 500
17
  },
18
  {
19
- "epoch": 0.7479431563201197,
20
- "grad_norm": 2.6418004035949707,
21
- "learning_rate": 4.62602842183994e-05,
22
- "loss": 2.0162,
23
  "step": 1000
24
  },
25
  {
26
- "epoch": 1.1219147344801794,
27
- "grad_norm": 2.6499061584472656,
28
- "learning_rate": 4.4390426327599105e-05,
29
- "loss": 1.928,
30
  "step": 1500
31
  },
32
  {
33
- "epoch": 1.4958863126402393,
34
- "grad_norm": 2.92419695854187,
35
- "learning_rate": 4.252056843679881e-05,
36
- "loss": 1.8714,
37
  "step": 2000
38
  },
39
  {
40
- "epoch": 1.8698578908002992,
41
- "grad_norm": 2.9675424098968506,
42
- "learning_rate": 4.06507105459985e-05,
43
- "loss": 1.8688,
44
  "step": 2500
45
  },
46
  {
47
- "epoch": 2.243829468960359,
48
- "grad_norm": 2.4510722160339355,
49
- "learning_rate": 3.878085265519821e-05,
50
- "loss": 1.8199,
51
  "step": 3000
52
  },
53
  {
54
- "epoch": 2.6178010471204187,
55
- "grad_norm": 3.3887815475463867,
56
- "learning_rate": 3.691099476439791e-05,
57
- "loss": 1.7878,
58
  "step": 3500
59
  },
60
  {
61
- "epoch": 2.9917726252804786,
62
- "grad_norm": 2.7540111541748047,
63
- "learning_rate": 3.5041136873597606e-05,
64
- "loss": 1.804,
65
  "step": 4000
66
  },
67
  {
68
- "epoch": 3.3657442034405385,
69
- "grad_norm": 2.7998127937316895,
70
- "learning_rate": 3.317127898279731e-05,
71
- "loss": 1.764,
72
  "step": 4500
73
  },
74
  {
75
- "epoch": 3.7397157816005984,
76
- "grad_norm": 3.000854015350342,
77
- "learning_rate": 3.130142109199701e-05,
78
- "loss": 1.7486,
79
  "step": 5000
80
  },
81
  {
82
- "epoch": 4.113687359760658,
83
- "grad_norm": 2.85657000541687,
84
- "learning_rate": 2.9431563201196712e-05,
85
- "loss": 1.7418,
86
  "step": 5500
87
  },
88
  {
89
- "epoch": 4.487658937920718,
90
- "grad_norm": 2.8628413677215576,
91
- "learning_rate": 2.7561705310396414e-05,
92
- "loss": 1.7193,
93
  "step": 6000
94
  },
95
  {
96
- "epoch": 4.861630516080778,
97
- "grad_norm": 3.066136360168457,
98
- "learning_rate": 2.569184741959611e-05,
99
- "loss": 1.7083,
100
  "step": 6500
101
  },
102
  {
103
- "epoch": 5.2356020942408374,
104
- "grad_norm": 2.46604585647583,
105
- "learning_rate": 2.382198952879581e-05,
106
- "loss": 1.6959,
107
  "step": 7000
108
  },
109
  {
110
- "epoch": 5.609573672400898,
111
- "grad_norm": 2.4070425033569336,
112
- "learning_rate": 2.1952131637995513e-05,
113
- "loss": 1.6787,
114
  "step": 7500
115
  },
116
  {
117
- "epoch": 5.983545250560957,
118
- "grad_norm": 2.59104585647583,
119
- "learning_rate": 2.0082273747195215e-05,
120
- "loss": 1.6979,
121
  "step": 8000
122
  },
123
  {
124
- "epoch": 6.3575168287210175,
125
- "grad_norm": 2.270085334777832,
126
- "learning_rate": 1.8212415856394914e-05,
127
- "loss": 1.6618,
128
  "step": 8500
129
  },
130
  {
131
- "epoch": 6.731488406881077,
132
- "grad_norm": 2.6211342811584473,
133
- "learning_rate": 1.6342557965594616e-05,
134
- "loss": 1.665,
135
  "step": 9000
136
  },
137
  {
138
- "epoch": 7.105459985041137,
139
- "grad_norm": 2.876598358154297,
140
- "learning_rate": 1.4472700074794315e-05,
141
- "loss": 1.6716,
142
  "step": 9500
143
  },
144
  {
145
- "epoch": 7.479431563201197,
146
- "grad_norm": 2.5392422676086426,
147
- "learning_rate": 1.2602842183994019e-05,
148
- "loss": 1.6354,
149
  "step": 10000
150
  },
151
  {
152
- "epoch": 7.853403141361256,
153
- "grad_norm": 2.6859071254730225,
154
- "learning_rate": 1.0732984293193717e-05,
155
- "loss": 1.6581,
156
  "step": 10500
157
  },
158
  {
159
- "epoch": 8.227374719521316,
160
- "grad_norm": 2.5266973972320557,
161
- "learning_rate": 8.863126402393418e-06,
162
- "loss": 1.6501,
163
  "step": 11000
164
  },
165
  {
166
- "epoch": 8.601346297681376,
167
- "grad_norm": 1.9921244382858276,
168
- "learning_rate": 6.993268511593119e-06,
169
- "loss": 1.6351,
170
  "step": 11500
171
  },
172
  {
173
- "epoch": 8.975317875841435,
174
- "grad_norm": 2.633855104446411,
175
- "learning_rate": 5.12341062079282e-06,
176
- "loss": 1.6507,
177
  "step": 12000
178
  },
179
  {
180
- "epoch": 9.349289454001497,
181
- "grad_norm": 2.9301204681396484,
182
- "learning_rate": 3.2535527299925206e-06,
183
- "loss": 1.6296,
184
  "step": 12500
185
  },
186
  {
187
- "epoch": 9.723261032161556,
188
- "grad_norm": 2.9712412357330322,
189
- "learning_rate": 1.3836948391922214e-06,
190
- "loss": 1.6316,
191
  "step": 13000
192
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  {
194
  "epoch": 10.0,
195
- "step": 13370,
196
- "total_flos": 1.7358769019486208e+16,
197
- "train_loss": 1.7481338489563856,
198
- "train_runtime": 1121.6717,
199
- "train_samples_per_second": 95.331,
200
- "train_steps_per_second": 11.92
201
  }
202
  ],
203
  "logging_steps": 500,
204
- "max_steps": 13370,
205
  "num_input_tokens_seen": 0,
206
  "num_train_epochs": 10,
207
  "save_steps": 500,
@@ -217,7 +385,7 @@
217
  "attributes": {}
218
  }
219
  },
220
- "total_flos": 1.7358769019486208e+16,
221
  "train_batch_size": 8,
222
  "trial_name": null,
223
  "trial_params": null
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 10.0,
5
+ "eval_steps": 1000.0,
6
+ "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.2,
13
+ "grad_norm": 3.1640737056732178,
14
+ "learning_rate": 4.9e-05,
15
+ "loss": 2.212,
16
  "step": 500
17
  },
18
  {
19
+ "epoch": 0.4,
20
+ "grad_norm": 23.492929458618164,
21
+ "learning_rate": 4.8e-05,
22
+ "loss": 2.0008,
23
  "step": 1000
24
  },
25
  {
26
+ "epoch": 0.6,
27
+ "grad_norm": 2.86879563331604,
28
+ "learning_rate": 4.7e-05,
29
+ "loss": 1.9454,
30
  "step": 1500
31
  },
32
  {
33
+ "epoch": 0.8,
34
+ "grad_norm": 2.5470571517944336,
35
+ "learning_rate": 4.600000000000001e-05,
36
+ "loss": 1.9098,
37
  "step": 2000
38
  },
39
  {
40
+ "epoch": 1.0,
41
+ "grad_norm": 3.299417018890381,
42
+ "learning_rate": 4.5e-05,
43
+ "loss": 1.8775,
44
  "step": 2500
45
  },
46
  {
47
+ "epoch": 1.2,
48
+ "grad_norm": 3.0483977794647217,
49
+ "learning_rate": 4.4000000000000006e-05,
50
+ "loss": 1.8398,
51
  "step": 3000
52
  },
53
  {
54
+ "epoch": 1.4,
55
+ "grad_norm": 3.0028903484344482,
56
+ "learning_rate": 4.3e-05,
57
+ "loss": 1.8147,
58
  "step": 3500
59
  },
60
  {
61
+ "epoch": 1.6,
62
+ "grad_norm": 2.972844123840332,
63
+ "learning_rate": 4.2e-05,
64
+ "loss": 1.8035,
65
  "step": 4000
66
  },
67
  {
68
+ "epoch": 1.8,
69
+ "grad_norm": 2.82574200630188,
70
+ "learning_rate": 4.1e-05,
71
+ "loss": 1.7974,
72
  "step": 4500
73
  },
74
  {
75
+ "epoch": 2.0,
76
+ "grad_norm": 2.6531100273132324,
77
+ "learning_rate": 4e-05,
78
+ "loss": 1.7818,
79
  "step": 5000
80
  },
81
  {
82
+ "epoch": 2.2,
83
+ "grad_norm": 3.6078109741210938,
84
+ "learning_rate": 3.9000000000000006e-05,
85
+ "loss": 1.7492,
86
  "step": 5500
87
  },
88
  {
89
+ "epoch": 2.4,
90
+ "grad_norm": 2.573765516281128,
91
+ "learning_rate": 3.8e-05,
92
+ "loss": 1.7536,
93
  "step": 6000
94
  },
95
  {
96
+ "epoch": 2.6,
97
+ "grad_norm": 3.07545804977417,
98
+ "learning_rate": 3.7e-05,
99
+ "loss": 1.7341,
100
  "step": 6500
101
  },
102
  {
103
+ "epoch": 2.8,
104
+ "grad_norm": 2.4871368408203125,
105
+ "learning_rate": 3.6e-05,
106
+ "loss": 1.7362,
107
  "step": 7000
108
  },
109
  {
110
+ "epoch": 3.0,
111
+ "grad_norm": 2.881721019744873,
112
+ "learning_rate": 3.5e-05,
113
+ "loss": 1.7426,
114
  "step": 7500
115
  },
116
  {
117
+ "epoch": 3.2,
118
+ "grad_norm": 3.6232352256774902,
119
+ "learning_rate": 3.4000000000000007e-05,
120
+ "loss": 1.6958,
121
  "step": 8000
122
  },
123
  {
124
+ "epoch": 3.4,
125
+ "grad_norm": 2.4172329902648926,
126
+ "learning_rate": 3.3e-05,
127
+ "loss": 1.6961,
128
  "step": 8500
129
  },
130
  {
131
+ "epoch": 3.6,
132
+ "grad_norm": 2.5114290714263916,
133
+ "learning_rate": 3.2000000000000005e-05,
134
+ "loss": 1.6961,
135
  "step": 9000
136
  },
137
  {
138
+ "epoch": 3.8,
139
+ "grad_norm": 2.7974891662597656,
140
+ "learning_rate": 3.1e-05,
141
+ "loss": 1.6831,
142
  "step": 9500
143
  },
144
  {
145
+ "epoch": 4.0,
146
+ "grad_norm": 2.511326551437378,
147
+ "learning_rate": 3e-05,
148
+ "loss": 1.7117,
149
  "step": 10000
150
  },
151
  {
152
+ "epoch": 4.2,
153
+ "grad_norm": 2.3756415843963623,
154
+ "learning_rate": 2.9e-05,
155
+ "loss": 1.6684,
156
  "step": 10500
157
  },
158
  {
159
+ "epoch": 4.4,
160
+ "grad_norm": 2.2897322177886963,
161
+ "learning_rate": 2.8000000000000003e-05,
162
+ "loss": 1.6826,
163
  "step": 11000
164
  },
165
  {
166
+ "epoch": 4.6,
167
+ "grad_norm": 2.7029635906219482,
168
+ "learning_rate": 2.7000000000000002e-05,
169
+ "loss": 1.6649,
170
  "step": 11500
171
  },
172
  {
173
+ "epoch": 4.8,
174
+ "grad_norm": 2.453059673309326,
175
+ "learning_rate": 2.6000000000000002e-05,
176
+ "loss": 1.6488,
177
  "step": 12000
178
  },
179
  {
180
+ "epoch": 5.0,
181
+ "grad_norm": 2.173687219619751,
182
+ "learning_rate": 2.5e-05,
183
+ "loss": 1.652,
184
  "step": 12500
185
  },
186
  {
187
+ "epoch": 5.2,
188
+ "grad_norm": 2.9122655391693115,
189
+ "learning_rate": 2.4e-05,
190
+ "loss": 1.6354,
191
  "step": 13000
192
  },
193
+ {
194
+ "epoch": 5.4,
195
+ "grad_norm": 2.8792638778686523,
196
+ "learning_rate": 2.3000000000000003e-05,
197
+ "loss": 1.6423,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 5.6,
202
+ "grad_norm": 3.0389626026153564,
203
+ "learning_rate": 2.2000000000000003e-05,
204
+ "loss": 1.6591,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 5.8,
209
+ "grad_norm": 2.6412346363067627,
210
+ "learning_rate": 2.1e-05,
211
+ "loss": 1.6365,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 6.0,
216
+ "grad_norm": 2.6655163764953613,
217
+ "learning_rate": 2e-05,
218
+ "loss": 1.6133,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 6.2,
223
+ "grad_norm": 2.6776795387268066,
224
+ "learning_rate": 1.9e-05,
225
+ "loss": 1.603,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 6.4,
230
+ "grad_norm": 3.4206550121307373,
231
+ "learning_rate": 1.8e-05,
232
+ "loss": 1.6142,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 6.6,
237
+ "grad_norm": 3.59487247467041,
238
+ "learning_rate": 1.7000000000000003e-05,
239
+ "loss": 1.6193,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 6.8,
244
+ "grad_norm": 2.707260847091675,
245
+ "learning_rate": 1.6000000000000003e-05,
246
+ "loss": 1.618,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 7.0,
251
+ "grad_norm": 2.4932329654693604,
252
+ "learning_rate": 1.5e-05,
253
+ "loss": 1.6179,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 7.2,
258
+ "grad_norm": 2.7114312648773193,
259
+ "learning_rate": 1.4000000000000001e-05,
260
+ "loss": 1.6064,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 7.4,
265
+ "grad_norm": 2.9900901317596436,
266
+ "learning_rate": 1.3000000000000001e-05,
267
+ "loss": 1.6079,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 7.6,
272
+ "grad_norm": 2.5069007873535156,
273
+ "learning_rate": 1.2e-05,
274
+ "loss": 1.5917,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 7.8,
279
+ "grad_norm": 2.816830635070801,
280
+ "learning_rate": 1.1000000000000001e-05,
281
+ "loss": 1.6159,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 8.0,
286
+ "grad_norm": 2.500258445739746,
287
+ "learning_rate": 1e-05,
288
+ "loss": 1.586,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 8.2,
293
+ "grad_norm": 2.392134189605713,
294
+ "learning_rate": 9e-06,
295
+ "loss": 1.5948,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 8.4,
300
+ "grad_norm": 2.19275164604187,
301
+ "learning_rate": 8.000000000000001e-06,
302
+ "loss": 1.5904,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 8.6,
307
+ "grad_norm": 2.1356213092803955,
308
+ "learning_rate": 7.000000000000001e-06,
309
+ "loss": 1.5772,
310
+ "step": 21500
311
+ },
312
+ {
313
+ "epoch": 8.8,
314
+ "grad_norm": 2.0939230918884277,
315
+ "learning_rate": 6e-06,
316
+ "loss": 1.604,
317
+ "step": 22000
318
+ },
319
+ {
320
+ "epoch": 9.0,
321
+ "grad_norm": 2.8503177165985107,
322
+ "learning_rate": 5e-06,
323
+ "loss": 1.5878,
324
+ "step": 22500
325
+ },
326
+ {
327
+ "epoch": 9.2,
328
+ "grad_norm": 2.6212220191955566,
329
+ "learning_rate": 4.000000000000001e-06,
330
+ "loss": 1.5893,
331
+ "step": 23000
332
+ },
333
+ {
334
+ "epoch": 9.4,
335
+ "grad_norm": 2.9165279865264893,
336
+ "learning_rate": 3e-06,
337
+ "loss": 1.5796,
338
+ "step": 23500
339
+ },
340
+ {
341
+ "epoch": 9.6,
342
+ "grad_norm": 2.8491413593292236,
343
+ "learning_rate": 2.0000000000000003e-06,
344
+ "loss": 1.5832,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 9.8,
349
+ "grad_norm": 3.690595865249634,
350
+ "learning_rate": 1.0000000000000002e-06,
351
+ "loss": 1.5872,
352
+ "step": 24500
353
+ },
354
+ {
355
+ "epoch": 10.0,
356
+ "grad_norm": 3.193439245223999,
357
+ "learning_rate": 0.0,
358
+ "loss": 1.5781,
359
+ "step": 25000
360
+ },
361
  {
362
  "epoch": 10.0,
363
+ "step": 25000,
364
+ "total_flos": 3.2973681787109376e+16,
365
+ "train_loss": 1.6927287036132812,
366
+ "train_runtime": 2178.3738,
367
+ "train_samples_per_second": 91.812,
368
+ "train_steps_per_second": 11.476
369
  }
370
  ],
371
  "logging_steps": 500,
372
+ "max_steps": 25000,
373
  "num_input_tokens_seen": 0,
374
  "num_train_epochs": 10,
375
  "save_steps": 500,
 
385
  "attributes": {}
386
  }
387
  },
388
+ "total_flos": 3.2973681787109376e+16,
389
  "train_batch_size": 8,
390
  "trial_name": null,
391
  "trial_params": null