1024m commited on
Commit
e67cb15
1 Parent(s): ce5020a

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +135 -262
  6. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:843cb11d766af36ac71491251c6a59e1c18acf437ae2a81f66e4f4c75fe0c1fd
3
  size 1629436964
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f242ee095f7638b7463b942830986f1a347b04482aa841b4677146bd81ef888d
3
  size 1629436964
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c37b72249ab670a79f95e7a8a28b4295bc0d2474b70ef75088b7f94377a21cd
3
  size 3259184731
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64f655fa0dd0081f2268b67b7028d6c401ddca176d518cebfa125e630d681ef2
3
  size 3259184731
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c615480f9132ca44d1a64f61d0ad33b4cfee708b0079b7d90597306c265b6e39
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8b8f78f8ab97f155a53e04798b283255c70c7d783ca3734914d92eb7738c00
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f8ea15f450d448adf5f088f5e69b322ba185360623de1fad0e8c5fa0932135
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb84971a8233865d3879ef75d98a5c44792abce4fbe173916e56341b9753aae6
3
  size 1064
trainer_state.json CHANGED
@@ -1,378 +1,251 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.0,
5
  "eval_steps": 500,
6
- "global_step": 4232,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.19,
13
- "grad_norm": 6.645576477050781,
14
- "learning_rate": 1.9810964083175805e-05,
15
- "loss": 0.5588,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.38,
20
- "grad_norm": 4.662398815155029,
21
- "learning_rate": 1.962192816635161e-05,
22
- "loss": 0.3737,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.57,
27
- "grad_norm": 5.282985687255859,
28
- "learning_rate": 1.9432892249527412e-05,
29
- "loss": 0.2863,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.76,
34
- "grad_norm": 1.5854848623275757,
35
- "learning_rate": 1.9243856332703215e-05,
36
- "loss": 0.323,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.95,
41
- "grad_norm": 19.09914779663086,
42
- "learning_rate": 1.905482041587902e-05,
43
- "loss": 0.2956,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 1.0,
48
- "eval_loss": 0.20477035641670227,
49
- "eval_runtime": 4.245,
50
- "eval_samples_per_second": 91.638,
51
- "eval_steps_per_second": 6.596,
52
- "step": 529
53
- },
54
- {
55
- "epoch": 1.13,
56
- "grad_norm": 26.59105682373047,
57
- "learning_rate": 1.8865784499054822e-05,
58
- "loss": 0.2269,
59
  "step": 600
60
  },
61
  {
62
- "epoch": 1.32,
63
- "grad_norm": 9.394390106201172,
64
- "learning_rate": 1.8676748582230626e-05,
65
- "loss": 0.2393,
66
  "step": 700
67
  },
68
  {
69
- "epoch": 1.51,
70
- "grad_norm": 13.779593467712402,
71
- "learning_rate": 1.848771266540643e-05,
72
- "loss": 0.2135,
73
  "step": 800
74
  },
75
  {
76
- "epoch": 1.7,
77
- "grad_norm": 0.6053704619407654,
78
- "learning_rate": 1.8298676748582232e-05,
79
- "loss": 0.2241,
80
  "step": 900
81
  },
82
  {
83
- "epoch": 1.89,
84
- "grad_norm": 16.190296173095703,
85
- "learning_rate": 1.8109640831758036e-05,
86
- "loss": 0.2355,
87
  "step": 1000
88
  },
89
  {
90
- "epoch": 2.0,
91
- "eval_loss": 0.21921110153198242,
92
- "eval_runtime": 4.2197,
93
- "eval_samples_per_second": 92.187,
94
- "eval_steps_per_second": 6.636,
95
- "step": 1058
96
- },
97
- {
98
- "epoch": 2.08,
99
- "grad_norm": 0.10688259452581406,
100
- "learning_rate": 1.792060491493384e-05,
101
- "loss": 0.1909,
102
  "step": 1100
103
  },
104
  {
105
- "epoch": 2.27,
106
- "grad_norm": 29.609710693359375,
107
- "learning_rate": 1.7731568998109643e-05,
108
- "loss": 0.1637,
109
  "step": 1200
110
  },
111
  {
112
- "epoch": 2.46,
113
- "grad_norm": 0.16163112223148346,
114
- "learning_rate": 1.7542533081285446e-05,
115
- "loss": 0.1828,
116
  "step": 1300
117
  },
118
  {
119
- "epoch": 2.65,
120
- "grad_norm": 2.127821207046509,
121
- "learning_rate": 1.735349716446125e-05,
122
- "loss": 0.1827,
123
  "step": 1400
124
  },
125
  {
126
- "epoch": 2.84,
127
- "grad_norm": 0.030160142108798027,
128
- "learning_rate": 1.7164461247637053e-05,
129
- "loss": 0.1528,
130
  "step": 1500
131
  },
132
  {
133
- "epoch": 3.0,
134
- "eval_loss": 0.2590792179107666,
135
- "eval_runtime": 4.2371,
136
- "eval_samples_per_second": 91.808,
137
- "eval_steps_per_second": 6.608,
138
- "step": 1587
139
- },
140
- {
141
- "epoch": 3.02,
142
- "grad_norm": 0.0637928918004036,
143
- "learning_rate": 1.6975425330812856e-05,
144
- "loss": 0.1574,
145
  "step": 1600
146
  },
147
  {
148
- "epoch": 3.21,
149
- "grad_norm": 0.01419603731483221,
150
- "learning_rate": 1.678638941398866e-05,
151
- "loss": 0.0918,
152
  "step": 1700
153
  },
154
  {
155
- "epoch": 3.4,
156
- "grad_norm": 9.31396770477295,
157
- "learning_rate": 1.6597353497164463e-05,
158
- "loss": 0.1175,
159
  "step": 1800
160
  },
161
  {
162
- "epoch": 3.59,
163
- "grad_norm": 0.023506687954068184,
164
- "learning_rate": 1.6408317580340267e-05,
165
- "loss": 0.1094,
166
  "step": 1900
167
  },
168
  {
169
- "epoch": 3.78,
170
- "grad_norm": 13.4344482421875,
171
- "learning_rate": 1.621928166351607e-05,
172
- "loss": 0.1103,
173
  "step": 2000
174
  },
175
  {
176
- "epoch": 3.97,
177
- "grad_norm": 30.01926040649414,
178
- "learning_rate": 1.6030245746691873e-05,
179
- "loss": 0.1398,
180
  "step": 2100
181
  },
182
  {
183
- "epoch": 4.0,
184
- "eval_loss": 0.33959057927131653,
185
- "eval_runtime": 4.2467,
186
- "eval_samples_per_second": 91.601,
187
- "eval_steps_per_second": 6.593,
188
- "step": 2116
189
- },
190
- {
191
- "epoch": 4.16,
192
- "grad_norm": 0.12888406217098236,
193
- "learning_rate": 1.5841209829867677e-05,
194
- "loss": 0.0536,
195
  "step": 2200
196
  },
197
  {
198
- "epoch": 4.35,
199
- "grad_norm": 0.06800971180200577,
200
- "learning_rate": 1.565217391304348e-05,
201
- "loss": 0.1039,
202
  "step": 2300
203
  },
204
  {
205
- "epoch": 4.54,
206
- "grad_norm": 2.495684862136841,
207
- "learning_rate": 1.5463137996219284e-05,
208
- "loss": 0.0886,
209
  "step": 2400
210
  },
211
  {
212
- "epoch": 4.73,
213
- "grad_norm": 3.7909159660339355,
214
- "learning_rate": 1.5274102079395087e-05,
215
- "loss": 0.0766,
216
  "step": 2500
217
  },
218
  {
219
- "epoch": 4.91,
220
- "grad_norm": 0.025901077315211296,
221
- "learning_rate": 1.5085066162570889e-05,
222
- "loss": 0.0916,
223
  "step": 2600
224
  },
225
  {
226
- "epoch": 5.0,
227
- "eval_loss": 0.3040069341659546,
228
- "eval_runtime": 4.2484,
229
- "eval_samples_per_second": 91.564,
230
- "eval_steps_per_second": 6.591,
231
- "step": 2645
232
- },
233
- {
234
- "epoch": 5.1,
235
- "grad_norm": 0.011547481641173363,
236
- "learning_rate": 1.4896030245746694e-05,
237
- "loss": 0.0644,
238
  "step": 2700
239
  },
240
  {
241
- "epoch": 5.29,
242
- "grad_norm": 0.026049258187413216,
243
- "learning_rate": 1.4706994328922497e-05,
244
- "loss": 0.0542,
245
  "step": 2800
246
  },
247
  {
248
- "epoch": 5.48,
249
- "grad_norm": 0.35288381576538086,
250
- "learning_rate": 1.45179584120983e-05,
251
- "loss": 0.0512,
252
  "step": 2900
253
  },
254
  {
255
- "epoch": 5.67,
256
- "grad_norm": 0.023430563509464264,
257
- "learning_rate": 1.4328922495274103e-05,
258
  "loss": 0.0491,
259
  "step": 3000
260
  },
261
  {
262
- "epoch": 5.86,
263
- "grad_norm": 0.0032355256844311953,
264
- "learning_rate": 1.4139886578449906e-05,
265
- "loss": 0.0326,
266
  "step": 3100
267
  },
268
  {
269
- "epoch": 6.0,
270
- "eval_loss": 0.3605019450187683,
271
- "eval_runtime": 4.2462,
272
- "eval_samples_per_second": 91.611,
273
- "eval_steps_per_second": 6.594,
274
- "step": 3174
275
- },
276
- {
277
- "epoch": 6.05,
278
- "grad_norm": 0.1259104311466217,
279
- "learning_rate": 1.395085066162571e-05,
280
- "loss": 0.0665,
281
  "step": 3200
282
  },
283
  {
284
- "epoch": 6.24,
285
- "grad_norm": 0.08823427557945251,
286
- "learning_rate": 1.3761814744801514e-05,
287
- "loss": 0.0252,
288
  "step": 3300
289
- },
290
- {
291
- "epoch": 6.43,
292
- "grad_norm": 101.22339630126953,
293
- "learning_rate": 1.3572778827977318e-05,
294
- "loss": 0.0416,
295
- "step": 3400
296
- },
297
- {
298
- "epoch": 6.62,
299
- "grad_norm": 0.033835213631391525,
300
- "learning_rate": 1.338374291115312e-05,
301
- "loss": 0.0465,
302
- "step": 3500
303
- },
304
- {
305
- "epoch": 6.81,
306
- "grad_norm": 0.15451325476169586,
307
- "learning_rate": 1.3194706994328923e-05,
308
- "loss": 0.0358,
309
- "step": 3600
310
- },
311
- {
312
- "epoch": 6.99,
313
- "grad_norm": 0.0031314522493630648,
314
- "learning_rate": 1.3005671077504726e-05,
315
- "loss": 0.0344,
316
- "step": 3700
317
- },
318
- {
319
- "epoch": 7.0,
320
- "eval_loss": 0.41286924481391907,
321
- "eval_runtime": 4.2519,
322
- "eval_samples_per_second": 91.488,
323
- "eval_steps_per_second": 6.585,
324
- "step": 3703
325
- },
326
- {
327
- "epoch": 7.18,
328
- "grad_norm": 0.0014451502356678247,
329
- "learning_rate": 1.281663516068053e-05,
330
- "loss": 0.0148,
331
- "step": 3800
332
- },
333
- {
334
- "epoch": 7.37,
335
- "grad_norm": 17.679460525512695,
336
- "learning_rate": 1.2627599243856335e-05,
337
- "loss": 0.0309,
338
- "step": 3900
339
- },
340
- {
341
- "epoch": 7.56,
342
- "grad_norm": 0.03577155992388725,
343
- "learning_rate": 1.2438563327032138e-05,
344
- "loss": 0.0235,
345
- "step": 4000
346
- },
347
- {
348
- "epoch": 7.75,
349
- "grad_norm": 28.639484405517578,
350
- "learning_rate": 1.224952741020794e-05,
351
- "loss": 0.0296,
352
- "step": 4100
353
- },
354
- {
355
- "epoch": 7.94,
356
- "grad_norm": 0.00975144561380148,
357
- "learning_rate": 1.2060491493383744e-05,
358
- "loss": 0.0218,
359
- "step": 4200
360
- },
361
- {
362
- "epoch": 8.0,
363
- "eval_loss": 0.3556581735610962,
364
- "eval_runtime": 4.245,
365
- "eval_samples_per_second": 91.637,
366
- "eval_steps_per_second": 6.596,
367
- "step": 4232
368
  }
369
  ],
370
  "logging_steps": 100,
371
- "max_steps": 10580,
372
  "num_input_tokens_seen": 0,
373
  "num_train_epochs": 20,
374
  "save_steps": 500,
375
- "total_flos": 4.321523491876685e+16,
376
  "train_batch_size": 14,
377
  "trial_name": null,
378
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.0,
5
  "eval_steps": 500,
6
+ "global_step": 3342,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.18,
13
+ "grad_norm": 6.054772853851318,
14
+ "learning_rate": 1.9820466786355476e-05,
15
+ "loss": 0.5721,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.36,
20
+ "grad_norm": 9.810357093811035,
21
+ "learning_rate": 1.9640933572710953e-05,
22
+ "loss": 0.3846,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.54,
27
+ "grad_norm": 9.026122093200684,
28
+ "learning_rate": 1.9461400359066428e-05,
29
+ "loss": 0.3019,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.72,
34
+ "grad_norm": 11.95788288116455,
35
+ "learning_rate": 1.9281867145421905e-05,
36
+ "loss": 0.3071,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.9,
41
+ "grad_norm": 15.329608917236328,
42
+ "learning_rate": 1.910233393177738e-05,
43
+ "loss": 0.2907,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 1.08,
48
+ "grad_norm": 12.364314079284668,
49
+ "learning_rate": 1.8922800718132857e-05,
50
+ "loss": 0.2646,
 
 
 
 
 
 
 
 
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 1.26,
55
+ "grad_norm": 14.555986404418945,
56
+ "learning_rate": 1.874326750448833e-05,
57
+ "loss": 0.2097,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 1.44,
62
+ "grad_norm": 4.199421405792236,
63
+ "learning_rate": 1.8563734290843805e-05,
64
+ "loss": 0.2389,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 1.62,
69
+ "grad_norm": 22.485984802246094,
70
+ "learning_rate": 1.8384201077199283e-05,
71
+ "loss": 0.2019,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 1.8,
76
+ "grad_norm": 9.688520431518555,
77
+ "learning_rate": 1.820466786355476e-05,
78
+ "loss": 0.2594,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 1.97,
83
+ "grad_norm": 18.052719116210938,
84
+ "learning_rate": 1.8025134649910235e-05,
85
+ "loss": 0.2044,
 
 
 
 
 
 
 
 
86
  "step": 1100
87
  },
88
  {
89
+ "epoch": 2.15,
90
+ "grad_norm": 0.7371789216995239,
91
+ "learning_rate": 1.7845601436265712e-05,
92
+ "loss": 0.1551,
93
  "step": 1200
94
  },
95
  {
96
+ "epoch": 2.33,
97
+ "grad_norm": 20.938648223876953,
98
+ "learning_rate": 1.7666068222621186e-05,
99
+ "loss": 0.1463,
100
  "step": 1300
101
  },
102
  {
103
+ "epoch": 2.51,
104
+ "grad_norm": 0.25227147340774536,
105
+ "learning_rate": 1.748653500897666e-05,
106
+ "loss": 0.1493,
107
  "step": 1400
108
  },
109
  {
110
+ "epoch": 2.69,
111
+ "grad_norm": 0.27634137868881226,
112
+ "learning_rate": 1.7307001795332138e-05,
113
+ "loss": 0.1649,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 2.87,
118
+ "grad_norm": 0.2588340938091278,
119
+ "learning_rate": 1.7127468581687616e-05,
120
+ "loss": 0.1521,
 
 
 
 
 
 
 
 
121
  "step": 1600
122
  },
123
  {
124
+ "epoch": 3.05,
125
+ "grad_norm": 0.05350634083151817,
126
+ "learning_rate": 1.694793536804309e-05,
127
+ "loss": 0.1343,
128
  "step": 1700
129
  },
130
  {
131
+ "epoch": 3.23,
132
+ "grad_norm": 0.02972230687737465,
133
+ "learning_rate": 1.6768402154398564e-05,
134
+ "loss": 0.1068,
135
  "step": 1800
136
  },
137
  {
138
+ "epoch": 3.41,
139
+ "grad_norm": 0.09572970867156982,
140
+ "learning_rate": 1.658886894075404e-05,
141
+ "loss": 0.1151,
142
  "step": 1900
143
  },
144
  {
145
+ "epoch": 3.59,
146
+ "grad_norm": 21.431325912475586,
147
+ "learning_rate": 1.6409335727109516e-05,
148
+ "loss": 0.1073,
149
  "step": 2000
150
  },
151
  {
152
+ "epoch": 3.77,
153
+ "grad_norm": 1.4688669443130493,
154
+ "learning_rate": 1.6229802513464993e-05,
155
+ "loss": 0.1098,
156
  "step": 2100
157
  },
158
  {
159
+ "epoch": 3.95,
160
+ "grad_norm": 19.461355209350586,
161
+ "learning_rate": 1.6050269299820467e-05,
162
+ "loss": 0.1238,
 
 
 
 
 
 
 
 
163
  "step": 2200
164
  },
165
  {
166
+ "epoch": 4.13,
167
+ "grad_norm": 6.33543586730957,
168
+ "learning_rate": 1.5870736086175945e-05,
169
+ "loss": 0.0934,
170
  "step": 2300
171
  },
172
  {
173
+ "epoch": 4.31,
174
+ "grad_norm": 10.25698184967041,
175
+ "learning_rate": 1.569120287253142e-05,
176
+ "loss": 0.068,
177
  "step": 2400
178
  },
179
  {
180
+ "epoch": 4.49,
181
+ "grad_norm": 0.05421575903892517,
182
+ "learning_rate": 1.5511669658886893e-05,
183
+ "loss": 0.0767,
184
  "step": 2500
185
  },
186
  {
187
+ "epoch": 4.67,
188
+ "grad_norm": 0.04917303845286369,
189
+ "learning_rate": 1.533213644524237e-05,
190
+ "loss": 0.1053,
191
  "step": 2600
192
  },
193
  {
194
+ "epoch": 4.85,
195
+ "grad_norm": 1.722901463508606,
196
+ "learning_rate": 1.5152603231597847e-05,
197
+ "loss": 0.0513,
 
 
 
 
 
 
 
 
198
  "step": 2700
199
  },
200
  {
201
+ "epoch": 5.03,
202
+ "grad_norm": 28.548158645629883,
203
+ "learning_rate": 1.4973070017953321e-05,
204
+ "loss": 0.0611,
205
  "step": 2800
206
  },
207
  {
208
+ "epoch": 5.21,
209
+ "grad_norm": 1.0562294721603394,
210
+ "learning_rate": 1.4793536804308799e-05,
211
+ "loss": 0.0543,
212
  "step": 2900
213
  },
214
  {
215
+ "epoch": 5.39,
216
+ "grad_norm": 0.011326675303280354,
217
+ "learning_rate": 1.4614003590664274e-05,
218
  "loss": 0.0491,
219
  "step": 3000
220
  },
221
  {
222
+ "epoch": 5.57,
223
+ "grad_norm": 0.009987740777432919,
224
+ "learning_rate": 1.4434470377019749e-05,
225
+ "loss": 0.0567,
226
  "step": 3100
227
  },
228
  {
229
+ "epoch": 5.75,
230
+ "grad_norm": 26.81354331970215,
231
+ "learning_rate": 1.4254937163375226e-05,
232
+ "loss": 0.0641,
 
 
 
 
 
 
 
 
233
  "step": 3200
234
  },
235
  {
236
+ "epoch": 5.92,
237
+ "grad_norm": 0.042816389352083206,
238
+ "learning_rate": 1.4075403949730702e-05,
239
+ "loss": 0.0488,
240
  "step": 3300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  }
242
  ],
243
  "logging_steps": 100,
244
+ "max_steps": 11140,
245
  "num_input_tokens_seen": 0,
246
  "num_train_epochs": 20,
247
  "save_steps": 500,
248
+ "total_flos": 3.4115676633458784e+16,
249
  "train_batch_size": 14,
250
  "trial_name": null,
251
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e38d84d4a8a34ed1c3d4c1693843cfd19f859389604ec942e9d423aad70c82fc
3
  size 4856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:491d871fc503f6c09f6650ccce3a497d9ac84504786a77113305d7344d9a5d08
3
  size 4856