shihaozou commited on
Commit
4b56d25
1 Parent(s): d430465

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -27,5 +27,5 @@
27
  "transformers_version": "4.41.2",
28
  "type_vocab_size": 2,
29
  "use_cache": true,
30
- "vocab_size": 105879
31
  }
 
27
  "transformers_version": "4.41.2",
28
  "type_vocab_size": 2,
29
  "use_cache": true,
30
+ "vocab_size": 176008
31
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1227fc9d063afe8a916525265451a011ad282688af85f24698698f65acebf31
3
- size 669879044
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac2d545e95d53dbd847e3dfb313fc59d64125bf026261ddc71edbe677f2edb81
3
+ size 885595848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d70e5e168a8b362f4912c842ebf94d0b361157d833012e71f64d87a5a60fcb56
3
- size 1339879610
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbc5e79a5f32a4ee4d255a20aeadc3cfbad6eb4cc0d92611370c0dfe67fc9dd5
3
+ size 1771313210
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:120972ded6aee57bd52e0678e4bc85f1cc274f4a37aa9bd80c07bf52890268c4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd920835b7e1d3f942ead294d5d63bc784d9e6ec27e5553642cbea5f2c9e77c4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f37a3a9153cdef7e42b212fa871ec5e0fa2010d69e2cd26dedc7993b39ac3e58
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5252eff43b3c3312cd40cbc7cbc932038114747237577193618542ca149ff6e
3
  size 1064
last-checkpoint/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json CHANGED
@@ -8,7 +8,7 @@
8
  "single_word": false,
9
  "special": true
10
  },
11
- "100": {
12
  "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
@@ -16,7 +16,7 @@
16
  "single_word": false,
17
  "special": true
18
  },
19
- "101": {
20
  "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
@@ -24,7 +24,7 @@
24
  "single_word": false,
25
  "special": true
26
  },
27
- "102": {
28
  "content": "[SEP]",
29
  "lstrip": false,
30
  "normalized": false,
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "special": true
34
  },
35
- "103": {
36
  "content": "[MASK]",
37
  "lstrip": false,
38
  "normalized": false,
 
8
  "single_word": false,
9
  "special": true
10
  },
11
+ "1": {
12
  "content": "[UNK]",
13
  "lstrip": false,
14
  "normalized": false,
 
16
  "single_word": false,
17
  "special": true
18
  },
19
+ "2": {
20
  "content": "[CLS]",
21
  "lstrip": false,
22
  "normalized": false,
 
24
  "single_word": false,
25
  "special": true
26
  },
27
+ "3": {
28
  "content": "[SEP]",
29
  "lstrip": false,
30
  "normalized": false,
 
32
  "single_word": false,
33
  "special": true
34
  },
35
+ "4": {
36
  "content": "[MASK]",
37
  "lstrip": false,
38
  "normalized": false,
last-checkpoint/trainer_state.json CHANGED
@@ -1,663 +1,19 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.352412529447692,
5
  "eval_steps": 500,
6
- "global_step": 46500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.014542070209114969,
13
- "grad_norm": 14.519265174865723,
14
  "learning_rate": 1.9903052865272566e-05,
15
- "loss": 0.9249,
16
  "step": 500
17
- },
18
- {
19
- "epoch": 0.029084140418229938,
20
- "grad_norm": 16.82585906982422,
21
- "learning_rate": 1.9806105730545135e-05,
22
- "loss": 0.7196,
23
- "step": 1000
24
- },
25
- {
26
- "epoch": 0.04362621062734491,
27
- "grad_norm": 7.649247646331787,
28
- "learning_rate": 1.9709158595817703e-05,
29
- "loss": 0.6657,
30
- "step": 1500
31
- },
32
- {
33
- "epoch": 0.058168280836459876,
34
- "grad_norm": 15.57618522644043,
35
- "learning_rate": 1.961221146109027e-05,
36
- "loss": 0.6333,
37
- "step": 2000
38
- },
39
- {
40
- "epoch": 0.07271035104557484,
41
- "grad_norm": 8.144426345825195,
42
- "learning_rate": 1.9515264326362836e-05,
43
- "loss": 0.6274,
44
- "step": 2500
45
- },
46
- {
47
- "epoch": 0.08725242125468982,
48
- "grad_norm": 5.530069828033447,
49
- "learning_rate": 1.9418317191635404e-05,
50
- "loss": 0.5996,
51
- "step": 3000
52
- },
53
- {
54
- "epoch": 0.10179449146380479,
55
- "grad_norm": 17.099382400512695,
56
- "learning_rate": 1.932137005690797e-05,
57
- "loss": 0.5851,
58
- "step": 3500
59
- },
60
- {
61
- "epoch": 0.11633656167291975,
62
- "grad_norm": 2.200979709625244,
63
- "learning_rate": 1.9224422922180537e-05,
64
- "loss": 0.6288,
65
- "step": 4000
66
- },
67
- {
68
- "epoch": 0.13087863188203472,
69
- "grad_norm": 13.900425910949707,
70
- "learning_rate": 1.91274757874531e-05,
71
- "loss": 0.6023,
72
- "step": 4500
73
- },
74
- {
75
- "epoch": 0.14542070209114968,
76
- "grad_norm": 2.3364081382751465,
77
- "learning_rate": 1.903052865272567e-05,
78
- "loss": 0.5997,
79
- "step": 5000
80
- },
81
- {
82
- "epoch": 0.15996277230026468,
83
- "grad_norm": 60.922637939453125,
84
- "learning_rate": 1.8933581517998238e-05,
85
- "loss": 0.5695,
86
- "step": 5500
87
- },
88
- {
89
- "epoch": 0.17450484250937964,
90
- "grad_norm": 5.009929656982422,
91
- "learning_rate": 1.8836634383270806e-05,
92
- "loss": 0.5434,
93
- "step": 6000
94
- },
95
- {
96
- "epoch": 0.1890469127184946,
97
- "grad_norm": 10.972416877746582,
98
- "learning_rate": 1.873968724854337e-05,
99
- "loss": 0.5832,
100
- "step": 6500
101
- },
102
- {
103
- "epoch": 0.20358898292760957,
104
- "grad_norm": 3.8107187747955322,
105
- "learning_rate": 1.864274011381594e-05,
106
- "loss": 0.5418,
107
- "step": 7000
108
- },
109
- {
110
- "epoch": 0.21813105313672454,
111
- "grad_norm": 1.1021103858947754,
112
- "learning_rate": 1.8545792979088504e-05,
113
- "loss": 0.5869,
114
- "step": 7500
115
- },
116
- {
117
- "epoch": 0.2326731233458395,
118
- "grad_norm": 4.626884460449219,
119
- "learning_rate": 1.844884584436107e-05,
120
- "loss": 0.5532,
121
- "step": 8000
122
- },
123
- {
124
- "epoch": 0.24721519355495447,
125
- "grad_norm": 2.25512957572937,
126
- "learning_rate": 1.8351898709633636e-05,
127
- "loss": 0.5484,
128
- "step": 8500
129
- },
130
- {
131
- "epoch": 0.26175726376406944,
132
- "grad_norm": 10.878036499023438,
133
- "learning_rate": 1.8254951574906205e-05,
134
- "loss": 0.517,
135
- "step": 9000
136
- },
137
- {
138
- "epoch": 0.2762993339731844,
139
- "grad_norm": 0.1631862372159958,
140
- "learning_rate": 1.8158004440178773e-05,
141
- "loss": 0.5495,
142
- "step": 9500
143
- },
144
- {
145
- "epoch": 0.29084140418229937,
146
- "grad_norm": 4.393772125244141,
147
- "learning_rate": 1.806105730545134e-05,
148
- "loss": 0.5365,
149
- "step": 10000
150
- },
151
- {
152
- "epoch": 0.3053834743914144,
153
- "grad_norm": 3.9716999530792236,
154
- "learning_rate": 1.7964110170723906e-05,
155
- "loss": 0.5463,
156
- "step": 10500
157
- },
158
- {
159
- "epoch": 0.31992554460052935,
160
- "grad_norm": 16.515634536743164,
161
- "learning_rate": 1.7867163035996474e-05,
162
- "loss": 0.5367,
163
- "step": 11000
164
- },
165
- {
166
- "epoch": 0.3344676148096443,
167
- "grad_norm": 8.901307106018066,
168
- "learning_rate": 1.777021590126904e-05,
169
- "loss": 0.5279,
170
- "step": 11500
171
- },
172
- {
173
- "epoch": 0.3490096850187593,
174
- "grad_norm": 5.7714524269104,
175
- "learning_rate": 1.7673268766541607e-05,
176
- "loss": 0.5694,
177
- "step": 12000
178
- },
179
- {
180
- "epoch": 0.36355175522787425,
181
- "grad_norm": 14.238178253173828,
182
- "learning_rate": 1.757632163181417e-05,
183
- "loss": 0.5453,
184
- "step": 12500
185
- },
186
- {
187
- "epoch": 0.3780938254369892,
188
- "grad_norm": 2.3930513858795166,
189
- "learning_rate": 1.747937449708674e-05,
190
- "loss": 0.5055,
191
- "step": 13000
192
- },
193
- {
194
- "epoch": 0.3926358956461042,
195
- "grad_norm": 0.9438181519508362,
196
- "learning_rate": 1.7382427362359308e-05,
197
- "loss": 0.5561,
198
- "step": 13500
199
- },
200
- {
201
- "epoch": 0.40717796585521915,
202
- "grad_norm": 19.889507293701172,
203
- "learning_rate": 1.7285480227631876e-05,
204
- "loss": 0.5267,
205
- "step": 14000
206
- },
207
- {
208
- "epoch": 0.4217200360643341,
209
- "grad_norm": 9.12895393371582,
210
- "learning_rate": 1.718853309290444e-05,
211
- "loss": 0.5353,
212
- "step": 14500
213
- },
214
- {
215
- "epoch": 0.4362621062734491,
216
- "grad_norm": 6.552937030792236,
217
- "learning_rate": 1.709158595817701e-05,
218
- "loss": 0.5231,
219
- "step": 15000
220
- },
221
- {
222
- "epoch": 0.45080417648256405,
223
- "grad_norm": 6.750959396362305,
224
- "learning_rate": 1.6994638823449574e-05,
225
- "loss": 0.4972,
226
- "step": 15500
227
- },
228
- {
229
- "epoch": 0.465346246691679,
230
- "grad_norm": 10.956033706665039,
231
- "learning_rate": 1.689769168872214e-05,
232
- "loss": 0.4949,
233
- "step": 16000
234
- },
235
- {
236
- "epoch": 0.479888316900794,
237
- "grad_norm": 3.459519863128662,
238
- "learning_rate": 1.6800744553994706e-05,
239
- "loss": 0.5219,
240
- "step": 16500
241
- },
242
- {
243
- "epoch": 0.49443038710990894,
244
- "grad_norm": 7.870626926422119,
245
- "learning_rate": 1.6703797419267275e-05,
246
- "loss": 0.5049,
247
- "step": 17000
248
- },
249
- {
250
- "epoch": 0.508972457319024,
251
- "grad_norm": 9.513204574584961,
252
- "learning_rate": 1.6606850284539843e-05,
253
- "loss": 0.5365,
254
- "step": 17500
255
- },
256
- {
257
- "epoch": 0.5235145275281389,
258
- "grad_norm": 4.7530951499938965,
259
- "learning_rate": 1.650990314981241e-05,
260
- "loss": 0.4979,
261
- "step": 18000
262
- },
263
- {
264
- "epoch": 0.5380565977372539,
265
- "grad_norm": 4.865274906158447,
266
- "learning_rate": 1.6412956015084976e-05,
267
- "loss": 0.5273,
268
- "step": 18500
269
- },
270
- {
271
- "epoch": 0.5525986679463688,
272
- "grad_norm": 2.539562940597534,
273
- "learning_rate": 1.6316008880357544e-05,
274
- "loss": 0.5029,
275
- "step": 19000
276
- },
277
- {
278
- "epoch": 0.5671407381554838,
279
- "grad_norm": 2.071009874343872,
280
- "learning_rate": 1.621906174563011e-05,
281
- "loss": 0.5166,
282
- "step": 19500
283
- },
284
- {
285
- "epoch": 0.5816828083645987,
286
- "grad_norm": 7.213927268981934,
287
- "learning_rate": 1.6122114610902677e-05,
288
- "loss": 0.5628,
289
- "step": 20000
290
- },
291
- {
292
- "epoch": 0.5962248785737138,
293
- "grad_norm": 19.586095809936523,
294
- "learning_rate": 1.602516747617524e-05,
295
- "loss": 0.5259,
296
- "step": 20500
297
- },
298
- {
299
- "epoch": 0.6107669487828288,
300
- "grad_norm": 2.6470067501068115,
301
- "learning_rate": 1.592822034144781e-05,
302
- "loss": 0.5044,
303
- "step": 21000
304
- },
305
- {
306
- "epoch": 0.6253090189919437,
307
- "grad_norm": 8.12119197845459,
308
- "learning_rate": 1.5831273206720378e-05,
309
- "loss": 0.4909,
310
- "step": 21500
311
- },
312
- {
313
- "epoch": 0.6398510892010587,
314
- "grad_norm": 11.704862594604492,
315
- "learning_rate": 1.5734326071992943e-05,
316
- "loss": 0.5103,
317
- "step": 22000
318
- },
319
- {
320
- "epoch": 0.6543931594101736,
321
- "grad_norm": 5.466031551361084,
322
- "learning_rate": 1.563737893726551e-05,
323
- "loss": 0.5097,
324
- "step": 22500
325
- },
326
- {
327
- "epoch": 0.6689352296192886,
328
- "grad_norm": 1.2860121726989746,
329
- "learning_rate": 1.5540431802538075e-05,
330
- "loss": 0.4858,
331
- "step": 23000
332
- },
333
- {
334
- "epoch": 0.6834772998284036,
335
- "grad_norm": 5.133608341217041,
336
- "learning_rate": 1.5443484667810644e-05,
337
- "loss": 0.4814,
338
- "step": 23500
339
- },
340
- {
341
- "epoch": 0.6980193700375186,
342
- "grad_norm": 10.405769348144531,
343
- "learning_rate": 1.5346537533083212e-05,
344
- "loss": 0.4985,
345
- "step": 24000
346
- },
347
- {
348
- "epoch": 0.7125614402466335,
349
- "grad_norm": 3.4990031719207764,
350
- "learning_rate": 1.5249590398355778e-05,
351
- "loss": 0.493,
352
- "step": 24500
353
- },
354
- {
355
- "epoch": 0.7271035104557485,
356
- "grad_norm": 5.292512893676758,
357
- "learning_rate": 1.5152643263628345e-05,
358
- "loss": 0.4807,
359
- "step": 25000
360
- },
361
- {
362
- "epoch": 0.7416455806648634,
363
- "grad_norm": 17.39272117614746,
364
- "learning_rate": 1.5055696128900911e-05,
365
- "loss": 0.4966,
366
- "step": 25500
367
- },
368
- {
369
- "epoch": 0.7561876508739784,
370
- "grad_norm": 18.466636657714844,
371
- "learning_rate": 1.4958748994173478e-05,
372
- "loss": 0.4851,
373
- "step": 26000
374
- },
375
- {
376
- "epoch": 0.7707297210830933,
377
- "grad_norm": 2.161870241165161,
378
- "learning_rate": 1.4861801859446046e-05,
379
- "loss": 0.4922,
380
- "step": 26500
381
- },
382
- {
383
- "epoch": 0.7852717912922084,
384
- "grad_norm": 1.0825892686843872,
385
- "learning_rate": 1.4764854724718612e-05,
386
- "loss": 0.4987,
387
- "step": 27000
388
- },
389
- {
390
- "epoch": 0.7998138615013233,
391
- "grad_norm": 11.1233491897583,
392
- "learning_rate": 1.4667907589991179e-05,
393
- "loss": 0.4728,
394
- "step": 27500
395
- },
396
- {
397
- "epoch": 0.8143559317104383,
398
- "grad_norm": 8.120223045349121,
399
- "learning_rate": 1.4570960455263745e-05,
400
- "loss": 0.5043,
401
- "step": 28000
402
- },
403
- {
404
- "epoch": 0.8288980019195533,
405
- "grad_norm": 6.8163933753967285,
406
- "learning_rate": 1.4474013320536313e-05,
407
- "loss": 0.5224,
408
- "step": 28500
409
- },
410
- {
411
- "epoch": 0.8434400721286682,
412
- "grad_norm": 15.210949897766113,
413
- "learning_rate": 1.437706618580888e-05,
414
- "loss": 0.4532,
415
- "step": 29000
416
- },
417
- {
418
- "epoch": 0.8579821423377832,
419
- "grad_norm": 3.8793272972106934,
420
- "learning_rate": 1.4280119051081446e-05,
421
- "loss": 0.4864,
422
- "step": 29500
423
- },
424
- {
425
- "epoch": 0.8725242125468982,
426
- "grad_norm": 8.277094841003418,
427
- "learning_rate": 1.4183171916354013e-05,
428
- "loss": 0.5122,
429
- "step": 30000
430
- },
431
- {
432
- "epoch": 0.8870662827560132,
433
- "grad_norm": 6.275518417358398,
434
- "learning_rate": 1.408622478162658e-05,
435
- "loss": 0.49,
436
- "step": 30500
437
- },
438
- {
439
- "epoch": 0.9016083529651281,
440
- "grad_norm": 8.79964542388916,
441
- "learning_rate": 1.3989277646899147e-05,
442
- "loss": 0.4991,
443
- "step": 31000
444
- },
445
- {
446
- "epoch": 0.9161504231742431,
447
- "grad_norm": 23.397424697875977,
448
- "learning_rate": 1.3892330512171715e-05,
449
- "loss": 0.4966,
450
- "step": 31500
451
- },
452
- {
453
- "epoch": 0.930692493383358,
454
- "grad_norm": 9.809805870056152,
455
- "learning_rate": 1.379538337744428e-05,
456
- "loss": 0.5091,
457
- "step": 32000
458
- },
459
- {
460
- "epoch": 0.945234563592473,
461
- "grad_norm": 1.723449468612671,
462
- "learning_rate": 1.3698436242716848e-05,
463
- "loss": 0.4963,
464
- "step": 32500
465
- },
466
- {
467
- "epoch": 0.959776633801588,
468
- "grad_norm": 9.49936580657959,
469
- "learning_rate": 1.3601489107989415e-05,
470
- "loss": 0.4739,
471
- "step": 33000
472
- },
473
- {
474
- "epoch": 0.974318704010703,
475
- "grad_norm": 5.465103626251221,
476
- "learning_rate": 1.3504541973261983e-05,
477
- "loss": 0.4947,
478
- "step": 33500
479
- },
480
- {
481
- "epoch": 0.9888607742198179,
482
- "grad_norm": 5.015740871429443,
483
- "learning_rate": 1.3407594838534548e-05,
484
- "loss": 0.4692,
485
- "step": 34000
486
- },
487
- {
488
- "epoch": 1.003402844428933,
489
- "grad_norm": 11.569725036621094,
490
- "learning_rate": 1.3310647703807116e-05,
491
- "loss": 0.4667,
492
- "step": 34500
493
- },
494
- {
495
- "epoch": 1.017944914638048,
496
- "grad_norm": 6.356573581695557,
497
- "learning_rate": 1.3213700569079682e-05,
498
- "loss": 0.4682,
499
- "step": 35000
500
- },
501
- {
502
- "epoch": 1.032486984847163,
503
- "grad_norm": 5.768576622009277,
504
- "learning_rate": 1.311675343435225e-05,
505
- "loss": 0.486,
506
- "step": 35500
507
- },
508
- {
509
- "epoch": 1.0470290550562777,
510
- "grad_norm": 5.155892372131348,
511
- "learning_rate": 1.3019806299624815e-05,
512
- "loss": 0.4968,
513
- "step": 36000
514
- },
515
- {
516
- "epoch": 1.0615711252653928,
517
- "grad_norm": 3.3541529178619385,
518
- "learning_rate": 1.2922859164897383e-05,
519
- "loss": 0.4944,
520
- "step": 36500
521
- },
522
- {
523
- "epoch": 1.0761131954745078,
524
- "grad_norm": 8.195282936096191,
525
- "learning_rate": 1.282591203016995e-05,
526
- "loss": 0.4467,
527
- "step": 37000
528
- },
529
- {
530
- "epoch": 1.0906552656836228,
531
- "grad_norm": 3.4823594093322754,
532
- "learning_rate": 1.2728964895442518e-05,
533
- "loss": 0.4544,
534
- "step": 37500
535
- },
536
- {
537
- "epoch": 1.1051973358927376,
538
- "grad_norm": 2.6191506385803223,
539
- "learning_rate": 1.2632017760715083e-05,
540
- "loss": 0.4807,
541
- "step": 38000
542
- },
543
- {
544
- "epoch": 1.1197394061018526,
545
- "grad_norm": 3.6867098808288574,
546
- "learning_rate": 1.253507062598765e-05,
547
- "loss": 0.5017,
548
- "step": 38500
549
- },
550
- {
551
- "epoch": 1.1342814763109677,
552
- "grad_norm": 4.41229772567749,
553
- "learning_rate": 1.2438123491260217e-05,
554
- "loss": 0.4981,
555
- "step": 39000
556
- },
557
- {
558
- "epoch": 1.1488235465200827,
559
- "grad_norm": 6.181690692901611,
560
- "learning_rate": 1.2341176356532785e-05,
561
- "loss": 0.4632,
562
- "step": 39500
563
- },
564
- {
565
- "epoch": 1.1633656167291975,
566
- "grad_norm": 1.7811199426651,
567
- "learning_rate": 1.224422922180535e-05,
568
- "loss": 0.4935,
569
- "step": 40000
570
- },
571
- {
572
- "epoch": 1.1779076869383125,
573
- "grad_norm": 7.902093410491943,
574
- "learning_rate": 1.2147282087077918e-05,
575
- "loss": 0.467,
576
- "step": 40500
577
- },
578
- {
579
- "epoch": 1.1924497571474275,
580
- "grad_norm": 7.348107814788818,
581
- "learning_rate": 1.2050334952350485e-05,
582
- "loss": 0.4732,
583
- "step": 41000
584
- },
585
- {
586
- "epoch": 1.2069918273565425,
587
- "grad_norm": 1.9056262969970703,
588
- "learning_rate": 1.1953387817623053e-05,
589
- "loss": 0.4689,
590
- "step": 41500
591
- },
592
- {
593
- "epoch": 1.2215338975656573,
594
- "grad_norm": 3.8321168422698975,
595
- "learning_rate": 1.1856440682895618e-05,
596
- "loss": 0.4957,
597
- "step": 42000
598
- },
599
- {
600
- "epoch": 1.2360759677747724,
601
- "grad_norm": 0.5265329480171204,
602
- "learning_rate": 1.1759493548168184e-05,
603
- "loss": 0.4738,
604
- "step": 42500
605
- },
606
- {
607
- "epoch": 1.2506180379838874,
608
- "grad_norm": 7.372343063354492,
609
- "learning_rate": 1.1662546413440752e-05,
610
- "loss": 0.485,
611
- "step": 43000
612
- },
613
- {
614
- "epoch": 1.2651601081930024,
615
- "grad_norm": 2.4991230964660645,
616
- "learning_rate": 1.1565599278713317e-05,
617
- "loss": 0.4497,
618
- "step": 43500
619
- },
620
- {
621
- "epoch": 1.2797021784021174,
622
- "grad_norm": 11.783917427062988,
623
- "learning_rate": 1.1468652143985885e-05,
624
- "loss": 0.5148,
625
- "step": 44000
626
- },
627
- {
628
- "epoch": 1.2942442486112322,
629
- "grad_norm": 12.756231307983398,
630
- "learning_rate": 1.1371705009258452e-05,
631
- "loss": 0.4676,
632
- "step": 44500
633
- },
634
- {
635
- "epoch": 1.3087863188203472,
636
- "grad_norm": 4.692300319671631,
637
- "learning_rate": 1.127475787453102e-05,
638
- "loss": 0.4647,
639
- "step": 45000
640
- },
641
- {
642
- "epoch": 1.3233283890294623,
643
- "grad_norm": 7.03782844543457,
644
- "learning_rate": 1.1177810739803584e-05,
645
- "loss": 0.4572,
646
- "step": 45500
647
- },
648
- {
649
- "epoch": 1.3378704592385773,
650
- "grad_norm": 3.1515519618988037,
651
- "learning_rate": 1.1080863605076153e-05,
652
- "loss": 0.5162,
653
- "step": 46000
654
- },
655
- {
656
- "epoch": 1.352412529447692,
657
- "grad_norm": 13.067205429077148,
658
- "learning_rate": 1.0983916470348719e-05,
659
- "loss": 0.5158,
660
- "step": 46500
661
  }
662
  ],
663
  "logging_steps": 500,
@@ -677,7 +33,7 @@
677
  "attributes": {}
678
  }
679
  },
680
- "total_flos": 2.416368314067739e+16,
681
  "train_batch_size": 16,
682
  "trial_name": null,
683
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.014542070209114969,
5
  "eval_steps": 500,
6
+ "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.014542070209114969,
13
+ "grad_norm": 12.663599014282227,
14
  "learning_rate": 1.9903052865272566e-05,
15
+ "loss": 7.4704,
16
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
  ],
19
  "logging_steps": 500,
 
33
  "attributes": {}
34
  }
35
  },
36
+ "total_flos": 268694085430272.0,
37
  "train_batch_size": 16,
38
  "trial_name": null,
39
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1cde46356f72f91161f863cec40376c04f3c90f38cae9b314f1588079bbc3ffe
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cf779800eed062a2e03f99378369c4384c63e953f13f7419f13d9c53267c737
3
  size 5176
last-checkpoint/vocab.txt CHANGED
The diff for this file is too large to render. See raw diff