trinhxuankhai commited on
Commit
3e5f9bf
1 Parent(s): 4a8ae2e

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -19,10 +19,10 @@
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
22
- "c_attn",
23
  "attn.c_proj",
 
24
  "w2",
25
- "w1"
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
 
19
  "rank_pattern": {},
20
  "revision": null,
21
  "target_modules": [
 
22
  "attn.c_proj",
23
+ "w1",
24
  "w2",
25
+ "c_attn"
26
  ],
27
  "task_type": "CAUSAL_LM",
28
  "use_dora": false,
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c77fcb65521bd73c5533b114bafb223fb0bec716242b658c0d8bde0902b8f5ac
3
- size 224486538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a16606a639a21b14a52ca5ac566f39a90be2304619731424fca8998ec4fcf9b
3
+ size 224483018
qwen.tiktoken ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "pad_token": "<|endoftext|>"
3
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "Qwen/Qwen-VL-Chat--tokenization_qwen.QWenTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "clean_up_tokenization_spaces": true,
9
+ "model_max_length": 1024,
10
+ "padding_side": "right",
11
+ "tokenizer_class": "QWenTokenizer"
12
+ }
trainer_state.json ADDED
@@ -0,0 +1,914 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7727344036102295,
3
+ "best_model_checkpoint": "ckpt/origin/vehicle_environment/checkpoint-91",
4
+ "epoch": 4.830917874396135,
5
+ "eval_steps": 7,
6
+ "global_step": 125,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04,
13
+ "learning_rate": 0.0,
14
+ "loss": 0.8677,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.08,
19
+ "learning_rate": 1e-05,
20
+ "loss": 0.8182,
21
+ "step": 2
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 1e-05,
26
+ "loss": 0.8928,
27
+ "step": 3
28
+ },
29
+ {
30
+ "epoch": 0.15,
31
+ "learning_rate": 1e-05,
32
+ "loss": 0.9758,
33
+ "step": 4
34
+ },
35
+ {
36
+ "epoch": 0.19,
37
+ "learning_rate": 1e-05,
38
+ "loss": 0.8773,
39
+ "step": 5
40
+ },
41
+ {
42
+ "epoch": 0.23,
43
+ "learning_rate": 1e-05,
44
+ "loss": 0.8984,
45
+ "step": 6
46
+ },
47
+ {
48
+ "epoch": 0.27,
49
+ "learning_rate": 1e-05,
50
+ "loss": 0.793,
51
+ "step": 7
52
+ },
53
+ {
54
+ "epoch": 0.27,
55
+ "eval_loss": 0.954296886920929,
56
+ "eval_runtime": 44.1101,
57
+ "eval_samples_per_second": 1.134,
58
+ "eval_steps_per_second": 0.567,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.31,
63
+ "learning_rate": 1e-05,
64
+ "loss": 0.8718,
65
+ "step": 8
66
+ },
67
+ {
68
+ "epoch": 0.35,
69
+ "learning_rate": 1e-05,
70
+ "loss": 0.8806,
71
+ "step": 9
72
+ },
73
+ {
74
+ "epoch": 0.39,
75
+ "learning_rate": 1e-05,
76
+ "loss": 0.7152,
77
+ "step": 10
78
+ },
79
+ {
80
+ "epoch": 0.43,
81
+ "learning_rate": 1e-05,
82
+ "loss": 0.7653,
83
+ "step": 11
84
+ },
85
+ {
86
+ "epoch": 0.46,
87
+ "learning_rate": 1e-05,
88
+ "loss": 0.7864,
89
+ "step": 12
90
+ },
91
+ {
92
+ "epoch": 0.5,
93
+ "learning_rate": 1e-05,
94
+ "loss": 0.7629,
95
+ "step": 13
96
+ },
97
+ {
98
+ "epoch": 0.54,
99
+ "learning_rate": 1e-05,
100
+ "loss": 0.8066,
101
+ "step": 14
102
+ },
103
+ {
104
+ "epoch": 0.54,
105
+ "eval_loss": 0.8725000023841858,
106
+ "eval_runtime": 43.3056,
107
+ "eval_samples_per_second": 1.155,
108
+ "eval_steps_per_second": 0.577,
109
+ "step": 14
110
+ },
111
+ {
112
+ "epoch": 0.58,
113
+ "learning_rate": 1e-05,
114
+ "loss": 0.793,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.62,
119
+ "learning_rate": 1e-05,
120
+ "loss": 0.7722,
121
+ "step": 16
122
+ },
123
+ {
124
+ "epoch": 0.66,
125
+ "learning_rate": 1e-05,
126
+ "loss": 0.8317,
127
+ "step": 17
128
+ },
129
+ {
130
+ "epoch": 0.7,
131
+ "learning_rate": 1e-05,
132
+ "loss": 0.7793,
133
+ "step": 18
134
+ },
135
+ {
136
+ "epoch": 0.73,
137
+ "learning_rate": 1e-05,
138
+ "loss": 0.6925,
139
+ "step": 19
140
+ },
141
+ {
142
+ "epoch": 0.77,
143
+ "learning_rate": 1e-05,
144
+ "loss": 0.7365,
145
+ "step": 20
146
+ },
147
+ {
148
+ "epoch": 0.81,
149
+ "learning_rate": 1e-05,
150
+ "loss": 0.7505,
151
+ "step": 21
152
+ },
153
+ {
154
+ "epoch": 0.81,
155
+ "eval_loss": 0.8274219036102295,
156
+ "eval_runtime": 43.2778,
157
+ "eval_samples_per_second": 1.155,
158
+ "eval_steps_per_second": 0.578,
159
+ "step": 21
160
+ },
161
+ {
162
+ "epoch": 0.85,
163
+ "learning_rate": 1e-05,
164
+ "loss": 0.7391,
165
+ "step": 22
166
+ },
167
+ {
168
+ "epoch": 0.89,
169
+ "learning_rate": 1e-05,
170
+ "loss": 0.7058,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.93,
175
+ "learning_rate": 1e-05,
176
+ "loss": 0.6859,
177
+ "step": 24
178
+ },
179
+ {
180
+ "epoch": 0.97,
181
+ "learning_rate": 1e-05,
182
+ "loss": 0.679,
183
+ "step": 25
184
+ },
185
+ {
186
+ "epoch": 1.0,
187
+ "learning_rate": 1e-05,
188
+ "loss": 0.6708,
189
+ "step": 26
190
+ },
191
+ {
192
+ "epoch": 1.04,
193
+ "learning_rate": 1e-05,
194
+ "loss": 0.6968,
195
+ "step": 27
196
+ },
197
+ {
198
+ "epoch": 1.08,
199
+ "learning_rate": 1e-05,
200
+ "loss": 0.6937,
201
+ "step": 28
202
+ },
203
+ {
204
+ "epoch": 1.08,
205
+ "eval_loss": 0.802539050579071,
206
+ "eval_runtime": 43.6,
207
+ "eval_samples_per_second": 1.147,
208
+ "eval_steps_per_second": 0.573,
209
+ "step": 28
210
+ },
211
+ {
212
+ "epoch": 1.12,
213
+ "learning_rate": 1e-05,
214
+ "loss": 0.7578,
215
+ "step": 29
216
+ },
217
+ {
218
+ "epoch": 1.16,
219
+ "learning_rate": 1e-05,
220
+ "loss": 0.6307,
221
+ "step": 30
222
+ },
223
+ {
224
+ "epoch": 1.2,
225
+ "learning_rate": 1e-05,
226
+ "loss": 0.6949,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 1.24,
231
+ "learning_rate": 1e-05,
232
+ "loss": 0.6743,
233
+ "step": 32
234
+ },
235
+ {
236
+ "epoch": 1.28,
237
+ "learning_rate": 1e-05,
238
+ "loss": 0.6814,
239
+ "step": 33
240
+ },
241
+ {
242
+ "epoch": 1.31,
243
+ "learning_rate": 1e-05,
244
+ "loss": 0.5865,
245
+ "step": 34
246
+ },
247
+ {
248
+ "epoch": 1.35,
249
+ "learning_rate": 1e-05,
250
+ "loss": 0.5945,
251
+ "step": 35
252
+ },
253
+ {
254
+ "epoch": 1.35,
255
+ "eval_loss": 0.7889453172683716,
256
+ "eval_runtime": 43.2969,
257
+ "eval_samples_per_second": 1.155,
258
+ "eval_steps_per_second": 0.577,
259
+ "step": 35
260
+ },
261
+ {
262
+ "epoch": 1.39,
263
+ "learning_rate": 1e-05,
264
+ "loss": 0.6505,
265
+ "step": 36
266
+ },
267
+ {
268
+ "epoch": 1.43,
269
+ "learning_rate": 1e-05,
270
+ "loss": 0.7418,
271
+ "step": 37
272
+ },
273
+ {
274
+ "epoch": 1.47,
275
+ "learning_rate": 1e-05,
276
+ "loss": 0.6592,
277
+ "step": 38
278
+ },
279
+ {
280
+ "epoch": 1.51,
281
+ "learning_rate": 1e-05,
282
+ "loss": 0.6829,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 1.55,
287
+ "learning_rate": 1e-05,
288
+ "loss": 0.7168,
289
+ "step": 40
290
+ },
291
+ {
292
+ "epoch": 1.58,
293
+ "learning_rate": 1e-05,
294
+ "loss": 0.7255,
295
+ "step": 41
296
+ },
297
+ {
298
+ "epoch": 1.62,
299
+ "learning_rate": 1e-05,
300
+ "loss": 0.7029,
301
+ "step": 42
302
+ },
303
+ {
304
+ "epoch": 1.62,
305
+ "eval_loss": 0.7841406464576721,
306
+ "eval_runtime": 43.3346,
307
+ "eval_samples_per_second": 1.154,
308
+ "eval_steps_per_second": 0.577,
309
+ "step": 42
310
+ },
311
+ {
312
+ "epoch": 1.66,
313
+ "learning_rate": 1e-05,
314
+ "loss": 0.589,
315
+ "step": 43
316
+ },
317
+ {
318
+ "epoch": 1.7,
319
+ "learning_rate": 1e-05,
320
+ "loss": 0.6757,
321
+ "step": 44
322
+ },
323
+ {
324
+ "epoch": 1.74,
325
+ "learning_rate": 1e-05,
326
+ "loss": 0.7625,
327
+ "step": 45
328
+ },
329
+ {
330
+ "epoch": 1.78,
331
+ "learning_rate": 1e-05,
332
+ "loss": 0.6877,
333
+ "step": 46
334
+ },
335
+ {
336
+ "epoch": 1.82,
337
+ "learning_rate": 1e-05,
338
+ "loss": 0.6104,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 1.86,
343
+ "learning_rate": 1e-05,
344
+ "loss": 0.6581,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 1.89,
349
+ "learning_rate": 1e-05,
350
+ "loss": 0.6228,
351
+ "step": 49
352
+ },
353
+ {
354
+ "epoch": 1.89,
355
+ "eval_loss": 0.7757421731948853,
356
+ "eval_runtime": 43.3021,
357
+ "eval_samples_per_second": 1.155,
358
+ "eval_steps_per_second": 0.577,
359
+ "step": 49
360
+ },
361
+ {
362
+ "epoch": 1.93,
363
+ "learning_rate": 1e-05,
364
+ "loss": 0.7766,
365
+ "step": 50
366
+ },
367
+ {
368
+ "epoch": 1.97,
369
+ "learning_rate": 1e-05,
370
+ "loss": 0.5925,
371
+ "step": 51
372
+ },
373
+ {
374
+ "epoch": 2.01,
375
+ "learning_rate": 1e-05,
376
+ "loss": 0.6484,
377
+ "step": 52
378
+ },
379
+ {
380
+ "epoch": 2.05,
381
+ "learning_rate": 1e-05,
382
+ "loss": 0.7354,
383
+ "step": 53
384
+ },
385
+ {
386
+ "epoch": 2.09,
387
+ "learning_rate": 1e-05,
388
+ "loss": 0.6472,
389
+ "step": 54
390
+ },
391
+ {
392
+ "epoch": 2.13,
393
+ "learning_rate": 1e-05,
394
+ "loss": 0.64,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 2.16,
399
+ "learning_rate": 1e-05,
400
+ "loss": 0.6172,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 2.16,
405
+ "eval_loss": 0.7679296731948853,
406
+ "eval_runtime": 43.362,
407
+ "eval_samples_per_second": 1.153,
408
+ "eval_steps_per_second": 0.577,
409
+ "step": 56
410
+ },
411
+ {
412
+ "epoch": 2.2,
413
+ "learning_rate": 1e-05,
414
+ "loss": 0.6298,
415
+ "step": 57
416
+ },
417
+ {
418
+ "epoch": 2.24,
419
+ "learning_rate": 1e-05,
420
+ "loss": 0.5544,
421
+ "step": 58
422
+ },
423
+ {
424
+ "epoch": 2.28,
425
+ "learning_rate": 1e-05,
426
+ "loss": 0.6791,
427
+ "step": 59
428
+ },
429
+ {
430
+ "epoch": 2.32,
431
+ "learning_rate": 1e-05,
432
+ "loss": 0.6732,
433
+ "step": 60
434
+ },
435
+ {
436
+ "epoch": 2.36,
437
+ "learning_rate": 1e-05,
438
+ "loss": 0.6169,
439
+ "step": 61
440
+ },
441
+ {
442
+ "epoch": 2.4,
443
+ "learning_rate": 1e-05,
444
+ "loss": 0.6215,
445
+ "step": 62
446
+ },
447
+ {
448
+ "epoch": 2.43,
449
+ "learning_rate": 1e-05,
450
+ "loss": 0.5446,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 2.43,
455
+ "eval_loss": 0.7676171660423279,
456
+ "eval_runtime": 43.2939,
457
+ "eval_samples_per_second": 1.155,
458
+ "eval_steps_per_second": 0.577,
459
+ "step": 63
460
+ },
461
+ {
462
+ "epoch": 2.47,
463
+ "learning_rate": 1e-05,
464
+ "loss": 0.5829,
465
+ "step": 64
466
+ },
467
+ {
468
+ "epoch": 2.51,
469
+ "learning_rate": 1e-05,
470
+ "loss": 0.6256,
471
+ "step": 65
472
+ },
473
+ {
474
+ "epoch": 2.55,
475
+ "learning_rate": 1e-05,
476
+ "loss": 0.5826,
477
+ "step": 66
478
+ },
479
+ {
480
+ "epoch": 2.59,
481
+ "learning_rate": 1e-05,
482
+ "loss": 0.6113,
483
+ "step": 67
484
+ },
485
+ {
486
+ "epoch": 2.63,
487
+ "learning_rate": 1e-05,
488
+ "loss": 0.6292,
489
+ "step": 68
490
+ },
491
+ {
492
+ "epoch": 2.67,
493
+ "learning_rate": 1e-05,
494
+ "loss": 0.6434,
495
+ "step": 69
496
+ },
497
+ {
498
+ "epoch": 2.71,
499
+ "learning_rate": 1e-05,
500
+ "loss": 0.5857,
501
+ "step": 70
502
+ },
503
+ {
504
+ "epoch": 2.71,
505
+ "eval_loss": 0.7694922089576721,
506
+ "eval_runtime": 43.2962,
507
+ "eval_samples_per_second": 1.155,
508
+ "eval_steps_per_second": 0.577,
509
+ "step": 70
510
+ },
511
+ {
512
+ "epoch": 2.74,
513
+ "learning_rate": 1e-05,
514
+ "loss": 0.5844,
515
+ "step": 71
516
+ },
517
+ {
518
+ "epoch": 2.78,
519
+ "learning_rate": 1e-05,
520
+ "loss": 0.63,
521
+ "step": 72
522
+ },
523
+ {
524
+ "epoch": 2.82,
525
+ "learning_rate": 1e-05,
526
+ "loss": 0.5413,
527
+ "step": 73
528
+ },
529
+ {
530
+ "epoch": 2.86,
531
+ "learning_rate": 1e-05,
532
+ "loss": 0.6852,
533
+ "step": 74
534
+ },
535
+ {
536
+ "epoch": 2.9,
537
+ "learning_rate": 1e-05,
538
+ "loss": 0.5592,
539
+ "step": 75
540
+ },
541
+ {
542
+ "epoch": 2.94,
543
+ "learning_rate": 1e-05,
544
+ "loss": 0.6597,
545
+ "step": 76
546
+ },
547
+ {
548
+ "epoch": 2.98,
549
+ "learning_rate": 1e-05,
550
+ "loss": 0.5186,
551
+ "step": 77
552
+ },
553
+ {
554
+ "epoch": 2.98,
555
+ "eval_loss": 0.7741405963897705,
556
+ "eval_runtime": 43.2995,
557
+ "eval_samples_per_second": 1.155,
558
+ "eval_steps_per_second": 0.577,
559
+ "step": 77
560
+ },
561
+ {
562
+ "epoch": 3.01,
563
+ "learning_rate": 1e-05,
564
+ "loss": 0.6163,
565
+ "step": 78
566
+ },
567
+ {
568
+ "epoch": 3.05,
569
+ "learning_rate": 1e-05,
570
+ "loss": 0.5931,
571
+ "step": 79
572
+ },
573
+ {
574
+ "epoch": 3.09,
575
+ "learning_rate": 1e-05,
576
+ "loss": 0.6509,
577
+ "step": 80
578
+ },
579
+ {
580
+ "epoch": 3.13,
581
+ "learning_rate": 1e-05,
582
+ "loss": 0.6136,
583
+ "step": 81
584
+ },
585
+ {
586
+ "epoch": 3.17,
587
+ "learning_rate": 1e-05,
588
+ "loss": 0.5426,
589
+ "step": 82
590
+ },
591
+ {
592
+ "epoch": 3.21,
593
+ "learning_rate": 1e-05,
594
+ "loss": 0.575,
595
+ "step": 83
596
+ },
597
+ {
598
+ "epoch": 3.25,
599
+ "learning_rate": 1e-05,
600
+ "loss": 0.5386,
601
+ "step": 84
602
+ },
603
+ {
604
+ "epoch": 3.25,
605
+ "eval_loss": 0.7715234160423279,
606
+ "eval_runtime": 43.3518,
607
+ "eval_samples_per_second": 1.153,
608
+ "eval_steps_per_second": 0.577,
609
+ "step": 84
610
+ },
611
+ {
612
+ "epoch": 3.29,
613
+ "learning_rate": 1e-05,
614
+ "loss": 0.5588,
615
+ "step": 85
616
+ },
617
+ {
618
+ "epoch": 3.32,
619
+ "learning_rate": 1e-05,
620
+ "loss": 0.5428,
621
+ "step": 86
622
+ },
623
+ {
624
+ "epoch": 3.36,
625
+ "learning_rate": 1e-05,
626
+ "loss": 0.6062,
627
+ "step": 87
628
+ },
629
+ {
630
+ "epoch": 3.4,
631
+ "learning_rate": 1e-05,
632
+ "loss": 0.5421,
633
+ "step": 88
634
+ },
635
+ {
636
+ "epoch": 3.44,
637
+ "learning_rate": 1e-05,
638
+ "loss": 0.6211,
639
+ "step": 89
640
+ },
641
+ {
642
+ "epoch": 3.48,
643
+ "learning_rate": 1e-05,
644
+ "loss": 0.6537,
645
+ "step": 90
646
+ },
647
+ {
648
+ "epoch": 3.52,
649
+ "learning_rate": 1e-05,
650
+ "loss": 0.5364,
651
+ "step": 91
652
+ },
653
+ {
654
+ "epoch": 3.52,
655
+ "eval_loss": 0.7727344036102295,
656
+ "eval_runtime": 43.2938,
657
+ "eval_samples_per_second": 1.155,
658
+ "eval_steps_per_second": 0.577,
659
+ "step": 91
660
+ },
661
+ {
662
+ "epoch": 3.56,
663
+ "learning_rate": 1e-05,
664
+ "loss": 0.5734,
665
+ "step": 92
666
+ },
667
+ {
668
+ "epoch": 3.59,
669
+ "learning_rate": 1e-05,
670
+ "loss": 0.5648,
671
+ "step": 93
672
+ },
673
+ {
674
+ "epoch": 3.63,
675
+ "learning_rate": 1e-05,
676
+ "loss": 0.5326,
677
+ "step": 94
678
+ },
679
+ {
680
+ "epoch": 3.67,
681
+ "learning_rate": 1e-05,
682
+ "loss": 0.5787,
683
+ "step": 95
684
+ },
685
+ {
686
+ "epoch": 3.71,
687
+ "learning_rate": 1e-05,
688
+ "loss": 0.5831,
689
+ "step": 96
690
+ },
691
+ {
692
+ "epoch": 3.75,
693
+ "learning_rate": 1e-05,
694
+ "loss": 0.6093,
695
+ "step": 97
696
+ },
697
+ {
698
+ "epoch": 3.79,
699
+ "learning_rate": 1e-05,
700
+ "loss": 0.6548,
701
+ "step": 98
702
+ },
703
+ {
704
+ "epoch": 3.79,
705
+ "eval_loss": 0.7698047161102295,
706
+ "eval_runtime": 43.3828,
707
+ "eval_samples_per_second": 1.153,
708
+ "eval_steps_per_second": 0.576,
709
+ "step": 98
710
+ },
711
+ {
712
+ "epoch": 3.83,
713
+ "learning_rate": 1e-05,
714
+ "loss": 0.5649,
715
+ "step": 99
716
+ },
717
+ {
718
+ "epoch": 3.86,
719
+ "learning_rate": 1e-05,
720
+ "loss": 0.5951,
721
+ "step": 100
722
+ },
723
+ {
724
+ "epoch": 3.9,
725
+ "learning_rate": 1e-05,
726
+ "loss": 0.5956,
727
+ "step": 101
728
+ },
729
+ {
730
+ "epoch": 3.94,
731
+ "learning_rate": 1e-05,
732
+ "loss": 0.5413,
733
+ "step": 102
734
+ },
735
+ {
736
+ "epoch": 3.98,
737
+ "learning_rate": 1e-05,
738
+ "loss": 0.5199,
739
+ "step": 103
740
+ },
741
+ {
742
+ "epoch": 4.02,
743
+ "learning_rate": 1e-05,
744
+ "loss": 0.5608,
745
+ "step": 104
746
+ },
747
+ {
748
+ "epoch": 4.06,
749
+ "learning_rate": 1e-05,
750
+ "loss": 0.5592,
751
+ "step": 105
752
+ },
753
+ {
754
+ "epoch": 4.06,
755
+ "eval_loss": 0.7690625190734863,
756
+ "eval_runtime": 43.3059,
757
+ "eval_samples_per_second": 1.155,
758
+ "eval_steps_per_second": 0.577,
759
+ "step": 105
760
+ },
761
+ {
762
+ "epoch": 4.1,
763
+ "learning_rate": 1e-05,
764
+ "loss": 0.5203,
765
+ "step": 106
766
+ },
767
+ {
768
+ "epoch": 4.14,
769
+ "learning_rate": 1e-05,
770
+ "loss": 0.5927,
771
+ "step": 107
772
+ },
773
+ {
774
+ "epoch": 4.17,
775
+ "learning_rate": 1e-05,
776
+ "loss": 0.6014,
777
+ "step": 108
778
+ },
779
+ {
780
+ "epoch": 4.21,
781
+ "learning_rate": 1e-05,
782
+ "loss": 0.5519,
783
+ "step": 109
784
+ },
785
+ {
786
+ "epoch": 4.25,
787
+ "learning_rate": 1e-05,
788
+ "loss": 0.5568,
789
+ "step": 110
790
+ },
791
+ {
792
+ "epoch": 4.29,
793
+ "learning_rate": 1e-05,
794
+ "loss": 0.5682,
795
+ "step": 111
796
+ },
797
+ {
798
+ "epoch": 4.33,
799
+ "learning_rate": 1e-05,
800
+ "loss": 0.5782,
801
+ "step": 112
802
+ },
803
+ {
804
+ "epoch": 4.33,
805
+ "eval_loss": 0.7662500143051147,
806
+ "eval_runtime": 43.2755,
807
+ "eval_samples_per_second": 1.155,
808
+ "eval_steps_per_second": 0.578,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 4.37,
813
+ "learning_rate": 1e-05,
814
+ "loss": 0.4916,
815
+ "step": 113
816
+ },
817
+ {
818
+ "epoch": 4.41,
819
+ "learning_rate": 1e-05,
820
+ "loss": 0.5217,
821
+ "step": 114
822
+ },
823
+ {
824
+ "epoch": 4.44,
825
+ "learning_rate": 1e-05,
826
+ "loss": 0.6367,
827
+ "step": 115
828
+ },
829
+ {
830
+ "epoch": 4.48,
831
+ "learning_rate": 1e-05,
832
+ "loss": 0.562,
833
+ "step": 116
834
+ },
835
+ {
836
+ "epoch": 4.52,
837
+ "learning_rate": 1e-05,
838
+ "loss": 0.5706,
839
+ "step": 117
840
+ },
841
+ {
842
+ "epoch": 4.56,
843
+ "learning_rate": 1e-05,
844
+ "loss": 0.5658,
845
+ "step": 118
846
+ },
847
+ {
848
+ "epoch": 4.6,
849
+ "learning_rate": 1e-05,
850
+ "loss": 0.549,
851
+ "step": 119
852
+ },
853
+ {
854
+ "epoch": 4.6,
855
+ "eval_loss": 0.7637890577316284,
856
+ "eval_runtime": 43.2782,
857
+ "eval_samples_per_second": 1.155,
858
+ "eval_steps_per_second": 0.578,
859
+ "step": 119
860
+ },
861
+ {
862
+ "epoch": 4.64,
863
+ "learning_rate": 1e-05,
864
+ "loss": 0.5286,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 4.68,
869
+ "learning_rate": 1e-05,
870
+ "loss": 0.4961,
871
+ "step": 121
872
+ },
873
+ {
874
+ "epoch": 4.71,
875
+ "learning_rate": 1e-05,
876
+ "loss": 0.5203,
877
+ "step": 122
878
+ },
879
+ {
880
+ "epoch": 4.75,
881
+ "learning_rate": 1e-05,
882
+ "loss": 0.5485,
883
+ "step": 123
884
+ },
885
+ {
886
+ "epoch": 4.79,
887
+ "learning_rate": 1e-05,
888
+ "loss": 0.599,
889
+ "step": 124
890
+ },
891
+ {
892
+ "epoch": 4.83,
893
+ "learning_rate": 1e-05,
894
+ "loss": 0.4822,
895
+ "step": 125
896
+ },
897
+ {
898
+ "epoch": 4.83,
899
+ "step": 125,
900
+ "total_flos": 66871008165888.0,
901
+ "train_loss": 0.6458125,
902
+ "train_runtime": 5591.1555,
903
+ "train_samples_per_second": 1.11,
904
+ "train_steps_per_second": 0.022
905
+ }
906
+ ],
907
+ "logging_steps": 1.0,
908
+ "max_steps": 125,
909
+ "num_train_epochs": 5,
910
+ "save_steps": 13,
911
+ "total_flos": 66871008165888.0,
912
+ "trial_name": null,
913
+ "trial_params": null
914
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d66831c86105266f9455ffb7eae420d11ec18e07ef4907e28579ecf2fa873afd
3
+ size 6840