yyx123 commited on
Commit
363cd93
1 Parent(s): 728493c

Model save

Browse files
Files changed (5) hide show
  1. README.md +26 -11
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +497 -39
README.md CHANGED
@@ -2,13 +2,9 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- datasets:
11
- - ruozhiba
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-ruozhiba3
@@ -20,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-ruozhiba3
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the ruozhiba dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2.0470
26
 
27
  ## Model description
28
 
@@ -41,20 +37,39 @@ More information needed
41
  ### Training hyperparameters
42
 
43
  The following hyperparameters were used during training:
44
- - learning_rate: 0.0002
45
- - train_batch_size: 1
46
- - eval_batch_size: 1
47
  - seed: 42
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: cosine
50
  - lr_scheduler_warmup_ratio: 0.1
51
- - num_epochs: 1
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 2.1518 | 1.0 | 20 | 2.0470 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
 
8
  base_model: 01-ai/Yi-6B
9
  model-index:
10
  - name: Yi-6B-ruozhiba3
 
16
 
17
  # Yi-6B-ruozhiba3
18
 
19
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 3.9909
22
 
23
  ## Model description
24
 
 
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
+ - learning_rate: 5e-05
41
+ - train_batch_size: 4
42
+ - eval_batch_size: 4
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 20
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss |
52
  |:-------------:|:-----:|:----:|:---------------:|
53
+ | 2.644 | 1.0 | 55 | 2.3047 |
54
+ | 1.9548 | 2.0 | 110 | 1.9419 |
55
+ | 1.788 | 3.0 | 165 | 1.9135 |
56
+ | 1.6342 | 4.0 | 220 | 1.9499 |
57
+ | 1.3781 | 5.0 | 275 | 2.1321 |
58
+ | 1.0617 | 6.0 | 330 | 2.3518 |
59
+ | 0.8104 | 7.0 | 385 | 2.6090 |
60
+ | 0.5864 | 8.0 | 440 | 2.8890 |
61
+ | 0.4159 | 9.0 | 495 | 3.1356 |
62
+ | 0.3344 | 10.0 | 550 | 3.3190 |
63
+ | 0.2446 | 11.0 | 605 | 3.5470 |
64
+ | 0.199 | 12.0 | 660 | 3.6840 |
65
+ | 0.1245 | 13.0 | 715 | 3.7653 |
66
+ | 0.1208 | 14.0 | 770 | 3.8722 |
67
+ | 0.1003 | 15.0 | 825 | 3.9575 |
68
+ | 0.0767 | 16.0 | 880 | 3.9671 |
69
+ | 0.0913 | 17.0 | 935 | 3.9921 |
70
+ | 0.0895 | 18.0 | 990 | 3.9940 |
71
+ | 0.0671 | 19.0 | 1045 | 3.9915 |
72
+ | 0.0671 | 20.0 | 1100 | 3.9909 |
73
 
74
 
75
  ### Framework versions
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 2.047029972076416,
4
- "eval_runtime": 6.0756,
5
- "eval_samples": 240,
6
- "eval_samples_per_second": 3.292,
7
- "eval_steps_per_second": 3.292,
8
- "train_loss": 2.146018850803375,
9
- "train_runtime": 37.3509,
10
- "train_samples": 240,
11
- "train_samples_per_second": 0.535,
12
- "train_steps_per_second": 0.535
13
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_loss": 3.990852117538452,
4
+ "eval_runtime": 1.2395,
5
+ "eval_samples": 23,
6
+ "eval_samples_per_second": 18.556,
7
+ "eval_steps_per_second": 4.841,
8
+ "train_loss": 0.6849299516461113,
9
+ "train_runtime": 917.6848,
10
+ "train_samples": 217,
11
+ "train_samples_per_second": 4.729,
12
+ "train_steps_per_second": 1.199
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 2.047029972076416,
4
- "eval_runtime": 6.0756,
5
- "eval_samples": 240,
6
- "eval_samples_per_second": 3.292,
7
- "eval_steps_per_second": 3.292
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "eval_loss": 3.990852117538452,
4
+ "eval_runtime": 1.2395,
5
+ "eval_samples": 23,
6
+ "eval_samples_per_second": 18.556,
7
+ "eval_steps_per_second": 4.841
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "train_loss": 2.146018850803375,
4
- "train_runtime": 37.3509,
5
- "train_samples": 240,
6
- "train_samples_per_second": 0.535,
7
- "train_steps_per_second": 0.535
8
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "train_loss": 0.6849299516461113,
4
+ "train_runtime": 917.6848,
5
+ "train_samples": 217,
6
+ "train_samples_per_second": 4.729,
7
+ "train_steps_per_second": 1.199
8
  }
trainer_state.json CHANGED
@@ -1,68 +1,526 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.05,
13
- "learning_rate": 0.0001,
14
- "loss": 2.1566,
15
  "step": 1
16
  },
17
  {
18
- "epoch": 0.25,
19
- "learning_rate": 0.00018660254037844388,
20
- "loss": 2.2312,
21
- "step": 5
22
  },
23
  {
24
- "epoch": 0.5,
25
- "learning_rate": 0.00011736481776669306,
26
- "loss": 2.0678,
27
- "step": 10
28
  },
29
  {
30
- "epoch": 0.75,
31
- "learning_rate": 3.5721239031346066e-05,
32
- "loss": 2.1481,
33
- "step": 15
 
 
34
  },
35
  {
36
- "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  "learning_rate": 0.0,
38
- "loss": 2.1518,
39
- "step": 20
40
  },
41
  {
42
- "epoch": 1.0,
43
- "eval_loss": 2.047029972076416,
44
- "eval_runtime": 6.0766,
45
- "eval_samples_per_second": 3.291,
46
- "eval_steps_per_second": 3.291,
47
- "step": 20
48
  },
49
  {
50
- "epoch": 1.0,
51
- "step": 20,
52
- "total_flos": 1434058374512640.0,
53
- "train_loss": 2.146018850803375,
54
- "train_runtime": 37.3509,
55
- "train_samples_per_second": 0.535,
56
- "train_steps_per_second": 0.535
57
  }
58
  ],
59
- "logging_steps": 5,
60
- "max_steps": 20,
61
  "num_input_tokens_seen": 0,
62
- "num_train_epochs": 1,
63
- "save_steps": 100,
64
- "total_flos": 1434058374512640.0,
65
- "train_batch_size": 1,
66
  "trial_name": null,
67
  "trial_params": null
68
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 1100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.02,
13
+ "learning_rate": 4.545454545454545e-07,
14
+ "loss": 2.5611,
15
  "step": 1
16
  },
17
  {
18
+ "epoch": 0.36,
19
+ "learning_rate": 9.090909090909091e-06,
20
+ "loss": 2.5692,
21
+ "step": 20
22
  },
23
  {
24
+ "epoch": 0.73,
25
+ "learning_rate": 1.8181818181818182e-05,
26
+ "loss": 2.644,
27
+ "step": 40
28
  },
29
  {
30
+ "epoch": 1.0,
31
+ "eval_loss": 2.304738759994507,
32
+ "eval_runtime": 1.217,
33
+ "eval_samples_per_second": 18.899,
34
+ "eval_steps_per_second": 4.93,
35
+ "step": 55
36
  },
37
  {
38
+ "epoch": 1.09,
39
+ "learning_rate": 2.7272727272727273e-05,
40
+ "loss": 2.3827,
41
+ "step": 60
42
+ },
43
+ {
44
+ "epoch": 1.45,
45
+ "learning_rate": 3.6363636363636364e-05,
46
+ "loss": 2.0781,
47
+ "step": 80
48
+ },
49
+ {
50
+ "epoch": 1.82,
51
+ "learning_rate": 4.545454545454546e-05,
52
+ "loss": 1.9548,
53
+ "step": 100
54
+ },
55
+ {
56
+ "epoch": 2.0,
57
+ "eval_loss": 1.9419394731521606,
58
+ "eval_runtime": 1.2171,
59
+ "eval_samples_per_second": 18.898,
60
+ "eval_steps_per_second": 4.93,
61
+ "step": 110
62
+ },
63
+ {
64
+ "epoch": 2.18,
65
+ "learning_rate": 4.9987413559579636e-05,
66
+ "loss": 1.8022,
67
+ "step": 120
68
+ },
69
+ {
70
+ "epoch": 2.55,
71
+ "learning_rate": 4.988679806432712e-05,
72
+ "loss": 1.8295,
73
+ "step": 140
74
+ },
75
+ {
76
+ "epoch": 2.91,
77
+ "learning_rate": 4.968597221690986e-05,
78
+ "loss": 1.788,
79
+ "step": 160
80
+ },
81
+ {
82
+ "epoch": 3.0,
83
+ "eval_loss": 1.9134601354599,
84
+ "eval_runtime": 1.2132,
85
+ "eval_samples_per_second": 18.958,
86
+ "eval_steps_per_second": 4.946,
87
+ "step": 165
88
+ },
89
+ {
90
+ "epoch": 3.27,
91
+ "learning_rate": 4.938574467213518e-05,
92
+ "loss": 1.6784,
93
+ "step": 180
94
+ },
95
+ {
96
+ "epoch": 3.64,
97
+ "learning_rate": 4.898732434036244e-05,
98
+ "loss": 1.6528,
99
+ "step": 200
100
+ },
101
+ {
102
+ "epoch": 4.0,
103
+ "learning_rate": 4.849231551964771e-05,
104
+ "loss": 1.6342,
105
+ "step": 220
106
+ },
107
+ {
108
+ "epoch": 4.0,
109
+ "eval_loss": 1.9498955011367798,
110
+ "eval_runtime": 1.2502,
111
+ "eval_samples_per_second": 18.397,
112
+ "eval_steps_per_second": 4.799,
113
+ "step": 220
114
+ },
115
+ {
116
+ "epoch": 4.36,
117
+ "learning_rate": 4.790271143580174e-05,
118
+ "loss": 1.3818,
119
+ "step": 240
120
+ },
121
+ {
122
+ "epoch": 4.73,
123
+ "learning_rate": 4.722088621637309e-05,
124
+ "loss": 1.3781,
125
+ "step": 260
126
+ },
127
+ {
128
+ "epoch": 5.0,
129
+ "eval_loss": 2.1321451663970947,
130
+ "eval_runtime": 1.2118,
131
+ "eval_samples_per_second": 18.98,
132
+ "eval_steps_per_second": 4.951,
133
+ "step": 275
134
+ },
135
+ {
136
+ "epoch": 5.09,
137
+ "learning_rate": 4.644958533087443e-05,
138
+ "loss": 1.2512,
139
+ "step": 280
140
+ },
141
+ {
142
+ "epoch": 5.45,
143
+ "learning_rate": 4.559191453574582e-05,
144
+ "loss": 1.0475,
145
+ "step": 300
146
+ },
147
+ {
148
+ "epoch": 5.82,
149
+ "learning_rate": 4.465132736856969e-05,
150
+ "loss": 1.0617,
151
+ "step": 320
152
+ },
153
+ {
154
+ "epoch": 6.0,
155
+ "eval_loss": 2.3518364429473877,
156
+ "eval_runtime": 1.216,
157
+ "eval_samples_per_second": 18.914,
158
+ "eval_steps_per_second": 4.934,
159
+ "step": 330
160
+ },
161
+ {
162
+ "epoch": 6.18,
163
+ "learning_rate": 4.3631611241893874e-05,
164
+ "loss": 0.9003,
165
+ "step": 340
166
+ },
167
+ {
168
+ "epoch": 6.55,
169
+ "learning_rate": 4.2536872192658036e-05,
170
+ "loss": 0.7805,
171
+ "step": 360
172
+ },
173
+ {
174
+ "epoch": 6.91,
175
+ "learning_rate": 4.137151834863213e-05,
176
+ "loss": 0.8104,
177
+ "step": 380
178
+ },
179
+ {
180
+ "epoch": 7.0,
181
+ "eval_loss": 2.609004259109497,
182
+ "eval_runtime": 1.2167,
183
+ "eval_samples_per_second": 18.904,
184
+ "eval_steps_per_second": 4.932,
185
+ "step": 385
186
+ },
187
+ {
188
+ "epoch": 7.27,
189
+ "learning_rate": 4.014024217844167e-05,
190
+ "loss": 0.6542,
191
+ "step": 400
192
+ },
193
+ {
194
+ "epoch": 7.64,
195
+ "learning_rate": 3.884800159665276e-05,
196
+ "loss": 0.5753,
197
+ "step": 420
198
+ },
199
+ {
200
+ "epoch": 8.0,
201
+ "learning_rate": 3.7500000000000003e-05,
202
+ "loss": 0.5864,
203
+ "step": 440
204
+ },
205
+ {
206
+ "epoch": 8.0,
207
+ "eval_loss": 2.8889544010162354,
208
+ "eval_runtime": 1.2434,
209
+ "eval_samples_per_second": 18.498,
210
+ "eval_steps_per_second": 4.825,
211
+ "step": 440
212
+ },
213
+ {
214
+ "epoch": 8.36,
215
+ "learning_rate": 3.610166531514436e-05,
216
+ "loss": 0.4181,
217
+ "step": 460
218
+ },
219
+ {
220
+ "epoch": 8.73,
221
+ "learning_rate": 3.465862814232822e-05,
222
+ "loss": 0.4159,
223
+ "step": 480
224
+ },
225
+ {
226
+ "epoch": 9.0,
227
+ "eval_loss": 3.1356287002563477,
228
+ "eval_runtime": 1.2192,
229
+ "eval_samples_per_second": 18.865,
230
+ "eval_steps_per_second": 4.921,
231
+ "step": 495
232
+ },
233
+ {
234
+ "epoch": 9.09,
235
+ "learning_rate": 3.3176699082935545e-05,
236
+ "loss": 0.4188,
237
+ "step": 500
238
+ },
239
+ {
240
+ "epoch": 9.45,
241
+ "learning_rate": 3.166184534225087e-05,
242
+ "loss": 0.3131,
243
+ "step": 520
244
+ },
245
+ {
246
+ "epoch": 9.82,
247
+ "learning_rate": 3.012016670162977e-05,
248
+ "loss": 0.3344,
249
+ "step": 540
250
+ },
251
+ {
252
+ "epoch": 10.0,
253
+ "eval_loss": 3.3189520835876465,
254
+ "eval_runtime": 1.2131,
255
+ "eval_samples_per_second": 18.96,
256
+ "eval_steps_per_second": 4.946,
257
+ "step": 550
258
+ },
259
+ {
260
+ "epoch": 10.18,
261
+ "learning_rate": 2.8557870956832132e-05,
262
+ "loss": 0.3005,
263
+ "step": 560
264
+ },
265
+ {
266
+ "epoch": 10.55,
267
+ "learning_rate": 2.698124892141971e-05,
268
+ "loss": 0.2527,
269
+ "step": 580
270
+ },
271
+ {
272
+ "epoch": 10.91,
273
+ "learning_rate": 2.5396649095870202e-05,
274
+ "loss": 0.2446,
275
+ "step": 600
276
+ },
277
+ {
278
+ "epoch": 11.0,
279
+ "eval_loss": 3.5470495223999023,
280
+ "eval_runtime": 1.216,
281
+ "eval_samples_per_second": 18.914,
282
+ "eval_steps_per_second": 4.934,
283
+ "step": 605
284
+ },
285
+ {
286
+ "epoch": 11.27,
287
+ "learning_rate": 2.3810452104406444e-05,
288
+ "loss": 0.1855,
289
+ "step": 620
290
+ },
291
+ {
292
+ "epoch": 11.64,
293
+ "learning_rate": 2.222904500247473e-05,
294
+ "loss": 0.1745,
295
+ "step": 640
296
+ },
297
+ {
298
+ "epoch": 12.0,
299
+ "learning_rate": 2.0658795558326743e-05,
300
+ "loss": 0.199,
301
+ "step": 660
302
+ },
303
+ {
304
+ "epoch": 12.0,
305
+ "eval_loss": 3.68398118019104,
306
+ "eval_runtime": 1.2409,
307
+ "eval_samples_per_second": 18.535,
308
+ "eval_steps_per_second": 4.835,
309
+ "step": 660
310
+ },
311
+ {
312
+ "epoch": 12.36,
313
+ "learning_rate": 1.9106026612264316e-05,
314
+ "loss": 0.1455,
315
+ "step": 680
316
+ },
317
+ {
318
+ "epoch": 12.73,
319
+ "learning_rate": 1.7576990616793137e-05,
320
+ "loss": 0.1245,
321
+ "step": 700
322
+ },
323
+ {
324
+ "epoch": 13.0,
325
+ "eval_loss": 3.765277147293091,
326
+ "eval_runtime": 1.2162,
327
+ "eval_samples_per_second": 18.911,
328
+ "eval_steps_per_second": 4.933,
329
+ "step": 715
330
+ },
331
+ {
332
+ "epoch": 13.09,
333
+ "learning_rate": 1.6077844460203206e-05,
334
+ "loss": 0.1351,
335
+ "step": 720
336
+ },
337
+ {
338
+ "epoch": 13.45,
339
+ "learning_rate": 1.4614624674952842e-05,
340
+ "loss": 0.0967,
341
+ "step": 740
342
+ },
343
+ {
344
+ "epoch": 13.82,
345
+ "learning_rate": 1.3193223130682936e-05,
346
+ "loss": 0.1208,
347
+ "step": 760
348
+ },
349
+ {
350
+ "epoch": 14.0,
351
+ "eval_loss": 3.8721702098846436,
352
+ "eval_runtime": 1.2167,
353
+ "eval_samples_per_second": 18.903,
354
+ "eval_steps_per_second": 4.931,
355
+ "step": 770
356
+ },
357
+ {
358
+ "epoch": 14.18,
359
+ "learning_rate": 1.181936330973744e-05,
360
+ "loss": 0.0853,
361
+ "step": 780
362
+ },
363
+ {
364
+ "epoch": 14.55,
365
+ "learning_rate": 1.049857726072005e-05,
366
+ "loss": 0.0854,
367
+ "step": 800
368
+ },
369
+ {
370
+ "epoch": 14.91,
371
+ "learning_rate": 9.236183322886945e-06,
372
+ "loss": 0.1003,
373
+ "step": 820
374
+ },
375
+ {
376
+ "epoch": 15.0,
377
+ "eval_loss": 3.9574601650238037,
378
+ "eval_runtime": 1.2205,
379
+ "eval_samples_per_second": 18.844,
380
+ "eval_steps_per_second": 4.916,
381
+ "step": 825
382
+ },
383
+ {
384
+ "epoch": 15.27,
385
+ "learning_rate": 8.0372647110717e-06,
386
+ "loss": 0.0753,
387
+ "step": 840
388
+ },
389
+ {
390
+ "epoch": 15.64,
391
+ "learning_rate": 6.906649047373246e-06,
392
+ "loss": 0.0928,
393
+ "step": 860
394
+ },
395
+ {
396
+ "epoch": 16.0,
397
+ "learning_rate": 5.848888922025553e-06,
398
+ "loss": 0.0767,
399
+ "step": 880
400
+ },
401
+ {
402
+ "epoch": 16.0,
403
+ "eval_loss": 3.9671382904052734,
404
+ "eval_runtime": 1.2561,
405
+ "eval_samples_per_second": 18.311,
406
+ "eval_steps_per_second": 4.777,
407
+ "step": 880
408
+ },
409
+ {
410
+ "epoch": 16.36,
411
+ "learning_rate": 4.868243561723535e-06,
412
+ "loss": 0.0702,
413
+ "step": 900
414
+ },
415
+ {
416
+ "epoch": 16.73,
417
+ "learning_rate": 3.968661679220468e-06,
418
+ "loss": 0.0913,
419
+ "step": 920
420
+ },
421
+ {
422
+ "epoch": 17.0,
423
+ "eval_loss": 3.9921278953552246,
424
+ "eval_runtime": 1.2166,
425
+ "eval_samples_per_second": 18.905,
426
+ "eval_steps_per_second": 4.932,
427
+ "step": 935
428
+ },
429
+ {
430
+ "epoch": 17.09,
431
+ "learning_rate": 3.1537655732553768e-06,
432
+ "loss": 0.0698,
433
+ "step": 940
434
+ },
435
+ {
436
+ "epoch": 17.45,
437
+ "learning_rate": 2.4268365428344736e-06,
438
+ "loss": 0.0661,
439
+ "step": 960
440
+ },
441
+ {
442
+ "epoch": 17.82,
443
+ "learning_rate": 1.790801674598186e-06,
444
+ "loss": 0.0895,
445
+ "step": 980
446
+ },
447
+ {
448
+ "epoch": 18.0,
449
+ "eval_loss": 3.9939558506011963,
450
+ "eval_runtime": 1.2161,
451
+ "eval_samples_per_second": 18.913,
452
+ "eval_steps_per_second": 4.934,
453
+ "step": 990
454
+ },
455
+ {
456
+ "epoch": 18.18,
457
+ "learning_rate": 1.248222056476367e-06,
458
+ "loss": 0.0695,
459
+ "step": 1000
460
+ },
461
+ {
462
+ "epoch": 18.55,
463
+ "learning_rate": 8.012824650910938e-07,
464
+ "loss": 0.086,
465
+ "step": 1020
466
+ },
467
+ {
468
+ "epoch": 18.91,
469
+ "learning_rate": 4.517825684323324e-07,
470
+ "loss": 0.0671,
471
+ "step": 1040
472
+ },
473
+ {
474
+ "epoch": 19.0,
475
+ "eval_loss": 3.9915316104888916,
476
+ "eval_runtime": 1.2143,
477
+ "eval_samples_per_second": 18.941,
478
+ "eval_steps_per_second": 4.941,
479
+ "step": 1045
480
+ },
481
+ {
482
+ "epoch": 19.27,
483
+ "learning_rate": 2.011296792301165e-07,
484
+ "loss": 0.0681,
485
+ "step": 1060
486
+ },
487
+ {
488
+ "epoch": 19.64,
489
+ "learning_rate": 5.033308820289184e-08,
490
+ "loss": 0.09,
491
+ "step": 1080
492
+ },
493
+ {
494
+ "epoch": 20.0,
495
  "learning_rate": 0.0,
496
+ "loss": 0.0671,
497
+ "step": 1100
498
  },
499
  {
500
+ "epoch": 20.0,
501
+ "eval_loss": 3.990852117538452,
502
+ "eval_runtime": 1.2506,
503
+ "eval_samples_per_second": 18.391,
504
+ "eval_steps_per_second": 4.798,
505
+ "step": 1100
506
  },
507
  {
508
+ "epoch": 20.0,
509
+ "step": 1100,
510
+ "total_flos": 3.807078373542298e+16,
511
+ "train_loss": 0.6849299516461113,
512
+ "train_runtime": 917.6848,
513
+ "train_samples_per_second": 4.729,
514
+ "train_steps_per_second": 1.199
515
  }
516
  ],
517
+ "logging_steps": 20,
518
+ "max_steps": 1100,
519
  "num_input_tokens_seen": 0,
520
+ "num_train_epochs": 20,
521
+ "save_steps": 20,
522
+ "total_flos": 3.807078373542298e+16,
523
+ "train_batch_size": 4,
524
  "trial_name": null,
525
  "trial_params": null
526
  }