tiagoblima commited on
Commit
7b3f575
1 Parent(s): bd7bcda

End of training

Browse files
Files changed (5) hide show
  1. README.md +4 -2
  2. all_results.json +8 -8
  3. eval_results.json +4 -4
  4. train_results.json +4 -4
  5. trainer_state.json +883 -157
README.md CHANGED
@@ -3,6 +3,8 @@ license: mit
3
  base_model: unicamp-dl/ptt5-base-t5-vocab
4
  tags:
5
  - generated_from_trainer
 
 
6
  model-index:
7
  - name: t5_base-qg-ap-nopeft
8
  results: []
@@ -13,9 +15,9 @@ should probably proofread and complete it, then remove this comment. -->
13
 
14
  # t5_base-qg-ap-nopeft
15
 
16
- This model is a fine-tuned version of [unicamp-dl/ptt5-base-t5-vocab](https://huggingface.co/unicamp-dl/ptt5-base-t5-vocab) on an unknown dataset.
17
  It achieves the following results on the evaluation set:
18
- - Loss: 1.2074
19
 
20
  ## Model description
21
 
 
3
  base_model: unicamp-dl/ptt5-base-t5-vocab
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tiagoblima/qg_squad_v1_pt
8
  model-index:
9
  - name: t5_base-qg-ap-nopeft
10
  results: []
 
15
 
16
  # t5_base-qg-ap-nopeft
17
 
18
+ This model is a fine-tuned version of [unicamp-dl/ptt5-base-t5-vocab](https://huggingface.co/unicamp-dl/ptt5-base-t5-vocab) on the tiagoblima/qg_squad_v1_pt dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 1.1980
21
 
22
  ## Model description
23
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 1.2066526412963867,
4
- "eval_runtime": 62.6318,
5
  "eval_samples": 3585,
6
- "eval_samples_per_second": 57.239,
7
- "eval_steps_per_second": 14.322,
8
- "train_loss": 1.1255827186131242,
9
- "train_runtime": 15011.7488,
10
  "train_samples": 51704,
11
- "train_samples_per_second": 17.221,
12
- "train_steps_per_second": 0.269
13
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 1.1979700326919556,
4
+ "eval_runtime": 100.8452,
5
  "eval_samples": 3585,
6
+ "eval_samples_per_second": 35.55,
7
+ "eval_steps_per_second": 8.895,
8
+ "train_loss": 1.0387100059157441,
9
+ "train_runtime": 30224.2981,
10
  "train_samples": 51704,
11
+ "train_samples_per_second": 8.553,
12
+ "train_steps_per_second": 0.535
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_loss": 1.2066526412963867,
4
- "eval_runtime": 62.6318,
5
  "eval_samples": 3585,
6
- "eval_samples_per_second": 57.239,
7
- "eval_steps_per_second": 14.322
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_loss": 1.1979700326919556,
4
+ "eval_runtime": 100.8452,
5
  "eval_samples": 3585,
6
+ "eval_samples_per_second": 35.55,
7
+ "eval_steps_per_second": 8.895
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
- "train_loss": 1.1255827186131242,
4
- "train_runtime": 15011.7488,
5
  "train_samples": 51704,
6
- "train_samples_per_second": 17.221,
7
- "train_steps_per_second": 0.269
8
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "train_loss": 1.0387100059157441,
4
+ "train_runtime": 30224.2981,
5
  "train_samples": 51704,
6
+ "train_samples_per_second": 8.553,
7
+ "train_steps_per_second": 0.535
8
  }
trainer_state.json CHANGED
@@ -1,308 +1,1034 @@
1
  {
2
- "best_metric": 1.2066526412963867,
3
- "best_model_checkpoint": "/temp/t5_base-qg-ap-nopeft/checkpoint-3232",
4
- "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 4040,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.12,
13
- "learning_rate": 9.752475247524753e-05,
14
- "loss": 1.9421,
15
  "step": 100
16
  },
17
  {
18
- "epoch": 0.25,
19
- "learning_rate": 9.504950495049505e-05,
20
- "loss": 1.3367,
21
  "step": 200
22
  },
23
  {
24
- "epoch": 0.37,
25
- "learning_rate": 9.257425742574259e-05,
26
- "loss": 1.2944,
27
  "step": 300
28
  },
29
  {
30
- "epoch": 0.5,
31
- "learning_rate": 9.009900990099011e-05,
32
- "loss": 1.267,
33
  "step": 400
34
  },
35
  {
36
- "epoch": 0.62,
37
- "learning_rate": 8.762376237623763e-05,
38
- "loss": 1.2404,
39
  "step": 500
40
  },
41
  {
42
- "epoch": 0.74,
43
- "learning_rate": 8.514851485148515e-05,
44
- "loss": 1.2277,
45
  "step": 600
46
  },
47
  {
48
- "epoch": 0.87,
49
- "learning_rate": 8.267326732673268e-05,
50
- "loss": 1.2245,
51
  "step": 700
52
  },
53
  {
54
- "epoch": 0.99,
55
- "learning_rate": 8.019801980198021e-05,
56
- "loss": 1.2123,
57
  "step": 800
58
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  {
60
  "epoch": 1.0,
61
- "eval_loss": 1.24958074092865,
62
- "eval_runtime": 62.4161,
63
- "eval_samples_per_second": 57.437,
64
- "eval_steps_per_second": 14.371,
65
- "step": 808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  },
67
  {
68
  "epoch": 1.11,
69
- "learning_rate": 7.772277227722773e-05,
70
- "loss": 1.1527,
71
- "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  },
73
  {
74
  "epoch": 1.24,
75
- "learning_rate": 7.524752475247526e-05,
76
- "loss": 1.1486,
77
- "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
  {
80
  "epoch": 1.36,
81
- "learning_rate": 7.277227722772278e-05,
82
- "loss": 1.1403,
83
- "step": 1100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  },
85
  {
86
  "epoch": 1.49,
87
- "learning_rate": 7.02970297029703e-05,
88
- "loss": 1.1483,
89
- "step": 1200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  },
91
  {
92
  "epoch": 1.61,
93
- "learning_rate": 6.782178217821783e-05,
94
- "loss": 1.1423,
95
- "step": 1300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  },
97
  {
98
  "epoch": 1.73,
99
- "learning_rate": 6.534653465346535e-05,
100
- "loss": 1.1231,
101
- "step": 1400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  },
103
  {
104
  "epoch": 1.86,
105
- "learning_rate": 6.287128712871287e-05,
106
- "loss": 1.1312,
107
- "step": 1500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  },
109
  {
110
  "epoch": 1.98,
111
- "learning_rate": 6.03960396039604e-05,
112
- "loss": 1.1329,
113
- "step": 1600
114
  },
115
  {
116
  "epoch": 2.0,
117
- "eval_loss": 1.2206790447235107,
118
- "eval_runtime": 62.4282,
119
- "eval_samples_per_second": 57.426,
120
- "eval_steps_per_second": 14.369,
121
- "step": 1616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  },
123
  {
124
  "epoch": 2.1,
125
- "learning_rate": 5.792079207920792e-05,
126
- "loss": 1.0849,
127
- "step": 1700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  },
129
  {
130
  "epoch": 2.23,
131
- "learning_rate": 5.544554455445545e-05,
132
- "loss": 1.0893,
133
- "step": 1800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  },
135
  {
136
  "epoch": 2.35,
137
- "learning_rate": 5.2970297029702974e-05,
138
- "loss": 1.0767,
139
- "step": 1900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  },
141
  {
142
  "epoch": 2.48,
143
- "learning_rate": 5.0495049504950497e-05,
144
- "loss": 1.0773,
145
- "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  },
147
  {
148
  "epoch": 2.6,
149
- "learning_rate": 4.801980198019802e-05,
150
- "loss": 1.0791,
151
- "step": 2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  },
153
  {
154
  "epoch": 2.72,
155
- "learning_rate": 4.554455445544555e-05,
156
- "loss": 1.0749,
157
- "step": 2200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  },
159
  {
160
  "epoch": 2.85,
161
- "learning_rate": 4.306930693069307e-05,
162
- "loss": 1.0918,
163
- "step": 2300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  },
165
  {
166
  "epoch": 2.97,
167
- "learning_rate": 4.05940594059406e-05,
168
- "loss": 1.0819,
169
- "step": 2400
170
  },
171
  {
172
  "epoch": 3.0,
173
- "eval_loss": 1.2097089290618896,
174
- "eval_runtime": 62.4812,
175
- "eval_samples_per_second": 57.377,
176
- "eval_steps_per_second": 14.356,
177
- "step": 2424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  },
179
  {
180
  "epoch": 3.09,
181
- "learning_rate": 3.811881188118812e-05,
182
- "loss": 1.0497,
183
- "step": 2500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  },
185
  {
186
  "epoch": 3.22,
187
- "learning_rate": 3.5643564356435645e-05,
188
- "loss": 1.0372,
189
- "step": 2600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  },
191
  {
192
  "epoch": 3.34,
193
- "learning_rate": 3.3168316831683175e-05,
194
- "loss": 1.0545,
195
- "step": 2700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  },
197
  {
198
  "epoch": 3.47,
199
- "learning_rate": 3.06930693069307e-05,
200
- "loss": 1.0511,
201
- "step": 2800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  },
203
  {
204
  "epoch": 3.59,
205
- "learning_rate": 2.8217821782178216e-05,
206
- "loss": 1.0473,
207
- "step": 2900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  },
209
  {
210
  "epoch": 3.71,
211
- "learning_rate": 2.5742574257425746e-05,
212
- "loss": 1.0321,
213
- "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  },
215
  {
216
  "epoch": 3.84,
217
- "learning_rate": 2.326732673267327e-05,
218
- "loss": 1.0439,
219
- "step": 3100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  },
221
  {
222
  "epoch": 3.96,
223
- "learning_rate": 2.079207920792079e-05,
224
- "loss": 1.0447,
225
- "step": 3200
 
 
 
 
 
 
226
  },
227
  {
228
  "epoch": 4.0,
229
- "eval_loss": 1.2066526412963867,
230
- "eval_runtime": 62.569,
231
- "eval_samples_per_second": 57.297,
232
- "eval_steps_per_second": 14.336,
233
- "step": 3232
 
 
 
 
 
 
 
 
 
 
 
 
234
  },
235
  {
236
  "epoch": 4.08,
237
- "learning_rate": 1.8316831683168317e-05,
238
- "loss": 1.0334,
239
- "step": 3300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  },
241
  {
242
  "epoch": 4.21,
243
- "learning_rate": 1.5841584158415843e-05,
244
- "loss": 1.0142,
245
- "step": 3400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  },
247
  {
248
  "epoch": 4.33,
249
- "learning_rate": 1.3366336633663367e-05,
250
- "loss": 1.0318,
251
- "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  },
253
  {
254
  "epoch": 4.46,
255
- "learning_rate": 1.0891089108910891e-05,
256
- "loss": 1.0127,
257
- "step": 3600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  },
259
  {
260
  "epoch": 4.58,
261
- "learning_rate": 8.415841584158417e-06,
262
- "loss": 1.0219,
263
- "step": 3700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  },
265
  {
266
  "epoch": 4.7,
267
- "learning_rate": 5.940594059405941e-06,
268
- "loss": 1.0135,
269
- "step": 3800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  },
271
  {
272
  "epoch": 4.83,
273
- "learning_rate": 3.4653465346534657e-06,
274
- "loss": 1.0245,
275
- "step": 3900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  },
277
  {
278
  "epoch": 4.95,
279
- "learning_rate": 9.900990099009902e-07,
280
- "loss": 1.0244,
281
- "step": 4000
 
 
 
 
 
 
282
  },
283
  {
284
  "epoch": 5.0,
285
- "eval_loss": 1.2074495553970337,
286
- "eval_runtime": 62.6556,
287
- "eval_samples_per_second": 57.218,
288
- "eval_steps_per_second": 14.316,
289
- "step": 4040
290
  },
291
  {
292
  "epoch": 5.0,
293
- "step": 4040,
294
- "total_flos": 1.574277938675712e+17,
295
- "train_loss": 1.1255827186131242,
296
- "train_runtime": 15011.7488,
297
- "train_samples_per_second": 17.221,
298
- "train_steps_per_second": 0.269
299
  }
300
  ],
301
  "logging_steps": 100,
302
- "max_steps": 4040,
303
  "num_train_epochs": 5,
304
  "save_steps": 500,
305
- "total_flos": 1.574277938675712e+17,
306
  "trial_name": null,
307
  "trial_params": null
308
  }
 
1
  {
2
+ "best_metric": 1.1979700326919556,
3
+ "best_model_checkpoint": "/temp/t5_base-qg-ap-nopeft/checkpoint-9694",
4
+ "epoch": 4.9992263654649545,
5
  "eval_steps": 500,
6
+ "global_step": 16155,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 9.938099659548128e-05,
14
+ "loss": 2.1468,
15
  "step": 100
16
  },
17
  {
18
+ "epoch": 0.06,
19
+ "learning_rate": 9.876199319096255e-05,
20
+ "loss": 1.4194,
21
  "step": 200
22
  },
23
  {
24
+ "epoch": 0.09,
25
+ "learning_rate": 9.814298978644383e-05,
26
+ "loss": 1.3613,
27
  "step": 300
28
  },
29
  {
30
+ "epoch": 0.12,
31
+ "learning_rate": 9.752398638192511e-05,
32
+ "loss": 1.3139,
33
  "step": 400
34
  },
35
  {
36
+ "epoch": 0.15,
37
+ "learning_rate": 9.690498297740637e-05,
38
+ "loss": 1.3216,
39
  "step": 500
40
  },
41
  {
42
+ "epoch": 0.19,
43
+ "learning_rate": 9.628597957288765e-05,
44
+ "loss": 1.286,
45
  "step": 600
46
  },
47
  {
48
+ "epoch": 0.22,
49
+ "learning_rate": 9.566697616836893e-05,
50
+ "loss": 1.2877,
51
  "step": 700
52
  },
53
  {
54
+ "epoch": 0.25,
55
+ "learning_rate": 9.50479727638502e-05,
56
+ "loss": 1.246,
57
  "step": 800
58
  },
59
+ {
60
+ "epoch": 0.28,
61
+ "learning_rate": 9.442896935933148e-05,
62
+ "loss": 1.2309,
63
+ "step": 900
64
+ },
65
+ {
66
+ "epoch": 0.31,
67
+ "learning_rate": 9.380996595481276e-05,
68
+ "loss": 1.2611,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.34,
73
+ "learning_rate": 9.319096255029403e-05,
74
+ "loss": 1.247,
75
+ "step": 1100
76
+ },
77
+ {
78
+ "epoch": 0.37,
79
+ "learning_rate": 9.257195914577531e-05,
80
+ "loss": 1.2398,
81
+ "step": 1200
82
+ },
83
+ {
84
+ "epoch": 0.4,
85
+ "learning_rate": 9.195295574125658e-05,
86
+ "loss": 1.2403,
87
+ "step": 1300
88
+ },
89
+ {
90
+ "epoch": 0.43,
91
+ "learning_rate": 9.133395233673786e-05,
92
+ "loss": 1.2324,
93
+ "step": 1400
94
+ },
95
+ {
96
+ "epoch": 0.46,
97
+ "learning_rate": 9.071494893221914e-05,
98
+ "loss": 1.2069,
99
+ "step": 1500
100
+ },
101
+ {
102
+ "epoch": 0.5,
103
+ "learning_rate": 9.00959455277004e-05,
104
+ "loss": 1.2273,
105
+ "step": 1600
106
+ },
107
+ {
108
+ "epoch": 0.53,
109
+ "learning_rate": 8.947694212318168e-05,
110
+ "loss": 1.2199,
111
+ "step": 1700
112
+ },
113
+ {
114
+ "epoch": 0.56,
115
+ "learning_rate": 8.885793871866296e-05,
116
+ "loss": 1.1963,
117
+ "step": 1800
118
+ },
119
+ {
120
+ "epoch": 0.59,
121
+ "learning_rate": 8.823893531414423e-05,
122
+ "loss": 1.2081,
123
+ "step": 1900
124
+ },
125
+ {
126
+ "epoch": 0.62,
127
+ "learning_rate": 8.76199319096255e-05,
128
+ "loss": 1.1895,
129
+ "step": 2000
130
+ },
131
+ {
132
+ "epoch": 0.65,
133
+ "learning_rate": 8.700092850510679e-05,
134
+ "loss": 1.203,
135
+ "step": 2100
136
+ },
137
+ {
138
+ "epoch": 0.68,
139
+ "learning_rate": 8.638192510058805e-05,
140
+ "loss": 1.1738,
141
+ "step": 2200
142
+ },
143
+ {
144
+ "epoch": 0.71,
145
+ "learning_rate": 8.576292169606933e-05,
146
+ "loss": 1.187,
147
+ "step": 2300
148
+ },
149
+ {
150
+ "epoch": 0.74,
151
+ "learning_rate": 8.514391829155061e-05,
152
+ "loss": 1.1857,
153
+ "step": 2400
154
+ },
155
+ {
156
+ "epoch": 0.77,
157
+ "learning_rate": 8.452491488703188e-05,
158
+ "loss": 1.169,
159
+ "step": 2500
160
+ },
161
+ {
162
+ "epoch": 0.8,
163
+ "learning_rate": 8.390591148251315e-05,
164
+ "loss": 1.2174,
165
+ "step": 2600
166
+ },
167
+ {
168
+ "epoch": 0.84,
169
+ "learning_rate": 8.328690807799443e-05,
170
+ "loss": 1.1737,
171
+ "step": 2700
172
+ },
173
+ {
174
+ "epoch": 0.87,
175
+ "learning_rate": 8.26679046734757e-05,
176
+ "loss": 1.174,
177
+ "step": 2800
178
+ },
179
+ {
180
+ "epoch": 0.9,
181
+ "learning_rate": 8.204890126895698e-05,
182
+ "loss": 1.1709,
183
+ "step": 2900
184
+ },
185
+ {
186
+ "epoch": 0.93,
187
+ "learning_rate": 8.142989786443826e-05,
188
+ "loss": 1.1882,
189
+ "step": 3000
190
+ },
191
+ {
192
+ "epoch": 0.96,
193
+ "learning_rate": 8.081089445991952e-05,
194
+ "loss": 1.1793,
195
+ "step": 3100
196
+ },
197
+ {
198
+ "epoch": 0.99,
199
+ "learning_rate": 8.019189105540082e-05,
200
+ "loss": 1.1634,
201
+ "step": 3200
202
+ },
203
  {
204
  "epoch": 1.0,
205
+ "eval_loss": 1.224627137184143,
206
+ "eval_runtime": 100.7601,
207
+ "eval_samples_per_second": 35.58,
208
+ "eval_steps_per_second": 8.902,
209
+ "step": 3231
210
+ },
211
+ {
212
+ "epoch": 1.02,
213
+ "learning_rate": 7.95728876508821e-05,
214
+ "loss": 1.101,
215
+ "step": 3300
216
+ },
217
+ {
218
+ "epoch": 1.05,
219
+ "learning_rate": 7.895388424636336e-05,
220
+ "loss": 1.0723,
221
+ "step": 3400
222
+ },
223
+ {
224
+ "epoch": 1.08,
225
+ "learning_rate": 7.833488084184464e-05,
226
+ "loss": 1.0846,
227
+ "step": 3500
228
  },
229
  {
230
  "epoch": 1.11,
231
+ "learning_rate": 7.771587743732592e-05,
232
+ "loss": 1.0802,
233
+ "step": 3600
234
+ },
235
+ {
236
+ "epoch": 1.14,
237
+ "learning_rate": 7.709687403280718e-05,
238
+ "loss": 1.0806,
239
+ "step": 3700
240
+ },
241
+ {
242
+ "epoch": 1.18,
243
+ "learning_rate": 7.647787062828846e-05,
244
+ "loss": 1.0844,
245
+ "step": 3800
246
+ },
247
+ {
248
+ "epoch": 1.21,
249
+ "learning_rate": 7.585886722376974e-05,
250
+ "loss": 1.0838,
251
+ "step": 3900
252
  },
253
  {
254
  "epoch": 1.24,
255
+ "learning_rate": 7.523986381925101e-05,
256
+ "loss": 1.0977,
257
+ "step": 4000
258
+ },
259
+ {
260
+ "epoch": 1.27,
261
+ "learning_rate": 7.462086041473229e-05,
262
+ "loss": 1.0832,
263
+ "step": 4100
264
+ },
265
+ {
266
+ "epoch": 1.3,
267
+ "learning_rate": 7.400185701021357e-05,
268
+ "loss": 1.074,
269
+ "step": 4200
270
+ },
271
+ {
272
+ "epoch": 1.33,
273
+ "learning_rate": 7.338285360569483e-05,
274
+ "loss": 1.0835,
275
+ "step": 4300
276
  },
277
  {
278
  "epoch": 1.36,
279
+ "learning_rate": 7.276385020117611e-05,
280
+ "loss": 1.0725,
281
+ "step": 4400
282
+ },
283
+ {
284
+ "epoch": 1.39,
285
+ "learning_rate": 7.214484679665738e-05,
286
+ "loss": 1.0854,
287
+ "step": 4500
288
+ },
289
+ {
290
+ "epoch": 1.42,
291
+ "learning_rate": 7.152584339213866e-05,
292
+ "loss": 1.0879,
293
+ "step": 4600
294
+ },
295
+ {
296
+ "epoch": 1.45,
297
+ "learning_rate": 7.090683998761994e-05,
298
+ "loss": 1.0916,
299
+ "step": 4700
300
  },
301
  {
302
  "epoch": 1.49,
303
+ "learning_rate": 7.02878365831012e-05,
304
+ "loss": 1.0758,
305
+ "step": 4800
306
+ },
307
+ {
308
+ "epoch": 1.52,
309
+ "learning_rate": 6.966883317858248e-05,
310
+ "loss": 1.0953,
311
+ "step": 4900
312
+ },
313
+ {
314
+ "epoch": 1.55,
315
+ "learning_rate": 6.904982977406376e-05,
316
+ "loss": 1.0733,
317
+ "step": 5000
318
+ },
319
+ {
320
+ "epoch": 1.58,
321
+ "learning_rate": 6.843082636954502e-05,
322
+ "loss": 1.0782,
323
+ "step": 5100
324
  },
325
  {
326
  "epoch": 1.61,
327
+ "learning_rate": 6.78118229650263e-05,
328
+ "loss": 1.0785,
329
+ "step": 5200
330
+ },
331
+ {
332
+ "epoch": 1.64,
333
+ "learning_rate": 6.719281956050758e-05,
334
+ "loss": 1.0708,
335
+ "step": 5300
336
+ },
337
+ {
338
+ "epoch": 1.67,
339
+ "learning_rate": 6.657381615598886e-05,
340
+ "loss": 1.0541,
341
+ "step": 5400
342
+ },
343
+ {
344
+ "epoch": 1.7,
345
+ "learning_rate": 6.595481275147014e-05,
346
+ "loss": 1.0629,
347
+ "step": 5500
348
  },
349
  {
350
  "epoch": 1.73,
351
+ "learning_rate": 6.533580934695142e-05,
352
+ "loss": 1.0707,
353
+ "step": 5600
354
+ },
355
+ {
356
+ "epoch": 1.76,
357
+ "learning_rate": 6.471680594243269e-05,
358
+ "loss": 1.077,
359
+ "step": 5700
360
+ },
361
+ {
362
+ "epoch": 1.79,
363
+ "learning_rate": 6.409780253791397e-05,
364
+ "loss": 1.0926,
365
+ "step": 5800
366
+ },
367
+ {
368
+ "epoch": 1.83,
369
+ "learning_rate": 6.347879913339524e-05,
370
+ "loss": 1.062,
371
+ "step": 5900
372
  },
373
  {
374
  "epoch": 1.86,
375
+ "learning_rate": 6.285979572887651e-05,
376
+ "loss": 1.079,
377
+ "step": 6000
378
+ },
379
+ {
380
+ "epoch": 1.89,
381
+ "learning_rate": 6.224079232435779e-05,
382
+ "loss": 1.065,
383
+ "step": 6100
384
+ },
385
+ {
386
+ "epoch": 1.92,
387
+ "learning_rate": 6.162178891983907e-05,
388
+ "loss": 1.0766,
389
+ "step": 6200
390
+ },
391
+ {
392
+ "epoch": 1.95,
393
+ "learning_rate": 6.100278551532034e-05,
394
+ "loss": 1.0814,
395
+ "step": 6300
396
  },
397
  {
398
  "epoch": 1.98,
399
+ "learning_rate": 6.038378211080161e-05,
400
+ "loss": 1.0645,
401
+ "step": 6400
402
  },
403
  {
404
  "epoch": 2.0,
405
+ "eval_loss": 1.2034717798233032,
406
+ "eval_runtime": 100.7994,
407
+ "eval_samples_per_second": 35.566,
408
+ "eval_steps_per_second": 8.899,
409
+ "step": 6463
410
+ },
411
+ {
412
+ "epoch": 2.01,
413
+ "learning_rate": 5.9764778706282886e-05,
414
+ "loss": 1.0357,
415
+ "step": 6500
416
+ },
417
+ {
418
+ "epoch": 2.04,
419
+ "learning_rate": 5.914577530176416e-05,
420
+ "loss": 0.9951,
421
+ "step": 6600
422
+ },
423
+ {
424
+ "epoch": 2.07,
425
+ "learning_rate": 5.852677189724544e-05,
426
+ "loss": 0.9866,
427
+ "step": 6700
428
  },
429
  {
430
  "epoch": 2.1,
431
+ "learning_rate": 5.790776849272671e-05,
432
+ "loss": 0.9925,
433
+ "step": 6800
434
+ },
435
+ {
436
+ "epoch": 2.14,
437
+ "learning_rate": 5.728876508820798e-05,
438
+ "loss": 1.006,
439
+ "step": 6900
440
+ },
441
+ {
442
+ "epoch": 2.17,
443
+ "learning_rate": 5.666976168368926e-05,
444
+ "loss": 1.0164,
445
+ "step": 7000
446
+ },
447
+ {
448
+ "epoch": 2.2,
449
+ "learning_rate": 5.6050758279170534e-05,
450
+ "loss": 1.0098,
451
+ "step": 7100
452
  },
453
  {
454
  "epoch": 2.23,
455
+ "learning_rate": 5.5431754874651806e-05,
456
+ "loss": 0.9917,
457
+ "step": 7200
458
+ },
459
+ {
460
+ "epoch": 2.26,
461
+ "learning_rate": 5.4812751470133085e-05,
462
+ "loss": 0.9891,
463
+ "step": 7300
464
+ },
465
+ {
466
+ "epoch": 2.29,
467
+ "learning_rate": 5.419374806561436e-05,
468
+ "loss": 0.9932,
469
+ "step": 7400
470
+ },
471
+ {
472
+ "epoch": 2.32,
473
+ "learning_rate": 5.357474466109564e-05,
474
+ "loss": 0.9881,
475
+ "step": 7500
476
  },
477
  {
478
  "epoch": 2.35,
479
+ "learning_rate": 5.2955741256576916e-05,
480
+ "loss": 1.0001,
481
+ "step": 7600
482
+ },
483
+ {
484
+ "epoch": 2.38,
485
+ "learning_rate": 5.2336737852058195e-05,
486
+ "loss": 0.9836,
487
+ "step": 7700
488
+ },
489
+ {
490
+ "epoch": 2.41,
491
+ "learning_rate": 5.171773444753947e-05,
492
+ "loss": 0.9965,
493
+ "step": 7800
494
+ },
495
+ {
496
+ "epoch": 2.44,
497
+ "learning_rate": 5.109873104302074e-05,
498
+ "loss": 1.0102,
499
+ "step": 7900
500
  },
501
  {
502
  "epoch": 2.48,
503
+ "learning_rate": 5.047972763850202e-05,
504
+ "loss": 1.0023,
505
+ "step": 8000
506
+ },
507
+ {
508
+ "epoch": 2.51,
509
+ "learning_rate": 4.986072423398329e-05,
510
+ "loss": 1.0118,
511
+ "step": 8100
512
+ },
513
+ {
514
+ "epoch": 2.54,
515
+ "learning_rate": 4.9241720829464564e-05,
516
+ "loss": 0.9785,
517
+ "step": 8200
518
+ },
519
+ {
520
+ "epoch": 2.57,
521
+ "learning_rate": 4.862271742494584e-05,
522
+ "loss": 1.0105,
523
+ "step": 8300
524
  },
525
  {
526
  "epoch": 2.6,
527
+ "learning_rate": 4.8003714020427115e-05,
528
+ "loss": 0.998,
529
+ "step": 8400
530
+ },
531
+ {
532
+ "epoch": 2.63,
533
+ "learning_rate": 4.738471061590839e-05,
534
+ "loss": 0.9986,
535
+ "step": 8500
536
+ },
537
+ {
538
+ "epoch": 2.66,
539
+ "learning_rate": 4.676570721138967e-05,
540
+ "loss": 0.9938,
541
+ "step": 8600
542
+ },
543
+ {
544
+ "epoch": 2.69,
545
+ "learning_rate": 4.614670380687094e-05,
546
+ "loss": 0.9939,
547
+ "step": 8700
548
  },
549
  {
550
  "epoch": 2.72,
551
+ "learning_rate": 4.552770040235221e-05,
552
+ "loss": 0.9846,
553
+ "step": 8800
554
+ },
555
+ {
556
+ "epoch": 2.75,
557
+ "learning_rate": 4.490869699783349e-05,
558
+ "loss": 1.0124,
559
+ "step": 8900
560
+ },
561
+ {
562
+ "epoch": 2.79,
563
+ "learning_rate": 4.428969359331476e-05,
564
+ "loss": 0.9964,
565
+ "step": 9000
566
+ },
567
+ {
568
+ "epoch": 2.82,
569
+ "learning_rate": 4.367069018879604e-05,
570
+ "loss": 1.0029,
571
+ "step": 9100
572
  },
573
  {
574
  "epoch": 2.85,
575
+ "learning_rate": 4.3051686784277315e-05,
576
+ "loss": 1.0148,
577
+ "step": 9200
578
+ },
579
+ {
580
+ "epoch": 2.88,
581
+ "learning_rate": 4.2432683379758594e-05,
582
+ "loss": 1.0039,
583
+ "step": 9300
584
+ },
585
+ {
586
+ "epoch": 2.91,
587
+ "learning_rate": 4.1813679975239866e-05,
588
+ "loss": 0.9917,
589
+ "step": 9400
590
+ },
591
+ {
592
+ "epoch": 2.94,
593
+ "learning_rate": 4.119467657072114e-05,
594
+ "loss": 1.0066,
595
+ "step": 9500
596
  },
597
  {
598
  "epoch": 2.97,
599
+ "learning_rate": 4.057567316620242e-05,
600
+ "loss": 0.991,
601
+ "step": 9600
602
  },
603
  {
604
  "epoch": 3.0,
605
+ "eval_loss": 1.1979700326919556,
606
+ "eval_runtime": 100.8034,
607
+ "eval_samples_per_second": 35.564,
608
+ "eval_steps_per_second": 8.899,
609
+ "step": 9694
610
+ },
611
+ {
612
+ "epoch": 3.0,
613
+ "learning_rate": 3.995666976168369e-05,
614
+ "loss": 0.9994,
615
+ "step": 9700
616
+ },
617
+ {
618
+ "epoch": 3.03,
619
+ "learning_rate": 3.933766635716496e-05,
620
+ "loss": 0.9555,
621
+ "step": 9800
622
+ },
623
+ {
624
+ "epoch": 3.06,
625
+ "learning_rate": 3.871866295264624e-05,
626
+ "loss": 0.9364,
627
+ "step": 9900
628
  },
629
  {
630
  "epoch": 3.09,
631
+ "learning_rate": 3.8099659548127514e-05,
632
+ "loss": 0.9392,
633
+ "step": 10000
634
+ },
635
+ {
636
+ "epoch": 3.13,
637
+ "learning_rate": 3.748065614360879e-05,
638
+ "loss": 0.9439,
639
+ "step": 10100
640
+ },
641
+ {
642
+ "epoch": 3.16,
643
+ "learning_rate": 3.686165273909007e-05,
644
+ "loss": 0.9474,
645
+ "step": 10200
646
+ },
647
+ {
648
+ "epoch": 3.19,
649
+ "learning_rate": 3.6242649334571345e-05,
650
+ "loss": 0.9298,
651
+ "step": 10300
652
  },
653
  {
654
  "epoch": 3.22,
655
+ "learning_rate": 3.562364593005262e-05,
656
+ "loss": 0.9245,
657
+ "step": 10400
658
+ },
659
+ {
660
+ "epoch": 3.25,
661
+ "learning_rate": 3.5004642525533896e-05,
662
+ "loss": 0.9724,
663
+ "step": 10500
664
+ },
665
+ {
666
+ "epoch": 3.28,
667
+ "learning_rate": 3.438563912101517e-05,
668
+ "loss": 0.9484,
669
+ "step": 10600
670
+ },
671
+ {
672
+ "epoch": 3.31,
673
+ "learning_rate": 3.376663571649644e-05,
674
+ "loss": 0.9376,
675
+ "step": 10700
676
  },
677
  {
678
  "epoch": 3.34,
679
+ "learning_rate": 3.314763231197771e-05,
680
+ "loss": 0.9549,
681
+ "step": 10800
682
+ },
683
+ {
684
+ "epoch": 3.37,
685
+ "learning_rate": 3.252862890745899e-05,
686
+ "loss": 0.957,
687
+ "step": 10900
688
+ },
689
+ {
690
+ "epoch": 3.4,
691
+ "learning_rate": 3.1909625502940265e-05,
692
+ "loss": 0.9509,
693
+ "step": 11000
694
+ },
695
+ {
696
+ "epoch": 3.43,
697
+ "learning_rate": 3.129062209842154e-05,
698
+ "loss": 0.9427,
699
+ "step": 11100
700
  },
701
  {
702
  "epoch": 3.47,
703
+ "learning_rate": 3.067161869390282e-05,
704
+ "loss": 0.9598,
705
+ "step": 11200
706
+ },
707
+ {
708
+ "epoch": 3.5,
709
+ "learning_rate": 3.0052615289384095e-05,
710
+ "loss": 0.9584,
711
+ "step": 11300
712
+ },
713
+ {
714
+ "epoch": 3.53,
715
+ "learning_rate": 2.943361188486537e-05,
716
+ "loss": 0.9544,
717
+ "step": 11400
718
+ },
719
+ {
720
+ "epoch": 3.56,
721
+ "learning_rate": 2.8814608480346644e-05,
722
+ "loss": 0.9561,
723
+ "step": 11500
724
  },
725
  {
726
  "epoch": 3.59,
727
+ "learning_rate": 2.819560507582792e-05,
728
+ "loss": 0.9374,
729
+ "step": 11600
730
+ },
731
+ {
732
+ "epoch": 3.62,
733
+ "learning_rate": 2.7576601671309192e-05,
734
+ "loss": 0.9236,
735
+ "step": 11700
736
+ },
737
+ {
738
+ "epoch": 3.65,
739
+ "learning_rate": 2.6957598266790468e-05,
740
+ "loss": 0.9434,
741
+ "step": 11800
742
+ },
743
+ {
744
+ "epoch": 3.68,
745
+ "learning_rate": 2.6338594862271743e-05,
746
+ "loss": 0.9363,
747
+ "step": 11900
748
  },
749
  {
750
  "epoch": 3.71,
751
+ "learning_rate": 2.5719591457753016e-05,
752
+ "loss": 0.9381,
753
+ "step": 12000
754
+ },
755
+ {
756
+ "epoch": 3.74,
757
+ "learning_rate": 2.510058805323429e-05,
758
+ "loss": 0.9535,
759
+ "step": 12100
760
+ },
761
+ {
762
+ "epoch": 3.78,
763
+ "learning_rate": 2.448158464871557e-05,
764
+ "loss": 0.9577,
765
+ "step": 12200
766
+ },
767
+ {
768
+ "epoch": 3.81,
769
+ "learning_rate": 2.3862581244196843e-05,
770
+ "loss": 0.9373,
771
+ "step": 12300
772
  },
773
  {
774
  "epoch": 3.84,
775
+ "learning_rate": 2.324357783967812e-05,
776
+ "loss": 0.9499,
777
+ "step": 12400
778
+ },
779
+ {
780
+ "epoch": 3.87,
781
+ "learning_rate": 2.2624574435159395e-05,
782
+ "loss": 0.9442,
783
+ "step": 12500
784
+ },
785
+ {
786
+ "epoch": 3.9,
787
+ "learning_rate": 2.200557103064067e-05,
788
+ "loss": 0.9419,
789
+ "step": 12600
790
+ },
791
+ {
792
+ "epoch": 3.93,
793
+ "learning_rate": 2.1386567626121946e-05,
794
+ "loss": 0.9509,
795
+ "step": 12700
796
  },
797
  {
798
  "epoch": 3.96,
799
+ "learning_rate": 2.076756422160322e-05,
800
+ "loss": 0.9562,
801
+ "step": 12800
802
+ },
803
+ {
804
+ "epoch": 3.99,
805
+ "learning_rate": 2.0148560817084494e-05,
806
+ "loss": 0.9459,
807
+ "step": 12900
808
  },
809
  {
810
  "epoch": 4.0,
811
+ "eval_loss": 1.2027287483215332,
812
+ "eval_runtime": 100.7364,
813
+ "eval_samples_per_second": 35.588,
814
+ "eval_steps_per_second": 8.904,
815
+ "step": 12926
816
+ },
817
+ {
818
+ "epoch": 4.02,
819
+ "learning_rate": 1.952955741256577e-05,
820
+ "loss": 0.9155,
821
+ "step": 13000
822
+ },
823
+ {
824
+ "epoch": 4.05,
825
+ "learning_rate": 1.8910554008047046e-05,
826
+ "loss": 0.9265,
827
+ "step": 13100
828
  },
829
  {
830
  "epoch": 4.08,
831
+ "learning_rate": 1.829155060352832e-05,
832
+ "loss": 0.9157,
833
+ "step": 13200
834
+ },
835
+ {
836
+ "epoch": 4.12,
837
+ "learning_rate": 1.7672547199009594e-05,
838
+ "loss": 0.9087,
839
+ "step": 13300
840
+ },
841
+ {
842
+ "epoch": 4.15,
843
+ "learning_rate": 1.705354379449087e-05,
844
+ "loss": 0.9044,
845
+ "step": 13400
846
+ },
847
+ {
848
+ "epoch": 4.18,
849
+ "learning_rate": 1.6434540389972145e-05,
850
+ "loss": 0.9176,
851
+ "step": 13500
852
  },
853
  {
854
  "epoch": 4.21,
855
+ "learning_rate": 1.581553698545342e-05,
856
+ "loss": 0.904,
857
+ "step": 13600
858
+ },
859
+ {
860
+ "epoch": 4.24,
861
+ "learning_rate": 1.5196533580934697e-05,
862
+ "loss": 0.9292,
863
+ "step": 13700
864
+ },
865
+ {
866
+ "epoch": 4.27,
867
+ "learning_rate": 1.4577530176415971e-05,
868
+ "loss": 0.9154,
869
+ "step": 13800
870
+ },
871
+ {
872
+ "epoch": 4.3,
873
+ "learning_rate": 1.3958526771897245e-05,
874
+ "loss": 0.9146,
875
+ "step": 13900
876
  },
877
  {
878
  "epoch": 4.33,
879
+ "learning_rate": 1.3339523367378521e-05,
880
+ "loss": 0.9293,
881
+ "step": 14000
882
+ },
883
+ {
884
+ "epoch": 4.36,
885
+ "learning_rate": 1.2720519962859798e-05,
886
+ "loss": 0.9019,
887
+ "step": 14100
888
+ },
889
+ {
890
+ "epoch": 4.39,
891
+ "learning_rate": 1.210151655834107e-05,
892
+ "loss": 0.902,
893
+ "step": 14200
894
+ },
895
+ {
896
+ "epoch": 4.43,
897
+ "learning_rate": 1.1482513153822347e-05,
898
+ "loss": 0.9085,
899
+ "step": 14300
900
  },
901
  {
902
  "epoch": 4.46,
903
+ "learning_rate": 1.086350974930362e-05,
904
+ "loss": 0.8968,
905
+ "step": 14400
906
+ },
907
+ {
908
+ "epoch": 4.49,
909
+ "learning_rate": 1.0244506344784898e-05,
910
+ "loss": 0.9086,
911
+ "step": 14500
912
+ },
913
+ {
914
+ "epoch": 4.52,
915
+ "learning_rate": 9.625502940266172e-06,
916
+ "loss": 0.9153,
917
+ "step": 14600
918
+ },
919
+ {
920
+ "epoch": 4.55,
921
+ "learning_rate": 9.006499535747446e-06,
922
+ "loss": 0.9088,
923
+ "step": 14700
924
  },
925
  {
926
  "epoch": 4.58,
927
+ "learning_rate": 8.387496131228722e-06,
928
+ "loss": 0.9106,
929
+ "step": 14800
930
+ },
931
+ {
932
+ "epoch": 4.61,
933
+ "learning_rate": 7.768492726709998e-06,
934
+ "loss": 0.9198,
935
+ "step": 14900
936
+ },
937
+ {
938
+ "epoch": 4.64,
939
+ "learning_rate": 7.149489322191272e-06,
940
+ "loss": 0.8956,
941
+ "step": 15000
942
+ },
943
+ {
944
+ "epoch": 4.67,
945
+ "learning_rate": 6.530485917672548e-06,
946
+ "loss": 0.8981,
947
+ "step": 15100
948
  },
949
  {
950
  "epoch": 4.7,
951
+ "learning_rate": 5.9114825131538225e-06,
952
+ "loss": 0.9142,
953
+ "step": 15200
954
+ },
955
+ {
956
+ "epoch": 4.73,
957
+ "learning_rate": 5.2924791086350974e-06,
958
+ "loss": 0.9095,
959
+ "step": 15300
960
+ },
961
+ {
962
+ "epoch": 4.77,
963
+ "learning_rate": 4.673475704116373e-06,
964
+ "loss": 0.922,
965
+ "step": 15400
966
+ },
967
+ {
968
+ "epoch": 4.8,
969
+ "learning_rate": 4.054472299597648e-06,
970
+ "loss": 0.9119,
971
+ "step": 15500
972
  },
973
  {
974
  "epoch": 4.83,
975
+ "learning_rate": 3.4354688950789226e-06,
976
+ "loss": 0.9313,
977
+ "step": 15600
978
+ },
979
+ {
980
+ "epoch": 4.86,
981
+ "learning_rate": 2.816465490560198e-06,
982
+ "loss": 0.9035,
983
+ "step": 15700
984
+ },
985
+ {
986
+ "epoch": 4.89,
987
+ "learning_rate": 2.1974620860414733e-06,
988
+ "loss": 0.9041,
989
+ "step": 15800
990
+ },
991
+ {
992
+ "epoch": 4.92,
993
+ "learning_rate": 1.5784586815227482e-06,
994
+ "loss": 0.9316,
995
+ "step": 15900
996
  },
997
  {
998
  "epoch": 4.95,
999
+ "learning_rate": 9.594552770040236e-07,
1000
+ "loss": 0.9313,
1001
+ "step": 16000
1002
+ },
1003
+ {
1004
+ "epoch": 4.98,
1005
+ "learning_rate": 3.404518724852987e-07,
1006
+ "loss": 0.9191,
1007
+ "step": 16100
1008
  },
1009
  {
1010
  "epoch": 5.0,
1011
+ "eval_loss": 1.2074109315872192,
1012
+ "eval_runtime": 100.8821,
1013
+ "eval_samples_per_second": 35.537,
1014
+ "eval_steps_per_second": 8.892,
1015
+ "step": 16155
1016
  },
1017
  {
1018
  "epoch": 5.0,
1019
+ "step": 16155,
1020
+ "total_flos": 1.574034355519488e+17,
1021
+ "train_loss": 1.0387100059157441,
1022
+ "train_runtime": 30224.2981,
1023
+ "train_samples_per_second": 8.553,
1024
+ "train_steps_per_second": 0.535
1025
  }
1026
  ],
1027
  "logging_steps": 100,
1028
+ "max_steps": 16155,
1029
  "num_train_epochs": 5,
1030
  "save_steps": 500,
1031
+ "total_flos": 1.574034355519488e+17,
1032
  "trial_name": null,
1033
  "trial_params": null
1034
  }