terry69 commited on
Commit
c7c0b30
1 Parent(s): 7cbe347

Model save

Browse files
README.md CHANGED
@@ -3,15 +3,9 @@ library_name: transformers
3
  license: other
4
  base_model: Qwen/Qwen1.5-7B-Chat
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
- - trl
11
- - sft
12
- - generated_from_trainer
13
- datasets:
14
- - preference-data
15
  model-index:
16
  - name: qwen_feedback
17
  results: []
@@ -22,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
22
 
23
  # qwen_feedback
24
 
25
- This model is a fine-tuned version of [Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat) on the preference-data dataset.
26
 
27
  ## Model description
28
 
@@ -60,7 +54,7 @@ The following hyperparameters were used during training:
60
 
61
  ### Framework versions
62
 
63
- - Transformers 4.44.2
64
- - Pytorch 2.3.1+cu121
65
- - Datasets 2.19.1
66
- - Tokenizers 0.19.1
 
3
  license: other
4
  base_model: Qwen/Qwen1.5-7B-Chat
5
  tags:
 
6
  - trl
7
  - sft
8
  - generated_from_trainer
 
 
 
 
 
9
  model-index:
10
  - name: qwen_feedback
11
  results: []
 
16
 
17
  # qwen_feedback
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen1.5-7B-Chat](https://huggingface.co/Qwen/Qwen1.5-7B-Chat) on the None dataset.
20
 
21
  ## Model description
22
 
 
54
 
55
  ### Framework versions
56
 
57
+ - Transformers 4.45.1
58
+ - Pytorch 2.4.1+cu121
59
+ - Datasets 3.0.1
60
+ - Tokenizers 0.20.0
all_results.json CHANGED
@@ -1,13 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_runtime": 2.7767,
4
- "eval_samples": 10,
5
- "eval_samples_per_second": 3.601,
6
- "eval_steps_per_second": 1.08,
7
  "total_flos": 430711668473856.0,
8
- "train_loss": 0.6697713777575416,
9
- "train_runtime": 21206.7621,
10
  "train_samples": 98952,
11
- "train_samples_per_second": 1.253,
12
- "train_steps_per_second": 0.078
13
  }
 
1
  {
2
  "epoch": 1.0,
 
 
 
 
3
  "total_flos": 430711668473856.0,
4
+ "train_loss": 0.669738974236064,
5
+ "train_runtime": 16716.2181,
6
  "train_samples": 98952,
7
+ "train_samples_per_second": 1.59,
8
+ "train_steps_per_second": 0.099
9
  }
generation_config.json CHANGED
@@ -10,5 +10,5 @@
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
- "transformers_version": "4.44.2"
14
  }
 
10
  "temperature": 0.7,
11
  "top_k": 20,
12
  "top_p": 0.8,
13
+ "transformers_version": "4.45.1"
14
  }
runs/Sep30_00-59-37_COE-CS-sv003/events.out.tfevents.1727658063.COE-CS-sv003.1275764.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4173c61236c5a0006868d6dc382488d4d2c2f03e8ecac629407c2dc7183c9e94
3
- size 76126
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0bb502083473c9dca5785241acdc808c2e1cfe690930fb3631d064ff9e42585
3
+ size 76480
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 430711668473856.0,
4
- "train_loss": 0.6697713777575416,
5
- "train_runtime": 21206.7621,
6
  "train_samples": 98952,
7
- "train_samples_per_second": 1.253,
8
- "train_steps_per_second": 0.078
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 430711668473856.0,
4
+ "train_loss": 0.669738974236064,
5
+ "train_runtime": 16716.2181,
6
  "train_samples": 98952,
7
+ "train_samples_per_second": 1.59,
8
+ "train_steps_per_second": 0.099
9
  }
trainer_state.json CHANGED
@@ -10,2350 +10,2350 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0006020469596628537,
13
- "grad_norm": 41.92205807920962,
14
  "learning_rate": 5.98802395209581e-08,
15
- "loss": 2.0696,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0030102347983142685,
20
- "grad_norm": 41.95143902657454,
21
  "learning_rate": 2.9940119760479047e-07,
22
- "loss": 2.0656,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.006020469596628537,
27
- "grad_norm": 27.748468191825033,
28
  "learning_rate": 5.988023952095809e-07,
29
- "loss": 1.9946,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.009030704394942806,
34
- "grad_norm": 15.587842890269123,
35
  "learning_rate": 8.982035928143713e-07,
36
- "loss": 1.7623,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.012040939193257074,
41
- "grad_norm": 5.238640819714065,
42
  "learning_rate": 1.1976047904191619e-06,
43
- "loss": 1.5059,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.015051173991571343,
48
- "grad_norm": 4.531923782640025,
49
  "learning_rate": 1.4970059880239521e-06,
50
- "loss": 1.3322,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.018061408789885613,
55
- "grad_norm": 2.9213704309561614,
56
  "learning_rate": 1.7964071856287426e-06,
57
- "loss": 1.255,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.02107164358819988,
62
- "grad_norm": 1.9235170352981705,
63
  "learning_rate": 2.095808383233533e-06,
64
- "loss": 1.1734,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.024081878386514148,
69
- "grad_norm": 1.4658856095298773,
70
  "learning_rate": 2.3952095808383237e-06,
71
- "loss": 1.1022,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.027092113184828417,
76
- "grad_norm": 1.4116919266440267,
77
  "learning_rate": 2.694610778443114e-06,
78
- "loss": 1.0786,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.030102347983142687,
83
- "grad_norm": 1.2856710577917903,
84
  "learning_rate": 2.9940119760479042e-06,
85
- "loss": 1.0395,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.033112582781456956,
90
- "grad_norm": 1.191081117611533,
91
  "learning_rate": 3.2934131736526947e-06,
92
- "loss": 1.0493,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.036122817579771226,
97
- "grad_norm": 1.2407336303580925,
98
  "learning_rate": 3.592814371257485e-06,
99
- "loss": 1.0163,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.03913305237808549,
104
- "grad_norm": 1.2517904434964362,
105
  "learning_rate": 3.892215568862276e-06,
106
- "loss": 1.0031,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.04214328717639976,
111
- "grad_norm": 1.1898573918091981,
112
  "learning_rate": 4.191616766467066e-06,
113
- "loss": 0.9945,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.04515352197471403,
118
- "grad_norm": 1.2209003810778682,
119
  "learning_rate": 4.4910179640718566e-06,
120
- "loss": 0.9881,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.048163756773028296,
125
- "grad_norm": 1.2515970343619347,
126
  "learning_rate": 4.7904191616766475e-06,
127
- "loss": 0.9797,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.051173991571342566,
132
- "grad_norm": 1.192783734551512,
133
  "learning_rate": 5.0898203592814375e-06,
134
- "loss": 0.9627,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.054184226369656835,
139
- "grad_norm": 1.223525320981026,
140
  "learning_rate": 5.389221556886228e-06,
141
- "loss": 0.9511,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.057194461167971104,
146
- "grad_norm": 1.2516482707774526,
147
  "learning_rate": 5.6886227544910184e-06,
148
- "loss": 0.9366,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.060204695966285374,
153
- "grad_norm": 1.2648194370111776,
154
  "learning_rate": 5.9880239520958085e-06,
155
- "loss": 0.942,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.06321493076459964,
160
- "grad_norm": 1.301961795790511,
161
  "learning_rate": 6.2874251497005985e-06,
162
- "loss": 0.9239,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.06622516556291391,
167
- "grad_norm": 1.318910357881307,
168
  "learning_rate": 6.586826347305389e-06,
169
- "loss": 0.9295,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.06923540036122817,
174
- "grad_norm": 1.4146859865499117,
175
  "learning_rate": 6.88622754491018e-06,
176
- "loss": 0.9184,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.07224563515954245,
181
- "grad_norm": 1.3765402810382459,
182
  "learning_rate": 7.18562874251497e-06,
183
- "loss": 0.9104,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.07525586995785671,
188
- "grad_norm": 1.329497309221002,
189
  "learning_rate": 7.485029940119761e-06,
190
- "loss": 0.8829,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.07826610475617098,
195
- "grad_norm": 1.2582931017325312,
196
  "learning_rate": 7.784431137724551e-06,
197
- "loss": 0.8772,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.08127633955448525,
202
- "grad_norm": 1.2761906563557504,
203
  "learning_rate": 8.083832335329342e-06,
204
- "loss": 0.8863,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.08428657435279951,
209
- "grad_norm": 1.305371614001374,
210
  "learning_rate": 8.383233532934131e-06,
211
- "loss": 0.8736,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.08729680915111379,
216
- "grad_norm": 1.2522121501580266,
217
  "learning_rate": 8.682634730538922e-06,
218
- "loss": 0.87,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.09030704394942805,
223
- "grad_norm": 1.2869728645207454,
224
  "learning_rate": 8.982035928143713e-06,
225
- "loss": 0.8669,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.09331727874774233,
230
- "grad_norm": 1.2991051636455195,
231
  "learning_rate": 9.281437125748504e-06,
232
- "loss": 0.8576,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.09632751354605659,
237
- "grad_norm": 1.2786814630152046,
238
  "learning_rate": 9.580838323353295e-06,
239
- "loss": 0.8496,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.09933774834437085,
244
- "grad_norm": 1.2277883256106235,
245
  "learning_rate": 9.880239520958084e-06,
246
- "loss": 0.8393,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.10234798314268513,
251
- "grad_norm": 1.2307747824783823,
252
  "learning_rate": 9.999900509954779e-06,
253
- "loss": 0.8226,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.1053582179409994,
258
- "grad_norm": 1.2949808017913165,
259
  "learning_rate": 9.999292529572152e-06,
260
- "loss": 0.8414,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.10836845273931367,
265
- "grad_norm": 1.3249580865142179,
266
  "learning_rate": 9.998131908181262e-06,
267
- "loss": 0.8137,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.11137868753762793,
272
- "grad_norm": 1.371692604105339,
273
  "learning_rate": 9.996418774081658e-06,
274
- "loss": 0.8237,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.11438892233594221,
279
- "grad_norm": 1.2829546119130226,
280
  "learning_rate": 9.994153316649769e-06,
281
- "loss": 0.8205,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.11739915713425647,
286
- "grad_norm": 1.342998197089531,
287
  "learning_rate": 9.991335786317964e-06,
288
- "loss": 0.8239,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.12040939193257075,
293
- "grad_norm": 1.2976871743628908,
294
  "learning_rate": 9.987966494546873e-06,
295
- "loss": 0.8146,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.12341962673088501,
300
- "grad_norm": 1.2301463331443308,
301
  "learning_rate": 9.984045813790959e-06,
302
- "loss": 0.8283,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.12642986152919927,
307
- "grad_norm": 1.2412905970181487,
308
  "learning_rate": 9.979574177457337e-06,
309
- "loss": 0.8124,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.12944009632751355,
314
- "grad_norm": 1.283058093189711,
315
  "learning_rate": 9.974552079857873e-06,
316
- "loss": 0.8084,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.13245033112582782,
321
- "grad_norm": 1.2963566163720668,
322
  "learning_rate": 9.968980076154533e-06,
323
- "loss": 0.8188,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.13546056592414207,
328
- "grad_norm": 1.307056994934901,
329
  "learning_rate": 9.962858782298023e-06,
330
- "loss": 0.8244,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.13847080072245635,
335
- "grad_norm": 1.2138422771404471,
336
  "learning_rate": 9.956188874959686e-06,
337
- "loss": 0.8046,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.14148103552077063,
342
- "grad_norm": 1.24733957721358,
343
  "learning_rate": 9.948971091456715e-06,
344
- "loss": 0.8058,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.1444912703190849,
349
- "grad_norm": 1.3355706431877798,
350
  "learning_rate": 9.941206229670634e-06,
351
- "loss": 0.8117,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.14750150511739915,
356
- "grad_norm": 1.2561128971062672,
357
  "learning_rate": 9.932895147959106e-06,
358
- "loss": 0.798,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.15051173991571343,
363
- "grad_norm": 1.270591111220033,
364
  "learning_rate": 9.924038765061042e-06,
365
- "loss": 0.7829,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.1535219747140277,
370
- "grad_norm": 1.3107584214302355,
371
  "learning_rate": 9.91463805999504e-06,
372
- "loss": 0.7941,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.15653220951234195,
377
- "grad_norm": 1.2232660141148983,
378
  "learning_rate": 9.904694071951167e-06,
379
- "loss": 0.792,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.15954244431065623,
384
- "grad_norm": 1.2409348272552818,
385
  "learning_rate": 9.894207900176074e-06,
386
- "loss": 0.7951,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.1625526791089705,
391
- "grad_norm": 1.278980886871779,
392
  "learning_rate": 9.883180703851488e-06,
393
- "loss": 0.7945,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.16556291390728478,
398
- "grad_norm": 1.291936431929788,
399
  "learning_rate": 9.871613701966067e-06,
400
- "loss": 0.8016,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.16857314870559903,
405
- "grad_norm": 1.2668111499149215,
406
  "learning_rate": 9.859508173180653e-06,
407
- "loss": 0.7926,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.1715833835039133,
412
- "grad_norm": 1.250463723228362,
413
  "learning_rate": 9.846865455686915e-06,
414
- "loss": 0.7879,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.17459361830222758,
419
- "grad_norm": 1.3112862736174495,
420
  "learning_rate": 9.833686947059436e-06,
421
- "loss": 0.7929,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.17760385310054183,
426
- "grad_norm": 1.1820374232575113,
427
  "learning_rate": 9.819974104101198e-06,
428
- "loss": 0.7912,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.1806140878988561,
433
- "grad_norm": 1.1994367974309528,
434
  "learning_rate": 9.80572844268256e-06,
435
- "loss": 0.7709,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.18362432269717038,
440
- "grad_norm": 1.2106571499448653,
441
  "learning_rate": 9.790951537573686e-06,
442
  "loss": 0.7819,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.18663455749548466,
447
- "grad_norm": 1.2411822431871422,
448
  "learning_rate": 9.775645022270448e-06,
449
- "loss": 0.7798,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.1896447922937989,
454
- "grad_norm": 1.2173832414288233,
455
  "learning_rate": 9.759810588813872e-06,
456
- "loss": 0.7698,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.19265502709211318,
461
- "grad_norm": 1.206540724864329,
462
  "learning_rate": 9.743449987603082e-06,
463
- "loss": 0.7721,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.19566526189042746,
468
- "grad_norm": 1.211521580157664,
469
  "learning_rate": 9.726565027201813e-06,
470
- "loss": 0.7873,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.1986754966887417,
475
- "grad_norm": 1.19240172696107,
476
  "learning_rate": 9.70915757413847e-06,
477
- "loss": 0.7699,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.20168573148705599,
482
- "grad_norm": 1.1585014114964425,
483
  "learning_rate": 9.691229552699817e-06,
484
- "loss": 0.7662,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.20469596628537026,
489
- "grad_norm": 1.2388838685746641,
490
  "learning_rate": 9.672782944718234e-06,
491
- "loss": 0.7721,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.20770620108368454,
496
- "grad_norm": 1.1676447847746696,
497
  "learning_rate": 9.65381978935266e-06,
498
- "loss": 0.7926,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.2107164358819988,
503
- "grad_norm": 1.1880515427609326,
504
  "learning_rate": 9.634342182863163e-06,
505
- "loss": 0.7666,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.21372667068031306,
510
- "grad_norm": 1.3394374470348678,
511
  "learning_rate": 9.614352278379217e-06,
512
- "loss": 0.7751,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.21673690547862734,
517
- "grad_norm": 1.2148169383488967,
518
  "learning_rate": 9.593852285661684e-06,
519
- "loss": 0.7736,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.2197471402769416,
524
- "grad_norm": 1.2418535249704257,
525
  "learning_rate": 9.572844470858537e-06,
526
- "loss": 0.7667,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.22275737507525586,
531
- "grad_norm": 1.1836007116668403,
532
  "learning_rate": 9.551331156254358e-06,
533
- "loss": 0.7587,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.22576760987357014,
538
- "grad_norm": 1.2576722953575195,
539
  "learning_rate": 9.529314720013618e-06,
540
- "loss": 0.7649,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.22877784467188442,
545
- "grad_norm": 1.3127260473166227,
546
  "learning_rate": 9.506797595917787e-06,
547
- "loss": 0.7556,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.23178807947019867,
552
- "grad_norm": 1.2123856962626416,
553
  "learning_rate": 9.483782273096295e-06,
554
- "loss": 0.749,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.23479831426851294,
559
- "grad_norm": 1.2872889149850437,
560
  "learning_rate": 9.460271295751373e-06,
561
- "loss": 0.7574,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.23780854906682722,
566
- "grad_norm": 1.263319415183585,
567
  "learning_rate": 9.436267262876808e-06,
568
- "loss": 0.7587,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.2408187838651415,
573
- "grad_norm": 1.1849312644592684,
574
  "learning_rate": 9.411772827970642e-06,
575
- "loss": 0.7326,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.24382901866345574,
580
- "grad_norm": 1.1766218556431003,
581
  "learning_rate": 9.38679069874184e-06,
582
- "loss": 0.754,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.24683925346177002,
587
- "grad_norm": 1.290526908514049,
588
  "learning_rate": 9.36132363681097e-06,
589
- "loss": 0.7492,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.2498494882600843,
594
- "grad_norm": 1.1826527637981796,
595
  "learning_rate": 9.335374457404928e-06,
596
- "loss": 0.7458,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.25285972305839854,
601
- "grad_norm": 1.178690042871564,
602
  "learning_rate": 9.308946029045726e-06,
603
- "loss": 0.7409,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.25586995785671285,
608
- "grad_norm": 1.2484570253176477,
609
  "learning_rate": 9.282041273233402e-06,
610
- "loss": 0.7417,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.2588801926550271,
615
- "grad_norm": 1.273386773114662,
616
  "learning_rate": 9.254663164123052e-06,
617
- "loss": 0.7499,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.26189042745334135,
622
- "grad_norm": 1.1934961011332912,
623
  "learning_rate": 9.226814728196072e-06,
624
- "loss": 0.7149,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.26490066225165565,
629
- "grad_norm": 1.3353496110689795,
630
  "learning_rate": 9.198499043925591e-06,
631
- "loss": 0.7453,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.2679108970499699,
636
- "grad_norm": 1.2809286469157408,
637
  "learning_rate": 9.169719241436162e-06,
638
- "loss": 0.7326,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.27092113184828415,
643
- "grad_norm": 1.3365855596712244,
644
  "learning_rate": 9.14047850215775e-06,
645
- "loss": 0.7415,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.27393136664659845,
650
- "grad_norm": 1.2482839002741117,
651
  "learning_rate": 9.110780058474052e-06,
652
- "loss": 0.7363,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.2769416014449127,
657
- "grad_norm": 1.193302574955275,
658
  "learning_rate": 9.080627193365155e-06,
659
- "loss": 0.7354,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.27995183624322695,
664
- "grad_norm": 1.1845235529692022,
665
  "learning_rate": 9.050023240044649e-06,
666
- "loss": 0.7323,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.28296207104154125,
671
- "grad_norm": 1.1721388192198845,
672
  "learning_rate": 9.018971581591141e-06,
673
- "loss": 0.7375,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.2859723058398555,
678
- "grad_norm": 1.2075020325950543,
679
  "learning_rate": 8.987475650574289e-06,
680
- "loss": 0.731,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.2889825406381698,
685
- "grad_norm": 1.328392371779474,
686
  "learning_rate": 8.955538928675343e-06,
687
- "loss": 0.7232,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.29199277543648405,
692
- "grad_norm": 1.1739151729528425,
693
  "learning_rate": 8.923164946302274e-06,
694
- "loss": 0.7321,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.2950030102347983,
699
- "grad_norm": 1.1939351821984825,
700
  "learning_rate": 8.890357282199504e-06,
701
- "loss": 0.7252,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.2980132450331126,
706
- "grad_norm": 1.2277194033736811,
707
  "learning_rate": 8.857119563052301e-06,
708
- "loss": 0.7282,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.30102347983142685,
713
- "grad_norm": 1.2708327611399552,
714
  "learning_rate": 8.823455463085873e-06,
715
- "loss": 0.6969,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.3040337146297411,
720
- "grad_norm": 1.2115563428539284,
721
  "learning_rate": 8.789368703659199e-06,
722
- "loss": 0.7158,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.3070439494280554,
727
- "grad_norm": 1.1978355348696688,
728
  "learning_rate": 8.754863052853658e-06,
729
- "loss": 0.7171,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.31005418422636966,
734
- "grad_norm": 1.3124481912397998,
735
  "learning_rate": 8.719942325056496e-06,
736
- "loss": 0.7246,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.3130644190246839,
741
- "grad_norm": 1.2314941185008341,
742
  "learning_rate": 8.68461038053916e-06,
743
- "loss": 0.7263,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.3160746538229982,
748
- "grad_norm": 1.224013137534039,
749
  "learning_rate": 8.648871125030576e-06,
750
- "loss": 0.7083,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.31908488862131246,
755
- "grad_norm": 1.227965310988214,
756
  "learning_rate": 8.612728509285395e-06,
757
- "loss": 0.7095,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.3220951234196267,
762
- "grad_norm": 1.1959993758173042,
763
  "learning_rate": 8.576186528647253e-06,
764
- "loss": 0.694,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.325105358217941,
769
- "grad_norm": 1.2534772083993118,
770
  "learning_rate": 8.53924922260712e-06,
771
- "loss": 0.7168,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.32811559301625526,
776
- "grad_norm": 1.2274144164892773,
777
  "learning_rate": 8.501920674356755e-06,
778
- "loss": 0.7198,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.33112582781456956,
783
- "grad_norm": 1.210111783105508,
784
  "learning_rate": 8.46420501033733e-06,
785
- "loss": 0.7109,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.3341360626128838,
790
- "grad_norm": 1.2312597757641623,
791
  "learning_rate": 8.42610639978329e-06,
792
- "loss": 0.699,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.33714629741119806,
797
- "grad_norm": 1.1910413172047565,
798
  "learning_rate": 8.387629054261454e-06,
799
- "loss": 0.7048,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.34015653220951236,
804
- "grad_norm": 1.2182074145817383,
805
  "learning_rate": 8.348777227205462e-06,
806
- "loss": 0.7089,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.3431667670078266,
811
- "grad_norm": 1.22978245210155,
812
  "learning_rate": 8.309555213445583e-06,
813
- "loss": 0.6895,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.34617700180614086,
818
- "grad_norm": 1.2174592755837728,
819
  "learning_rate": 8.269967348733947e-06,
820
- "loss": 0.6912,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.34918723660445516,
825
- "grad_norm": 1.1889473916563937,
826
  "learning_rate": 8.230018009265255e-06,
827
- "loss": 0.7013,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.3521974714027694,
832
- "grad_norm": 1.236378472808724,
833
  "learning_rate": 8.189711611193012e-06,
834
- "loss": 0.6798,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.35520770620108366,
839
- "grad_norm": 1.2136777205264142,
840
  "learning_rate": 8.149052610141357e-06,
841
- "loss": 0.6916,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.35821794099939797,
846
- "grad_norm": 1.346765531962112,
847
  "learning_rate": 8.108045500712518e-06,
848
- "loss": 0.7018,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.3612281757977122,
853
- "grad_norm": 1.3636064667620953,
854
  "learning_rate": 8.066694815989961e-06,
855
- "loss": 0.674,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.36423841059602646,
860
- "grad_norm": 1.2275626464611498,
861
  "learning_rate": 8.025005127037282e-06,
862
- "loss": 0.6954,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.36724864539434077,
867
- "grad_norm": 1.262810130773417,
868
  "learning_rate": 7.982981042392907e-06,
869
- "loss": 0.6891,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.370258880192655,
874
- "grad_norm": 1.3116340438880467,
875
  "learning_rate": 7.940627207560655e-06,
876
- "loss": 0.7057,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.3732691149909693,
881
- "grad_norm": 1.2968792501944857,
882
  "learning_rate": 7.897948304496189e-06,
883
- "loss": 0.6816,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.37627934978928357,
888
- "grad_norm": 1.3837518855998683,
889
  "learning_rate": 7.854949051089467e-06,
890
- "loss": 0.6991,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.3792895845875978,
895
- "grad_norm": 1.2584131007142432,
896
  "learning_rate": 7.811634200643202e-06,
897
- "loss": 0.6839,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.3822998193859121,
902
- "grad_norm": 1.179717893419111,
903
  "learning_rate": 7.768008541347423e-06,
904
- "loss": 0.6781,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.38531005418422637,
909
- "grad_norm": 1.1579172841485028,
910
  "learning_rate": 7.72407689575016e-06,
911
- "loss": 0.6776,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.3883202889825406,
916
- "grad_norm": 1.2457079092742271,
917
  "learning_rate": 7.67984412022434e-06,
918
- "loss": 0.6885,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.3913305237808549,
923
- "grad_norm": 1.1752744144417815,
924
  "learning_rate": 7.635315104430959e-06,
925
- "loss": 0.6626,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.39434075857916917,
930
- "grad_norm": 1.2508542445727817,
931
  "learning_rate": 7.5904947707785434e-06,
932
- "loss": 0.6722,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.3973509933774834,
937
- "grad_norm": 1.2550002014037438,
938
  "learning_rate": 7.545388073879018e-06,
939
- "loss": 0.6921,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.4003612281757977,
944
- "grad_norm": 1.1364322698328715,
945
  "learning_rate": 7.500000000000001e-06,
946
- "loss": 0.6504,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.40337146297411197,
951
- "grad_norm": 1.223430718844965,
952
  "learning_rate": 7.454335566513603e-06,
953
- "loss": 0.6627,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.4063816977724263,
958
- "grad_norm": 1.325519859064528,
959
  "learning_rate": 7.408399821341787e-06,
960
- "loss": 0.6832,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.4093919325707405,
965
- "grad_norm": 1.2864653106496968,
966
  "learning_rate": 7.362197842398355e-06,
967
- "loss": 0.6766,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.4124021673690548,
972
- "grad_norm": 1.1824581903822915,
973
  "learning_rate": 7.315734737027612e-06,
974
- "loss": 0.6656,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.4154124021673691,
979
- "grad_norm": 1.2157872690010714,
980
  "learning_rate": 7.2690156414397775e-06,
981
- "loss": 0.6709,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.4184226369656833,
986
- "grad_norm": 1.19882231252741,
987
  "learning_rate": 7.22204572014322e-06,
988
- "loss": 0.6603,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.4214328717639976,
993
- "grad_norm": 1.2166274243887607,
994
  "learning_rate": 7.174830165373542e-06,
995
- "loss": 0.6511,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.4244431065623119,
1000
- "grad_norm": 1.2085721212618408,
1001
  "learning_rate": 7.127374196519616e-06,
1002
- "loss": 0.6745,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.4274533413606261,
1007
- "grad_norm": 1.2438528141488636,
1008
  "learning_rate": 7.079683059546607e-06,
1009
- "loss": 0.6691,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.4304635761589404,
1014
- "grad_norm": 1.2395889341588784,
1015
  "learning_rate": 7.031762026416074e-06,
1016
- "loss": 0.664,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.4334738109572547,
1021
- "grad_norm": 1.209939412026367,
1022
  "learning_rate": 6.983616394503177e-06,
1023
- "loss": 0.6545,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.43648404575556893,
1028
- "grad_norm": 1.2085222788343624,
1029
  "learning_rate": 6.9352514860110876e-06,
1030
- "loss": 0.6652,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.4394942805538832,
1035
- "grad_norm": 1.215874760750364,
1036
  "learning_rate": 6.886672647382653e-06,
1037
- "loss": 0.6506,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.4425045153521975,
1042
- "grad_norm": 1.2270402546979953,
1043
  "learning_rate": 6.837885248709386e-06,
1044
- "loss": 0.6555,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.44551475015051173,
1049
- "grad_norm": 1.168121733189555,
1050
  "learning_rate": 6.788894683137822e-06,
1051
- "loss": 0.6496,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.44852498494882603,
1056
- "grad_norm": 1.2729494639482952,
1057
  "learning_rate": 6.739706366273346e-06,
1058
- "loss": 0.6744,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.4515352197471403,
1063
- "grad_norm": 1.1910337115343361,
1064
  "learning_rate": 6.690325735581532e-06,
1065
- "loss": 0.6649,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.45454545454545453,
1070
- "grad_norm": 1.2834186580973637,
1071
  "learning_rate": 6.640758249787067e-06,
1072
- "loss": 0.6288,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.45755568934376883,
1077
- "grad_norm": 1.3482463281404071,
1078
  "learning_rate": 6.591009388270315e-06,
1079
- "loss": 0.638,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.4605659241420831,
1084
- "grad_norm": 1.2245377747283543,
1085
  "learning_rate": 6.54108465046161e-06,
1086
- "loss": 0.6577,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.46357615894039733,
1091
- "grad_norm": 1.2532400498054517,
1092
  "learning_rate": 6.490989555233328e-06,
1093
- "loss": 0.6398,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.46658639373871164,
1098
- "grad_norm": 1.2133221218062722,
1099
  "learning_rate": 6.440729640289809e-06,
1100
- "loss": 0.6493,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.4695966285370259,
1105
- "grad_norm": 1.2934721949521188,
1106
  "learning_rate": 6.3903104615551956e-06,
1107
- "loss": 0.6337,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.47260686333534013,
1112
- "grad_norm": 1.208460074456768,
1113
  "learning_rate": 6.3397375925592675e-06,
1114
- "loss": 0.6426,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.47561709813365444,
1119
- "grad_norm": 1.2643602971960362,
1120
  "learning_rate": 6.289016623821308e-06,
1121
- "loss": 0.6414,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.4786273329319687,
1126
- "grad_norm": 1.199529441446071,
1127
  "learning_rate": 6.2381531622321234e-06,
1128
- "loss": 0.6487,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.481637567730283,
1133
- "grad_norm": 1.2923378006389945,
1134
  "learning_rate": 6.18715283043422e-06,
1135
- "loss": 0.6242,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.48464780252859724,
1140
- "grad_norm": 1.2138746939708784,
1141
  "learning_rate": 6.136021266200271e-06,
1142
- "loss": 0.6326,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.4876580373269115,
1147
- "grad_norm": 1.261846103287385,
1148
  "learning_rate": 6.084764121809878e-06,
1149
- "loss": 0.6309,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.4906682721252258,
1154
- "grad_norm": 1.2939948005814752,
1155
  "learning_rate": 6.033387063424765e-06,
1156
- "loss": 0.6252,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.49367850692354004,
1161
- "grad_norm": 1.3038410279687773,
1162
  "learning_rate": 5.9818957704624046e-06,
1163
- "loss": 0.6335,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.4966887417218543,
1168
- "grad_norm": 1.201247170646155,
1169
  "learning_rate": 5.930295934968197e-06,
1170
- "loss": 0.6262,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.4996989765201686,
1175
- "grad_norm": 1.194976486878025,
1176
  "learning_rate": 5.878593260986256e-06,
1177
- "loss": 0.6341,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.5027092113184829,
1182
- "grad_norm": 1.285380478781876,
1183
  "learning_rate": 5.8267934639288525e-06,
1184
- "loss": 0.6423,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.5057194461167971,
1189
- "grad_norm": 1.2142854289892278,
1190
  "learning_rate": 5.77490226994462e-06,
1191
- "loss": 0.6217,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.5087296809151114,
1196
- "grad_norm": 1.2387931529444611,
1197
  "learning_rate": 5.722925415285555e-06,
1198
- "loss": 0.6325,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.5117399157134257,
1203
- "grad_norm": 1.166501906073654,
1204
  "learning_rate": 5.670868645672916e-06,
1205
- "loss": 0.6265,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.5147501505117399,
1210
- "grad_norm": 1.2315216626331218,
1211
  "learning_rate": 5.618737715662067e-06,
1212
- "loss": 0.6163,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.5177603853100542,
1217
- "grad_norm": 1.2402640957841304,
1218
  "learning_rate": 5.566538388006351e-06,
1219
- "loss": 0.6376,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.5207706201083685,
1224
- "grad_norm": 1.2844662700519471,
1225
  "learning_rate": 5.514276433020044e-06,
1226
- "loss": 0.6146,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.5237808549066827,
1231
- "grad_norm": 1.2752483334016254,
1232
  "learning_rate": 5.461957627940489e-06,
1233
- "loss": 0.6394,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.526791089704997,
1238
- "grad_norm": 1.227395524806422,
1239
  "learning_rate": 5.409587756289462e-06,
1240
- "loss": 0.6139,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.5298013245033113,
1245
- "grad_norm": 1.2049284439880659,
1246
  "learning_rate": 5.357172607233831e-06,
1247
- "loss": 0.6144,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.5328115593016255,
1252
- "grad_norm": 1.245827654247563,
1253
  "learning_rate": 5.304717974945596e-06,
1254
- "loss": 0.6152,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.5358217940999398,
1259
- "grad_norm": 1.3299253926984027,
1260
  "learning_rate": 5.252229657961394e-06,
1261
- "loss": 0.6121,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.5388320288982541,
1266
- "grad_norm": 1.209710810071064,
1267
  "learning_rate": 5.199713458541495e-06,
1268
- "loss": 0.5971,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.5418422636965683,
1273
- "grad_norm": 1.254649968737726,
1274
  "learning_rate": 5.1471751820284e-06,
1275
- "loss": 0.6022,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.5448524984948826,
1280
- "grad_norm": 1.267331556965335,
1281
  "learning_rate": 5.094620636205096e-06,
1282
- "loss": 0.6163,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.5478627332931969,
1287
- "grad_norm": 1.2114393038072488,
1288
  "learning_rate": 5.042055630653042e-06,
1289
- "loss": 0.6162,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.5508729680915111,
1294
- "grad_norm": 1.2119812194103736,
1295
  "learning_rate": 4.98948597610996e-06,
1296
- "loss": 0.6103,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.5538832028898254,
1301
- "grad_norm": 1.2118940977952493,
1302
  "learning_rate": 4.936917483827483e-06,
1303
- "loss": 0.609,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.5568934376881397,
1308
- "grad_norm": 1.216555361613017,
1309
  "learning_rate": 4.884355964928767e-06,
1310
- "loss": 0.5869,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.5599036724864539,
1315
- "grad_norm": 1.2125436521321353,
1316
  "learning_rate": 4.831807229766101e-06,
1317
- "loss": 0.5959,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.5629139072847682,
1322
- "grad_norm": 1.2273004433080854,
1323
  "learning_rate": 4.779277087278615e-06,
1324
- "loss": 0.5956,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.5659241420830825,
1329
- "grad_norm": 1.327408954158384,
1330
  "learning_rate": 4.7267713443501274e-06,
1331
- "loss": 0.6143,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.5689343768813967,
1336
- "grad_norm": 1.2414363124005465,
1337
  "learning_rate": 4.67429580516724e-06,
1338
- "loss": 0.5887,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.571944611679711,
1343
- "grad_norm": 1.203442995074465,
1344
  "learning_rate": 4.6218562705777185e-06,
1345
- "loss": 0.6004,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.5749548464780253,
1350
- "grad_norm": 1.2174481353824513,
1351
  "learning_rate": 4.5694585374492314e-06,
1352
- "loss": 0.5853,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.5779650812763396,
1357
- "grad_norm": 1.2904799934518778,
1358
  "learning_rate": 4.517108398028566e-06,
1359
- "loss": 0.6017,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.5809753160746538,
1364
- "grad_norm": 1.2287011726696084,
1365
  "learning_rate": 4.464811639301314e-06,
1366
- "loss": 0.606,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.5839855508729681,
1371
- "grad_norm": 1.2571252783339275,
1372
  "learning_rate": 4.412574042352156e-06,
1373
- "loss": 0.6011,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.5869957856712824,
1378
- "grad_norm": 1.2246675913495295,
1379
  "learning_rate": 4.360401381725806e-06,
1380
- "loss": 0.5845,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.5900060204695966,
1385
- "grad_norm": 1.269602392062893,
1386
  "learning_rate": 4.308299424788667e-06,
1387
- "loss": 0.578,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.5930162552679109,
1392
- "grad_norm": 1.2266667172046457,
1393
  "learning_rate": 4.256273931091284e-06,
1394
- "loss": 0.596,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.5960264900662252,
1399
- "grad_norm": 1.205718345046694,
1400
  "learning_rate": 4.204330651731662e-06,
1401
- "loss": 0.5936,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.5990367248645394,
1406
- "grad_norm": 1.1952203148817158,
1407
  "learning_rate": 4.152475328719517e-06,
1408
- "loss": 0.5738,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.6020469596628537,
1413
- "grad_norm": 1.276847431530527,
1414
  "learning_rate": 4.1007136943415325e-06,
1415
- "loss": 0.5686,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.605057194461168,
1420
- "grad_norm": 1.2563772516846479,
1421
  "learning_rate": 4.049051470527692e-06,
1422
- "loss": 0.5825,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.6080674292594822,
1427
- "grad_norm": 1.3085315216721447,
1428
  "learning_rate": 3.997494368218745e-06,
1429
- "loss": 0.5796,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.6110776640577965,
1434
- "grad_norm": 1.252832406830381,
1435
  "learning_rate": 3.946048086734921e-06,
1436
- "loss": 0.5912,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.6140878988561108,
1441
- "grad_norm": 1.241503492216098,
1442
  "learning_rate": 3.894718313145873e-06,
1443
- "loss": 0.5811,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.617098133654425,
1448
- "grad_norm": 1.1913066136989139,
1449
  "learning_rate": 3.843510721642036e-06,
1450
- "loss": 0.5718,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.6201083684527393,
1455
- "grad_norm": 1.3275708487415967,
1456
  "learning_rate": 3.7924309729073616e-06,
1457
- "loss": 0.5609,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.6231186032510536,
1462
- "grad_norm": 1.256537476019095,
1463
  "learning_rate": 3.7414847134935716e-06,
1464
- "loss": 0.5825,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.6261288380493678,
1469
- "grad_norm": 1.221832356712663,
1470
  "learning_rate": 3.6906775751959667e-06,
1471
- "loss": 0.554,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.6291390728476821,
1476
- "grad_norm": 1.2248781750755409,
1477
  "learning_rate": 3.640015174430864e-06,
1478
- "loss": 0.5638,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.6321493076459964,
1483
- "grad_norm": 1.2047806028877057,
1484
  "learning_rate": 3.5895031116147355e-06,
1485
- "loss": 0.5733,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.6351595424443106,
1490
- "grad_norm": 1.275159952276474,
1491
  "learning_rate": 3.539146970545124e-06,
1492
- "loss": 0.5799,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.6381697772426249,
1497
- "grad_norm": 1.284257273469691,
1498
  "learning_rate": 3.488952317783374e-06,
1499
- "loss": 0.5726,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.6411800120409392,
1504
- "grad_norm": 1.2383109080935772,
1505
  "learning_rate": 3.438924702039301e-06,
1506
- "loss": 0.5708,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.6441902468392534,
1511
- "grad_norm": 1.2190673552207432,
1512
  "learning_rate": 3.389069653557805e-06,
1513
- "loss": 0.5538,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.6472004816375677,
1518
- "grad_norm": 1.2506993933070911,
1519
  "learning_rate": 3.3393926835075307e-06,
1520
- "loss": 0.5797,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.650210716435882,
1525
- "grad_norm": 1.2356328454342058,
1526
  "learning_rate": 3.289899283371657e-06,
1527
- "loss": 0.5585,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.6532209512341962,
1532
- "grad_norm": 1.3063384197021242,
1533
  "learning_rate": 3.240594924340835e-06,
1534
- "loss": 0.5796,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.6562311860325105,
1539
- "grad_norm": 1.2373220459625451,
1540
  "learning_rate": 3.1914850567083866e-06,
1541
- "loss": 0.5622,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.6592414208308248,
1546
- "grad_norm": 1.3531129912740778,
1547
  "learning_rate": 3.1425751092678064e-06,
1548
- "loss": 0.5596,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.6622516556291391,
1553
- "grad_norm": 1.2389296621775976,
1554
  "learning_rate": 3.0938704887126425e-06,
1555
- "loss": 0.5535,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.6652618904274533,
1560
- "grad_norm": 1.2515989438388835,
1561
  "learning_rate": 3.045376579038821e-06,
1562
- "loss": 0.5681,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.6682721252257676,
1567
- "grad_norm": 1.2185663400088427,
1568
  "learning_rate": 2.9970987409494784e-06,
1569
- "loss": 0.571,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.6712823600240819,
1574
- "grad_norm": 1.269873255758116,
1575
  "learning_rate": 2.9490423112623646e-06,
1576
- "loss": 0.5658,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.6742925948223961,
1581
- "grad_norm": 1.2244029668023797,
1582
  "learning_rate": 2.9012126023198973e-06,
1583
- "loss": 0.538,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 0.6773028296207104,
1588
- "grad_norm": 1.2816311543199834,
1589
  "learning_rate": 2.853614901401909e-06,
1590
- "loss": 0.5484,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 0.6803130644190247,
1595
- "grad_norm": 1.276081217254366,
1596
  "learning_rate": 2.806254470141174e-06,
1597
- "loss": 0.5691,
1598
  "step": 1130
1599
  },
1600
  {
1601
  "epoch": 0.6833232992173389,
1602
- "grad_norm": 1.266389306582438,
1603
  "learning_rate": 2.759136543941773e-06,
1604
- "loss": 0.5567,
1605
  "step": 1135
1606
  },
1607
  {
1608
  "epoch": 0.6863335340156532,
1609
- "grad_norm": 1.2924072931517483,
1610
  "learning_rate": 2.712266331400332e-06,
1611
- "loss": 0.5439,
1612
  "step": 1140
1613
  },
1614
  {
1615
  "epoch": 0.6893437688139675,
1616
- "grad_norm": 1.254669923456662,
1617
  "learning_rate": 2.66564901373027e-06,
1618
- "loss": 0.5613,
1619
  "step": 1145
1620
  },
1621
  {
1622
  "epoch": 0.6923540036122817,
1623
- "grad_norm": 1.3474144476855063,
1624
  "learning_rate": 2.6192897441890337e-06,
1625
- "loss": 0.5489,
1626
  "step": 1150
1627
  },
1628
  {
1629
  "epoch": 0.695364238410596,
1630
- "grad_norm": 1.269185993074116,
1631
  "learning_rate": 2.573193647508426e-06,
1632
- "loss": 0.5483,
1633
  "step": 1155
1634
  },
1635
  {
1636
  "epoch": 0.6983744732089103,
1637
- "grad_norm": 1.3562380015815203,
1638
  "learning_rate": 2.5273658193281252e-06,
1639
- "loss": 0.5567,
1640
  "step": 1160
1641
  },
1642
  {
1643
  "epoch": 0.7013847080072245,
1644
- "grad_norm": 1.2580827828033503,
1645
  "learning_rate": 2.4818113256323745e-06,
1646
- "loss": 0.5417,
1647
  "step": 1165
1648
  },
1649
  {
1650
  "epoch": 0.7043949428055388,
1651
- "grad_norm": 1.2738218692009522,
1652
  "learning_rate": 2.4365352021899635e-06,
1653
- "loss": 0.5418,
1654
  "step": 1170
1655
  },
1656
  {
1657
  "epoch": 0.7074051776038531,
1658
- "grad_norm": 1.257678286567073,
1659
  "learning_rate": 2.391542453997578e-06,
1660
- "loss": 0.5502,
1661
  "step": 1175
1662
  },
1663
  {
1664
  "epoch": 0.7104154124021673,
1665
- "grad_norm": 1.275010857155547,
1666
  "learning_rate": 2.346838054726505e-06,
1667
- "loss": 0.5281,
1668
  "step": 1180
1669
  },
1670
  {
1671
  "epoch": 0.7134256472004816,
1672
- "grad_norm": 1.3071543695724546,
1673
  "learning_rate": 2.302426946172836e-06,
1674
- "loss": 0.5324,
1675
  "step": 1185
1676
  },
1677
  {
1678
  "epoch": 0.7164358819987959,
1679
- "grad_norm": 1.2139842130794491,
1680
  "learning_rate": 2.258314037711184e-06,
1681
- "loss": 0.554,
1682
  "step": 1190
1683
  },
1684
  {
1685
  "epoch": 0.7194461167971101,
1686
- "grad_norm": 1.2099665519481047,
1687
  "learning_rate": 2.214504205751971e-06,
1688
- "loss": 0.5429,
1689
  "step": 1195
1690
  },
1691
  {
1692
  "epoch": 0.7224563515954244,
1693
- "grad_norm": 1.3437644441417467,
1694
  "learning_rate": 2.1710022932023805e-06,
1695
- "loss": 0.5472,
1696
  "step": 1200
1697
  },
1698
  {
1699
  "epoch": 0.7254665863937387,
1700
- "grad_norm": 1.26435415137662,
1701
  "learning_rate": 2.127813108931007e-06,
1702
- "loss": 0.5487,
1703
  "step": 1205
1704
  },
1705
  {
1706
  "epoch": 0.7284768211920529,
1707
- "grad_norm": 1.2092596990063904,
1708
  "learning_rate": 2.084941427236245e-06,
1709
- "loss": 0.533,
1710
  "step": 1210
1711
  },
1712
  {
1713
  "epoch": 0.7314870559903672,
1714
- "grad_norm": 1.2313416526959684,
1715
  "learning_rate": 2.04239198731855e-06,
1716
- "loss": 0.5349,
1717
  "step": 1215
1718
  },
1719
  {
1720
  "epoch": 0.7344972907886815,
1721
- "grad_norm": 1.3072779429716,
1722
  "learning_rate": 2.000169492756523e-06,
1723
- "loss": 0.5402,
1724
  "step": 1220
1725
  },
1726
  {
1727
  "epoch": 0.7375075255869958,
1728
- "grad_norm": 1.2795138536801312,
1729
  "learning_rate": 1.9582786109869713e-06,
1730
- "loss": 0.5478,
1731
  "step": 1225
1732
  },
1733
  {
1734
  "epoch": 0.74051776038531,
1735
- "grad_norm": 1.2961377151032474,
1736
  "learning_rate": 1.9167239727889527e-06,
1737
- "loss": 0.543,
1738
  "step": 1230
1739
  },
1740
  {
1741
  "epoch": 0.7435279951836243,
1742
- "grad_norm": 1.2566416288826483,
1743
  "learning_rate": 1.875510171771865e-06,
1744
- "loss": 0.5277,
1745
  "step": 1235
1746
  },
1747
  {
1748
  "epoch": 0.7465382299819386,
1749
- "grad_norm": 1.2674802415781152,
1750
  "learning_rate": 1.8346417638676533e-06,
1751
- "loss": 0.5401,
1752
  "step": 1240
1753
  },
1754
  {
1755
  "epoch": 0.7495484647802528,
1756
- "grad_norm": 1.2584995829737369,
1757
  "learning_rate": 1.7941232668271863e-06,
1758
- "loss": 0.5315,
1759
  "step": 1245
1760
  },
1761
  {
1762
  "epoch": 0.7525586995785671,
1763
- "grad_norm": 1.2445698965082652,
1764
  "learning_rate": 1.753959159720836e-06,
1765
- "loss": 0.5294,
1766
  "step": 1250
1767
  },
1768
  {
1769
  "epoch": 0.7555689343768814,
1770
- "grad_norm": 1.2674536046077867,
1771
  "learning_rate": 1.7141538824433506e-06,
1772
- "loss": 0.5278,
1773
  "step": 1255
1774
  },
1775
  {
1776
  "epoch": 0.7585791691751956,
1777
- "grad_norm": 1.294324607883709,
1778
  "learning_rate": 1.6747118352230495e-06,
1779
- "loss": 0.5392,
1780
  "step": 1260
1781
  },
1782
  {
1783
  "epoch": 0.7615894039735099,
1784
- "grad_norm": 1.2340338478309871,
1785
  "learning_rate": 1.6356373781354058e-06,
1786
- "loss": 0.5335,
1787
  "step": 1265
1788
  },
1789
  {
1790
  "epoch": 0.7645996387718242,
1791
- "grad_norm": 1.3398026544287736,
1792
  "learning_rate": 1.5969348306210692e-06,
1793
- "loss": 0.5322,
1794
  "step": 1270
1795
  },
1796
  {
1797
  "epoch": 0.7676098735701384,
1798
- "grad_norm": 1.3532898551242525,
1799
  "learning_rate": 1.5586084710083737e-06,
1800
- "loss": 0.5198,
1801
  "step": 1275
1802
  },
1803
  {
1804
  "epoch": 0.7706201083684527,
1805
- "grad_norm": 1.315480840691035,
1806
  "learning_rate": 1.5206625360403943e-06,
1807
- "loss": 0.5319,
1808
  "step": 1280
1809
  },
1810
  {
1811
  "epoch": 0.773630343166767,
1812
- "grad_norm": 1.255766242972646,
1813
  "learning_rate": 1.4831012204066114e-06,
1814
- "loss": 0.5101,
1815
  "step": 1285
1816
  },
1817
  {
1818
  "epoch": 0.7766405779650812,
1819
- "grad_norm": 1.3117533667822703,
1820
  "learning_rate": 1.445928676279199e-06,
1821
- "loss": 0.5369,
1822
  "step": 1290
1823
  },
1824
  {
1825
  "epoch": 0.7796508127633955,
1826
- "grad_norm": 1.288985322714688,
1827
  "learning_rate": 1.4091490128540374e-06,
1828
- "loss": 0.526,
1829
  "step": 1295
1830
  },
1831
  {
1832
  "epoch": 0.7826610475617098,
1833
- "grad_norm": 1.2093920386965087,
1834
  "learning_rate": 1.3727662958964627e-06,
1835
- "loss": 0.5152,
1836
  "step": 1300
1837
  },
1838
  {
1839
  "epoch": 0.785671282360024,
1840
- "grad_norm": 1.288741507825481,
1841
  "learning_rate": 1.3367845472918272e-06,
1842
- "loss": 0.5242,
1843
  "step": 1305
1844
  },
1845
  {
1846
  "epoch": 0.7886815171583383,
1847
- "grad_norm": 1.289266020619647,
1848
  "learning_rate": 1.3012077446008969e-06,
1849
- "loss": 0.519,
1850
  "step": 1310
1851
  },
1852
  {
1853
  "epoch": 0.7916917519566526,
1854
- "grad_norm": 1.305675104370065,
1855
  "learning_rate": 1.266039820620159e-06,
1856
- "loss": 0.5241,
1857
  "step": 1315
1858
  },
1859
  {
1860
  "epoch": 0.7947019867549668,
1861
- "grad_norm": 1.2633206609620018,
1862
  "learning_rate": 1.2312846629470826e-06,
1863
- "loss": 0.5298,
1864
  "step": 1320
1865
  },
1866
  {
1867
  "epoch": 0.7977122215532811,
1868
- "grad_norm": 1.2569813999396267,
1869
  "learning_rate": 1.1969461135503573e-06,
1870
- "loss": 0.5284,
1871
  "step": 1325
1872
  },
1873
  {
1874
  "epoch": 0.8007224563515954,
1875
- "grad_norm": 1.2500080013485295,
1876
  "learning_rate": 1.163027968345195e-06,
1877
- "loss": 0.5013,
1878
  "step": 1330
1879
  },
1880
  {
1881
  "epoch": 0.8037326911499096,
1882
- "grad_norm": 1.2227749344044816,
1883
  "learning_rate": 1.1295339767737125e-06,
1884
- "loss": 0.5181,
1885
  "step": 1335
1886
  },
1887
  {
1888
  "epoch": 0.8067429259482239,
1889
- "grad_norm": 1.2039813893211253,
1890
  "learning_rate": 1.0964678413904529e-06,
1891
- "loss": 0.5077,
1892
  "step": 1340
1893
  },
1894
  {
1895
  "epoch": 0.8097531607465382,
1896
- "grad_norm": 1.2432336763434526,
1897
  "learning_rate": 1.0638332174530953e-06,
1898
- "loss": 0.5183,
1899
  "step": 1345
1900
  },
1901
  {
1902
  "epoch": 0.8127633955448526,
1903
- "grad_norm": 1.3224425175232468,
1904
  "learning_rate": 1.0316337125183817e-06,
1905
- "loss": 0.5085,
1906
  "step": 1350
1907
  },
1908
  {
1909
  "epoch": 0.8157736303431667,
1910
- "grad_norm": 1.2490625646083682,
1911
  "learning_rate": 9.998728860433277e-07,
1912
  "loss": 0.512,
1913
  "step": 1355
1914
  },
1915
  {
1916
  "epoch": 0.818783865141481,
1917
- "grad_norm": 1.2834749344502785,
1918
  "learning_rate": 9.685542489917494e-07,
1919
- "loss": 0.5148,
1920
  "step": 1360
1921
  },
1922
  {
1923
  "epoch": 0.8217940999397954,
1924
- "grad_norm": 1.2717678687726444,
1925
  "learning_rate": 9.376812634461418e-07,
1926
- "loss": 0.5165,
1927
  "step": 1365
1928
  },
1929
  {
1930
  "epoch": 0.8248043347381095,
1931
- "grad_norm": 1.273693819691485,
1932
  "learning_rate": 9.072573422249692e-07,
1933
- "loss": 0.5138,
1934
  "step": 1370
1935
  },
1936
  {
1937
  "epoch": 0.8278145695364238,
1938
- "grad_norm": 1.2541602658022404,
1939
  "learning_rate": 8.772858485054042e-07,
1940
- "loss": 0.5341,
1941
  "step": 1375
1942
  },
1943
  {
1944
  "epoch": 0.8308248043347382,
1945
- "grad_norm": 1.311251853744712,
1946
  "learning_rate": 8.477700954515372e-07,
1947
- "loss": 0.5104,
1948
  "step": 1380
1949
  },
1950
  {
1951
  "epoch": 0.8338350391330523,
1952
- "grad_norm": 1.2486954380131434,
1953
  "learning_rate": 8.187133458481416e-07,
1954
- "loss": 0.5156,
1955
  "step": 1385
1956
  },
1957
  {
1958
  "epoch": 0.8368452739313667,
1959
- "grad_norm": 1.2747215761089814,
1960
  "learning_rate": 7.901188117399817e-07,
1961
- "loss": 0.5211,
1962
  "step": 1390
1963
  },
1964
  {
1965
  "epoch": 0.839855508729681,
1966
- "grad_norm": 1.234543288759014,
1967
  "learning_rate": 7.619896540767435e-07,
1968
- "loss": 0.5157,
1969
  "step": 1395
1970
  },
1971
  {
1972
  "epoch": 0.8428657435279951,
1973
- "grad_norm": 1.3263990218789858,
1974
  "learning_rate": 7.343289823636168e-07,
1975
- "loss": 0.5147,
1976
  "step": 1400
1977
  },
1978
  {
1979
  "epoch": 0.8458759783263095,
1980
- "grad_norm": 1.2622046736064572,
1981
  "learning_rate": 7.0713985431755e-07,
1982
- "loss": 0.5222,
1983
  "step": 1405
1984
  },
1985
  {
1986
  "epoch": 0.8488862131246238,
1987
- "grad_norm": 1.2345168269397355,
1988
  "learning_rate": 6.804252755292429e-07,
1989
- "loss": 0.5226,
1990
  "step": 1410
1991
  },
1992
  {
1993
  "epoch": 0.851896447922938,
1994
- "grad_norm": 1.244419436243612,
1995
  "learning_rate": 6.541881991309013e-07,
1996
- "loss": 0.5211,
1997
  "step": 1415
1998
  },
1999
  {
2000
  "epoch": 0.8549066827212523,
2001
- "grad_norm": 1.277021130264081,
2002
  "learning_rate": 6.284315254697726e-07,
2003
- "loss": 0.514,
2004
  "step": 1420
2005
  },
2006
  {
2007
  "epoch": 0.8579169175195666,
2008
- "grad_norm": 1.3651795136190366,
2009
  "learning_rate": 6.031581017875482e-07,
2010
- "loss": 0.5196,
2011
  "step": 1425
2012
  },
2013
  {
2014
  "epoch": 0.8609271523178808,
2015
- "grad_norm": 1.2924139249953397,
2016
  "learning_rate": 5.783707219056078e-07,
2017
- "loss": 0.4952,
2018
  "step": 1430
2019
  },
2020
  {
2021
  "epoch": 0.863937387116195,
2022
- "grad_norm": 1.29441777957266,
2023
  "learning_rate": 5.540721259161774e-07,
2024
- "loss": 0.5093,
2025
  "step": 1435
2026
  },
2027
  {
2028
  "epoch": 0.8669476219145094,
2029
- "grad_norm": 1.2349372001584507,
2030
  "learning_rate": 5.302649998794368e-07,
2031
- "loss": 0.5189,
2032
  "step": 1440
2033
  },
2034
  {
2035
  "epoch": 0.8699578567128236,
2036
- "grad_norm": 1.3010846373559104,
2037
  "learning_rate": 5.0695197552659e-07,
2038
- "loss": 0.5125,
2039
  "step": 1445
2040
  },
2041
  {
2042
  "epoch": 0.8729680915111379,
2043
- "grad_norm": 1.2313741385914343,
2044
  "learning_rate": 4.841356299689359e-07,
2045
- "loss": 0.5019,
2046
  "step": 1450
2047
  },
2048
  {
2049
  "epoch": 0.8759783263094522,
2050
- "grad_norm": 1.2736789351010163,
2051
  "learning_rate": 4.618184854129981e-07,
2052
- "loss": 0.5162,
2053
  "step": 1455
2054
  },
2055
  {
2056
  "epoch": 0.8789885611077664,
2057
- "grad_norm": 1.3450829294599627,
2058
  "learning_rate": 4.4000300888169753e-07,
2059
- "loss": 0.5122,
2060
  "step": 1460
2061
  },
2062
  {
2063
  "epoch": 0.8819987959060807,
2064
- "grad_norm": 1.3345961379400417,
2065
  "learning_rate": 4.1869161194164565e-07,
2066
- "loss": 0.5195,
2067
  "step": 1465
2068
  },
2069
  {
2070
  "epoch": 0.885009030704395,
2071
- "grad_norm": 1.2706980513616788,
2072
  "learning_rate": 3.9788665043656083e-07,
2073
- "loss": 0.5272,
2074
  "step": 1470
2075
  },
2076
  {
2077
  "epoch": 0.8880192655027093,
2078
- "grad_norm": 1.298097370684443,
2079
  "learning_rate": 3.775904242268391e-07,
2080
- "loss": 0.5189,
2081
  "step": 1475
2082
  },
2083
  {
2084
  "epoch": 0.8910295003010235,
2085
- "grad_norm": 1.264558600518237,
2086
  "learning_rate": 3.578051769353219e-07,
2087
- "loss": 0.5105,
2088
  "step": 1480
2089
  },
2090
  {
2091
  "epoch": 0.8940397350993378,
2092
- "grad_norm": 1.2417670245253374,
2093
  "learning_rate": 3.385330956992816e-07,
2094
- "loss": 0.4906,
2095
  "step": 1485
2096
  },
2097
  {
2098
  "epoch": 0.8970499698976521,
2099
- "grad_norm": 1.3169348811520523,
2100
  "learning_rate": 3.1977631092863613e-07,
2101
- "loss": 0.5162,
2102
  "step": 1490
2103
  },
2104
  {
2105
  "epoch": 0.9000602046959663,
2106
- "grad_norm": 1.3331828477675507,
2107
  "learning_rate": 3.015368960704584e-07,
2108
- "loss": 0.5118,
2109
  "step": 1495
2110
  },
2111
  {
2112
  "epoch": 0.9030704394942806,
2113
- "grad_norm": 1.2820985050226046,
2114
  "learning_rate": 2.8381686737975867e-07,
2115
- "loss": 0.5034,
2116
  "step": 1500
2117
  },
2118
  {
2119
  "epoch": 0.9060806742925949,
2120
- "grad_norm": 1.369084535263281,
2121
  "learning_rate": 2.666181836966053e-07,
2122
- "loss": 0.5111,
2123
  "step": 1505
2124
  },
2125
  {
2126
  "epoch": 0.9090909090909091,
2127
- "grad_norm": 1.2852930092164696,
2128
  "learning_rate": 2.4994274622958726e-07,
2129
- "loss": 0.4986,
2130
  "step": 1510
2131
  },
2132
  {
2133
  "epoch": 0.9121011438892234,
2134
- "grad_norm": 1.2402160274810259,
2135
  "learning_rate": 2.3379239834564526e-07,
2136
- "loss": 0.5118,
2137
  "step": 1515
2138
  },
2139
  {
2140
  "epoch": 0.9151113786875377,
2141
- "grad_norm": 1.237567686573788,
2142
  "learning_rate": 2.1816892536629775e-07,
2143
- "loss": 0.5035,
2144
  "step": 1520
2145
  },
2146
  {
2147
  "epoch": 0.9181216134858519,
2148
- "grad_norm": 1.2495841691480962,
2149
  "learning_rate": 2.0307405437029027e-07,
2150
- "loss": 0.5111,
2151
  "step": 1525
2152
  },
2153
  {
2154
  "epoch": 0.9211318482841662,
2155
- "grad_norm": 1.2363443537540169,
2156
  "learning_rate": 1.8850945400266994e-07,
2157
- "loss": 0.4958,
2158
  "step": 1530
2159
  },
2160
  {
2161
  "epoch": 0.9241420830824805,
2162
- "grad_norm": 1.2925244771883408,
2163
  "learning_rate": 1.7447673429033361e-07,
2164
- "loss": 0.5073,
2165
  "step": 1535
2166
  },
2167
  {
2168
  "epoch": 0.9271523178807947,
2169
- "grad_norm": 1.2918239018472708,
2170
  "learning_rate": 1.6097744646404457e-07,
2171
- "loss": 0.505,
2172
  "step": 1540
2173
  },
2174
  {
2175
  "epoch": 0.930162552679109,
2176
- "grad_norm": 1.3462621823153225,
2177
  "learning_rate": 1.4801308278695636e-07,
2178
- "loss": 0.5208,
2179
  "step": 1545
2180
  },
2181
  {
2182
  "epoch": 0.9331727874774233,
2183
- "grad_norm": 1.3040362281207198,
2184
  "learning_rate": 1.3558507638965158e-07,
2185
- "loss": 0.5068,
2186
  "step": 1550
2187
  },
2188
  {
2189
  "epoch": 0.9361830222757375,
2190
- "grad_norm": 1.2501483756470055,
2191
  "learning_rate": 1.2369480111171784e-07,
2192
- "loss": 0.4984,
2193
  "step": 1555
2194
  },
2195
  {
2196
  "epoch": 0.9391932570740518,
2197
- "grad_norm": 1.24010506497212,
2198
  "learning_rate": 1.1234357134987717e-07,
2199
- "loss": 0.5036,
2200
  "step": 1560
2201
  },
2202
  {
2203
  "epoch": 0.9422034918723661,
2204
- "grad_norm": 1.2731162265485154,
2205
  "learning_rate": 1.0153264191269052e-07,
2206
- "loss": 0.5159,
2207
  "step": 1565
2208
  },
2209
  {
2210
  "epoch": 0.9452137266706803,
2211
- "grad_norm": 1.241141289633183,
2212
  "learning_rate": 9.126320788184374e-08,
2213
- "loss": 0.5077,
2214
  "step": 1570
2215
  },
2216
  {
2217
  "epoch": 0.9482239614689946,
2218
- "grad_norm": 1.2598787839937213,
2219
  "learning_rate": 8.153640448003875e-08,
2220
- "loss": 0.4974,
2221
  "step": 1575
2222
  },
2223
  {
2224
  "epoch": 0.9512341962673089,
2225
- "grad_norm": 1.2922592226571687,
2226
  "learning_rate": 7.235330694550402e-08,
2227
- "loss": 0.4975,
2228
  "step": 1580
2229
  },
2230
  {
2231
  "epoch": 0.9542444310656231,
2232
- "grad_norm": 1.2596424466646803,
2233
  "learning_rate": 6.371493041313126e-08,
2234
- "loss": 0.52,
2235
  "step": 1585
2236
  },
2237
  {
2238
  "epoch": 0.9572546658639374,
2239
- "grad_norm": 1.2829293637589985,
2240
  "learning_rate": 5.562222980225907e-08,
2241
- "loss": 0.5118,
2242
  "step": 1590
2243
  },
2244
  {
2245
  "epoch": 0.9602649006622517,
2246
- "grad_norm": 1.2885001691506446,
2247
  "learning_rate": 4.807609971111238e-08,
2248
- "loss": 0.5082,
2249
  "step": 1595
2250
  },
2251
  {
2252
  "epoch": 0.963275135460566,
2253
- "grad_norm": 1.251029961869992,
2254
  "learning_rate": 4.107737431791159e-08,
2255
- "loss": 0.5135,
2256
  "step": 1600
2257
  },
2258
  {
2259
  "epoch": 0.9662853702588802,
2260
- "grad_norm": 1.2394267183031216,
2261
  "learning_rate": 3.462682728865685e-08,
2262
- "loss": 0.5104,
2263
  "step": 1605
2264
  },
2265
  {
2266
  "epoch": 0.9692956050571945,
2267
- "grad_norm": 1.2966890851556148,
2268
  "learning_rate": 2.8725171691605934e-08,
2269
- "loss": 0.5123,
2270
  "step": 1610
2271
  },
2272
  {
2273
  "epoch": 0.9723058398555088,
2274
- "grad_norm": 1.2442957383652233,
2275
  "learning_rate": 2.3373059918448958e-08,
2276
- "loss": 0.5047,
2277
  "step": 1615
2278
  },
2279
  {
2280
  "epoch": 0.975316074653823,
2281
- "grad_norm": 1.2908864465886667,
2282
  "learning_rate": 1.8571083612188845e-08,
2283
- "loss": 0.5063,
2284
  "step": 1620
2285
  },
2286
  {
2287
  "epoch": 0.9783263094521373,
2288
- "grad_norm": 1.2475359893632505,
2289
  "learning_rate": 1.431977360173975e-08,
2290
- "loss": 0.5029,
2291
  "step": 1625
2292
  },
2293
  {
2294
  "epoch": 0.9813365442504516,
2295
- "grad_norm": 1.2712485645750238,
2296
  "learning_rate": 1.0619599843249006e-08,
2297
- "loss": 0.5147,
2298
  "step": 1630
2299
  },
2300
  {
2301
  "epoch": 0.9843467790487658,
2302
- "grad_norm": 1.2710371635621938,
2303
  "learning_rate": 7.470971368142011e-09,
2304
- "loss": 0.4974,
2305
  "step": 1635
2306
  },
2307
  {
2308
  "epoch": 0.9873570138470801,
2309
- "grad_norm": 1.3076474088761778,
2310
  "learning_rate": 4.874236237911723e-09,
2311
- "loss": 0.504,
2312
  "step": 1640
2313
  },
2314
  {
2315
  "epoch": 0.9903672486453944,
2316
- "grad_norm": 1.2813980219783268,
2317
  "learning_rate": 2.8296815056377824e-09,
2318
- "loss": 0.5086,
2319
  "step": 1645
2320
  },
2321
  {
2322
  "epoch": 0.9933774834437086,
2323
- "grad_norm": 1.2247806264161647,
2324
  "learning_rate": 1.3375331842574446e-09,
2325
- "loss": 0.4984,
2326
  "step": 1650
2327
  },
2328
  {
2329
  "epoch": 0.9963877182420229,
2330
- "grad_norm": 1.2500205386368362,
2331
  "learning_rate": 3.9795622158111945e-10,
2332
- "loss": 0.5102,
2333
  "step": 1655
2334
  },
2335
  {
2336
  "epoch": 0.9993979530403372,
2337
- "grad_norm": 1.3276901052559071,
2338
  "learning_rate": 1.1054482056405136e-11,
2339
- "loss": 0.5088,
2340
  "step": 1660
2341
  },
2342
  {
2343
  "epoch": 1.0,
2344
- "eval_runtime": 3.3023,
2345
- "eval_samples_per_second": 3.028,
2346
- "eval_steps_per_second": 0.908,
2347
  "step": 1661
2348
  },
2349
  {
2350
  "epoch": 1.0,
2351
  "step": 1661,
2352
  "total_flos": 430711668473856.0,
2353
- "train_loss": 0.6697713777575416,
2354
- "train_runtime": 21206.7621,
2355
- "train_samples_per_second": 1.253,
2356
- "train_steps_per_second": 0.078
2357
  }
2358
  ],
2359
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0006020469596628537,
13
+ "grad_norm": 40.966713680527505,
14
  "learning_rate": 5.98802395209581e-08,
15
+ "loss": 2.0373,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0030102347983142685,
20
+ "grad_norm": 39.14616421730173,
21
  "learning_rate": 2.9940119760479047e-07,
22
+ "loss": 2.0379,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.006020469596628537,
27
+ "grad_norm": 27.910266092453746,
28
  "learning_rate": 5.988023952095809e-07,
29
+ "loss": 1.9953,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.009030704394942806,
34
+ "grad_norm": 15.201013735116844,
35
  "learning_rate": 8.982035928143713e-07,
36
+ "loss": 1.7804,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.012040939193257074,
41
+ "grad_norm": 5.295520564674036,
42
  "learning_rate": 1.1976047904191619e-06,
43
+ "loss": 1.502,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.015051173991571343,
48
+ "grad_norm": 4.314026946089969,
49
  "learning_rate": 1.4970059880239521e-06,
50
+ "loss": 1.3422,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.018061408789885613,
55
+ "grad_norm": 2.8393277241114694,
56
  "learning_rate": 1.7964071856287426e-06,
57
+ "loss": 1.2716,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.02107164358819988,
62
+ "grad_norm": 1.9881246281959908,
63
  "learning_rate": 2.095808383233533e-06,
64
+ "loss": 1.142,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.024081878386514148,
69
+ "grad_norm": 1.4576393482889902,
70
  "learning_rate": 2.3952095808383237e-06,
71
+ "loss": 1.091,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.027092113184828417,
76
+ "grad_norm": 1.3452736306984228,
77
  "learning_rate": 2.694610778443114e-06,
78
+ "loss": 1.0808,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.030102347983142687,
83
+ "grad_norm": 1.2853833849515133,
84
  "learning_rate": 2.9940119760479042e-06,
85
+ "loss": 1.0534,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.033112582781456956,
90
+ "grad_norm": 1.3461267648995063,
91
  "learning_rate": 3.2934131736526947e-06,
92
+ "loss": 1.0335,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.036122817579771226,
97
+ "grad_norm": 1.2720929793887874,
98
  "learning_rate": 3.592814371257485e-06,
99
+ "loss": 1.0202,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.03913305237808549,
104
+ "grad_norm": 1.2405264880245372,
105
  "learning_rate": 3.892215568862276e-06,
106
+ "loss": 1.0059,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.04214328717639976,
111
+ "grad_norm": 1.1810599448574401,
112
  "learning_rate": 4.191616766467066e-06,
113
+ "loss": 1.0041,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.04515352197471403,
118
+ "grad_norm": 1.2433212471597048,
119
  "learning_rate": 4.4910179640718566e-06,
120
+ "loss": 0.9915,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.048163756773028296,
125
+ "grad_norm": 1.2370160588056265,
126
  "learning_rate": 4.7904191616766475e-06,
127
+ "loss": 0.9683,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.051173991571342566,
132
+ "grad_norm": 1.2799056245464302,
133
  "learning_rate": 5.0898203592814375e-06,
134
+ "loss": 0.9757,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.054184226369656835,
139
+ "grad_norm": 1.2232343401620152,
140
  "learning_rate": 5.389221556886228e-06,
141
+ "loss": 0.9664,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.057194461167971104,
146
+ "grad_norm": 1.2749429574013502,
147
  "learning_rate": 5.6886227544910184e-06,
148
+ "loss": 0.9531,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.060204695966285374,
153
+ "grad_norm": 1.341905992560118,
154
  "learning_rate": 5.9880239520958085e-06,
155
+ "loss": 0.936,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.06321493076459964,
160
+ "grad_norm": 1.3312129649486055,
161
  "learning_rate": 6.2874251497005985e-06,
162
+ "loss": 0.9288,
163
  "step": 105
164
  },
165
  {
166
  "epoch": 0.06622516556291391,
167
+ "grad_norm": 1.3057249774040953,
168
  "learning_rate": 6.586826347305389e-06,
169
+ "loss": 0.9137,
170
  "step": 110
171
  },
172
  {
173
  "epoch": 0.06923540036122817,
174
+ "grad_norm": 1.3480198308811908,
175
  "learning_rate": 6.88622754491018e-06,
176
+ "loss": 0.9087,
177
  "step": 115
178
  },
179
  {
180
  "epoch": 0.07224563515954245,
181
+ "grad_norm": 1.2851052893092942,
182
  "learning_rate": 7.18562874251497e-06,
183
+ "loss": 0.9006,
184
  "step": 120
185
  },
186
  {
187
  "epoch": 0.07525586995785671,
188
+ "grad_norm": 1.3337166654009118,
189
  "learning_rate": 7.485029940119761e-06,
190
+ "loss": 0.8995,
191
  "step": 125
192
  },
193
  {
194
  "epoch": 0.07826610475617098,
195
+ "grad_norm": 1.259701546947682,
196
  "learning_rate": 7.784431137724551e-06,
197
+ "loss": 0.877,
198
  "step": 130
199
  },
200
  {
201
  "epoch": 0.08127633955448525,
202
+ "grad_norm": 1.3284638098388712,
203
  "learning_rate": 8.083832335329342e-06,
204
+ "loss": 0.8847,
205
  "step": 135
206
  },
207
  {
208
  "epoch": 0.08428657435279951,
209
+ "grad_norm": 1.3016966435741442,
210
  "learning_rate": 8.383233532934131e-06,
211
+ "loss": 0.8562,
212
  "step": 140
213
  },
214
  {
215
  "epoch": 0.08729680915111379,
216
+ "grad_norm": 1.272771944986212,
217
  "learning_rate": 8.682634730538922e-06,
218
+ "loss": 0.8537,
219
  "step": 145
220
  },
221
  {
222
  "epoch": 0.09030704394942805,
223
+ "grad_norm": 1.2404717495109163,
224
  "learning_rate": 8.982035928143713e-06,
225
+ "loss": 0.8402,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.09331727874774233,
230
+ "grad_norm": 1.3171631151874839,
231
  "learning_rate": 9.281437125748504e-06,
232
+ "loss": 0.8596,
233
  "step": 155
234
  },
235
  {
236
  "epoch": 0.09632751354605659,
237
+ "grad_norm": 1.2739485119594887,
238
  "learning_rate": 9.580838323353295e-06,
239
+ "loss": 0.8548,
240
  "step": 160
241
  },
242
  {
243
  "epoch": 0.09933774834437085,
244
+ "grad_norm": 1.2415391381244691,
245
  "learning_rate": 9.880239520958084e-06,
246
+ "loss": 0.8348,
247
  "step": 165
248
  },
249
  {
250
  "epoch": 0.10234798314268513,
251
+ "grad_norm": 1.2732863554156713,
252
  "learning_rate": 9.999900509954779e-06,
253
+ "loss": 0.8326,
254
  "step": 170
255
  },
256
  {
257
  "epoch": 0.1053582179409994,
258
+ "grad_norm": 1.3293213562766069,
259
  "learning_rate": 9.999292529572152e-06,
260
+ "loss": 0.839,
261
  "step": 175
262
  },
263
  {
264
  "epoch": 0.10836845273931367,
265
+ "grad_norm": 1.3011820572360617,
266
  "learning_rate": 9.998131908181262e-06,
267
+ "loss": 0.8377,
268
  "step": 180
269
  },
270
  {
271
  "epoch": 0.11137868753762793,
272
+ "grad_norm": 1.2833557284441266,
273
  "learning_rate": 9.996418774081658e-06,
274
+ "loss": 0.8228,
275
  "step": 185
276
  },
277
  {
278
  "epoch": 0.11438892233594221,
279
+ "grad_norm": 1.286488000985355,
280
  "learning_rate": 9.994153316649769e-06,
281
+ "loss": 0.8318,
282
  "step": 190
283
  },
284
  {
285
  "epoch": 0.11739915713425647,
286
+ "grad_norm": 1.2384872269253941,
287
  "learning_rate": 9.991335786317964e-06,
288
+ "loss": 0.8209,
289
  "step": 195
290
  },
291
  {
292
  "epoch": 0.12040939193257075,
293
+ "grad_norm": 1.2713798885562295,
294
  "learning_rate": 9.987966494546873e-06,
295
+ "loss": 0.8069,
296
  "step": 200
297
  },
298
  {
299
  "epoch": 0.12341962673088501,
300
+ "grad_norm": 1.2482060631896188,
301
  "learning_rate": 9.984045813790959e-06,
302
+ "loss": 0.8142,
303
  "step": 205
304
  },
305
  {
306
  "epoch": 0.12642986152919927,
307
+ "grad_norm": 1.3415986154597055,
308
  "learning_rate": 9.979574177457337e-06,
309
+ "loss": 0.813,
310
  "step": 210
311
  },
312
  {
313
  "epoch": 0.12944009632751355,
314
+ "grad_norm": 1.2350908672442804,
315
  "learning_rate": 9.974552079857873e-06,
316
+ "loss": 0.8109,
317
  "step": 215
318
  },
319
  {
320
  "epoch": 0.13245033112582782,
321
+ "grad_norm": 1.3964490955664748,
322
  "learning_rate": 9.968980076154533e-06,
323
+ "loss": 0.8293,
324
  "step": 220
325
  },
326
  {
327
  "epoch": 0.13546056592414207,
328
+ "grad_norm": 1.2591771311022242,
329
  "learning_rate": 9.962858782298023e-06,
330
+ "loss": 0.802,
331
  "step": 225
332
  },
333
  {
334
  "epoch": 0.13847080072245635,
335
+ "grad_norm": 1.288853392082994,
336
  "learning_rate": 9.956188874959686e-06,
337
+ "loss": 0.8151,
338
  "step": 230
339
  },
340
  {
341
  "epoch": 0.14148103552077063,
342
+ "grad_norm": 1.197725597237524,
343
  "learning_rate": 9.948971091456715e-06,
344
+ "loss": 0.8071,
345
  "step": 235
346
  },
347
  {
348
  "epoch": 0.1444912703190849,
349
+ "grad_norm": 1.2226857692711433,
350
  "learning_rate": 9.941206229670634e-06,
351
+ "loss": 0.8053,
352
  "step": 240
353
  },
354
  {
355
  "epoch": 0.14750150511739915,
356
+ "grad_norm": 1.3119146876238303,
357
  "learning_rate": 9.932895147959106e-06,
358
+ "loss": 0.7955,
359
  "step": 245
360
  },
361
  {
362
  "epoch": 0.15051173991571343,
363
+ "grad_norm": 1.3326937913135584,
364
  "learning_rate": 9.924038765061042e-06,
365
+ "loss": 0.7833,
366
  "step": 250
367
  },
368
  {
369
  "epoch": 0.1535219747140277,
370
+ "grad_norm": 1.2647168837729537,
371
  "learning_rate": 9.91463805999504e-06,
372
+ "loss": 0.7933,
373
  "step": 255
374
  },
375
  {
376
  "epoch": 0.15653220951234195,
377
+ "grad_norm": 1.2033595276379918,
378
  "learning_rate": 9.904694071951167e-06,
379
+ "loss": 0.8004,
380
  "step": 260
381
  },
382
  {
383
  "epoch": 0.15954244431065623,
384
+ "grad_norm": 1.1708378372158956,
385
  "learning_rate": 9.894207900176074e-06,
386
+ "loss": 0.7877,
387
  "step": 265
388
  },
389
  {
390
  "epoch": 0.1625526791089705,
391
+ "grad_norm": 1.277798553181957,
392
  "learning_rate": 9.883180703851488e-06,
393
+ "loss": 0.7899,
394
  "step": 270
395
  },
396
  {
397
  "epoch": 0.16556291390728478,
398
+ "grad_norm": 1.207840548329369,
399
  "learning_rate": 9.871613701966067e-06,
400
+ "loss": 0.7885,
401
  "step": 275
402
  },
403
  {
404
  "epoch": 0.16857314870559903,
405
+ "grad_norm": 1.156274795904894,
406
  "learning_rate": 9.859508173180653e-06,
407
+ "loss": 0.7664,
408
  "step": 280
409
  },
410
  {
411
  "epoch": 0.1715833835039133,
412
+ "grad_norm": 1.164171992678941,
413
  "learning_rate": 9.846865455686915e-06,
414
+ "loss": 0.7777,
415
  "step": 285
416
  },
417
  {
418
  "epoch": 0.17459361830222758,
419
+ "grad_norm": 1.1762216211439502,
420
  "learning_rate": 9.833686947059436e-06,
421
+ "loss": 0.7727,
422
  "step": 290
423
  },
424
  {
425
  "epoch": 0.17760385310054183,
426
+ "grad_norm": 1.1617067559555396,
427
  "learning_rate": 9.819974104101198e-06,
428
+ "loss": 0.7724,
429
  "step": 295
430
  },
431
  {
432
  "epoch": 0.1806140878988561,
433
+ "grad_norm": 1.1855967777933063,
434
  "learning_rate": 9.80572844268256e-06,
435
+ "loss": 0.7645,
436
  "step": 300
437
  },
438
  {
439
  "epoch": 0.18362432269717038,
440
+ "grad_norm": 1.1944671571121344,
441
  "learning_rate": 9.790951537573686e-06,
442
  "loss": 0.7819,
443
  "step": 305
444
  },
445
  {
446
  "epoch": 0.18663455749548466,
447
+ "grad_norm": 1.1965936475386203,
448
  "learning_rate": 9.775645022270448e-06,
449
+ "loss": 0.778,
450
  "step": 310
451
  },
452
  {
453
  "epoch": 0.1896447922937989,
454
+ "grad_norm": 1.256280786074695,
455
  "learning_rate": 9.759810588813872e-06,
456
+ "loss": 0.774,
457
  "step": 315
458
  },
459
  {
460
  "epoch": 0.19265502709211318,
461
+ "grad_norm": 1.3027821404071966,
462
  "learning_rate": 9.743449987603082e-06,
463
+ "loss": 0.7682,
464
  "step": 320
465
  },
466
  {
467
  "epoch": 0.19566526189042746,
468
+ "grad_norm": 1.2406739802662574,
469
  "learning_rate": 9.726565027201813e-06,
470
+ "loss": 0.788,
471
  "step": 325
472
  },
473
  {
474
  "epoch": 0.1986754966887417,
475
+ "grad_norm": 1.1751549969003265,
476
  "learning_rate": 9.70915757413847e-06,
477
+ "loss": 0.7815,
478
  "step": 330
479
  },
480
  {
481
  "epoch": 0.20168573148705599,
482
+ "grad_norm": 1.1964609554923433,
483
  "learning_rate": 9.691229552699817e-06,
484
+ "loss": 0.7615,
485
  "step": 335
486
  },
487
  {
488
  "epoch": 0.20469596628537026,
489
+ "grad_norm": 1.2774212868740151,
490
  "learning_rate": 9.672782944718234e-06,
491
+ "loss": 0.7677,
492
  "step": 340
493
  },
494
  {
495
  "epoch": 0.20770620108368454,
496
+ "grad_norm": 1.202397374963106,
497
  "learning_rate": 9.65381978935266e-06,
498
+ "loss": 0.7671,
499
  "step": 345
500
  },
501
  {
502
  "epoch": 0.2107164358819988,
503
+ "grad_norm": 1.2159818292840359,
504
  "learning_rate": 9.634342182863163e-06,
505
+ "loss": 0.7476,
506
  "step": 350
507
  },
508
  {
509
  "epoch": 0.21372667068031306,
510
+ "grad_norm": 1.194513717936444,
511
  "learning_rate": 9.614352278379217e-06,
512
+ "loss": 0.7788,
513
  "step": 355
514
  },
515
  {
516
  "epoch": 0.21673690547862734,
517
+ "grad_norm": 1.2060005508903568,
518
  "learning_rate": 9.593852285661684e-06,
519
+ "loss": 0.7709,
520
  "step": 360
521
  },
522
  {
523
  "epoch": 0.2197471402769416,
524
+ "grad_norm": 1.203716084337312,
525
  "learning_rate": 9.572844470858537e-06,
526
+ "loss": 0.7584,
527
  "step": 365
528
  },
529
  {
530
  "epoch": 0.22275737507525586,
531
+ "grad_norm": 1.282647241669985,
532
  "learning_rate": 9.551331156254358e-06,
533
+ "loss": 0.7696,
534
  "step": 370
535
  },
536
  {
537
  "epoch": 0.22576760987357014,
538
+ "grad_norm": 1.2009722082380498,
539
  "learning_rate": 9.529314720013618e-06,
540
+ "loss": 0.7526,
541
  "step": 375
542
  },
543
  {
544
  "epoch": 0.22877784467188442,
545
+ "grad_norm": 1.2201198977296086,
546
  "learning_rate": 9.506797595917787e-06,
547
+ "loss": 0.7511,
548
  "step": 380
549
  },
550
  {
551
  "epoch": 0.23178807947019867,
552
+ "grad_norm": 1.3415385517504728,
553
  "learning_rate": 9.483782273096295e-06,
554
+ "loss": 0.7513,
555
  "step": 385
556
  },
557
  {
558
  "epoch": 0.23479831426851294,
559
+ "grad_norm": 1.26924665240572,
560
  "learning_rate": 9.460271295751373e-06,
561
+ "loss": 0.7649,
562
  "step": 390
563
  },
564
  {
565
  "epoch": 0.23780854906682722,
566
+ "grad_norm": 1.2785361560349413,
567
  "learning_rate": 9.436267262876808e-06,
568
+ "loss": 0.751,
569
  "step": 395
570
  },
571
  {
572
  "epoch": 0.2408187838651415,
573
+ "grad_norm": 1.1852007444354873,
574
  "learning_rate": 9.411772827970642e-06,
575
+ "loss": 0.7548,
576
  "step": 400
577
  },
578
  {
579
  "epoch": 0.24382901866345574,
580
+ "grad_norm": 1.1910921737694808,
581
  "learning_rate": 9.38679069874184e-06,
582
+ "loss": 0.7507,
583
  "step": 405
584
  },
585
  {
586
  "epoch": 0.24683925346177002,
587
+ "grad_norm": 1.3053941058405514,
588
  "learning_rate": 9.36132363681097e-06,
589
+ "loss": 0.7557,
590
  "step": 410
591
  },
592
  {
593
  "epoch": 0.2498494882600843,
594
+ "grad_norm": 1.2465327608331673,
595
  "learning_rate": 9.335374457404928e-06,
596
+ "loss": 0.7649,
597
  "step": 415
598
  },
599
  {
600
  "epoch": 0.25285972305839854,
601
+ "grad_norm": 1.2813369476453156,
602
  "learning_rate": 9.308946029045726e-06,
603
+ "loss": 0.7325,
604
  "step": 420
605
  },
606
  {
607
  "epoch": 0.25586995785671285,
608
+ "grad_norm": 1.2539019797830948,
609
  "learning_rate": 9.282041273233402e-06,
610
+ "loss": 0.726,
611
  "step": 425
612
  },
613
  {
614
  "epoch": 0.2588801926550271,
615
+ "grad_norm": 1.2685847502715055,
616
  "learning_rate": 9.254663164123052e-06,
617
+ "loss": 0.7372,
618
  "step": 430
619
  },
620
  {
621
  "epoch": 0.26189042745334135,
622
+ "grad_norm": 1.2404250071973326,
623
  "learning_rate": 9.226814728196072e-06,
624
+ "loss": 0.7333,
625
  "step": 435
626
  },
627
  {
628
  "epoch": 0.26490066225165565,
629
+ "grad_norm": 1.206906014688777,
630
  "learning_rate": 9.198499043925591e-06,
631
+ "loss": 0.7305,
632
  "step": 440
633
  },
634
  {
635
  "epoch": 0.2679108970499699,
636
+ "grad_norm": 1.2568238232780307,
637
  "learning_rate": 9.169719241436162e-06,
638
+ "loss": 0.7437,
639
  "step": 445
640
  },
641
  {
642
  "epoch": 0.27092113184828415,
643
+ "grad_norm": 1.201047028691787,
644
  "learning_rate": 9.14047850215775e-06,
645
+ "loss": 0.7229,
646
  "step": 450
647
  },
648
  {
649
  "epoch": 0.27393136664659845,
650
+ "grad_norm": 1.1628890533112184,
651
  "learning_rate": 9.110780058474052e-06,
652
+ "loss": 0.7252,
653
  "step": 455
654
  },
655
  {
656
  "epoch": 0.2769416014449127,
657
+ "grad_norm": 1.222552632587458,
658
  "learning_rate": 9.080627193365155e-06,
659
+ "loss": 0.7348,
660
  "step": 460
661
  },
662
  {
663
  "epoch": 0.27995183624322695,
664
+ "grad_norm": 1.204712682838897,
665
  "learning_rate": 9.050023240044649e-06,
666
+ "loss": 0.7394,
667
  "step": 465
668
  },
669
  {
670
  "epoch": 0.28296207104154125,
671
+ "grad_norm": 1.1699974198146679,
672
  "learning_rate": 9.018971581591141e-06,
673
+ "loss": 0.7172,
674
  "step": 470
675
  },
676
  {
677
  "epoch": 0.2859723058398555,
678
+ "grad_norm": 1.2143354714842383,
679
  "learning_rate": 8.987475650574289e-06,
680
+ "loss": 0.7459,
681
  "step": 475
682
  },
683
  {
684
  "epoch": 0.2889825406381698,
685
+ "grad_norm": 1.2344181327548198,
686
  "learning_rate": 8.955538928675343e-06,
687
+ "loss": 0.7217,
688
  "step": 480
689
  },
690
  {
691
  "epoch": 0.29199277543648405,
692
+ "grad_norm": 1.2200821617384021,
693
  "learning_rate": 8.923164946302274e-06,
694
+ "loss": 0.7355,
695
  "step": 485
696
  },
697
  {
698
  "epoch": 0.2950030102347983,
699
+ "grad_norm": 1.2730755552957604,
700
  "learning_rate": 8.890357282199504e-06,
701
+ "loss": 0.7407,
702
  "step": 490
703
  },
704
  {
705
  "epoch": 0.2980132450331126,
706
+ "grad_norm": 1.326965523163219,
707
  "learning_rate": 8.857119563052301e-06,
708
+ "loss": 0.7112,
709
  "step": 495
710
  },
711
  {
712
  "epoch": 0.30102347983142685,
713
+ "grad_norm": 1.2881202459959005,
714
  "learning_rate": 8.823455463085873e-06,
715
+ "loss": 0.7299,
716
  "step": 500
717
  },
718
  {
719
  "epoch": 0.3040337146297411,
720
+ "grad_norm": 1.3023027317084608,
721
  "learning_rate": 8.789368703659199e-06,
722
+ "loss": 0.7291,
723
  "step": 505
724
  },
725
  {
726
  "epoch": 0.3070439494280554,
727
+ "grad_norm": 1.192827763147033,
728
  "learning_rate": 8.754863052853658e-06,
729
+ "loss": 0.7242,
730
  "step": 510
731
  },
732
  {
733
  "epoch": 0.31005418422636966,
734
+ "grad_norm": 1.172549917556519,
735
  "learning_rate": 8.719942325056496e-06,
736
+ "loss": 0.707,
737
  "step": 515
738
  },
739
  {
740
  "epoch": 0.3130644190246839,
741
+ "grad_norm": 1.1750104521638358,
742
  "learning_rate": 8.68461038053916e-06,
743
+ "loss": 0.7152,
744
  "step": 520
745
  },
746
  {
747
  "epoch": 0.3160746538229982,
748
+ "grad_norm": 1.3380189595946483,
749
  "learning_rate": 8.648871125030576e-06,
750
+ "loss": 0.711,
751
  "step": 525
752
  },
753
  {
754
  "epoch": 0.31908488862131246,
755
+ "grad_norm": 1.226348488844478,
756
  "learning_rate": 8.612728509285395e-06,
757
+ "loss": 0.7104,
758
  "step": 530
759
  },
760
  {
761
  "epoch": 0.3220951234196267,
762
+ "grad_norm": 1.245273272201369,
763
  "learning_rate": 8.576186528647253e-06,
764
+ "loss": 0.7286,
765
  "step": 535
766
  },
767
  {
768
  "epoch": 0.325105358217941,
769
+ "grad_norm": 1.1790288496340813,
770
  "learning_rate": 8.53924922260712e-06,
771
+ "loss": 0.7075,
772
  "step": 540
773
  },
774
  {
775
  "epoch": 0.32811559301625526,
776
+ "grad_norm": 1.1914412291760281,
777
  "learning_rate": 8.501920674356755e-06,
778
+ "loss": 0.7008,
779
  "step": 545
780
  },
781
  {
782
  "epoch": 0.33112582781456956,
783
+ "grad_norm": 1.1806518773814458,
784
  "learning_rate": 8.46420501033733e-06,
785
+ "loss": 0.6934,
786
  "step": 550
787
  },
788
  {
789
  "epoch": 0.3341360626128838,
790
+ "grad_norm": 1.2723595572649768,
791
  "learning_rate": 8.42610639978329e-06,
792
+ "loss": 0.7019,
793
  "step": 555
794
  },
795
  {
796
  "epoch": 0.33714629741119806,
797
+ "grad_norm": 1.2225157243802531,
798
  "learning_rate": 8.387629054261454e-06,
799
+ "loss": 0.7002,
800
  "step": 560
801
  },
802
  {
803
  "epoch": 0.34015653220951236,
804
+ "grad_norm": 1.2003267518231666,
805
  "learning_rate": 8.348777227205462e-06,
806
+ "loss": 0.6984,
807
  "step": 565
808
  },
809
  {
810
  "epoch": 0.3431667670078266,
811
+ "grad_norm": 1.1755635510106648,
812
  "learning_rate": 8.309555213445583e-06,
813
+ "loss": 0.7123,
814
  "step": 570
815
  },
816
  {
817
  "epoch": 0.34617700180614086,
818
+ "grad_norm": 1.2065493450135722,
819
  "learning_rate": 8.269967348733947e-06,
820
+ "loss": 0.6855,
821
  "step": 575
822
  },
823
  {
824
  "epoch": 0.34918723660445516,
825
+ "grad_norm": 1.2455393618923039,
826
  "learning_rate": 8.230018009265255e-06,
827
+ "loss": 0.6971,
828
  "step": 580
829
  },
830
  {
831
  "epoch": 0.3521974714027694,
832
+ "grad_norm": 1.3331603991829393,
833
  "learning_rate": 8.189711611193012e-06,
834
+ "loss": 0.6985,
835
  "step": 585
836
  },
837
  {
838
  "epoch": 0.35520770620108366,
839
+ "grad_norm": 1.221254992942756,
840
  "learning_rate": 8.149052610141357e-06,
841
+ "loss": 0.6843,
842
  "step": 590
843
  },
844
  {
845
  "epoch": 0.35821794099939797,
846
+ "grad_norm": 1.2577154855191386,
847
  "learning_rate": 8.108045500712518e-06,
848
+ "loss": 0.7015,
849
  "step": 595
850
  },
851
  {
852
  "epoch": 0.3612281757977122,
853
+ "grad_norm": 1.2678823261096912,
854
  "learning_rate": 8.066694815989961e-06,
855
+ "loss": 0.7054,
856
  "step": 600
857
  },
858
  {
859
  "epoch": 0.36423841059602646,
860
+ "grad_norm": 1.3068755574009954,
861
  "learning_rate": 8.025005127037282e-06,
862
+ "loss": 0.6909,
863
  "step": 605
864
  },
865
  {
866
  "epoch": 0.36724864539434077,
867
+ "grad_norm": 1.25345913435044,
868
  "learning_rate": 7.982981042392907e-06,
869
+ "loss": 0.6804,
870
  "step": 610
871
  },
872
  {
873
  "epoch": 0.370258880192655,
874
+ "grad_norm": 1.2171026231986553,
875
  "learning_rate": 7.940627207560655e-06,
876
+ "loss": 0.6792,
877
  "step": 615
878
  },
879
  {
880
  "epoch": 0.3732691149909693,
881
+ "grad_norm": 1.1878268346505996,
882
  "learning_rate": 7.897948304496189e-06,
883
+ "loss": 0.7002,
884
  "step": 620
885
  },
886
  {
887
  "epoch": 0.37627934978928357,
888
+ "grad_norm": 1.2021282337780814,
889
  "learning_rate": 7.854949051089467e-06,
890
+ "loss": 0.7026,
891
  "step": 625
892
  },
893
  {
894
  "epoch": 0.3792895845875978,
895
+ "grad_norm": 1.219850402255314,
896
  "learning_rate": 7.811634200643202e-06,
897
+ "loss": 0.7081,
898
  "step": 630
899
  },
900
  {
901
  "epoch": 0.3822998193859121,
902
+ "grad_norm": 1.25066752923423,
903
  "learning_rate": 7.768008541347423e-06,
904
+ "loss": 0.6709,
905
  "step": 635
906
  },
907
  {
908
  "epoch": 0.38531005418422637,
909
+ "grad_norm": 1.233821739163781,
910
  "learning_rate": 7.72407689575016e-06,
911
+ "loss": 0.6821,
912
  "step": 640
913
  },
914
  {
915
  "epoch": 0.3883202889825406,
916
+ "grad_norm": 1.2400488427811083,
917
  "learning_rate": 7.67984412022434e-06,
918
+ "loss": 0.6797,
919
  "step": 645
920
  },
921
  {
922
  "epoch": 0.3913305237808549,
923
+ "grad_norm": 1.2744104516354757,
924
  "learning_rate": 7.635315104430959e-06,
925
+ "loss": 0.6785,
926
  "step": 650
927
  },
928
  {
929
  "epoch": 0.39434075857916917,
930
+ "grad_norm": 1.264910637088745,
931
  "learning_rate": 7.5904947707785434e-06,
932
+ "loss": 0.649,
933
  "step": 655
934
  },
935
  {
936
  "epoch": 0.3973509933774834,
937
+ "grad_norm": 1.2672765451192602,
938
  "learning_rate": 7.545388073879018e-06,
939
+ "loss": 0.6906,
940
  "step": 660
941
  },
942
  {
943
  "epoch": 0.4003612281757977,
944
+ "grad_norm": 1.2685599345911318,
945
  "learning_rate": 7.500000000000001e-06,
946
+ "loss": 0.6671,
947
  "step": 665
948
  },
949
  {
950
  "epoch": 0.40337146297411197,
951
+ "grad_norm": 1.2358020853032294,
952
  "learning_rate": 7.454335566513603e-06,
953
+ "loss": 0.6775,
954
  "step": 670
955
  },
956
  {
957
  "epoch": 0.4063816977724263,
958
+ "grad_norm": 1.316177091586594,
959
  "learning_rate": 7.408399821341787e-06,
960
+ "loss": 0.679,
961
  "step": 675
962
  },
963
  {
964
  "epoch": 0.4093919325707405,
965
+ "grad_norm": 1.1839753807419882,
966
  "learning_rate": 7.362197842398355e-06,
967
+ "loss": 0.6712,
968
  "step": 680
969
  },
970
  {
971
  "epoch": 0.4124021673690548,
972
+ "grad_norm": 1.2764557732073194,
973
  "learning_rate": 7.315734737027612e-06,
974
+ "loss": 0.6719,
975
  "step": 685
976
  },
977
  {
978
  "epoch": 0.4154124021673691,
979
+ "grad_norm": 1.2259459384980307,
980
  "learning_rate": 7.2690156414397775e-06,
981
+ "loss": 0.657,
982
  "step": 690
983
  },
984
  {
985
  "epoch": 0.4184226369656833,
986
+ "grad_norm": 1.2344897988837857,
987
  "learning_rate": 7.22204572014322e-06,
988
+ "loss": 0.6553,
989
  "step": 695
990
  },
991
  {
992
  "epoch": 0.4214328717639976,
993
+ "grad_norm": 1.2553454967603142,
994
  "learning_rate": 7.174830165373542e-06,
995
+ "loss": 0.6602,
996
  "step": 700
997
  },
998
  {
999
  "epoch": 0.4244431065623119,
1000
+ "grad_norm": 1.2370627727074957,
1001
  "learning_rate": 7.127374196519616e-06,
1002
+ "loss": 0.6566,
1003
  "step": 705
1004
  },
1005
  {
1006
  "epoch": 0.4274533413606261,
1007
+ "grad_norm": 1.23827568170562,
1008
  "learning_rate": 7.079683059546607e-06,
1009
+ "loss": 0.6566,
1010
  "step": 710
1011
  },
1012
  {
1013
  "epoch": 0.4304635761589404,
1014
+ "grad_norm": 1.2717726441044983,
1015
  "learning_rate": 7.031762026416074e-06,
1016
+ "loss": 0.6523,
1017
  "step": 715
1018
  },
1019
  {
1020
  "epoch": 0.4334738109572547,
1021
+ "grad_norm": 1.2414138030074853,
1022
  "learning_rate": 6.983616394503177e-06,
1023
+ "loss": 0.668,
1024
  "step": 720
1025
  },
1026
  {
1027
  "epoch": 0.43648404575556893,
1028
+ "grad_norm": 1.2863979258950011,
1029
  "learning_rate": 6.9352514860110876e-06,
1030
+ "loss": 0.6723,
1031
  "step": 725
1032
  },
1033
  {
1034
  "epoch": 0.4394942805538832,
1035
+ "grad_norm": 1.1998975466871244,
1036
  "learning_rate": 6.886672647382653e-06,
1037
+ "loss": 0.6496,
1038
  "step": 730
1039
  },
1040
  {
1041
  "epoch": 0.4425045153521975,
1042
+ "grad_norm": 1.2253729409328844,
1043
  "learning_rate": 6.837885248709386e-06,
1044
+ "loss": 0.6837,
1045
  "step": 735
1046
  },
1047
  {
1048
  "epoch": 0.44551475015051173,
1049
+ "grad_norm": 1.3977058259662651,
1050
  "learning_rate": 6.788894683137822e-06,
1051
+ "loss": 0.6535,
1052
  "step": 740
1053
  },
1054
  {
1055
  "epoch": 0.44852498494882603,
1056
+ "grad_norm": 1.2252881439154193,
1057
  "learning_rate": 6.739706366273346e-06,
1058
+ "loss": 0.6668,
1059
  "step": 745
1060
  },
1061
  {
1062
  "epoch": 0.4515352197471403,
1063
+ "grad_norm": 1.2685171799824646,
1064
  "learning_rate": 6.690325735581532e-06,
1065
+ "loss": 0.6408,
1066
  "step": 750
1067
  },
1068
  {
1069
  "epoch": 0.45454545454545453,
1070
+ "grad_norm": 1.2559367914781316,
1071
  "learning_rate": 6.640758249787067e-06,
1072
+ "loss": 0.6551,
1073
  "step": 755
1074
  },
1075
  {
1076
  "epoch": 0.45755568934376883,
1077
+ "grad_norm": 1.3414553027680045,
1078
  "learning_rate": 6.591009388270315e-06,
1079
+ "loss": 0.6483,
1080
  "step": 760
1081
  },
1082
  {
1083
  "epoch": 0.4605659241420831,
1084
+ "grad_norm": 1.2833745957368083,
1085
  "learning_rate": 6.54108465046161e-06,
1086
+ "loss": 0.6591,
1087
  "step": 765
1088
  },
1089
  {
1090
  "epoch": 0.46357615894039733,
1091
+ "grad_norm": 1.2801896169833944,
1092
  "learning_rate": 6.490989555233328e-06,
1093
+ "loss": 0.6481,
1094
  "step": 770
1095
  },
1096
  {
1097
  "epoch": 0.46658639373871164,
1098
+ "grad_norm": 1.2982217970316745,
1099
  "learning_rate": 6.440729640289809e-06,
1100
+ "loss": 0.6445,
1101
  "step": 775
1102
  },
1103
  {
1104
  "epoch": 0.4695966285370259,
1105
+ "grad_norm": 1.2399362763796928,
1106
  "learning_rate": 6.3903104615551956e-06,
1107
+ "loss": 0.6428,
1108
  "step": 780
1109
  },
1110
  {
1111
  "epoch": 0.47260686333534013,
1112
+ "grad_norm": 1.252116968840507,
1113
  "learning_rate": 6.3397375925592675e-06,
1114
+ "loss": 0.621,
1115
  "step": 785
1116
  },
1117
  {
1118
  "epoch": 0.47561709813365444,
1119
+ "grad_norm": 1.1634766201317972,
1120
  "learning_rate": 6.289016623821308e-06,
1121
+ "loss": 0.6396,
1122
  "step": 790
1123
  },
1124
  {
1125
  "epoch": 0.4786273329319687,
1126
+ "grad_norm": 1.2089828599308985,
1127
  "learning_rate": 6.2381531622321234e-06,
1128
+ "loss": 0.6429,
1129
  "step": 795
1130
  },
1131
  {
1132
  "epoch": 0.481637567730283,
1133
+ "grad_norm": 1.2611778575722072,
1134
  "learning_rate": 6.18715283043422e-06,
1135
+ "loss": 0.6282,
1136
  "step": 800
1137
  },
1138
  {
1139
  "epoch": 0.48464780252859724,
1140
+ "grad_norm": 1.2634822793060227,
1141
  "learning_rate": 6.136021266200271e-06,
1142
+ "loss": 0.6472,
1143
  "step": 805
1144
  },
1145
  {
1146
  "epoch": 0.4876580373269115,
1147
+ "grad_norm": 1.2731517375276795,
1148
  "learning_rate": 6.084764121809878e-06,
1149
+ "loss": 0.6187,
1150
  "step": 810
1151
  },
1152
  {
1153
  "epoch": 0.4906682721252258,
1154
+ "grad_norm": 1.2681414747392186,
1155
  "learning_rate": 6.033387063424765e-06,
1156
+ "loss": 0.6318,
1157
  "step": 815
1158
  },
1159
  {
1160
  "epoch": 0.49367850692354004,
1161
+ "grad_norm": 1.2152691698415932,
1162
  "learning_rate": 5.9818957704624046e-06,
1163
+ "loss": 0.6442,
1164
  "step": 820
1165
  },
1166
  {
1167
  "epoch": 0.4966887417218543,
1168
+ "grad_norm": 1.2163481834250993,
1169
  "learning_rate": 5.930295934968197e-06,
1170
+ "loss": 0.633,
1171
  "step": 825
1172
  },
1173
  {
1174
  "epoch": 0.4996989765201686,
1175
+ "grad_norm": 1.2644071741742902,
1176
  "learning_rate": 5.878593260986256e-06,
1177
+ "loss": 0.6333,
1178
  "step": 830
1179
  },
1180
  {
1181
  "epoch": 0.5027092113184829,
1182
+ "grad_norm": 1.2998551781398153,
1183
  "learning_rate": 5.8267934639288525e-06,
1184
+ "loss": 0.6377,
1185
  "step": 835
1186
  },
1187
  {
1188
  "epoch": 0.5057194461167971,
1189
+ "grad_norm": 1.2747330171688573,
1190
  "learning_rate": 5.77490226994462e-06,
1191
+ "loss": 0.6385,
1192
  "step": 840
1193
  },
1194
  {
1195
  "epoch": 0.5087296809151114,
1196
+ "grad_norm": 1.2147166700112426,
1197
  "learning_rate": 5.722925415285555e-06,
1198
+ "loss": 0.6335,
1199
  "step": 845
1200
  },
1201
  {
1202
  "epoch": 0.5117399157134257,
1203
+ "grad_norm": 1.2814771734916943,
1204
  "learning_rate": 5.670868645672916e-06,
1205
+ "loss": 0.6316,
1206
  "step": 850
1207
  },
1208
  {
1209
  "epoch": 0.5147501505117399,
1210
+ "grad_norm": 1.2118662457570313,
1211
  "learning_rate": 5.618737715662067e-06,
1212
+ "loss": 0.6219,
1213
  "step": 855
1214
  },
1215
  {
1216
  "epoch": 0.5177603853100542,
1217
+ "grad_norm": 1.2322127749127771,
1218
  "learning_rate": 5.566538388006351e-06,
1219
+ "loss": 0.623,
1220
  "step": 860
1221
  },
1222
  {
1223
  "epoch": 0.5207706201083685,
1224
+ "grad_norm": 1.1991462675329747,
1225
  "learning_rate": 5.514276433020044e-06,
1226
+ "loss": 0.6107,
1227
  "step": 865
1228
  },
1229
  {
1230
  "epoch": 0.5237808549066827,
1231
+ "grad_norm": 1.2149020805519655,
1232
  "learning_rate": 5.461957627940489e-06,
1233
+ "loss": 0.6191,
1234
  "step": 870
1235
  },
1236
  {
1237
  "epoch": 0.526791089704997,
1238
+ "grad_norm": 1.1877405672596075,
1239
  "learning_rate": 5.409587756289462e-06,
1240
+ "loss": 0.6112,
1241
  "step": 875
1242
  },
1243
  {
1244
  "epoch": 0.5298013245033113,
1245
+ "grad_norm": 1.1970060104579194,
1246
  "learning_rate": 5.357172607233831e-06,
1247
+ "loss": 0.6284,
1248
  "step": 880
1249
  },
1250
  {
1251
  "epoch": 0.5328115593016255,
1252
+ "grad_norm": 1.368716689528383,
1253
  "learning_rate": 5.304717974945596e-06,
1254
+ "loss": 0.6163,
1255
  "step": 885
1256
  },
1257
  {
1258
  "epoch": 0.5358217940999398,
1259
+ "grad_norm": 1.273293450405195,
1260
  "learning_rate": 5.252229657961394e-06,
1261
+ "loss": 0.6214,
1262
  "step": 890
1263
  },
1264
  {
1265
  "epoch": 0.5388320288982541,
1266
+ "grad_norm": 1.196878746141804,
1267
  "learning_rate": 5.199713458541495e-06,
1268
+ "loss": 0.6247,
1269
  "step": 895
1270
  },
1271
  {
1272
  "epoch": 0.5418422636965683,
1273
+ "grad_norm": 1.2084713164747305,
1274
  "learning_rate": 5.1471751820284e-06,
1275
+ "loss": 0.6103,
1276
  "step": 900
1277
  },
1278
  {
1279
  "epoch": 0.5448524984948826,
1280
+ "grad_norm": 1.2196639084600536,
1281
  "learning_rate": 5.094620636205096e-06,
1282
+ "loss": 0.6221,
1283
  "step": 905
1284
  },
1285
  {
1286
  "epoch": 0.5478627332931969,
1287
+ "grad_norm": 1.1988440703906922,
1288
  "learning_rate": 5.042055630653042e-06,
1289
+ "loss": 0.6085,
1290
  "step": 910
1291
  },
1292
  {
1293
  "epoch": 0.5508729680915111,
1294
+ "grad_norm": 1.26407878369998,
1295
  "learning_rate": 4.98948597610996e-06,
1296
+ "loss": 0.6114,
1297
  "step": 915
1298
  },
1299
  {
1300
  "epoch": 0.5538832028898254,
1301
+ "grad_norm": 1.2541252408565282,
1302
  "learning_rate": 4.936917483827483e-06,
1303
+ "loss": 0.6098,
1304
  "step": 920
1305
  },
1306
  {
1307
  "epoch": 0.5568934376881397,
1308
+ "grad_norm": 1.2461032064160447,
1309
  "learning_rate": 4.884355964928767e-06,
1310
+ "loss": 0.6288,
1311
  "step": 925
1312
  },
1313
  {
1314
  "epoch": 0.5599036724864539,
1315
+ "grad_norm": 1.1876409214621886,
1316
  "learning_rate": 4.831807229766101e-06,
1317
+ "loss": 0.5967,
1318
  "step": 930
1319
  },
1320
  {
1321
  "epoch": 0.5629139072847682,
1322
+ "grad_norm": 1.3653959531629782,
1323
  "learning_rate": 4.779277087278615e-06,
1324
+ "loss": 0.5981,
1325
  "step": 935
1326
  },
1327
  {
1328
  "epoch": 0.5659241420830825,
1329
+ "grad_norm": 1.259758029782513,
1330
  "learning_rate": 4.7267713443501274e-06,
1331
+ "loss": 0.6208,
1332
  "step": 940
1333
  },
1334
  {
1335
  "epoch": 0.5689343768813967,
1336
+ "grad_norm": 1.2512441621206507,
1337
  "learning_rate": 4.67429580516724e-06,
1338
+ "loss": 0.6036,
1339
  "step": 945
1340
  },
1341
  {
1342
  "epoch": 0.571944611679711,
1343
+ "grad_norm": 1.2148856832751866,
1344
  "learning_rate": 4.6218562705777185e-06,
1345
+ "loss": 0.5918,
1346
  "step": 950
1347
  },
1348
  {
1349
  "epoch": 0.5749548464780253,
1350
+ "grad_norm": 1.2618875581171025,
1351
  "learning_rate": 4.5694585374492314e-06,
1352
+ "loss": 0.5956,
1353
  "step": 955
1354
  },
1355
  {
1356
  "epoch": 0.5779650812763396,
1357
+ "grad_norm": 1.2405407377914555,
1358
  "learning_rate": 4.517108398028566e-06,
1359
+ "loss": 0.6005,
1360
  "step": 960
1361
  },
1362
  {
1363
  "epoch": 0.5809753160746538,
1364
+ "grad_norm": 1.2775566642146432,
1365
  "learning_rate": 4.464811639301314e-06,
1366
+ "loss": 0.589,
1367
  "step": 965
1368
  },
1369
  {
1370
  "epoch": 0.5839855508729681,
1371
+ "grad_norm": 1.3117899815699137,
1372
  "learning_rate": 4.412574042352156e-06,
1373
+ "loss": 0.5979,
1374
  "step": 970
1375
  },
1376
  {
1377
  "epoch": 0.5869957856712824,
1378
+ "grad_norm": 1.269362160884011,
1379
  "learning_rate": 4.360401381725806e-06,
1380
+ "loss": 0.5887,
1381
  "step": 975
1382
  },
1383
  {
1384
  "epoch": 0.5900060204695966,
1385
+ "grad_norm": 1.248523016364606,
1386
  "learning_rate": 4.308299424788667e-06,
1387
+ "loss": 0.6022,
1388
  "step": 980
1389
  },
1390
  {
1391
  "epoch": 0.5930162552679109,
1392
+ "grad_norm": 1.2366195530748443,
1393
  "learning_rate": 4.256273931091284e-06,
1394
+ "loss": 0.6,
1395
  "step": 985
1396
  },
1397
  {
1398
  "epoch": 0.5960264900662252,
1399
+ "grad_norm": 1.222474401781831,
1400
  "learning_rate": 4.204330651731662e-06,
1401
+ "loss": 0.5871,
1402
  "step": 990
1403
  },
1404
  {
1405
  "epoch": 0.5990367248645394,
1406
+ "grad_norm": 1.2424032915235261,
1407
  "learning_rate": 4.152475328719517e-06,
1408
+ "loss": 0.5783,
1409
  "step": 995
1410
  },
1411
  {
1412
  "epoch": 0.6020469596628537,
1413
+ "grad_norm": 1.2570025380680472,
1414
  "learning_rate": 4.1007136943415325e-06,
1415
+ "loss": 0.5636,
1416
  "step": 1000
1417
  },
1418
  {
1419
  "epoch": 0.605057194461168,
1420
+ "grad_norm": 1.2170323945106316,
1421
  "learning_rate": 4.049051470527692e-06,
1422
+ "loss": 0.601,
1423
  "step": 1005
1424
  },
1425
  {
1426
  "epoch": 0.6080674292594822,
1427
+ "grad_norm": 1.1958865931929052,
1428
  "learning_rate": 3.997494368218745e-06,
1429
+ "loss": 0.5878,
1430
  "step": 1010
1431
  },
1432
  {
1433
  "epoch": 0.6110776640577965,
1434
+ "grad_norm": 1.3367291697694699,
1435
  "learning_rate": 3.946048086734921e-06,
1436
+ "loss": 0.5924,
1437
  "step": 1015
1438
  },
1439
  {
1440
  "epoch": 0.6140878988561108,
1441
+ "grad_norm": 1.2820724684669875,
1442
  "learning_rate": 3.894718313145873e-06,
1443
+ "loss": 0.5847,
1444
  "step": 1020
1445
  },
1446
  {
1447
  "epoch": 0.617098133654425,
1448
+ "grad_norm": 1.2719469697058394,
1449
  "learning_rate": 3.843510721642036e-06,
1450
+ "loss": 0.5744,
1451
  "step": 1025
1452
  },
1453
  {
1454
  "epoch": 0.6201083684527393,
1455
+ "grad_norm": 1.2274652244842972,
1456
  "learning_rate": 3.7924309729073616e-06,
1457
+ "loss": 0.5751,
1458
  "step": 1030
1459
  },
1460
  {
1461
  "epoch": 0.6231186032510536,
1462
+ "grad_norm": 1.3030807337065518,
1463
  "learning_rate": 3.7414847134935716e-06,
1464
+ "loss": 0.5899,
1465
  "step": 1035
1466
  },
1467
  {
1468
  "epoch": 0.6261288380493678,
1469
+ "grad_norm": 1.1948808509249107,
1470
  "learning_rate": 3.6906775751959667e-06,
1471
+ "loss": 0.5755,
1472
  "step": 1040
1473
  },
1474
  {
1475
  "epoch": 0.6291390728476821,
1476
+ "grad_norm": 1.2576164970306511,
1477
  "learning_rate": 3.640015174430864e-06,
1478
+ "loss": 0.5682,
1479
  "step": 1045
1480
  },
1481
  {
1482
  "epoch": 0.6321493076459964,
1483
+ "grad_norm": 1.2585913656336474,
1484
  "learning_rate": 3.5895031116147355e-06,
1485
+ "loss": 0.5791,
1486
  "step": 1050
1487
  },
1488
  {
1489
  "epoch": 0.6351595424443106,
1490
+ "grad_norm": 1.303783858936202,
1491
  "learning_rate": 3.539146970545124e-06,
1492
+ "loss": 0.5789,
1493
  "step": 1055
1494
  },
1495
  {
1496
  "epoch": 0.6381697772426249,
1497
+ "grad_norm": 1.2996679794519685,
1498
  "learning_rate": 3.488952317783374e-06,
1499
+ "loss": 0.5669,
1500
  "step": 1060
1501
  },
1502
  {
1503
  "epoch": 0.6411800120409392,
1504
+ "grad_norm": 1.227591078819204,
1505
  "learning_rate": 3.438924702039301e-06,
1506
+ "loss": 0.5698,
1507
  "step": 1065
1508
  },
1509
  {
1510
  "epoch": 0.6441902468392534,
1511
+ "grad_norm": 1.2857752436654992,
1512
  "learning_rate": 3.389069653557805e-06,
1513
+ "loss": 0.5739,
1514
  "step": 1070
1515
  },
1516
  {
1517
  "epoch": 0.6472004816375677,
1518
+ "grad_norm": 1.2421495144623809,
1519
  "learning_rate": 3.3393926835075307e-06,
1520
+ "loss": 0.5788,
1521
  "step": 1075
1522
  },
1523
  {
1524
  "epoch": 0.650210716435882,
1525
+ "grad_norm": 1.3331817251764237,
1526
  "learning_rate": 3.289899283371657e-06,
1527
+ "loss": 0.5606,
1528
  "step": 1080
1529
  },
1530
  {
1531
  "epoch": 0.6532209512341962,
1532
+ "grad_norm": 1.2708137446575152,
1533
  "learning_rate": 3.240594924340835e-06,
1534
+ "loss": 0.5615,
1535
  "step": 1085
1536
  },
1537
  {
1538
  "epoch": 0.6562311860325105,
1539
+ "grad_norm": 1.2640775164953746,
1540
  "learning_rate": 3.1914850567083866e-06,
1541
+ "loss": 0.565,
1542
  "step": 1090
1543
  },
1544
  {
1545
  "epoch": 0.6592414208308248,
1546
+ "grad_norm": 1.2780803430813366,
1547
  "learning_rate": 3.1425751092678064e-06,
1548
+ "loss": 0.5508,
1549
  "step": 1095
1550
  },
1551
  {
1552
  "epoch": 0.6622516556291391,
1553
+ "grad_norm": 1.2424354197819152,
1554
  "learning_rate": 3.0938704887126425e-06,
1555
+ "loss": 0.5541,
1556
  "step": 1100
1557
  },
1558
  {
1559
  "epoch": 0.6652618904274533,
1560
+ "grad_norm": 1.2085302041635138,
1561
  "learning_rate": 3.045376579038821e-06,
1562
+ "loss": 0.5507,
1563
  "step": 1105
1564
  },
1565
  {
1566
  "epoch": 0.6682721252257676,
1567
+ "grad_norm": 1.2913791921928985,
1568
  "learning_rate": 2.9970987409494784e-06,
1569
+ "loss": 0.5751,
1570
  "step": 1110
1571
  },
1572
  {
1573
  "epoch": 0.6712823600240819,
1574
+ "grad_norm": 1.2151986171427258,
1575
  "learning_rate": 2.9490423112623646e-06,
1576
+ "loss": 0.5545,
1577
  "step": 1115
1578
  },
1579
  {
1580
  "epoch": 0.6742925948223961,
1581
+ "grad_norm": 1.285546926147462,
1582
  "learning_rate": 2.9012126023198973e-06,
1583
+ "loss": 0.5559,
1584
  "step": 1120
1585
  },
1586
  {
1587
  "epoch": 0.6773028296207104,
1588
+ "grad_norm": 1.2516039295058423,
1589
  "learning_rate": 2.853614901401909e-06,
1590
+ "loss": 0.5492,
1591
  "step": 1125
1592
  },
1593
  {
1594
  "epoch": 0.6803130644190247,
1595
+ "grad_norm": 1.2619394657656795,
1596
  "learning_rate": 2.806254470141174e-06,
1597
+ "loss": 0.5535,
1598
  "step": 1130
1599
  },
1600
  {
1601
  "epoch": 0.6833232992173389,
1602
+ "grad_norm": 1.263160410510985,
1603
  "learning_rate": 2.759136543941773e-06,
1604
+ "loss": 0.5491,
1605
  "step": 1135
1606
  },
1607
  {
1608
  "epoch": 0.6863335340156532,
1609
+ "grad_norm": 1.2808685326776417,
1610
  "learning_rate": 2.712266331400332e-06,
1611
+ "loss": 0.5355,
1612
  "step": 1140
1613
  },
1614
  {
1615
  "epoch": 0.6893437688139675,
1616
+ "grad_norm": 1.2224213775798936,
1617
  "learning_rate": 2.66564901373027e-06,
1618
+ "loss": 0.5497,
1619
  "step": 1145
1620
  },
1621
  {
1622
  "epoch": 0.6923540036122817,
1623
+ "grad_norm": 1.2417187357822892,
1624
  "learning_rate": 2.6192897441890337e-06,
1625
+ "loss": 0.5395,
1626
  "step": 1150
1627
  },
1628
  {
1629
  "epoch": 0.695364238410596,
1630
+ "grad_norm": 1.2539889085403395,
1631
  "learning_rate": 2.573193647508426e-06,
1632
+ "loss": 0.5484,
1633
  "step": 1155
1634
  },
1635
  {
1636
  "epoch": 0.6983744732089103,
1637
+ "grad_norm": 1.277361909713358,
1638
  "learning_rate": 2.5273658193281252e-06,
1639
+ "loss": 0.5359,
1640
  "step": 1160
1641
  },
1642
  {
1643
  "epoch": 0.7013847080072245,
1644
+ "grad_norm": 1.2511254070143052,
1645
  "learning_rate": 2.4818113256323745e-06,
1646
+ "loss": 0.5549,
1647
  "step": 1165
1648
  },
1649
  {
1650
  "epoch": 0.7043949428055388,
1651
+ "grad_norm": 1.3131375870134938,
1652
  "learning_rate": 2.4365352021899635e-06,
1653
+ "loss": 0.5555,
1654
  "step": 1170
1655
  },
1656
  {
1657
  "epoch": 0.7074051776038531,
1658
+ "grad_norm": 1.2825609178954607,
1659
  "learning_rate": 2.391542453997578e-06,
1660
+ "loss": 0.5452,
1661
  "step": 1175
1662
  },
1663
  {
1664
  "epoch": 0.7104154124021673,
1665
+ "grad_norm": 1.2877459534486142,
1666
  "learning_rate": 2.346838054726505e-06,
1667
+ "loss": 0.5417,
1668
  "step": 1180
1669
  },
1670
  {
1671
  "epoch": 0.7134256472004816,
1672
+ "grad_norm": 1.2529996046706589,
1673
  "learning_rate": 2.302426946172836e-06,
1674
+ "loss": 0.54,
1675
  "step": 1185
1676
  },
1677
  {
1678
  "epoch": 0.7164358819987959,
1679
+ "grad_norm": 1.2558399980367174,
1680
  "learning_rate": 2.258314037711184e-06,
1681
+ "loss": 0.5484,
1682
  "step": 1190
1683
  },
1684
  {
1685
  "epoch": 0.7194461167971101,
1686
+ "grad_norm": 1.272832955666527,
1687
  "learning_rate": 2.214504205751971e-06,
1688
+ "loss": 0.5515,
1689
  "step": 1195
1690
  },
1691
  {
1692
  "epoch": 0.7224563515954244,
1693
+ "grad_norm": 1.2469393251594179,
1694
  "learning_rate": 2.1710022932023805e-06,
1695
+ "loss": 0.5519,
1696
  "step": 1200
1697
  },
1698
  {
1699
  "epoch": 0.7254665863937387,
1700
+ "grad_norm": 1.243045028807727,
1701
  "learning_rate": 2.127813108931007e-06,
1702
+ "loss": 0.5438,
1703
  "step": 1205
1704
  },
1705
  {
1706
  "epoch": 0.7284768211920529,
1707
+ "grad_norm": 1.2114723202770783,
1708
  "learning_rate": 2.084941427236245e-06,
1709
+ "loss": 0.5435,
1710
  "step": 1210
1711
  },
1712
  {
1713
  "epoch": 0.7314870559903672,
1714
+ "grad_norm": 1.2939587849010044,
1715
  "learning_rate": 2.04239198731855e-06,
1716
+ "loss": 0.5393,
1717
  "step": 1215
1718
  },
1719
  {
1720
  "epoch": 0.7344972907886815,
1721
+ "grad_norm": 1.2378436974378162,
1722
  "learning_rate": 2.000169492756523e-06,
1723
+ "loss": 0.5455,
1724
  "step": 1220
1725
  },
1726
  {
1727
  "epoch": 0.7375075255869958,
1728
+ "grad_norm": 1.2231608996738805,
1729
  "learning_rate": 1.9582786109869713e-06,
1730
+ "loss": 0.5421,
1731
  "step": 1225
1732
  },
1733
  {
1734
  "epoch": 0.74051776038531,
1735
+ "grad_norm": 1.3275912626430932,
1736
  "learning_rate": 1.9167239727889527e-06,
1737
+ "loss": 0.5361,
1738
  "step": 1230
1739
  },
1740
  {
1741
  "epoch": 0.7435279951836243,
1742
+ "grad_norm": 1.3041604998744816,
1743
  "learning_rate": 1.875510171771865e-06,
1744
+ "loss": 0.5331,
1745
  "step": 1235
1746
  },
1747
  {
1748
  "epoch": 0.7465382299819386,
1749
+ "grad_norm": 1.279896389842292,
1750
  "learning_rate": 1.8346417638676533e-06,
1751
+ "loss": 0.5286,
1752
  "step": 1240
1753
  },
1754
  {
1755
  "epoch": 0.7495484647802528,
1756
+ "grad_norm": 1.309403322072463,
1757
  "learning_rate": 1.7941232668271863e-06,
1758
+ "loss": 0.5479,
1759
  "step": 1245
1760
  },
1761
  {
1762
  "epoch": 0.7525586995785671,
1763
+ "grad_norm": 1.2455386392310572,
1764
  "learning_rate": 1.753959159720836e-06,
1765
+ "loss": 0.5353,
1766
  "step": 1250
1767
  },
1768
  {
1769
  "epoch": 0.7555689343768814,
1770
+ "grad_norm": 1.2637146932290357,
1771
  "learning_rate": 1.7141538824433506e-06,
1772
+ "loss": 0.5349,
1773
  "step": 1255
1774
  },
1775
  {
1776
  "epoch": 0.7585791691751956,
1777
+ "grad_norm": 1.258655302158318,
1778
  "learning_rate": 1.6747118352230495e-06,
1779
+ "loss": 0.538,
1780
  "step": 1260
1781
  },
1782
  {
1783
  "epoch": 0.7615894039735099,
1784
+ "grad_norm": 1.2756271345276968,
1785
  "learning_rate": 1.6356373781354058e-06,
1786
+ "loss": 0.5167,
1787
  "step": 1265
1788
  },
1789
  {
1790
  "epoch": 0.7645996387718242,
1791
+ "grad_norm": 1.2413970049455438,
1792
  "learning_rate": 1.5969348306210692e-06,
1793
+ "loss": 0.5309,
1794
  "step": 1270
1795
  },
1796
  {
1797
  "epoch": 0.7676098735701384,
1798
+ "grad_norm": 1.3035393545030223,
1799
  "learning_rate": 1.5586084710083737e-06,
1800
+ "loss": 0.5239,
1801
  "step": 1275
1802
  },
1803
  {
1804
  "epoch": 0.7706201083684527,
1805
+ "grad_norm": 1.2294394327286127,
1806
  "learning_rate": 1.5206625360403943e-06,
1807
+ "loss": 0.5305,
1808
  "step": 1280
1809
  },
1810
  {
1811
  "epoch": 0.773630343166767,
1812
+ "grad_norm": 1.2174444367598383,
1813
  "learning_rate": 1.4831012204066114e-06,
1814
+ "loss": 0.531,
1815
  "step": 1285
1816
  },
1817
  {
1818
  "epoch": 0.7766405779650812,
1819
+ "grad_norm": 1.2366691168478938,
1820
  "learning_rate": 1.445928676279199e-06,
1821
+ "loss": 0.518,
1822
  "step": 1290
1823
  },
1824
  {
1825
  "epoch": 0.7796508127633955,
1826
+ "grad_norm": 1.264298324101515,
1827
  "learning_rate": 1.4091490128540374e-06,
1828
+ "loss": 0.5068,
1829
  "step": 1295
1830
  },
1831
  {
1832
  "epoch": 0.7826610475617098,
1833
+ "grad_norm": 1.2332308291363379,
1834
  "learning_rate": 1.3727662958964627e-06,
1835
+ "loss": 0.5203,
1836
  "step": 1300
1837
  },
1838
  {
1839
  "epoch": 0.785671282360024,
1840
+ "grad_norm": 1.2299329633898903,
1841
  "learning_rate": 1.3367845472918272e-06,
1842
+ "loss": 0.5298,
1843
  "step": 1305
1844
  },
1845
  {
1846
  "epoch": 0.7886815171583383,
1847
+ "grad_norm": 1.2504340191558994,
1848
  "learning_rate": 1.3012077446008969e-06,
1849
+ "loss": 0.5248,
1850
  "step": 1310
1851
  },
1852
  {
1853
  "epoch": 0.7916917519566526,
1854
+ "grad_norm": 1.3049347135811968,
1855
  "learning_rate": 1.266039820620159e-06,
1856
+ "loss": 0.523,
1857
  "step": 1315
1858
  },
1859
  {
1860
  "epoch": 0.7947019867549668,
1861
+ "grad_norm": 1.2179046011678003,
1862
  "learning_rate": 1.2312846629470826e-06,
1863
+ "loss": 0.4974,
1864
  "step": 1320
1865
  },
1866
  {
1867
  "epoch": 0.7977122215532811,
1868
+ "grad_norm": 1.3164892931071699,
1869
  "learning_rate": 1.1969461135503573e-06,
1870
+ "loss": 0.526,
1871
  "step": 1325
1872
  },
1873
  {
1874
  "epoch": 0.8007224563515954,
1875
+ "grad_norm": 1.271375375821384,
1876
  "learning_rate": 1.163027968345195e-06,
1877
+ "loss": 0.5161,
1878
  "step": 1330
1879
  },
1880
  {
1881
  "epoch": 0.8037326911499096,
1882
+ "grad_norm": 1.3139938511991043,
1883
  "learning_rate": 1.1295339767737125e-06,
1884
+ "loss": 0.5113,
1885
  "step": 1335
1886
  },
1887
  {
1888
  "epoch": 0.8067429259482239,
1889
+ "grad_norm": 1.286007032144441,
1890
  "learning_rate": 1.0964678413904529e-06,
1891
+ "loss": 0.5217,
1892
  "step": 1340
1893
  },
1894
  {
1895
  "epoch": 0.8097531607465382,
1896
+ "grad_norm": 1.3030418958577936,
1897
  "learning_rate": 1.0638332174530953e-06,
1898
+ "loss": 0.5326,
1899
  "step": 1345
1900
  },
1901
  {
1902
  "epoch": 0.8127633955448526,
1903
+ "grad_norm": 1.2573877263232616,
1904
  "learning_rate": 1.0316337125183817e-06,
1905
+ "loss": 0.5172,
1906
  "step": 1350
1907
  },
1908
  {
1909
  "epoch": 0.8157736303431667,
1910
+ "grad_norm": 1.3008639038029295,
1911
  "learning_rate": 9.998728860433277e-07,
1912
  "loss": 0.512,
1913
  "step": 1355
1914
  },
1915
  {
1916
  "epoch": 0.818783865141481,
1917
+ "grad_norm": 1.2333298206728438,
1918
  "learning_rate": 9.685542489917494e-07,
1919
+ "loss": 0.5047,
1920
  "step": 1360
1921
  },
1922
  {
1923
  "epoch": 0.8217940999397954,
1924
+ "grad_norm": 1.289522737500026,
1925
  "learning_rate": 9.376812634461418e-07,
1926
+ "loss": 0.5104,
1927
  "step": 1365
1928
  },
1929
  {
1930
  "epoch": 0.8248043347381095,
1931
+ "grad_norm": 1.2659811011921933,
1932
  "learning_rate": 9.072573422249692e-07,
1933
+ "loss": 0.5232,
1934
  "step": 1370
1935
  },
1936
  {
1937
  "epoch": 0.8278145695364238,
1938
+ "grad_norm": 1.22881385464476,
1939
  "learning_rate": 8.772858485054042e-07,
1940
+ "loss": 0.5199,
1941
  "step": 1375
1942
  },
1943
  {
1944
  "epoch": 0.8308248043347382,
1945
+ "grad_norm": 1.2715826823700018,
1946
  "learning_rate": 8.477700954515372e-07,
1947
+ "loss": 0.5202,
1948
  "step": 1380
1949
  },
1950
  {
1951
  "epoch": 0.8338350391330523,
1952
+ "grad_norm": 1.2503742943128184,
1953
  "learning_rate": 8.187133458481416e-07,
1954
+ "loss": 0.5285,
1955
  "step": 1385
1956
  },
1957
  {
1958
  "epoch": 0.8368452739313667,
1959
+ "grad_norm": 1.2943224950483547,
1960
  "learning_rate": 7.901188117399817e-07,
1961
+ "loss": 0.5146,
1962
  "step": 1390
1963
  },
1964
  {
1965
  "epoch": 0.839855508729681,
1966
+ "grad_norm": 1.2465983369075755,
1967
  "learning_rate": 7.619896540767435e-07,
1968
+ "loss": 0.5214,
1969
  "step": 1395
1970
  },
1971
  {
1972
  "epoch": 0.8428657435279951,
1973
+ "grad_norm": 1.2931198121674863,
1974
  "learning_rate": 7.343289823636168e-07,
1975
+ "loss": 0.5103,
1976
  "step": 1400
1977
  },
1978
  {
1979
  "epoch": 0.8458759783263095,
1980
+ "grad_norm": 1.3126713125625864,
1981
  "learning_rate": 7.0713985431755e-07,
1982
+ "loss": 0.5131,
1983
  "step": 1405
1984
  },
1985
  {
1986
  "epoch": 0.8488862131246238,
1987
+ "grad_norm": 1.2514737845776198,
1988
  "learning_rate": 6.804252755292429e-07,
1989
+ "loss": 0.4972,
1990
  "step": 1410
1991
  },
1992
  {
1993
  "epoch": 0.851896447922938,
1994
+ "grad_norm": 1.2888999488065467,
1995
  "learning_rate": 6.541881991309013e-07,
1996
+ "loss": 0.5086,
1997
  "step": 1415
1998
  },
1999
  {
2000
  "epoch": 0.8549066827212523,
2001
+ "grad_norm": 1.2929120768720084,
2002
  "learning_rate": 6.284315254697726e-07,
2003
+ "loss": 0.5131,
2004
  "step": 1420
2005
  },
2006
  {
2007
  "epoch": 0.8579169175195666,
2008
+ "grad_norm": 1.2348017732169416,
2009
  "learning_rate": 6.031581017875482e-07,
2010
+ "loss": 0.5147,
2011
  "step": 1425
2012
  },
2013
  {
2014
  "epoch": 0.8609271523178808,
2015
+ "grad_norm": 1.2685705494636663,
2016
  "learning_rate": 5.783707219056078e-07,
2017
+ "loss": 0.5092,
2018
  "step": 1430
2019
  },
2020
  {
2021
  "epoch": 0.863937387116195,
2022
+ "grad_norm": 1.2976102586906246,
2023
  "learning_rate": 5.540721259161774e-07,
2024
+ "loss": 0.5115,
2025
  "step": 1435
2026
  },
2027
  {
2028
  "epoch": 0.8669476219145094,
2029
+ "grad_norm": 1.250337429227757,
2030
  "learning_rate": 5.302649998794368e-07,
2031
+ "loss": 0.5169,
2032
  "step": 1440
2033
  },
2034
  {
2035
  "epoch": 0.8699578567128236,
2036
+ "grad_norm": 1.2461623782975149,
2037
  "learning_rate": 5.0695197552659e-07,
2038
+ "loss": 0.5078,
2039
  "step": 1445
2040
  },
2041
  {
2042
  "epoch": 0.8729680915111379,
2043
+ "grad_norm": 1.2782693319086738,
2044
  "learning_rate": 4.841356299689359e-07,
2045
+ "loss": 0.5157,
2046
  "step": 1450
2047
  },
2048
  {
2049
  "epoch": 0.8759783263094522,
2050
+ "grad_norm": 1.2663091436447071,
2051
  "learning_rate": 4.618184854129981e-07,
2052
+ "loss": 0.5094,
2053
  "step": 1455
2054
  },
2055
  {
2056
  "epoch": 0.8789885611077664,
2057
+ "grad_norm": 1.2889286164793572,
2058
  "learning_rate": 4.4000300888169753e-07,
2059
+ "loss": 0.5097,
2060
  "step": 1460
2061
  },
2062
  {
2063
  "epoch": 0.8819987959060807,
2064
+ "grad_norm": 1.270161020673374,
2065
  "learning_rate": 4.1869161194164565e-07,
2066
+ "loss": 0.5178,
2067
  "step": 1465
2068
  },
2069
  {
2070
  "epoch": 0.885009030704395,
2071
+ "grad_norm": 1.3028978532412478,
2072
  "learning_rate": 3.9788665043656083e-07,
2073
+ "loss": 0.5071,
2074
  "step": 1470
2075
  },
2076
  {
2077
  "epoch": 0.8880192655027093,
2078
+ "grad_norm": 1.2451090536200642,
2079
  "learning_rate": 3.775904242268391e-07,
2080
+ "loss": 0.5161,
2081
  "step": 1475
2082
  },
2083
  {
2084
  "epoch": 0.8910295003010235,
2085
+ "grad_norm": 1.3526909015751019,
2086
  "learning_rate": 3.578051769353219e-07,
2087
+ "loss": 0.5156,
2088
  "step": 1480
2089
  },
2090
  {
2091
  "epoch": 0.8940397350993378,
2092
+ "grad_norm": 1.349791093552624,
2093
  "learning_rate": 3.385330956992816e-07,
2094
+ "loss": 0.5163,
2095
  "step": 1485
2096
  },
2097
  {
2098
  "epoch": 0.8970499698976521,
2099
+ "grad_norm": 1.274654975609038,
2100
  "learning_rate": 3.1977631092863613e-07,
2101
+ "loss": 0.5085,
2102
  "step": 1490
2103
  },
2104
  {
2105
  "epoch": 0.9000602046959663,
2106
+ "grad_norm": 1.2723546077606147,
2107
  "learning_rate": 3.015368960704584e-07,
2108
+ "loss": 0.4992,
2109
  "step": 1495
2110
  },
2111
  {
2112
  "epoch": 0.9030704394942806,
2113
+ "grad_norm": 1.2492243057949244,
2114
  "learning_rate": 2.8381686737975867e-07,
2115
+ "loss": 0.5117,
2116
  "step": 1500
2117
  },
2118
  {
2119
  "epoch": 0.9060806742925949,
2120
+ "grad_norm": 1.2506347579447048,
2121
  "learning_rate": 2.666181836966053e-07,
2122
+ "loss": 0.5061,
2123
  "step": 1505
2124
  },
2125
  {
2126
  "epoch": 0.9090909090909091,
2127
+ "grad_norm": 1.2651285126889307,
2128
  "learning_rate": 2.4994274622958726e-07,
2129
+ "loss": 0.5117,
2130
  "step": 1510
2131
  },
2132
  {
2133
  "epoch": 0.9121011438892234,
2134
+ "grad_norm": 1.2928036851193043,
2135
  "learning_rate": 2.3379239834564526e-07,
2136
+ "loss": 0.5052,
2137
  "step": 1515
2138
  },
2139
  {
2140
  "epoch": 0.9151113786875377,
2141
+ "grad_norm": 1.327963392569787,
2142
  "learning_rate": 2.1816892536629775e-07,
2143
+ "loss": 0.505,
2144
  "step": 1520
2145
  },
2146
  {
2147
  "epoch": 0.9181216134858519,
2148
+ "grad_norm": 1.2391568358607692,
2149
  "learning_rate": 2.0307405437029027e-07,
2150
+ "loss": 0.4938,
2151
  "step": 1525
2152
  },
2153
  {
2154
  "epoch": 0.9211318482841662,
2155
+ "grad_norm": 1.2967086784362822,
2156
  "learning_rate": 1.8850945400266994e-07,
2157
+ "loss": 0.5108,
2158
  "step": 1530
2159
  },
2160
  {
2161
  "epoch": 0.9241420830824805,
2162
+ "grad_norm": 1.29545735590143,
2163
  "learning_rate": 1.7447673429033361e-07,
2164
+ "loss": 0.5032,
2165
  "step": 1535
2166
  },
2167
  {
2168
  "epoch": 0.9271523178807947,
2169
+ "grad_norm": 1.2875158001193772,
2170
  "learning_rate": 1.6097744646404457e-07,
2171
+ "loss": 0.5107,
2172
  "step": 1540
2173
  },
2174
  {
2175
  "epoch": 0.930162552679109,
2176
+ "grad_norm": 1.266148830144139,
2177
  "learning_rate": 1.4801308278695636e-07,
2178
+ "loss": 0.5074,
2179
  "step": 1545
2180
  },
2181
  {
2182
  "epoch": 0.9331727874774233,
2183
+ "grad_norm": 1.3306638459879772,
2184
  "learning_rate": 1.3558507638965158e-07,
2185
+ "loss": 0.507,
2186
  "step": 1550
2187
  },
2188
  {
2189
  "epoch": 0.9361830222757375,
2190
+ "grad_norm": 1.333069277227204,
2191
  "learning_rate": 1.2369480111171784e-07,
2192
+ "loss": 0.5015,
2193
  "step": 1555
2194
  },
2195
  {
2196
  "epoch": 0.9391932570740518,
2197
+ "grad_norm": 1.3040224501040143,
2198
  "learning_rate": 1.1234357134987717e-07,
2199
+ "loss": 0.498,
2200
  "step": 1560
2201
  },
2202
  {
2203
  "epoch": 0.9422034918723661,
2204
+ "grad_norm": 1.2723009240100882,
2205
  "learning_rate": 1.0153264191269052e-07,
2206
+ "loss": 0.522,
2207
  "step": 1565
2208
  },
2209
  {
2210
  "epoch": 0.9452137266706803,
2211
+ "grad_norm": 1.2733119108501816,
2212
  "learning_rate": 9.126320788184374e-08,
2213
+ "loss": 0.5084,
2214
  "step": 1570
2215
  },
2216
  {
2217
  "epoch": 0.9482239614689946,
2218
+ "grad_norm": 1.2479278083881131,
2219
  "learning_rate": 8.153640448003875e-08,
2220
+ "loss": 0.4962,
2221
  "step": 1575
2222
  },
2223
  {
2224
  "epoch": 0.9512341962673089,
2225
+ "grad_norm": 1.2959385578328415,
2226
  "learning_rate": 7.235330694550402e-08,
2227
+ "loss": 0.5114,
2228
  "step": 1580
2229
  },
2230
  {
2231
  "epoch": 0.9542444310656231,
2232
+ "grad_norm": 1.231634916892265,
2233
  "learning_rate": 6.371493041313126e-08,
2234
+ "loss": 0.4951,
2235
  "step": 1585
2236
  },
2237
  {
2238
  "epoch": 0.9572546658639374,
2239
+ "grad_norm": 1.27585215530384,
2240
  "learning_rate": 5.562222980225907e-08,
2241
+ "loss": 0.5071,
2242
  "step": 1590
2243
  },
2244
  {
2245
  "epoch": 0.9602649006622517,
2246
+ "grad_norm": 1.225825933770062,
2247
  "learning_rate": 4.807609971111238e-08,
2248
+ "loss": 0.4971,
2249
  "step": 1595
2250
  },
2251
  {
2252
  "epoch": 0.963275135460566,
2253
+ "grad_norm": 1.2453682011592957,
2254
  "learning_rate": 4.107737431791159e-08,
2255
+ "loss": 0.5054,
2256
  "step": 1600
2257
  },
2258
  {
2259
  "epoch": 0.9662853702588802,
2260
+ "grad_norm": 1.2857076641073053,
2261
  "learning_rate": 3.462682728865685e-08,
2262
+ "loss": 0.5043,
2263
  "step": 1605
2264
  },
2265
  {
2266
  "epoch": 0.9692956050571945,
2267
+ "grad_norm": 1.2778563737126165,
2268
  "learning_rate": 2.8725171691605934e-08,
2269
+ "loss": 0.5075,
2270
  "step": 1610
2271
  },
2272
  {
2273
  "epoch": 0.9723058398555088,
2274
+ "grad_norm": 1.2554905316985445,
2275
  "learning_rate": 2.3373059918448958e-08,
2276
+ "loss": 0.4938,
2277
  "step": 1615
2278
  },
2279
  {
2280
  "epoch": 0.975316074653823,
2281
+ "grad_norm": 1.3192898914729378,
2282
  "learning_rate": 1.8571083612188845e-08,
2283
+ "loss": 0.5088,
2284
  "step": 1620
2285
  },
2286
  {
2287
  "epoch": 0.9783263094521373,
2288
+ "grad_norm": 1.2731885499582947,
2289
  "learning_rate": 1.431977360173975e-08,
2290
+ "loss": 0.4969,
2291
  "step": 1625
2292
  },
2293
  {
2294
  "epoch": 0.9813365442504516,
2295
+ "grad_norm": 1.304095478961953,
2296
  "learning_rate": 1.0619599843249006e-08,
2297
+ "loss": 0.5139,
2298
  "step": 1630
2299
  },
2300
  {
2301
  "epoch": 0.9843467790487658,
2302
+ "grad_norm": 1.27242623092078,
2303
  "learning_rate": 7.470971368142011e-09,
2304
+ "loss": 0.5156,
2305
  "step": 1635
2306
  },
2307
  {
2308
  "epoch": 0.9873570138470801,
2309
+ "grad_norm": 1.239577403155516,
2310
  "learning_rate": 4.874236237911723e-09,
2311
+ "loss": 0.5042,
2312
  "step": 1640
2313
  },
2314
  {
2315
  "epoch": 0.9903672486453944,
2316
+ "grad_norm": 1.2559422923667112,
2317
  "learning_rate": 2.8296815056377824e-09,
2318
+ "loss": 0.5124,
2319
  "step": 1645
2320
  },
2321
  {
2322
  "epoch": 0.9933774834437086,
2323
+ "grad_norm": 1.2978012230245286,
2324
  "learning_rate": 1.3375331842574446e-09,
2325
+ "loss": 0.5151,
2326
  "step": 1650
2327
  },
2328
  {
2329
  "epoch": 0.9963877182420229,
2330
+ "grad_norm": 1.2286869775237814,
2331
  "learning_rate": 3.9795622158111945e-10,
2332
+ "loss": 0.5093,
2333
  "step": 1655
2334
  },
2335
  {
2336
  "epoch": 0.9993979530403372,
2337
+ "grad_norm": 1.2619068290294162,
2338
  "learning_rate": 1.1054482056405136e-11,
2339
+ "loss": 0.5114,
2340
  "step": 1660
2341
  },
2342
  {
2343
  "epoch": 1.0,
2344
+ "eval_runtime": 3.8298,
2345
+ "eval_samples_per_second": 2.611,
2346
+ "eval_steps_per_second": 0.783,
2347
  "step": 1661
2348
  },
2349
  {
2350
  "epoch": 1.0,
2351
  "step": 1661,
2352
  "total_flos": 430711668473856.0,
2353
+ "train_loss": 0.669738974236064,
2354
+ "train_runtime": 16716.2181,
2355
+ "train_samples_per_second": 1.59,
2356
+ "train_steps_per_second": 0.099
2357
  }
2358
  ],
2359
  "logging_steps": 5,