File size: 21,967 Bytes
53f7f4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 100,
  "global_step": 385,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "grad_norm": 39.25,
      "learning_rate": 1.282051282051282e-07,
      "logits/chosen": 88.18099975585938,
      "logits/rejected": 88.25153350830078,
      "logps/chosen": -29.073104858398438,
      "logps/rejected": -26.25731658935547,
      "loss": 0.6931,
      "rewards/accuracies": 0.0,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.75,
      "learning_rate": 1.282051282051282e-06,
      "logits/chosen": 81.07136535644531,
      "logits/rejected": 80.77804565429688,
      "logps/chosen": -34.25458526611328,
      "logps/rejected": -33.03440475463867,
      "loss": 0.699,
      "rewards/accuracies": 0.4444444477558136,
      "rewards/chosen": -0.007714875973761082,
      "rewards/margins": 0.03788409009575844,
      "rewards/rejected": -0.045598965138196945,
      "step": 10
    },
    {
      "epoch": 0.05,
      "grad_norm": 26.25,
      "learning_rate": 2.564102564102564e-06,
      "logits/chosen": 80.65422058105469,
      "logits/rejected": 80.54401397705078,
      "logps/chosen": -33.63849639892578,
      "logps/rejected": -30.794116973876953,
      "loss": 0.708,
      "rewards/accuracies": 0.512499988079071,
      "rewards/chosen": 0.030845394358038902,
      "rewards/margins": 0.04082341492176056,
      "rewards/rejected": -0.009978031739592552,
      "step": 20
    },
    {
      "epoch": 0.08,
      "grad_norm": 38.25,
      "learning_rate": 3.846153846153847e-06,
      "logits/chosen": 82.5073013305664,
      "logits/rejected": 82.5381088256836,
      "logps/chosen": -33.88646697998047,
      "logps/rejected": -31.181421279907227,
      "loss": 0.7746,
      "rewards/accuracies": 0.44999998807907104,
      "rewards/chosen": 0.07581041753292084,
      "rewards/margins": -0.06963472068309784,
      "rewards/rejected": 0.14544512331485748,
      "step": 30
    },
    {
      "epoch": 0.1,
      "grad_norm": 31.625,
      "learning_rate": 4.999896948438434e-06,
      "logits/chosen": 81.06532287597656,
      "logits/rejected": 81.06108093261719,
      "logps/chosen": -32.81906509399414,
      "logps/rejected": -33.26140594482422,
      "loss": 0.6847,
      "rewards/accuracies": 0.574999988079071,
      "rewards/chosen": 0.21299926936626434,
      "rewards/margins": 0.14872975647449493,
      "rewards/rejected": 0.0642695277929306,
      "step": 40
    },
    {
      "epoch": 0.13,
      "grad_norm": 23.0,
      "learning_rate": 4.987541037542187e-06,
      "logits/chosen": 78.69737243652344,
      "logits/rejected": 78.7103500366211,
      "logps/chosen": -30.65850257873535,
      "logps/rejected": -30.81766128540039,
      "loss": 0.6962,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.3280490040779114,
      "rewards/margins": 0.17467446625232697,
      "rewards/rejected": 0.1533745527267456,
      "step": 50
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.625,
      "learning_rate": 4.954691471941119e-06,
      "logits/chosen": 83.20633697509766,
      "logits/rejected": 83.25883483886719,
      "logps/chosen": -30.961681365966797,
      "logps/rejected": -29.538171768188477,
      "loss": 0.703,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.12808682024478912,
      "rewards/margins": 0.09667714685201645,
      "rewards/rejected": 0.03140967711806297,
      "step": 60
    },
    {
      "epoch": 0.18,
      "grad_norm": 53.25,
      "learning_rate": 4.901618883413549e-06,
      "logits/chosen": 83.81951141357422,
      "logits/rejected": 83.84638977050781,
      "logps/chosen": -30.67291259765625,
      "logps/rejected": -33.11872482299805,
      "loss": 0.755,
      "rewards/accuracies": 0.4749999940395355,
      "rewards/chosen": -0.026334354653954506,
      "rewards/margins": 0.02227923832833767,
      "rewards/rejected": -0.04861358925700188,
      "step": 70
    },
    {
      "epoch": 0.21,
      "grad_norm": 31.75,
      "learning_rate": 4.828760511501322e-06,
      "logits/chosen": 81.4664306640625,
      "logits/rejected": 81.44920349121094,
      "logps/chosen": -31.316049575805664,
      "logps/rejected": -31.0085391998291,
      "loss": 0.6446,
      "rewards/accuracies": 0.550000011920929,
      "rewards/chosen": 0.11333731561899185,
      "rewards/margins": 0.2638704478740692,
      "rewards/rejected": -0.15053315460681915,
      "step": 80
    },
    {
      "epoch": 0.23,
      "grad_norm": 37.0,
      "learning_rate": 4.7367166013034295e-06,
      "logits/chosen": 78.19766998291016,
      "logits/rejected": 78.16535186767578,
      "logps/chosen": -32.48051071166992,
      "logps/rejected": -31.223648071289062,
      "loss": 0.6567,
      "rewards/accuracies": 0.6000000238418579,
      "rewards/chosen": 0.09460089355707169,
      "rewards/margins": 0.2579067349433899,
      "rewards/rejected": -0.1633058488368988,
      "step": 90
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.5,
      "learning_rate": 4.626245458345211e-06,
      "logits/chosen": 83.43191528320312,
      "logits/rejected": 83.45047760009766,
      "logps/chosen": -34.02558135986328,
      "logps/rejected": -31.787883758544922,
      "loss": 0.6845,
      "rewards/accuracies": 0.574999988079071,
      "rewards/chosen": 0.16764816641807556,
      "rewards/margins": 0.1900513470172882,
      "rewards/rejected": -0.022403212264180183,
      "step": 100
    },
    {
      "epoch": 0.26,
      "eval_logits/chosen": 98.71414947509766,
      "eval_logits/rejected": 98.70475769042969,
      "eval_logps/chosen": -32.44282531738281,
      "eval_logps/rejected": -36.040138244628906,
      "eval_loss": 0.7398820519447327,
      "eval_rewards/accuracies": 0.5245016813278198,
      "eval_rewards/chosen": 0.00021068855130579323,
      "eval_rewards/margins": 0.04437926039099693,
      "eval_rewards/rejected": -0.04416857287287712,
      "eval_runtime": 104.2075,
      "eval_samples_per_second": 3.292,
      "eval_steps_per_second": 0.413,
      "step": 100
    },
    {
      "epoch": 0.29,
      "grad_norm": 40.25,
      "learning_rate": 4.498257201263691e-06,
      "logits/chosen": 83.59847259521484,
      "logits/rejected": 83.49092102050781,
      "logps/chosen": -32.43052673339844,
      "logps/rejected": -32.78325271606445,
      "loss": 0.6135,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.3553674817085266,
      "rewards/margins": 0.43178611993789673,
      "rewards/rejected": -0.07641863822937012,
      "step": 110
    },
    {
      "epoch": 0.31,
      "grad_norm": 46.5,
      "learning_rate": 4.353806263777678e-06,
      "logits/chosen": 83.7637710571289,
      "logits/rejected": 83.87000274658203,
      "logps/chosen": -28.259990692138672,
      "logps/rejected": -35.35393524169922,
      "loss": 0.6375,
      "rewards/accuracies": 0.612500011920929,
      "rewards/chosen": 0.40175461769104004,
      "rewards/margins": 0.33862805366516113,
      "rewards/rejected": 0.06312654912471771,
      "step": 120
    },
    {
      "epoch": 0.34,
      "grad_norm": 24.875,
      "learning_rate": 4.1940827077152755e-06,
      "logits/chosen": 80.89453125,
      "logits/rejected": 80.9158706665039,
      "logps/chosen": -30.432043075561523,
      "logps/rejected": -32.080535888671875,
      "loss": 0.6294,
      "rewards/accuracies": 0.675000011920929,
      "rewards/chosen": 0.2851874530315399,
      "rewards/margins": 0.3745357096195221,
      "rewards/rejected": -0.08934825658798218,
      "step": 130
    },
    {
      "epoch": 0.36,
      "grad_norm": 25.5,
      "learning_rate": 4.0204024186666215e-06,
      "logits/chosen": 82.0683822631836,
      "logits/rejected": 82.07270812988281,
      "logps/chosen": -27.02596092224121,
      "logps/rejected": -33.121150970458984,
      "loss": 0.5365,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.2528177499771118,
      "rewards/margins": 0.6714814305305481,
      "rewards/rejected": -0.4186636805534363,
      "step": 140
    },
    {
      "epoch": 0.39,
      "grad_norm": 25.375,
      "learning_rate": 3.834196265035119e-06,
      "logits/chosen": 80.59815979003906,
      "logits/rejected": 80.57023620605469,
      "logps/chosen": -28.871845245361328,
      "logps/rejected": -33.09119415283203,
      "loss": 0.5456,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.31036004424095154,
      "rewards/margins": 0.6251744627952576,
      "rewards/rejected": -0.3148145079612732,
      "step": 150
    },
    {
      "epoch": 0.42,
      "grad_norm": 44.25,
      "learning_rate": 3.636998309800573e-06,
      "logits/chosen": 82.46113586425781,
      "logits/rejected": 82.46646118164062,
      "logps/chosen": -33.629737854003906,
      "logps/rejected": -30.432525634765625,
      "loss": 0.6101,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.30420786142349243,
      "rewards/margins": 0.5921996235847473,
      "rewards/rejected": -0.2879917025566101,
      "step": 160
    },
    {
      "epoch": 0.44,
      "grad_norm": 33.0,
      "learning_rate": 3.4304331721118078e-06,
      "logits/chosen": 83.26214599609375,
      "logits/rejected": 83.21092224121094,
      "logps/chosen": -30.77018165588379,
      "logps/rejected": -32.57013702392578,
      "loss": 0.573,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 0.2934645712375641,
      "rewards/margins": 0.6235076189041138,
      "rewards/rejected": -0.33004307746887207,
      "step": 170
    },
    {
      "epoch": 0.47,
      "grad_norm": 27.125,
      "learning_rate": 3.2162026428305436e-06,
      "logits/chosen": 80.83445739746094,
      "logits/rejected": 80.81375885009766,
      "logps/chosen": -30.401935577392578,
      "logps/rejected": -31.623117446899414,
      "loss": 0.5116,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.4771292805671692,
      "rewards/margins": 0.7566738724708557,
      "rewards/rejected": -0.2795446515083313,
      "step": 180
    },
    {
      "epoch": 0.49,
      "grad_norm": 14.0,
      "learning_rate": 2.996071664294641e-06,
      "logits/chosen": 82.55574035644531,
      "logits/rejected": 82.5384521484375,
      "logps/chosen": -30.206974029541016,
      "logps/rejected": -30.71441078186035,
      "loss": 0.6219,
      "rewards/accuracies": 0.6499999761581421,
      "rewards/chosen": 0.3356670141220093,
      "rewards/margins": 0.4834977686405182,
      "rewards/rejected": -0.14783072471618652,
      "step": 190
    },
    {
      "epoch": 0.52,
      "grad_norm": 15.375,
      "learning_rate": 2.7718537898066833e-06,
      "logits/chosen": 78.06065368652344,
      "logits/rejected": 78.00971984863281,
      "logps/chosen": -33.789581298828125,
      "logps/rejected": -32.68096923828125,
      "loss": 0.6115,
      "rewards/accuracies": 0.6625000238418579,
      "rewards/chosen": 0.577893853187561,
      "rewards/margins": 0.690432071685791,
      "rewards/rejected": -0.11253812164068222,
      "step": 200
    },
    {
      "epoch": 0.52,
      "eval_logits/chosen": 98.62848663330078,
      "eval_logits/rejected": 98.60428619384766,
      "eval_logps/chosen": -32.65570068359375,
      "eval_logps/rejected": -36.321441650390625,
      "eval_loss": 0.7252821922302246,
      "eval_rewards/accuracies": 0.530315637588501,
      "eval_rewards/chosen": -0.12751542031764984,
      "eval_rewards/margins": 0.08543363958597183,
      "eval_rewards/rejected": -0.2129490226507187,
      "eval_runtime": 103.8957,
      "eval_samples_per_second": 3.301,
      "eval_steps_per_second": 0.414,
      "step": 200
    },
    {
      "epoch": 0.55,
      "grad_norm": 52.0,
      "learning_rate": 2.5453962426402006e-06,
      "logits/chosen": 80.63914489746094,
      "logits/rejected": 80.54652404785156,
      "logps/chosen": -33.34014129638672,
      "logps/rejected": -35.32052230834961,
      "loss": 0.5935,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.3633476793766022,
      "rewards/margins": 0.5640031099319458,
      "rewards/rejected": -0.20065537095069885,
      "step": 210
    },
    {
      "epoch": 0.57,
      "grad_norm": 19.625,
      "learning_rate": 2.3185646976551794e-06,
      "logits/chosen": 82.76437377929688,
      "logits/rejected": 82.84717559814453,
      "logps/chosen": -31.025707244873047,
      "logps/rejected": -31.30951499938965,
      "loss": 0.5027,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.553946852684021,
      "rewards/margins": 0.9022806286811829,
      "rewards/rejected": -0.3483339250087738,
      "step": 220
    },
    {
      "epoch": 0.6,
      "grad_norm": 32.75,
      "learning_rate": 2.0932279108998323e-06,
      "logits/chosen": 79.89958190917969,
      "logits/rejected": 79.95211791992188,
      "logps/chosen": -32.34553146362305,
      "logps/rejected": -34.391754150390625,
      "loss": 0.6272,
      "rewards/accuracies": 0.625,
      "rewards/chosen": 0.2761251628398895,
      "rewards/margins": 0.5038853287696838,
      "rewards/rejected": -0.2277601659297943,
      "step": 230
    },
    {
      "epoch": 0.62,
      "grad_norm": 35.5,
      "learning_rate": 1.8712423238279358e-06,
      "logits/chosen": 82.25331115722656,
      "logits/rejected": 82.53690338134766,
      "logps/chosen": -30.6766357421875,
      "logps/rejected": -31.96030044555664,
      "loss": 0.4539,
      "rewards/accuracies": 0.8125,
      "rewards/chosen": 0.6068586111068726,
      "rewards/margins": 0.8638145327568054,
      "rewards/rejected": -0.25695592164993286,
      "step": 240
    },
    {
      "epoch": 0.65,
      "grad_norm": 30.5,
      "learning_rate": 1.6544367689701824e-06,
      "logits/chosen": 80.93089294433594,
      "logits/rejected": 80.99276733398438,
      "logps/chosen": -27.04372787475586,
      "logps/rejected": -30.084264755249023,
      "loss": 0.6593,
      "rewards/accuracies": 0.574999988079071,
      "rewards/chosen": 0.3313008248806,
      "rewards/margins": 0.441417396068573,
      "rewards/rejected": -0.11011654138565063,
      "step": 250
    },
    {
      "epoch": 0.68,
      "grad_norm": 29.125,
      "learning_rate": 1.4445974030621963e-06,
      "logits/chosen": 78.20941162109375,
      "logits/rejected": 78.33964538574219,
      "logps/chosen": -30.433767318725586,
      "logps/rejected": -36.57436752319336,
      "loss": 0.5,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.6763362884521484,
      "rewards/margins": 0.9599828720092773,
      "rewards/rejected": -0.28364673256874084,
      "step": 260
    },
    {
      "epoch": 0.7,
      "grad_norm": 21.5,
      "learning_rate": 1.243452991757889e-06,
      "logits/chosen": 77.5750503540039,
      "logits/rejected": 77.60489654541016,
      "logps/chosen": -30.800561904907227,
      "logps/rejected": -31.87221908569336,
      "loss": 0.4973,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.5870199203491211,
      "rewards/margins": 0.807098388671875,
      "rewards/rejected": -0.2200784683227539,
      "step": 270
    },
    {
      "epoch": 0.73,
      "grad_norm": 33.25,
      "learning_rate": 1.0526606671603523e-06,
      "logits/chosen": 80.28849029541016,
      "logits/rejected": 80.06718444824219,
      "logps/chosen": -31.078380584716797,
      "logps/rejected": -29.8966007232666,
      "loss": 0.5973,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.4389079213142395,
      "rewards/margins": 0.5766692757606506,
      "rewards/rejected": -0.13776138424873352,
      "step": 280
    },
    {
      "epoch": 0.75,
      "grad_norm": 17.75,
      "learning_rate": 8.737922755071455e-07,
      "logits/chosen": 80.41847229003906,
      "logits/rejected": 80.33303833007812,
      "logps/chosen": -32.99018478393555,
      "logps/rejected": -32.6365966796875,
      "loss": 0.4458,
      "rewards/accuracies": 0.762499988079071,
      "rewards/chosen": 0.6684367656707764,
      "rewards/margins": 1.0402761697769165,
      "rewards/rejected": -0.37183937430381775,
      "step": 290
    },
    {
      "epoch": 0.78,
      "grad_norm": 34.25,
      "learning_rate": 7.08321427484816e-07,
      "logits/chosen": 76.02632141113281,
      "logits/rejected": 76.11949920654297,
      "logps/chosen": -32.25402069091797,
      "logps/rejected": -29.283954620361328,
      "loss": 0.5545,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.6910130381584167,
      "rewards/margins": 0.8001711964607239,
      "rewards/rejected": -0.10915807634592056,
      "step": 300
    },
    {
      "epoch": 0.78,
      "eval_logits/chosen": 98.64820861816406,
      "eval_logits/rejected": 98.62397003173828,
      "eval_logps/chosen": -32.62586212158203,
      "eval_logps/rejected": -36.32143783569336,
      "eval_loss": 0.7248644828796387,
      "eval_rewards/accuracies": 0.5282392501831055,
      "eval_rewards/chosen": -0.10961288958787918,
      "eval_rewards/margins": 0.10333485901355743,
      "eval_rewards/rejected": -0.2129477560520172,
      "eval_runtime": 104.0116,
      "eval_samples_per_second": 3.298,
      "eval_steps_per_second": 0.413,
      "step": 300
    },
    {
      "epoch": 0.81,
      "grad_norm": 27.625,
      "learning_rate": 5.576113578589035e-07,
      "logits/chosen": 83.13574981689453,
      "logits/rejected": 83.16615295410156,
      "logps/chosen": -29.959243774414062,
      "logps/rejected": -32.55767059326172,
      "loss": 0.5265,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.542574405670166,
      "rewards/margins": 0.7573197484016418,
      "rewards/rejected": -0.21474528312683105,
      "step": 310
    },
    {
      "epoch": 0.83,
      "grad_norm": 21.625,
      "learning_rate": 4.229036944380913e-07,
      "logits/chosen": 80.65809631347656,
      "logits/rejected": 80.65727233886719,
      "logps/chosen": -30.505443572998047,
      "logps/rejected": -29.11099624633789,
      "loss": 0.5087,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.6558700799942017,
      "rewards/margins": 0.7707726359367371,
      "rewards/rejected": -0.11490253359079361,
      "step": 320
    },
    {
      "epoch": 0.86,
      "grad_norm": 19.25,
      "learning_rate": 3.053082288996112e-07,
      "logits/chosen": 77.81159210205078,
      "logits/rejected": 77.86034393310547,
      "logps/chosen": -29.130138397216797,
      "logps/rejected": -33.010986328125,
      "loss": 0.4483,
      "rewards/accuracies": 0.75,
      "rewards/chosen": 0.7334806323051453,
      "rewards/margins": 0.9591558575630188,
      "rewards/rejected": -0.22567513585090637,
      "step": 330
    },
    {
      "epoch": 0.88,
      "grad_norm": 41.75,
      "learning_rate": 2.0579377374915805e-07,
      "logits/chosen": 82.1180648803711,
      "logits/rejected": 82.14155578613281,
      "logps/chosen": -32.119606018066406,
      "logps/rejected": -33.77212905883789,
      "loss": 0.5073,
      "rewards/accuracies": 0.737500011920929,
      "rewards/chosen": 0.6555252075195312,
      "rewards/margins": 0.8975871801376343,
      "rewards/rejected": -0.24206197261810303,
      "step": 340
    },
    {
      "epoch": 0.91,
      "grad_norm": 17.25,
      "learning_rate": 1.2518018074041684e-07,
      "logits/chosen": 81.12958526611328,
      "logits/rejected": 81.1399154663086,
      "logps/chosen": -32.4399299621582,
      "logps/rejected": -33.30702590942383,
      "loss": 0.5483,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.7222862839698792,
      "rewards/margins": 0.8514213562011719,
      "rewards/rejected": -0.12913502752780914,
      "step": 350
    },
    {
      "epoch": 0.94,
      "grad_norm": 24.875,
      "learning_rate": 6.41315865106129e-08,
      "logits/chosen": 82.61198425292969,
      "logits/rejected": 82.64558410644531,
      "logps/chosen": -28.419490814208984,
      "logps/rejected": -31.76764488220215,
      "loss": 0.5254,
      "rewards/accuracies": 0.6875,
      "rewards/chosen": 0.6796320080757141,
      "rewards/margins": 0.7503107786178589,
      "rewards/rejected": -0.07067875564098358,
      "step": 360
    },
    {
      "epoch": 0.96,
      "grad_norm": 30.25,
      "learning_rate": 2.3150941078050325e-08,
      "logits/chosen": 82.08049774169922,
      "logits/rejected": 82.0997543334961,
      "logps/chosen": -31.871307373046875,
      "logps/rejected": -35.636024475097656,
      "loss": 0.5176,
      "rewards/accuracies": 0.7250000238418579,
      "rewards/chosen": 0.6029146909713745,
      "rewards/margins": 0.9189049601554871,
      "rewards/rejected": -0.31599029898643494,
      "step": 370
    },
    {
      "epoch": 0.99,
      "grad_norm": 31.875,
      "learning_rate": 2.575864278703266e-09,
      "logits/chosen": 75.98027038574219,
      "logits/rejected": 75.85136413574219,
      "logps/chosen": -29.75612449645996,
      "logps/rejected": -28.387653350830078,
      "loss": 0.5513,
      "rewards/accuracies": 0.699999988079071,
      "rewards/chosen": 0.49200135469436646,
      "rewards/margins": 0.6282498836517334,
      "rewards/rejected": -0.13624855875968933,
      "step": 380
    },
    {
      "epoch": 1.0,
      "step": 385,
      "total_flos": 0.0,
      "train_loss": 0.5886486524111265,
      "train_runtime": 2556.9439,
      "train_samples_per_second": 1.204,
      "train_steps_per_second": 0.151
    }
  ],
  "logging_steps": 10,
  "max_steps": 385,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}