yiran-wang3 commited on
Commit
f607847
1 Parent(s): 1cc0f49

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +4 -4
  2. config.json +1 -1
  3. train_results.json +4 -4
  4. trainer_state.json +313 -313
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4979530750094233,
5
- "train_runtime": 139.1789,
6
  "train_samples": 2345,
7
- "train_samples_per_second": 16.849,
8
- "train_steps_per_second": 0.266
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.4980396749200048,
5
+ "train_runtime": 139.9378,
6
  "train_samples": 2345,
7
+ "train_samples_per_second": 16.757,
8
+ "train_steps_per_second": 0.264
9
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
- "train_loss": 0.4979530750094233,
5
- "train_runtime": 139.1789,
6
  "train_samples": 2345,
7
- "train_samples_per_second": 16.849,
8
- "train_steps_per_second": 0.266
9
  }
 
1
  {
2
  "epoch": 1.0,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.4980396749200048,
5
+ "train_runtime": 139.9378,
6
  "train_samples": 2345,
7
+ "train_samples_per_second": 16.757,
8
+ "train_steps_per_second": 0.264
9
  }
trainer_state.json CHANGED
@@ -289,7 +289,7 @@
289
  "debug/reference_chosen_logps": -165.61618041992188,
290
  "debug/reference_rejected_logps": -139.546142578125,
291
  "epoch": 0.3783783783783784,
292
- "grad_norm": 3.271576173715924,
293
  "learning_rate": 1e-06,
294
  "logits/chosen": -3.4804084300994873,
295
  "logits/rejected": -3.5207834243774414,
@@ -303,496 +303,496 @@
303
  "step": 14
304
  },
305
  {
306
- "debug/policy_chosen_logits": -3.371809244155884,
307
- "debug/policy_chosen_logps": -138.2847900390625,
308
- "debug/policy_rejected_logits": -3.2725796699523926,
309
- "debug/policy_rejected_logps": -146.83644104003906,
310
  "debug/reference_chosen_logps": -138.04966735839844,
311
  "debug/reference_rejected_logps": -146.4534454345703,
312
  "epoch": 0.40540540540540543,
313
- "grad_norm": 3.5549880331732084,
314
  "learning_rate": 1e-06,
315
- "logits/chosen": -3.371809244155884,
316
- "logits/rejected": -3.2725796699523926,
317
- "logps/chosen": -138.2847900390625,
318
- "logps/rejected": -146.83644104003906,
319
- "loss": 0.4978,
320
  "rewards/accuracies": 0.5,
321
- "rewards/chosen": -0.002351150382310152,
322
- "rewards/margins": 0.0014788818079978228,
323
- "rewards/rejected": -0.0038300324231386185,
324
  "step": 15
325
  },
326
  {
327
- "debug/policy_chosen_logits": -3.2811360359191895,
328
- "debug/policy_chosen_logps": -145.2406005859375,
329
- "debug/policy_rejected_logits": -3.2666900157928467,
330
- "debug/policy_rejected_logps": -136.08499145507812,
331
  "debug/reference_chosen_logps": -145.38519287109375,
332
  "debug/reference_rejected_logps": -136.1790313720703,
333
  "epoch": 0.43243243243243246,
334
- "grad_norm": 3.390241199656903,
335
  "learning_rate": 1e-06,
336
- "logits/chosen": -3.2811360359191895,
337
- "logits/rejected": -3.2666900157928467,
338
- "logps/chosen": -145.2406005859375,
339
- "logps/rejected": -136.08499145507812,
340
- "loss": 0.4992,
341
- "rewards/accuracies": 0.375,
342
- "rewards/chosen": 0.0014458559453487396,
343
- "rewards/margins": 0.0005054186331108212,
344
- "rewards/rejected": 0.0009404371376149356,
345
  "step": 16
346
  },
347
  {
348
- "debug/policy_chosen_logits": -3.4209792613983154,
349
- "debug/policy_chosen_logps": -154.63943481445312,
350
- "debug/policy_rejected_logits": -3.3486948013305664,
351
- "debug/policy_rejected_logps": -133.35781860351562,
352
  "debug/reference_chosen_logps": -154.70895385742188,
353
  "debug/reference_rejected_logps": -133.4116668701172,
354
  "epoch": 0.4594594594594595,
355
- "grad_norm": 3.6742049248327437,
356
  "learning_rate": 1e-06,
357
- "logits/chosen": -3.4209792613983154,
358
- "logits/rejected": -3.3486948013305664,
359
- "logps/chosen": -154.63943481445312,
360
- "logps/rejected": -133.35781860351562,
361
- "loss": 0.4989,
362
- "rewards/accuracies": 0.5,
363
- "rewards/chosen": 0.0006950664683245122,
364
- "rewards/margins": 0.00015656478353776038,
365
- "rewards/rejected": 0.0005385016556829214,
366
  "step": 17
367
  },
368
  {
369
- "debug/policy_chosen_logits": -3.298342704772949,
370
- "debug/policy_chosen_logps": -146.52044677734375,
371
- "debug/policy_rejected_logits": -3.3201894760131836,
372
- "debug/policy_rejected_logps": -156.97808837890625,
373
  "debug/reference_chosen_logps": -146.34840393066406,
374
  "debug/reference_rejected_logps": -156.84378051757812,
375
  "epoch": 0.4864864864864865,
376
- "grad_norm": 3.5194962148324103,
377
  "learning_rate": 1e-06,
378
- "logits/chosen": -3.298342704772949,
379
- "logits/rejected": -3.3201894760131836,
380
- "logps/chosen": -146.52044677734375,
381
- "logps/rejected": -156.97808837890625,
382
- "loss": 0.4973,
383
- "rewards/accuracies": 0.5,
384
- "rewards/chosen": -0.0017202282324433327,
385
- "rewards/margins": -0.0003771304036490619,
386
- "rewards/rejected": -0.001343097654171288,
387
  "step": 18
388
  },
389
  {
390
- "debug/policy_chosen_logits": -3.3400371074676514,
391
- "debug/policy_chosen_logps": -132.01437377929688,
392
- "debug/policy_rejected_logits": -3.36023211479187,
393
- "debug/policy_rejected_logps": -148.31887817382812,
394
  "debug/reference_chosen_logps": -131.88931274414062,
395
  "debug/reference_rejected_logps": -148.13345336914062,
396
  "epoch": 0.5135135135135135,
397
- "grad_norm": 3.61313418435521,
398
  "learning_rate": 1e-06,
399
- "logits/chosen": -3.3400371074676514,
400
- "logits/rejected": -3.36023211479187,
401
- "logps/chosen": -132.01437377929688,
402
- "logps/rejected": -148.31887817382812,
403
- "loss": 0.498,
404
- "rewards/accuracies": 0.5,
405
- "rewards/chosen": -0.00125067715998739,
406
- "rewards/margins": 0.0006034375401213765,
407
- "rewards/rejected": -0.001854114467278123,
408
  "step": 19
409
  },
410
  {
411
- "debug/policy_chosen_logits": -3.2641985416412354,
412
- "debug/policy_chosen_logps": -134.37350463867188,
413
- "debug/policy_rejected_logits": -3.255669355392456,
414
- "debug/policy_rejected_logps": -124.11454772949219,
415
  "debug/reference_chosen_logps": -134.53549194335938,
416
  "debug/reference_rejected_logps": -123.25413513183594,
417
  "epoch": 0.5405405405405406,
418
- "grad_norm": 3.4240940627433663,
419
  "learning_rate": 1e-06,
420
- "logits/chosen": -3.2641985416412354,
421
- "logits/rejected": -3.255669355392456,
422
- "logps/chosen": -134.37350463867188,
423
- "logps/rejected": -124.11454772949219,
424
  "loss": 0.4969,
425
- "rewards/accuracies": 0.5,
426
- "rewards/chosen": 0.0016198731027543545,
427
- "rewards/margins": 0.010224045254290104,
428
- "rewards/rejected": -0.008604173548519611,
429
  "step": 20
430
  },
431
  {
432
- "debug/policy_chosen_logits": -3.389690637588501,
433
- "debug/policy_chosen_logps": -144.95230102539062,
434
- "debug/policy_rejected_logits": -3.3237528800964355,
435
- "debug/policy_rejected_logps": -140.4608917236328,
436
  "debug/reference_chosen_logps": -145.19155883789062,
437
  "debug/reference_rejected_logps": -139.9624481201172,
438
  "epoch": 0.5675675675675675,
439
- "grad_norm": 3.5636895297483804,
440
  "learning_rate": 1e-06,
441
- "logits/chosen": -3.389690637588501,
442
- "logits/rejected": -3.3237528800964355,
443
- "logps/chosen": -144.95230102539062,
444
- "logps/rejected": -140.4608917236328,
445
- "loss": 0.4972,
446
- "rewards/accuracies": 0.375,
447
- "rewards/chosen": 0.0023925015702843666,
448
- "rewards/margins": 0.007377022877335548,
449
- "rewards/rejected": -0.004984522238373756,
450
  "step": 21
451
  },
452
  {
453
- "debug/policy_chosen_logits": -3.2673821449279785,
454
- "debug/policy_chosen_logps": -159.7883758544922,
455
- "debug/policy_rejected_logits": -3.2593047618865967,
456
- "debug/policy_rejected_logps": -141.72947692871094,
457
  "debug/reference_chosen_logps": -159.80300903320312,
458
  "debug/reference_rejected_logps": -142.28695678710938,
459
  "epoch": 0.5945945945945946,
460
- "grad_norm": 3.8014985351330988,
461
  "learning_rate": 1e-06,
462
- "logits/chosen": -3.2673821449279785,
463
- "logits/rejected": -3.2593047618865967,
464
- "logps/chosen": -159.7883758544922,
465
- "logps/rejected": -141.72947692871094,
466
- "loss": 0.499,
467
  "rewards/accuracies": 0.125,
468
- "rewards/chosen": 0.00014627468772232533,
469
- "rewards/margins": -0.005428638309240341,
470
- "rewards/rejected": 0.00557491322979331,
471
  "step": 22
472
  },
473
  {
474
- "debug/policy_chosen_logits": -3.3076963424682617,
475
- "debug/policy_chosen_logps": -147.4810791015625,
476
- "debug/policy_rejected_logits": -3.3063511848449707,
477
- "debug/policy_rejected_logps": -155.9129638671875,
478
  "debug/reference_chosen_logps": -147.6890869140625,
479
  "debug/reference_rejected_logps": -155.93359375,
480
  "epoch": 0.6216216216216216,
481
- "grad_norm": 3.5943677168048143,
482
  "learning_rate": 1e-06,
483
- "logits/chosen": -3.3076963424682617,
484
- "logits/rejected": -3.3063511848449707,
485
- "logps/chosen": -147.4810791015625,
486
- "logps/rejected": -155.9129638671875,
487
- "loss": 0.499,
488
- "rewards/accuracies": 0.625,
489
- "rewards/chosen": 0.0020801923237740993,
490
- "rewards/margins": 0.001873721950687468,
491
- "rewards/rejected": 0.00020647048950195312,
492
  "step": 23
493
  },
494
  {
495
- "debug/policy_chosen_logits": -3.3162076473236084,
496
- "debug/policy_chosen_logps": -146.12327575683594,
497
- "debug/policy_rejected_logits": -3.2808167934417725,
498
- "debug/policy_rejected_logps": -150.74459838867188,
499
  "debug/reference_chosen_logps": -146.20852661132812,
500
  "debug/reference_rejected_logps": -151.0212860107422,
501
  "epoch": 0.6486486486486487,
502
- "grad_norm": 3.490255067904084,
503
  "learning_rate": 1e-06,
504
- "logits/chosen": -3.3162076473236084,
505
- "logits/rejected": -3.2808167934417725,
506
- "logps/chosen": -146.12327575683594,
507
- "logps/rejected": -150.74459838867188,
508
  "loss": 0.4979,
509
- "rewards/accuracies": 0.75,
510
- "rewards/chosen": 0.0008525657467544079,
511
- "rewards/margins": -0.0019143482204526663,
512
- "rewards/rejected": 0.002766914200037718,
513
  "step": 24
514
  },
515
  {
516
- "debug/policy_chosen_logits": -3.341583490371704,
517
- "debug/policy_chosen_logps": -145.74295043945312,
518
- "debug/policy_rejected_logits": -3.2622642517089844,
519
- "debug/policy_rejected_logps": -148.95095825195312,
520
  "debug/reference_chosen_logps": -145.28700256347656,
521
  "debug/reference_rejected_logps": -148.38320922851562,
522
  "epoch": 0.6756756756756757,
523
- "grad_norm": 3.4930826080691433,
524
  "learning_rate": 1e-06,
525
- "logits/chosen": -3.341583490371704,
526
- "logits/rejected": -3.2622642517089844,
527
- "logps/chosen": -145.74295043945312,
528
- "logps/rejected": -148.95095825195312,
529
- "loss": 0.4992,
530
  "rewards/accuracies": 0.375,
531
- "rewards/chosen": -0.0045594689436256886,
532
- "rewards/margins": 0.001117925625294447,
533
- "rewards/rejected": -0.0056773945689201355,
534
  "step": 25
535
  },
536
  {
537
- "debug/policy_chosen_logits": -3.3710758686065674,
538
- "debug/policy_chosen_logps": -134.76580810546875,
539
- "debug/policy_rejected_logits": -3.268131971359253,
540
- "debug/policy_rejected_logps": -164.87716674804688,
541
  "debug/reference_chosen_logps": -134.9357452392578,
542
  "debug/reference_rejected_logps": -164.06057739257812,
543
  "epoch": 0.7027027027027027,
544
- "grad_norm": 3.5324806343379795,
545
  "learning_rate": 1e-06,
546
- "logits/chosen": -3.3710758686065674,
547
- "logits/rejected": -3.268131971359253,
548
- "logps/chosen": -134.76580810546875,
549
- "logps/rejected": -164.87716674804688,
550
- "loss": 0.4961,
551
  "rewards/accuracies": 0.875,
552
- "rewards/chosen": 0.0016993426252156496,
553
- "rewards/margins": 0.009865359403192997,
554
- "rewards/rejected": -0.008166017010807991,
555
  "step": 26
556
  },
557
  {
558
- "debug/policy_chosen_logits": -3.276531934738159,
559
- "debug/policy_chosen_logps": -128.13336181640625,
560
- "debug/policy_rejected_logits": -3.2201032638549805,
561
- "debug/policy_rejected_logps": -148.01954650878906,
562
  "debug/reference_chosen_logps": -127.79987335205078,
563
  "debug/reference_rejected_logps": -148.602783203125,
564
  "epoch": 0.7297297297297297,
565
- "grad_norm": 3.5504923092772165,
566
  "learning_rate": 1e-06,
567
- "logits/chosen": -3.276531934738159,
568
- "logits/rejected": -3.2201032638549805,
569
- "logps/chosen": -128.13336181640625,
570
- "logps/rejected": -148.01954650878906,
571
- "loss": 0.4978,
572
  "rewards/accuracies": 0.25,
573
- "rewards/chosen": -0.0033348274882882833,
574
- "rewards/margins": -0.009167090058326721,
575
- "rewards/rejected": 0.005832261871546507,
576
  "step": 27
577
  },
578
  {
579
- "debug/policy_chosen_logits": -3.3670849800109863,
580
- "debug/policy_chosen_logps": -142.864013671875,
581
- "debug/policy_rejected_logits": -3.314009428024292,
582
- "debug/policy_rejected_logps": -159.29888916015625,
583
  "debug/reference_chosen_logps": -143.54417419433594,
584
  "debug/reference_rejected_logps": -159.19894409179688,
585
  "epoch": 0.7567567567567568,
586
- "grad_norm": 3.6333481085660533,
587
  "learning_rate": 1e-06,
588
- "logits/chosen": -3.3670849800109863,
589
- "logits/rejected": -3.314009428024292,
590
- "logps/chosen": -142.864013671875,
591
- "logps/rejected": -159.29888916015625,
592
- "loss": 0.4946,
593
- "rewards/accuracies": 0.875,
594
- "rewards/chosen": 0.006801585666835308,
595
- "rewards/margins": 0.007801036350429058,
596
- "rewards/rejected": -0.0009994508000090718,
597
  "step": 28
598
  },
599
  {
600
- "debug/policy_chosen_logits": -3.4510252475738525,
601
- "debug/policy_chosen_logps": -144.7528839111328,
602
- "debug/policy_rejected_logits": -3.4992246627807617,
603
- "debug/policy_rejected_logps": -129.56163024902344,
604
  "debug/reference_chosen_logps": -144.27435302734375,
605
  "debug/reference_rejected_logps": -128.7982177734375,
606
  "epoch": 0.7837837837837838,
607
- "grad_norm": 3.519707420288352,
608
  "learning_rate": 1e-06,
609
- "logits/chosen": -3.4510252475738525,
610
- "logits/rejected": -3.4992246627807617,
611
- "logps/chosen": -144.7528839111328,
612
- "logps/rejected": -129.56163024902344,
613
- "loss": 0.4917,
614
  "rewards/accuracies": 0.625,
615
- "rewards/chosen": -0.0047853561118245125,
616
- "rewards/margins": 0.0028487201780080795,
617
- "rewards/rejected": -0.007634077221155167,
618
  "step": 29
619
  },
620
  {
621
- "debug/policy_chosen_logits": -3.442349910736084,
622
- "debug/policy_chosen_logps": -129.01779174804688,
623
- "debug/policy_rejected_logits": -3.355781078338623,
624
- "debug/policy_rejected_logps": -125.97911071777344,
625
  "debug/reference_chosen_logps": -129.1351318359375,
626
  "debug/reference_rejected_logps": -125.61131286621094,
627
  "epoch": 0.8108108108108109,
628
- "grad_norm": 3.436282122029211,
629
  "learning_rate": 1e-06,
630
- "logits/chosen": -3.442349910736084,
631
- "logits/rejected": -3.355781078338623,
632
- "logps/chosen": -129.01779174804688,
633
- "logps/rejected": -125.97911071777344,
634
- "loss": 0.4957,
635
  "rewards/accuracies": 0.375,
636
- "rewards/chosen": 0.0011734389699995518,
637
- "rewards/margins": 0.004851359874010086,
638
- "rewards/rejected": -0.003677921136841178,
639
  "step": 30
640
  },
641
  {
642
- "debug/policy_chosen_logits": -3.46421480178833,
643
- "debug/policy_chosen_logps": -148.1017608642578,
644
- "debug/policy_rejected_logits": -3.4284045696258545,
645
- "debug/policy_rejected_logps": -155.14401245117188,
646
  "debug/reference_chosen_logps": -147.857666015625,
647
  "debug/reference_rejected_logps": -153.7999267578125,
648
  "epoch": 0.8378378378378378,
649
- "grad_norm": 3.573380219675295,
650
  "learning_rate": 1e-06,
651
- "logits/chosen": -3.46421480178833,
652
- "logits/rejected": -3.4284045696258545,
653
- "logps/chosen": -148.1017608642578,
654
- "logps/rejected": -155.14401245117188,
655
  "loss": 0.4994,
656
  "rewards/accuracies": 0.5,
657
- "rewards/chosen": -0.002440805546939373,
658
- "rewards/margins": 0.010999973863363266,
659
- "rewards/rejected": -0.013440780341625214,
660
  "step": 31
661
  },
662
  {
663
- "debug/policy_chosen_logits": -3.296452522277832,
664
- "debug/policy_chosen_logps": -146.0049285888672,
665
- "debug/policy_rejected_logits": -3.3844947814941406,
666
- "debug/policy_rejected_logps": -145.01739501953125,
667
  "debug/reference_chosen_logps": -146.03688049316406,
668
  "debug/reference_rejected_logps": -144.96478271484375,
669
  "epoch": 0.8648648648648649,
670
- "grad_norm": 3.7844677501283552,
671
  "learning_rate": 1e-06,
672
- "logits/chosen": -3.296452522277832,
673
- "logits/rejected": -3.3844947814941406,
674
- "logps/chosen": -146.0049285888672,
675
- "logps/rejected": -145.01739501953125,
676
- "loss": 0.4983,
677
- "rewards/accuracies": 0.625,
678
- "rewards/chosen": 0.0003195476019755006,
679
- "rewards/margins": 0.0008457758231088519,
680
- "rewards/rejected": -0.000526227755472064,
681
  "step": 32
682
  },
683
  {
684
- "debug/policy_chosen_logits": -3.393280506134033,
685
- "debug/policy_chosen_logps": -135.49090576171875,
686
- "debug/policy_rejected_logits": -3.4107859134674072,
687
- "debug/policy_rejected_logps": -126.17267608642578,
688
  "debug/reference_chosen_logps": -135.9659423828125,
689
  "debug/reference_rejected_logps": -123.26311492919922,
690
  "epoch": 0.8918918918918919,
691
- "grad_norm": 3.6987210522943323,
692
  "learning_rate": 1e-06,
693
- "logits/chosen": -3.393280506134033,
694
- "logits/rejected": -3.4107859134674072,
695
- "logps/chosen": -135.49090576171875,
696
- "logps/rejected": -126.17267608642578,
697
- "loss": 0.4982,
698
  "rewards/accuracies": 0.5,
699
- "rewards/chosen": 0.00475044222548604,
700
- "rewards/margins": 0.03384600579738617,
701
- "rewards/rejected": -0.029095562174916267,
702
  "step": 33
703
  },
704
  {
705
- "debug/policy_chosen_logits": -3.3232836723327637,
706
- "debug/policy_chosen_logps": -159.23611450195312,
707
- "debug/policy_rejected_logits": -3.3686249256134033,
708
- "debug/policy_rejected_logps": -165.2159423828125,
709
  "debug/reference_chosen_logps": -158.15670776367188,
710
  "debug/reference_rejected_logps": -164.94976806640625,
711
  "epoch": 0.918918918918919,
712
- "grad_norm": 3.553957133112115,
713
  "learning_rate": 1e-06,
714
- "logits/chosen": -3.3232836723327637,
715
- "logits/rejected": -3.3686249256134033,
716
- "logps/chosen": -159.23611450195312,
717
- "logps/rejected": -165.2159423828125,
718
- "loss": 0.4998,
719
  "rewards/accuracies": 0.25,
720
- "rewards/chosen": -0.010794105939567089,
721
- "rewards/margins": -0.008132400922477245,
722
- "rewards/rejected": -0.0026617045514285564,
723
  "step": 34
724
  },
725
  {
726
- "debug/policy_chosen_logits": -3.3675332069396973,
727
- "debug/policy_chosen_logps": -129.2323455810547,
728
- "debug/policy_rejected_logits": -3.39815092086792,
729
- "debug/policy_rejected_logps": -157.40768432617188,
730
  "debug/reference_chosen_logps": -129.46514892578125,
731
  "debug/reference_rejected_logps": -157.58755493164062,
732
  "epoch": 0.9459459459459459,
733
- "grad_norm": 3.6687608874400897,
734
  "learning_rate": 1e-06,
735
- "logits/chosen": -3.3675332069396973,
736
- "logits/rejected": -3.39815092086792,
737
- "logps/chosen": -129.2323455810547,
738
- "logps/rejected": -157.40768432617188,
739
- "loss": 0.4972,
740
- "rewards/accuracies": 0.625,
741
- "rewards/chosen": 0.0023280431050807238,
742
- "rewards/margins": 0.0005296231247484684,
743
- "rewards/rejected": 0.0017984198639169335,
744
  "step": 35
745
  },
746
  {
747
- "debug/policy_chosen_logits": -3.2530903816223145,
748
- "debug/policy_chosen_logps": -139.84768676757812,
749
- "debug/policy_rejected_logits": -3.380523204803467,
750
- "debug/policy_rejected_logps": -127.18656921386719,
751
  "debug/reference_chosen_logps": -140.35733032226562,
752
  "debug/reference_rejected_logps": -127.09129333496094,
753
  "epoch": 0.972972972972973,
754
- "grad_norm": 3.747493235351115,
755
  "learning_rate": 1e-06,
756
- "logits/chosen": -3.2530903816223145,
757
- "logits/rejected": -3.380523204803467,
758
- "logps/chosen": -139.84768676757812,
759
- "logps/rejected": -127.18656921386719,
760
- "loss": 0.4949,
761
- "rewards/accuracies": 0.75,
762
- "rewards/chosen": 0.005096264183521271,
763
- "rewards/margins": 0.006049060728400946,
764
- "rewards/rejected": -0.0009527970105409622,
765
  "step": 36
766
  },
767
  {
768
- "debug/policy_chosen_logits": -3.1672980785369873,
769
- "debug/policy_chosen_logps": -165.81317138671875,
770
- "debug/policy_rejected_logits": -3.1891133785247803,
771
- "debug/policy_rejected_logps": -135.89328002929688,
772
  "debug/reference_chosen_logps": -164.97256469726562,
773
  "debug/reference_rejected_logps": -135.76358032226562,
774
  "epoch": 1.0,
775
- "grad_norm": 3.5111470282705084,
776
  "learning_rate": 1e-06,
777
- "logits/chosen": -3.1672980785369873,
778
- "logits/rejected": -3.1891133785247803,
779
- "logps/chosen": -165.81317138671875,
780
- "logps/rejected": -135.89328002929688,
781
- "loss": 0.4809,
782
  "rewards/accuracies": 0.375,
783
- "rewards/chosen": -0.00840616226196289,
784
- "rewards/margins": -0.007109222002327442,
785
- "rewards/rejected": -0.0012969397939741611,
786
  "step": 37
787
  },
788
  {
789
  "epoch": 1.0,
790
  "step": 37,
791
  "total_flos": 0.0,
792
- "train_loss": 0.4979530750094233,
793
- "train_runtime": 139.1789,
794
- "train_samples_per_second": 16.849,
795
- "train_steps_per_second": 0.266
796
  }
797
  ],
798
  "logging_steps": 1,
 
289
  "debug/reference_chosen_logps": -165.61618041992188,
290
  "debug/reference_rejected_logps": -139.546142578125,
291
  "epoch": 0.3783783783783784,
292
+ "grad_norm": 3.2715325631312377,
293
  "learning_rate": 1e-06,
294
  "logits/chosen": -3.4804084300994873,
295
  "logits/rejected": -3.5207834243774414,
 
303
  "step": 14
304
  },
305
  {
306
+ "debug/policy_chosen_logits": -3.3723671436309814,
307
+ "debug/policy_chosen_logps": -138.40748596191406,
308
+ "debug/policy_rejected_logits": -3.2726492881774902,
309
+ "debug/policy_rejected_logps": -146.87710571289062,
310
  "debug/reference_chosen_logps": -138.04966735839844,
311
  "debug/reference_rejected_logps": -146.4534454345703,
312
  "epoch": 0.40540540540540543,
313
+ "grad_norm": 3.5479335407438923,
314
  "learning_rate": 1e-06,
315
+ "logits/chosen": -3.3723671436309814,
316
+ "logits/rejected": -3.2726492881774902,
317
+ "logps/chosen": -138.40748596191406,
318
+ "logps/rejected": -146.87710571289062,
319
+ "loss": 0.4971,
320
  "rewards/accuracies": 0.5,
321
+ "rewards/chosen": -0.003578138304874301,
322
+ "rewards/margins": 0.0006584453512914479,
323
+ "rewards/rejected": -0.004236583597958088,
324
  "step": 15
325
  },
326
  {
327
+ "debug/policy_chosen_logits": -3.280038356781006,
328
+ "debug/policy_chosen_logps": -144.99691772460938,
329
+ "debug/policy_rejected_logits": -3.265355110168457,
330
+ "debug/policy_rejected_logps": -136.1260223388672,
331
  "debug/reference_chosen_logps": -145.38519287109375,
332
  "debug/reference_rejected_logps": -136.1790313720703,
333
  "epoch": 0.43243243243243246,
334
+ "grad_norm": 3.3905692790559057,
335
  "learning_rate": 1e-06,
336
+ "logits/chosen": -3.280038356781006,
337
+ "logits/rejected": -3.265355110168457,
338
+ "logps/chosen": -144.99691772460938,
339
+ "logps/rejected": -136.1260223388672,
340
+ "loss": 0.4987,
341
+ "rewards/accuracies": 0.625,
342
+ "rewards/chosen": 0.0038827608805149794,
343
+ "rewards/margins": 0.0033527181949466467,
344
+ "rewards/rejected": 0.0005300427437759936,
345
  "step": 16
346
  },
347
  {
348
+ "debug/policy_chosen_logits": -3.4201347827911377,
349
+ "debug/policy_chosen_logps": -154.7392578125,
350
+ "debug/policy_rejected_logits": -3.348952293395996,
351
+ "debug/policy_rejected_logps": -133.3173828125,
352
  "debug/reference_chosen_logps": -154.70895385742188,
353
  "debug/reference_rejected_logps": -133.4116668701172,
354
  "epoch": 0.4594594594594595,
355
+ "grad_norm": 3.6801385333166086,
356
  "learning_rate": 1e-06,
357
+ "logits/chosen": -3.4201347827911377,
358
+ "logits/rejected": -3.348952293395996,
359
+ "logps/chosen": -154.7392578125,
360
+ "logps/rejected": -133.3173828125,
361
+ "loss": 0.4999,
362
+ "rewards/accuracies": 0.375,
363
+ "rewards/chosen": -0.0003031063242815435,
364
+ "rewards/margins": -0.0012458800338208675,
365
+ "rewards/rejected": 0.0009427738841623068,
366
  "step": 17
367
  },
368
  {
369
+ "debug/policy_chosen_logits": -3.2982780933380127,
370
+ "debug/policy_chosen_logps": -146.35263061523438,
371
+ "debug/policy_rejected_logits": -3.3202931880950928,
372
+ "debug/policy_rejected_logps": -156.7283172607422,
373
  "debug/reference_chosen_logps": -146.34840393066406,
374
  "debug/reference_rejected_logps": -156.84378051757812,
375
  "epoch": 0.4864864864864865,
376
+ "grad_norm": 3.5196394680278327,
377
  "learning_rate": 1e-06,
378
+ "logits/chosen": -3.2982780933380127,
379
+ "logits/rejected": -3.3202931880950928,
380
+ "logps/chosen": -146.35263061523438,
381
+ "logps/rejected": -156.7283172607422,
382
+ "loss": 0.4974,
383
+ "rewards/accuracies": 0.375,
384
+ "rewards/chosen": -4.220963455736637e-05,
385
+ "rewards/margins": -0.0011968993349000812,
386
+ "rewards/rejected": 0.0011546898167580366,
387
  "step": 18
388
  },
389
  {
390
+ "debug/policy_chosen_logits": -3.3398616313934326,
391
+ "debug/policy_chosen_logps": -131.99713134765625,
392
+ "debug/policy_rejected_logits": -3.3604841232299805,
393
+ "debug/policy_rejected_logps": -148.30227661132812,
394
  "debug/reference_chosen_logps": -131.88931274414062,
395
  "debug/reference_rejected_logps": -148.13345336914062,
396
  "epoch": 0.5135135135135135,
397
+ "grad_norm": 3.609790844356815,
398
  "learning_rate": 1e-06,
399
+ "logits/chosen": -3.3398616313934326,
400
+ "logits/rejected": -3.3604841232299805,
401
+ "logps/chosen": -131.99713134765625,
402
+ "logps/rejected": -148.30227661132812,
403
+ "loss": 0.4981,
404
+ "rewards/accuracies": 0.375,
405
+ "rewards/chosen": -0.0010780429001897573,
406
+ "rewards/margins": 0.0006100271129980683,
407
+ "rewards/rejected": -0.0016880702460184693,
408
  "step": 19
409
  },
410
  {
411
+ "debug/policy_chosen_logits": -3.2643423080444336,
412
+ "debug/policy_chosen_logps": -134.54617309570312,
413
+ "debug/policy_rejected_logits": -3.255173921585083,
414
+ "debug/policy_rejected_logps": -124.35269165039062,
415
  "debug/reference_chosen_logps": -134.53549194335938,
416
  "debug/reference_rejected_logps": -123.25413513183594,
417
  "epoch": 0.5405405405405406,
418
+ "grad_norm": 3.4308218789715035,
419
  "learning_rate": 1e-06,
420
+ "logits/chosen": -3.2643423080444336,
421
+ "logits/rejected": -3.255173921585083,
422
+ "logps/chosen": -134.54617309570312,
423
+ "logps/rejected": -124.35269165039062,
424
  "loss": 0.4969,
425
+ "rewards/accuracies": 0.375,
426
+ "rewards/chosen": -0.00010671629570424557,
427
+ "rewards/margins": 0.010878859087824821,
428
+ "rewards/rejected": -0.010985574685037136,
429
  "step": 20
430
  },
431
  {
432
+ "debug/policy_chosen_logits": -3.39021635055542,
433
+ "debug/policy_chosen_logps": -145.20547485351562,
434
+ "debug/policy_rejected_logits": -3.324305295944214,
435
+ "debug/policy_rejected_logps": -140.68399047851562,
436
  "debug/reference_chosen_logps": -145.19155883789062,
437
  "debug/reference_rejected_logps": -139.9624481201172,
438
  "epoch": 0.5675675675675675,
439
+ "grad_norm": 3.572312158938155,
440
  "learning_rate": 1e-06,
441
+ "logits/chosen": -3.39021635055542,
442
+ "logits/rejected": -3.324305295944214,
443
+ "logps/chosen": -145.20547485351562,
444
+ "logps/rejected": -140.68399047851562,
445
+ "loss": 0.4975,
446
+ "rewards/accuracies": 0.5,
447
+ "rewards/chosen": -0.00013918871991336346,
448
+ "rewards/margins": 0.007076186593621969,
449
+ "rewards/rejected": -0.007215375080704689,
450
  "step": 21
451
  },
452
  {
453
+ "debug/policy_chosen_logits": -3.267886161804199,
454
+ "debug/policy_chosen_logps": -160.2618865966797,
455
+ "debug/policy_rejected_logits": -3.2591264247894287,
456
+ "debug/policy_rejected_logps": -141.91876220703125,
457
  "debug/reference_chosen_logps": -159.80300903320312,
458
  "debug/reference_rejected_logps": -142.28695678710938,
459
  "epoch": 0.5945945945945946,
460
+ "grad_norm": 3.81196901542863,
461
  "learning_rate": 1e-06,
462
+ "logits/chosen": -3.267886161804199,
463
+ "logits/rejected": -3.2591264247894287,
464
+ "logps/chosen": -160.2618865966797,
465
+ "logps/rejected": -141.91876220703125,
466
+ "loss": 0.4995,
467
  "rewards/accuracies": 0.125,
468
+ "rewards/chosen": -0.004588775336742401,
469
+ "rewards/margins": -0.008270883932709694,
470
+ "rewards/rejected": 0.0036821081303060055,
471
  "step": 22
472
  },
473
  {
474
+ "debug/policy_chosen_logits": -3.3076066970825195,
475
+ "debug/policy_chosen_logps": -147.66110229492188,
476
+ "debug/policy_rejected_logits": -3.3076066970825195,
477
+ "debug/policy_rejected_logps": -156.1669464111328,
478
  "debug/reference_chosen_logps": -147.6890869140625,
479
  "debug/reference_rejected_logps": -155.93359375,
480
  "epoch": 0.6216216216216216,
481
+ "grad_norm": 3.584513321934443,
482
  "learning_rate": 1e-06,
483
+ "logits/chosen": -3.3076066970825195,
484
+ "logits/rejected": -3.3076066970825195,
485
+ "logps/chosen": -147.66110229492188,
486
+ "logps/rejected": -156.1669464111328,
487
+ "loss": 0.4981,
488
+ "rewards/accuracies": 0.5,
489
+ "rewards/chosen": 0.0002799700014293194,
490
+ "rewards/margins": 0.002613373100757599,
491
+ "rewards/rejected": -0.002333402633666992,
492
  "step": 23
493
  },
494
  {
495
+ "debug/policy_chosen_logits": -3.3157832622528076,
496
+ "debug/policy_chosen_logps": -145.93922424316406,
497
+ "debug/policy_rejected_logits": -3.2806382179260254,
498
+ "debug/policy_rejected_logps": -150.69491577148438,
499
  "debug/reference_chosen_logps": -146.20852661132812,
500
  "debug/reference_rejected_logps": -151.0212860107422,
501
  "epoch": 0.6486486486486487,
502
+ "grad_norm": 3.4973421303279575,
503
  "learning_rate": 1e-06,
504
+ "logits/chosen": -3.3157832622528076,
505
+ "logits/rejected": -3.2806382179260254,
506
+ "logps/chosen": -145.93922424316406,
507
+ "logps/rejected": -150.69491577148438,
508
  "loss": 0.4979,
509
+ "rewards/accuracies": 0.625,
510
+ "rewards/chosen": 0.0026929855812340975,
511
+ "rewards/margins": -0.0005707358941435814,
512
+ "rewards/rejected": 0.0032637212425470352,
513
  "step": 24
514
  },
515
  {
516
+ "debug/policy_chosen_logits": -3.340818405151367,
517
+ "debug/policy_chosen_logps": -145.6500244140625,
518
+ "debug/policy_rejected_logits": -3.2608158588409424,
519
+ "debug/policy_rejected_logps": -148.91769409179688,
520
  "debug/reference_chosen_logps": -145.28700256347656,
521
  "debug/reference_rejected_logps": -148.38320922851562,
522
  "epoch": 0.6756756756756757,
523
+ "grad_norm": 3.493835228930821,
524
  "learning_rate": 1e-06,
525
+ "logits/chosen": -3.340818405151367,
526
+ "logits/rejected": -3.2608158588409424,
527
+ "logps/chosen": -145.6500244140625,
528
+ "logps/rejected": -148.91769409179688,
529
+ "loss": 0.4997,
530
  "rewards/accuracies": 0.375,
531
+ "rewards/chosen": -0.003630065592005849,
532
+ "rewards/margins": 0.0017146589234471321,
533
+ "rewards/rejected": -0.005344724282622337,
534
  "step": 25
535
  },
536
  {
537
+ "debug/policy_chosen_logits": -3.3708367347717285,
538
+ "debug/policy_chosen_logps": -134.6257781982422,
539
+ "debug/policy_rejected_logits": -3.2685859203338623,
540
+ "debug/policy_rejected_logps": -164.87184143066406,
541
  "debug/reference_chosen_logps": -134.9357452392578,
542
  "debug/reference_rejected_logps": -164.06057739257812,
543
  "epoch": 0.7027027027027027,
544
+ "grad_norm": 3.5262056978807492,
545
  "learning_rate": 1e-06,
546
+ "logits/chosen": -3.3708367347717285,
547
+ "logits/rejected": -3.2685859203338623,
548
+ "logps/chosen": -134.6257781982422,
549
+ "logps/rejected": -164.87184143066406,
550
+ "loss": 0.4954,
551
  "rewards/accuracies": 0.875,
552
+ "rewards/chosen": 0.0030995942652225494,
553
+ "rewards/margins": 0.011212329380214214,
554
+ "rewards/rejected": -0.008112735114991665,
555
  "step": 26
556
  },
557
  {
558
+ "debug/policy_chosen_logits": -3.276146650314331,
559
+ "debug/policy_chosen_logps": -127.90383911132812,
560
+ "debug/policy_rejected_logits": -3.2199714183807373,
561
+ "debug/policy_rejected_logps": -148.08279418945312,
562
  "debug/reference_chosen_logps": -127.79987335205078,
563
  "debug/reference_rejected_logps": -148.602783203125,
564
  "epoch": 0.7297297297297297,
565
+ "grad_norm": 3.5419489962048973,
566
  "learning_rate": 1e-06,
567
+ "logits/chosen": -3.276146650314331,
568
+ "logits/rejected": -3.2199714183807373,
569
+ "logps/chosen": -127.90383911132812,
570
+ "logps/rejected": -148.08279418945312,
571
+ "loss": 0.4972,
572
  "rewards/accuracies": 0.25,
573
+ "rewards/chosen": -0.0010396192083135247,
574
+ "rewards/margins": -0.006239423528313637,
575
+ "rewards/rejected": 0.005199803970754147,
576
  "step": 27
577
  },
578
  {
579
+ "debug/policy_chosen_logits": -3.3671457767486572,
580
+ "debug/policy_chosen_logps": -142.98072814941406,
581
+ "debug/policy_rejected_logits": -3.313788652420044,
582
+ "debug/policy_rejected_logps": -159.15054321289062,
583
  "debug/reference_chosen_logps": -143.54417419433594,
584
  "debug/reference_rejected_logps": -159.19894409179688,
585
  "epoch": 0.7567567567567568,
586
+ "grad_norm": 3.6334386387375948,
587
  "learning_rate": 1e-06,
588
+ "logits/chosen": -3.3671457767486572,
589
+ "logits/rejected": -3.313788652420044,
590
+ "logps/chosen": -142.98072814941406,
591
+ "logps/rejected": -159.15054321289062,
592
+ "loss": 0.4957,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": 0.005634431727230549,
595
+ "rewards/margins": 0.005150537472218275,
596
+ "rewards/rejected": 0.0004838943714275956,
597
  "step": 28
598
  },
599
  {
600
+ "debug/policy_chosen_logits": -3.451542377471924,
601
+ "debug/policy_chosen_logps": -144.62733459472656,
602
+ "debug/policy_rejected_logits": -3.500192403793335,
603
+ "debug/policy_rejected_logps": -129.41226196289062,
604
  "debug/reference_chosen_logps": -144.27435302734375,
605
  "debug/reference_rejected_logps": -128.7982177734375,
606
  "epoch": 0.7837837837837838,
607
+ "grad_norm": 3.525191396719062,
608
  "learning_rate": 1e-06,
609
+ "logits/chosen": -3.451542377471924,
610
+ "logits/rejected": -3.500192403793335,
611
+ "logps/chosen": -144.62733459472656,
612
+ "logps/rejected": -129.41226196289062,
613
+ "loss": 0.4925,
614
  "rewards/accuracies": 0.625,
615
+ "rewards/chosen": -0.003529853653162718,
616
+ "rewards/margins": 0.0026106643490493298,
617
+ "rewards/rejected": -0.006140518002212048,
618
  "step": 29
619
  },
620
  {
621
+ "debug/policy_chosen_logits": -3.4419167041778564,
622
+ "debug/policy_chosen_logps": -128.94613647460938,
623
+ "debug/policy_rejected_logits": -3.3551814556121826,
624
+ "debug/policy_rejected_logps": -125.87129974365234,
625
  "debug/reference_chosen_logps": -129.1351318359375,
626
  "debug/reference_rejected_logps": -125.61131286621094,
627
  "epoch": 0.8108108108108109,
628
+ "grad_norm": 3.4357174715493812,
629
  "learning_rate": 1e-06,
630
+ "logits/chosen": -3.4419167041778564,
631
+ "logits/rejected": -3.3551814556121826,
632
+ "logps/chosen": -128.94613647460938,
633
+ "logps/rejected": -125.87129974365234,
634
+ "loss": 0.4956,
635
  "rewards/accuracies": 0.375,
636
+ "rewards/chosen": 0.001890067826025188,
637
+ "rewards/margins": 0.00448994617909193,
638
+ "rewards/rejected": -0.0025998782366514206,
639
  "step": 30
640
  },
641
  {
642
+ "debug/policy_chosen_logits": -3.464578628540039,
643
+ "debug/policy_chosen_logps": -148.08114624023438,
644
+ "debug/policy_rejected_logits": -3.42940354347229,
645
+ "debug/policy_rejected_logps": -155.03309631347656,
646
  "debug/reference_chosen_logps": -147.857666015625,
647
  "debug/reference_rejected_logps": -153.7999267578125,
648
  "epoch": 0.8378378378378378,
649
+ "grad_norm": 3.569444014406672,
650
  "learning_rate": 1e-06,
651
+ "logits/chosen": -3.464578628540039,
652
+ "logits/rejected": -3.42940354347229,
653
+ "logps/chosen": -148.08114624023438,
654
+ "logps/rejected": -155.03309631347656,
655
  "loss": 0.4994,
656
  "rewards/accuracies": 0.5,
657
+ "rewards/chosen": -0.0022347732447087765,
658
+ "rewards/margins": 0.010096902027726173,
659
+ "rewards/rejected": -0.012331675738096237,
660
  "step": 31
661
  },
662
  {
663
+ "debug/policy_chosen_logits": -3.2954604625701904,
664
+ "debug/policy_chosen_logps": -146.03802490234375,
665
+ "debug/policy_rejected_logits": -3.384243965148926,
666
+ "debug/policy_rejected_logps": -145.24456787109375,
667
  "debug/reference_chosen_logps": -146.03688049316406,
668
  "debug/reference_rejected_logps": -144.96478271484375,
669
  "epoch": 0.8648648648648649,
670
+ "grad_norm": 3.794862233420595,
671
  "learning_rate": 1e-06,
672
+ "logits/chosen": -3.2954604625701904,
673
+ "logits/rejected": -3.384243965148926,
674
+ "logps/chosen": -146.03802490234375,
675
+ "logps/rejected": -145.24456787109375,
676
+ "loss": 0.4995,
677
+ "rewards/accuracies": 0.5,
678
+ "rewards/chosen": -1.1482159607112408e-05,
679
+ "rewards/margins": 0.002786235883831978,
680
+ "rewards/rejected": -0.0027977179270237684,
681
  "step": 32
682
  },
683
  {
684
+ "debug/policy_chosen_logits": -3.393167734146118,
685
+ "debug/policy_chosen_logps": -135.54547119140625,
686
+ "debug/policy_rejected_logits": -3.4107766151428223,
687
+ "debug/policy_rejected_logps": -126.21063995361328,
688
  "debug/reference_chosen_logps": -135.9659423828125,
689
  "debug/reference_rejected_logps": -123.26311492919922,
690
  "epoch": 0.8918918918918919,
691
+ "grad_norm": 3.7044090203550195,
692
  "learning_rate": 1e-06,
693
+ "logits/chosen": -3.393167734146118,
694
+ "logits/rejected": -3.4107766151428223,
695
+ "logps/chosen": -135.54547119140625,
696
+ "logps/rejected": -126.21063995361328,
697
+ "loss": 0.4985,
698
  "rewards/accuracies": 0.5,
699
+ "rewards/chosen": 0.004204845521599054,
700
+ "rewards/margins": 0.03368005529046059,
701
+ "rewards/rejected": -0.02947521209716797,
702
  "step": 33
703
  },
704
  {
705
+ "debug/policy_chosen_logits": -3.3235180377960205,
706
+ "debug/policy_chosen_logps": -159.286865234375,
707
+ "debug/policy_rejected_logits": -3.369253635406494,
708
+ "debug/policy_rejected_logps": -165.31944274902344,
709
  "debug/reference_chosen_logps": -158.15670776367188,
710
  "debug/reference_rejected_logps": -164.94976806640625,
711
  "epoch": 0.918918918918919,
712
+ "grad_norm": 3.550860802017907,
713
  "learning_rate": 1e-06,
714
+ "logits/chosen": -3.3235180377960205,
715
+ "logits/rejected": -3.369253635406494,
716
+ "logps/chosen": -159.286865234375,
717
+ "logps/rejected": -165.31944274902344,
718
+ "loss": 0.4997,
719
  "rewards/accuracies": 0.25,
720
+ "rewards/chosen": -0.011301536113023758,
721
+ "rewards/margins": -0.007604693062603474,
722
+ "rewards/rejected": -0.0036968423519283533,
723
  "step": 34
724
  },
725
  {
726
+ "debug/policy_chosen_logits": -3.367034673690796,
727
+ "debug/policy_chosen_logps": -129.437744140625,
728
+ "debug/policy_rejected_logits": -3.396902561187744,
729
+ "debug/policy_rejected_logps": -157.3302001953125,
730
  "debug/reference_chosen_logps": -129.46514892578125,
731
  "debug/reference_rejected_logps": -157.58755493164062,
732
  "epoch": 0.9459459459459459,
733
+ "grad_norm": 3.6785421821803044,
734
  "learning_rate": 1e-06,
735
+ "logits/chosen": -3.367034673690796,
736
+ "logits/rejected": -3.396902561187744,
737
+ "logps/chosen": -129.437744140625,
738
+ "logps/rejected": -157.3302001953125,
739
+ "loss": 0.498,
740
+ "rewards/accuracies": 0.5,
741
+ "rewards/chosen": 0.00027409568428993225,
742
+ "rewards/margins": -0.002299328101798892,
743
+ "rewards/rejected": 0.002573424018919468,
744
  "step": 35
745
  },
746
  {
747
+ "debug/policy_chosen_logits": -3.252328634262085,
748
+ "debug/policy_chosen_logps": -139.81039428710938,
749
+ "debug/policy_rejected_logits": -3.3797786235809326,
750
+ "debug/policy_rejected_logps": -127.12653350830078,
751
  "debug/reference_chosen_logps": -140.35733032226562,
752
  "debug/reference_rejected_logps": -127.09129333496094,
753
  "epoch": 0.972972972972973,
754
+ "grad_norm": 3.7364723193101446,
755
  "learning_rate": 1e-06,
756
+ "logits/chosen": -3.252328634262085,
757
+ "logits/rejected": -3.3797786235809326,
758
+ "logps/chosen": -139.81039428710938,
759
+ "logps/rejected": -127.12653350830078,
760
+ "loss": 0.4951,
761
+ "rewards/accuracies": 0.625,
762
+ "rewards/chosen": 0.005469245836138725,
763
+ "rewards/margins": 0.0058215707540512085,
764
+ "rewards/rejected": -0.0003523253835737705,
765
  "step": 36
766
  },
767
  {
768
+ "debug/policy_chosen_logits": -3.166999101638794,
769
+ "debug/policy_chosen_logps": -165.9640350341797,
770
+ "debug/policy_rejected_logits": -3.1885428428649902,
771
+ "debug/policy_rejected_logps": -135.82620239257812,
772
  "debug/reference_chosen_logps": -164.97256469726562,
773
  "debug/reference_rejected_logps": -135.76358032226562,
774
  "epoch": 1.0,
775
+ "grad_norm": 3.5142818885997116,
776
  "learning_rate": 1e-06,
777
+ "logits/chosen": -3.166999101638794,
778
+ "logits/rejected": -3.1885428428649902,
779
+ "logps/chosen": -165.9640350341797,
780
+ "logps/rejected": -135.82620239257812,
781
+ "loss": 0.481,
782
  "rewards/accuracies": 0.375,
783
+ "rewards/chosen": -0.009914855472743511,
784
+ "rewards/margins": -0.009288596920669079,
785
+ "rewards/rejected": -0.0006262586684897542,
786
  "step": 37
787
  },
788
  {
789
  "epoch": 1.0,
790
  "step": 37,
791
  "total_flos": 0.0,
792
+ "train_loss": 0.4980396749200048,
793
+ "train_runtime": 139.9378,
794
+ "train_samples_per_second": 16.757,
795
+ "train_steps_per_second": 0.264
796
  }
797
  ],
798
  "logging_steps": 1,