martimfasantos commited on
Commit
5ceb57d
1 Parent(s): 2df4a24

Model save

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: martimfasantos/tinyllama-1.1b-mt-sft-full_sardine
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: tinyllama-1.1b-mt-dpo-full_LR1e-7_BS32_rmsprop_3epochs_sft_sardine_dpo_sardine
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # tinyllama-1.1b-mt-dpo-full_LR1e-7_BS32_rmsprop_3epochs_sft_sardine_dpo_sardine
17
+
18
+ This model is a fine-tuned version of [martimfasantos/tinyllama-1.1b-mt-sft-full_sardine](https://huggingface.co/martimfasantos/tinyllama-1.1b-mt-sft-full_sardine) on an unknown dataset.
19
+
20
+ ## Model description
21
+
22
+ More information needed
23
+
24
+ ## Intended uses & limitations
25
+
26
+ More information needed
27
+
28
+ ## Training and evaluation data
29
+
30
+ More information needed
31
+
32
+ ## Training procedure
33
+
34
+ ### Training hyperparameters
35
+
36
+ The following hyperparameters were used during training:
37
+ - learning_rate: 1e-07
38
+ - train_batch_size: 1
39
+ - eval_batch_size: 4
40
+ - seed: 42
41
+ - distributed_type: multi-GPU
42
+ - gradient_accumulation_steps: 32
43
+ - total_train_batch_size: 32
44
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
+ - lr_scheduler_type: cosine
46
+ - lr_scheduler_warmup_ratio: 0.1
47
+ - num_epochs: 3
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - Transformers 4.41.2
56
+ - Pytorch 2.1.2
57
+ - Datasets 2.20.0
58
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.99582225598177,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.546259533964032,
5
+ "train_runtime": 14357.5178,
6
+ "train_samples": 15798,
7
+ "train_samples_per_second": 3.301,
8
+ "train_steps_per_second": 0.103
9
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.41.2"
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb34c21731cbef6f14db1aaca2c67636bfffdff97be612aea684e2c4f124fceb
3
  size 4400216536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb95a093f7cad689746e2609d0e31b0d361e9babceb97639c28351392e03e66
3
  size 4400216536
runs/Jul20_19-21-30_poseidon/events.out.tfevents.1721504133.poseidon.1544871.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7f03189880ab5f49dc49a4f55f0ba66357cb64aa889ec9909a59deafeb84182
3
- size 102196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98dca213b6e6ecb32853dd3f87d43849944ad376e096e114dcdb1401c87c5b44
3
+ size 107366
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.99582225598177,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.546259533964032,
5
+ "train_runtime": 14357.5178,
6
+ "train_samples": 15798,
7
+ "train_samples_per_second": 3.301,
8
+ "train_steps_per_second": 0.103
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.99582225598177,
5
+ "eval_steps": 800,
6
+ "global_step": 1479,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002025572857323712,
13
+ "grad_norm": 31.013870239257812,
14
+ "learning_rate": 6.756756756756757e-10,
15
+ "logits/chosen": -2.5177597999572754,
16
+ "logits/rejected": -2.4276583194732666,
17
+ "logps/chosen": -79.6932373046875,
18
+ "logps/rejected": -86.58649444580078,
19
+ "loss": 0.6929,
20
+ "rewards/accuracies": 0.03125,
21
+ "rewards/chosen": -0.0008372783777303994,
22
+ "rewards/margins": 0.00045527220936492085,
23
+ "rewards/rejected": -0.0012925505870953202,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.020255728573237118,
28
+ "grad_norm": 28.71278190612793,
29
+ "learning_rate": 6.756756756756757e-09,
30
+ "logits/chosen": -2.587923526763916,
31
+ "logits/rejected": -2.421647787094116,
32
+ "logps/chosen": -72.02790069580078,
33
+ "logps/rejected": -68.7666015625,
34
+ "loss": 0.6928,
35
+ "rewards/accuracies": 0.4479166567325592,
36
+ "rewards/chosen": -0.000452535372460261,
37
+ "rewards/margins": 0.0008923111017793417,
38
+ "rewards/rejected": -0.001344846561551094,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.040511457146474236,
43
+ "grad_norm": 24.486814498901367,
44
+ "learning_rate": 1.3513513513513514e-08,
45
+ "logits/chosen": -2.5588934421539307,
46
+ "logits/rejected": -2.3707621097564697,
47
+ "logps/chosen": -77.4730453491211,
48
+ "logps/rejected": -71.17650604248047,
49
+ "loss": 0.693,
50
+ "rewards/accuracies": 0.49687498807907104,
51
+ "rewards/chosen": 0.0015422820579260588,
52
+ "rewards/margins": 0.00048240157775580883,
53
+ "rewards/rejected": 0.0010598807130008936,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.060767185719711354,
58
+ "grad_norm": 26.952857971191406,
59
+ "learning_rate": 2.027027027027027e-08,
60
+ "logits/chosen": -2.5552210807800293,
61
+ "logits/rejected": -2.3964200019836426,
62
+ "logps/chosen": -75.58769226074219,
63
+ "logps/rejected": -74.38423156738281,
64
+ "loss": 0.6941,
65
+ "rewards/accuracies": 0.4625000059604645,
66
+ "rewards/chosen": -0.00154799222946167,
67
+ "rewards/margins": -0.0016709610354155302,
68
+ "rewards/rejected": 0.0001229687622981146,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.08102291429294847,
73
+ "grad_norm": 30.491893768310547,
74
+ "learning_rate": 2.7027027027027028e-08,
75
+ "logits/chosen": -2.538985013961792,
76
+ "logits/rejected": -2.3956074714660645,
77
+ "logps/chosen": -84.64269256591797,
78
+ "logps/rejected": -82.15937042236328,
79
+ "loss": 0.6927,
80
+ "rewards/accuracies": 0.543749988079071,
81
+ "rewards/chosen": 0.0005608886131085455,
82
+ "rewards/margins": 0.0012397856917232275,
83
+ "rewards/rejected": -0.000678897020407021,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.1012786428661856,
88
+ "grad_norm": 27.152774810791016,
89
+ "learning_rate": 3.378378378378378e-08,
90
+ "logits/chosen": -2.515413522720337,
91
+ "logits/rejected": -2.358457565307617,
92
+ "logps/chosen": -81.1507568359375,
93
+ "logps/rejected": -78.68826293945312,
94
+ "loss": 0.6921,
95
+ "rewards/accuracies": 0.53125,
96
+ "rewards/chosen": -0.0013986782869324088,
97
+ "rewards/margins": 0.002325823763385415,
98
+ "rewards/rejected": -0.0037245028652250767,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.12153437143942271,
103
+ "grad_norm": 27.534177780151367,
104
+ "learning_rate": 4.054054054054054e-08,
105
+ "logits/chosen": -2.520850419998169,
106
+ "logits/rejected": -2.3658010959625244,
107
+ "logps/chosen": -78.13814544677734,
108
+ "logps/rejected": -75.04551696777344,
109
+ "loss": 0.6938,
110
+ "rewards/accuracies": 0.518750011920929,
111
+ "rewards/chosen": -0.0025924122892320156,
112
+ "rewards/margins": -0.0010297519620507956,
113
+ "rewards/rejected": -0.00156266032718122,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.14179010001265982,
118
+ "grad_norm": 30.60668182373047,
119
+ "learning_rate": 4.72972972972973e-08,
120
+ "logits/chosen": -2.5382590293884277,
121
+ "logits/rejected": -2.37661075592041,
122
+ "logps/chosen": -83.97273254394531,
123
+ "logps/rejected": -80.8182373046875,
124
+ "loss": 0.6926,
125
+ "rewards/accuracies": 0.515625,
126
+ "rewards/chosen": -0.003294873284175992,
127
+ "rewards/margins": 0.0013795426348224282,
128
+ "rewards/rejected": -0.004674416035413742,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.16204582858589694,
133
+ "grad_norm": 27.501964569091797,
134
+ "learning_rate": 5.4054054054054056e-08,
135
+ "logits/chosen": -2.4654035568237305,
136
+ "logits/rejected": -2.3297314643859863,
137
+ "logps/chosen": -75.83648681640625,
138
+ "logps/rejected": -76.66287994384766,
139
+ "loss": 0.6902,
140
+ "rewards/accuracies": 0.578125,
141
+ "rewards/chosen": -0.0013893753057345748,
142
+ "rewards/margins": 0.006102095358073711,
143
+ "rewards/rejected": -0.007491470314562321,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.18230155715913407,
148
+ "grad_norm": 24.944175720214844,
149
+ "learning_rate": 6.081081081081081e-08,
150
+ "logits/chosen": -2.5287299156188965,
151
+ "logits/rejected": -2.379664659500122,
152
+ "logps/chosen": -86.45475769042969,
153
+ "logps/rejected": -79.61102294921875,
154
+ "loss": 0.6904,
155
+ "rewards/accuracies": 0.578125,
156
+ "rewards/chosen": -0.0014329934492707253,
157
+ "rewards/margins": 0.005873243790119886,
158
+ "rewards/rejected": -0.007306237705051899,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.2025572857323712,
163
+ "grad_norm": 29.129093170166016,
164
+ "learning_rate": 6.756756756756756e-08,
165
+ "logits/chosen": -2.5153121948242188,
166
+ "logits/rejected": -2.361551523208618,
167
+ "logps/chosen": -84.0345230102539,
168
+ "logps/rejected": -78.61013793945312,
169
+ "loss": 0.6881,
170
+ "rewards/accuracies": 0.625,
171
+ "rewards/chosen": -0.0028929836116731167,
172
+ "rewards/margins": 0.010393200442194939,
173
+ "rewards/rejected": -0.013286183588206768,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.2228130143056083,
178
+ "grad_norm": 28.691940307617188,
179
+ "learning_rate": 7.432432432432432e-08,
180
+ "logits/chosen": -2.546154260635376,
181
+ "logits/rejected": -2.389882802963257,
182
+ "logps/chosen": -74.24641418457031,
183
+ "logps/rejected": -72.99244689941406,
184
+ "loss": 0.6872,
185
+ "rewards/accuracies": 0.609375,
186
+ "rewards/chosen": -0.003837780561298132,
187
+ "rewards/margins": 0.012213540263473988,
188
+ "rewards/rejected": -0.016051320359110832,
189
+ "step": 110
190
+ },
191
+ {
192
+ "epoch": 0.24306874287884542,
193
+ "grad_norm": 26.047407150268555,
194
+ "learning_rate": 8.108108108108108e-08,
195
+ "logits/chosen": -2.530447006225586,
196
+ "logits/rejected": -2.3604226112365723,
197
+ "logps/chosen": -79.45042419433594,
198
+ "logps/rejected": -75.46896362304688,
199
+ "loss": 0.6834,
200
+ "rewards/accuracies": 0.6781250238418579,
201
+ "rewards/chosen": -0.004467087332159281,
202
+ "rewards/margins": 0.020118705928325653,
203
+ "rewards/rejected": -0.024585790932178497,
204
+ "step": 120
205
+ },
206
+ {
207
+ "epoch": 0.26332447145208254,
208
+ "grad_norm": 30.345191955566406,
209
+ "learning_rate": 8.783783783783784e-08,
210
+ "logits/chosen": -2.4959208965301514,
211
+ "logits/rejected": -2.344454526901245,
212
+ "logps/chosen": -86.02290344238281,
213
+ "logps/rejected": -81.23602294921875,
214
+ "loss": 0.6824,
215
+ "rewards/accuracies": 0.6937500238418579,
216
+ "rewards/chosen": -0.006714115384966135,
217
+ "rewards/margins": 0.02207200787961483,
218
+ "rewards/rejected": -0.028786126524209976,
219
+ "step": 130
220
+ },
221
+ {
222
+ "epoch": 0.28358020002531964,
223
+ "grad_norm": 23.314868927001953,
224
+ "learning_rate": 9.45945945945946e-08,
225
+ "logits/chosen": -2.5607352256774902,
226
+ "logits/rejected": -2.394366502761841,
227
+ "logps/chosen": -72.60206604003906,
228
+ "logps/rejected": -67.85148620605469,
229
+ "loss": 0.6799,
230
+ "rewards/accuracies": 0.6875,
231
+ "rewards/chosen": -0.008559314534068108,
232
+ "rewards/margins": 0.027249369770288467,
233
+ "rewards/rejected": -0.035808682441711426,
234
+ "step": 140
235
+ },
236
+ {
237
+ "epoch": 0.3038359285985568,
238
+ "grad_norm": 27.51814079284668,
239
+ "learning_rate": 9.999944288759615e-08,
240
+ "logits/chosen": -2.5597286224365234,
241
+ "logits/rejected": -2.4156954288482666,
242
+ "logps/chosen": -74.42972564697266,
243
+ "logps/rejected": -70.92676544189453,
244
+ "loss": 0.6768,
245
+ "rewards/accuracies": 0.675000011920929,
246
+ "rewards/chosen": -0.00905610155314207,
247
+ "rewards/margins": 0.03394917771220207,
248
+ "rewards/rejected": -0.043005283921957016,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.3240916571717939,
253
+ "grad_norm": 30.097389221191406,
254
+ "learning_rate": 9.99799452570021e-08,
255
+ "logits/chosen": -2.508636236190796,
256
+ "logits/rejected": -2.3848562240600586,
257
+ "logps/chosen": -77.739013671875,
258
+ "logps/rejected": -73.92839813232422,
259
+ "loss": 0.6753,
260
+ "rewards/accuracies": 0.675000011920929,
261
+ "rewards/chosen": -0.017720907926559448,
262
+ "rewards/margins": 0.037171002477407455,
263
+ "rewards/rejected": -0.054891906678676605,
264
+ "step": 160
265
+ },
266
+ {
267
+ "epoch": 0.34434738574503104,
268
+ "grad_norm": 32.0848503112793,
269
+ "learning_rate": 9.993260441994115e-08,
270
+ "logits/chosen": -2.5097594261169434,
271
+ "logits/rejected": -2.3447771072387695,
272
+ "logps/chosen": -81.94526672363281,
273
+ "logps/rejected": -78.39651489257812,
274
+ "loss": 0.6634,
275
+ "rewards/accuracies": 0.7406250238418579,
276
+ "rewards/chosen": -0.013811466284096241,
277
+ "rewards/margins": 0.06231771036982536,
278
+ "rewards/rejected": -0.07612917572259903,
279
+ "step": 170
280
+ },
281
+ {
282
+ "epoch": 0.36460311431826814,
283
+ "grad_norm": 29.288728713989258,
284
+ "learning_rate": 9.985744674940535e-08,
285
+ "logits/chosen": -2.5364279747009277,
286
+ "logits/rejected": -2.354965925216675,
287
+ "logps/chosen": -80.46150207519531,
288
+ "logps/rejected": -75.10428619384766,
289
+ "loss": 0.6577,
290
+ "rewards/accuracies": 0.7406250238418579,
291
+ "rewards/chosen": -0.0209208894520998,
292
+ "rewards/margins": 0.07542888820171356,
293
+ "rewards/rejected": -0.09634977579116821,
294
+ "step": 180
295
+ },
296
+ {
297
+ "epoch": 0.38485884289150524,
298
+ "grad_norm": 27.00816535949707,
299
+ "learning_rate": 9.975451411479911e-08,
300
+ "logits/chosen": -2.499474048614502,
301
+ "logits/rejected": -2.337350606918335,
302
+ "logps/chosen": -78.61238098144531,
303
+ "logps/rejected": -78.64549255371094,
304
+ "loss": 0.657,
305
+ "rewards/accuracies": 0.706250011920929,
306
+ "rewards/chosen": -0.02487110160291195,
307
+ "rewards/margins": 0.07822562754154205,
308
+ "rewards/rejected": -0.10309673845767975,
309
+ "step": 190
310
+ },
311
+ {
312
+ "epoch": 0.4051145714647424,
313
+ "grad_norm": 27.34921646118164,
314
+ "learning_rate": 9.962386385861412e-08,
315
+ "logits/chosen": -2.50087308883667,
316
+ "logits/rejected": -2.360152006149292,
317
+ "logps/chosen": -76.67208862304688,
318
+ "logps/rejected": -78.57847595214844,
319
+ "loss": 0.652,
320
+ "rewards/accuracies": 0.746874988079071,
321
+ "rewards/chosen": -0.038105227053165436,
322
+ "rewards/margins": 0.08875634521245956,
323
+ "rewards/rejected": -0.126861572265625,
324
+ "step": 200
325
+ },
326
+ {
327
+ "epoch": 0.4253703000379795,
328
+ "grad_norm": 25.707185745239258,
329
+ "learning_rate": 9.946556876448468e-08,
330
+ "logits/chosen": -2.4654879570007324,
331
+ "logits/rejected": -2.312530994415283,
332
+ "logps/chosen": -78.15449523925781,
333
+ "logps/rejected": -77.42134857177734,
334
+ "loss": 0.6439,
335
+ "rewards/accuracies": 0.7124999761581421,
336
+ "rewards/chosen": -0.03906805440783501,
337
+ "rewards/margins": 0.10793592780828476,
338
+ "rewards/rejected": -0.14700399339199066,
339
+ "step": 210
340
+ },
341
+ {
342
+ "epoch": 0.4456260286112166,
343
+ "grad_norm": 27.729816436767578,
344
+ "learning_rate": 9.927971701664084e-08,
345
+ "logits/chosen": -2.4674429893493652,
346
+ "logits/rejected": -2.3009190559387207,
347
+ "logps/chosen": -75.07694244384766,
348
+ "logps/rejected": -75.41253662109375,
349
+ "loss": 0.6417,
350
+ "rewards/accuracies": 0.7593749761581421,
351
+ "rewards/chosen": -0.0388757549226284,
352
+ "rewards/margins": 0.1117323786020279,
353
+ "rewards/rejected": -0.1506081372499466,
354
+ "step": 220
355
+ },
356
+ {
357
+ "epoch": 0.46588175718445374,
358
+ "grad_norm": 24.861696243286133,
359
+ "learning_rate": 9.906641215078196e-08,
360
+ "logits/chosen": -2.462665557861328,
361
+ "logits/rejected": -2.309985876083374,
362
+ "logps/chosen": -77.72923278808594,
363
+ "logps/rejected": -75.81291961669922,
364
+ "loss": 0.6384,
365
+ "rewards/accuracies": 0.699999988079071,
366
+ "rewards/chosen": -0.052964676171541214,
367
+ "rewards/margins": 0.12125100940465927,
368
+ "rewards/rejected": -0.1742156744003296,
369
+ "step": 230
370
+ },
371
+ {
372
+ "epoch": 0.48613748575769083,
373
+ "grad_norm": 30.613037109375,
374
+ "learning_rate": 9.882577299639835e-08,
375
+ "logits/chosen": -2.4711391925811768,
376
+ "logits/rejected": -2.329216957092285,
377
+ "logps/chosen": -80.07206726074219,
378
+ "logps/rejected": -80.30625915527344,
379
+ "loss": 0.6333,
380
+ "rewards/accuracies": 0.7406250238418579,
381
+ "rewards/chosen": -0.0626656636595726,
382
+ "rewards/margins": 0.13491353392601013,
383
+ "rewards/rejected": -0.19757920503616333,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.506393214330928,
388
+ "grad_norm": 26.08587074279785,
389
+ "learning_rate": 9.85579336105728e-08,
390
+ "logits/chosen": -2.443732738494873,
391
+ "logits/rejected": -2.2919225692749023,
392
+ "logps/chosen": -81.61358642578125,
393
+ "logps/rejected": -77.46446990966797,
394
+ "loss": 0.6342,
395
+ "rewards/accuracies": 0.7281249761581421,
396
+ "rewards/chosen": -0.05705754831433296,
397
+ "rewards/margins": 0.13192656636238098,
398
+ "rewards/rejected": -0.18898411095142365,
399
+ "step": 250
400
+ },
401
+ {
402
+ "epoch": 0.5266489429041651,
403
+ "grad_norm": 24.169845581054688,
404
+ "learning_rate": 9.826304320329907e-08,
405
+ "logits/chosen": -2.478874683380127,
406
+ "logits/rejected": -2.297999382019043,
407
+ "logps/chosen": -83.48451232910156,
408
+ "logps/rejected": -78.53952026367188,
409
+ "loss": 0.6226,
410
+ "rewards/accuracies": 0.7093750238418579,
411
+ "rewards/chosen": -0.07330447435379028,
412
+ "rewards/margins": 0.16096489131450653,
413
+ "rewards/rejected": -0.23426935076713562,
414
+ "step": 260
415
+ },
416
+ {
417
+ "epoch": 0.5469046714774022,
418
+ "grad_norm": 26.509349822998047,
419
+ "learning_rate": 9.794126605435884e-08,
420
+ "logits/chosen": -2.452291488647461,
421
+ "logits/rejected": -2.2766990661621094,
422
+ "logps/chosen": -83.7742691040039,
423
+ "logps/rejected": -80.86663818359375,
424
+ "loss": 0.6085,
425
+ "rewards/accuracies": 0.731249988079071,
426
+ "rewards/chosen": -0.08826017379760742,
427
+ "rewards/margins": 0.20091946423053741,
428
+ "rewards/rejected": -0.28917962312698364,
429
+ "step": 270
430
+ },
431
+ {
432
+ "epoch": 0.5671604000506393,
433
+ "grad_norm": 23.097434997558594,
434
+ "learning_rate": 9.759278142180347e-08,
435
+ "logits/chosen": -2.4537911415100098,
436
+ "logits/rejected": -2.291194438934326,
437
+ "logps/chosen": -77.87368774414062,
438
+ "logps/rejected": -77.5306625366211,
439
+ "loss": 0.6047,
440
+ "rewards/accuracies": 0.762499988079071,
441
+ "rewards/chosen": -0.08215166628360748,
442
+ "rewards/margins": 0.20681920647621155,
443
+ "rewards/rejected": -0.28897085785865784,
444
+ "step": 280
445
+ },
446
+ {
447
+ "epoch": 0.5874161286238765,
448
+ "grad_norm": 25.596263885498047,
449
+ "learning_rate": 9.72177834420916e-08,
450
+ "logits/chosen": -2.4155325889587402,
451
+ "logits/rejected": -2.2689411640167236,
452
+ "logps/chosen": -84.03662109375,
453
+ "logps/rejected": -83.03952026367188,
454
+ "loss": 0.6053,
455
+ "rewards/accuracies": 0.7250000238418579,
456
+ "rewards/chosen": -0.12551462650299072,
457
+ "rewards/margins": 0.20985326170921326,
458
+ "rewards/rejected": -0.33536791801452637,
459
+ "step": 290
460
+ },
461
+ {
462
+ "epoch": 0.6076718571971136,
463
+ "grad_norm": 32.68680191040039,
464
+ "learning_rate": 9.68164810219381e-08,
465
+ "logits/chosen": -2.4283127784729004,
466
+ "logits/rejected": -2.3022093772888184,
467
+ "logps/chosen": -74.84422302246094,
468
+ "logps/rejected": -76.46062469482422,
469
+ "loss": 0.6136,
470
+ "rewards/accuracies": 0.7093750238418579,
471
+ "rewards/chosen": -0.13987191021442413,
472
+ "rewards/margins": 0.19149479269981384,
473
+ "rewards/rejected": -0.33136671781539917,
474
+ "step": 300
475
+ },
476
+ {
477
+ "epoch": 0.6279275857703507,
478
+ "grad_norm": 26.36361312866211,
479
+ "learning_rate": 9.638909772193478e-08,
480
+ "logits/chosen": -2.3842902183532715,
481
+ "logits/rejected": -2.242034435272217,
482
+ "logps/chosen": -85.19241333007812,
483
+ "logps/rejected": -81.81166076660156,
484
+ "loss": 0.6018,
485
+ "rewards/accuracies": 0.753125011920929,
486
+ "rewards/chosen": -0.12717826664447784,
487
+ "rewards/margins": 0.21827277541160583,
488
+ "rewards/rejected": -0.3454510569572449,
489
+ "step": 310
490
+ },
491
+ {
492
+ "epoch": 0.6481833143435878,
493
+ "grad_norm": 26.719867706298828,
494
+ "learning_rate": 9.593587163200753e-08,
495
+ "logits/chosen": -2.4053542613983154,
496
+ "logits/rejected": -2.277993679046631,
497
+ "logps/chosen": -82.21893310546875,
498
+ "logps/rejected": -82.72879791259766,
499
+ "loss": 0.5946,
500
+ "rewards/accuracies": 0.765625,
501
+ "rewards/chosen": -0.14859510958194733,
502
+ "rewards/margins": 0.2397138774394989,
503
+ "rewards/rejected": -0.38830894231796265,
504
+ "step": 320
505
+ },
506
+ {
507
+ "epoch": 0.6684390429168249,
508
+ "grad_norm": 29.379661560058594,
509
+ "learning_rate": 9.545705523877943e-08,
510
+ "logits/chosen": -2.39337420463562,
511
+ "logits/rejected": -2.243393659591675,
512
+ "logps/chosen": -89.27748107910156,
513
+ "logps/rejected": -88.36190795898438,
514
+ "loss": 0.583,
515
+ "rewards/accuracies": 0.7593749761581421,
516
+ "rewards/chosen": -0.15845103561878204,
517
+ "rewards/margins": 0.27531546354293823,
518
+ "rewards/rejected": -0.43376651406288147,
519
+ "step": 330
520
+ },
521
+ {
522
+ "epoch": 0.6886947714900621,
523
+ "grad_norm": 29.51100730895996,
524
+ "learning_rate": 9.495291528491348e-08,
525
+ "logits/chosen": -2.4061636924743652,
526
+ "logits/rejected": -2.2737927436828613,
527
+ "logps/chosen": -77.5340805053711,
528
+ "logps/rejected": -78.92658996582031,
529
+ "loss": 0.6076,
530
+ "rewards/accuracies": 0.715624988079071,
531
+ "rewards/chosen": -0.18812718987464905,
532
+ "rewards/margins": 0.22010421752929688,
533
+ "rewards/rejected": -0.4082314372062683,
534
+ "step": 340
535
+ },
536
+ {
537
+ "epoch": 0.7089505000632992,
538
+ "grad_norm": 28.85425567626953,
539
+ "learning_rate": 9.442373262051371e-08,
540
+ "logits/chosen": -2.3706448078155518,
541
+ "logits/rejected": -2.207597017288208,
542
+ "logps/chosen": -84.70035552978516,
543
+ "logps/rejected": -79.90083312988281,
544
+ "loss": 0.596,
545
+ "rewards/accuracies": 0.7093750238418579,
546
+ "rewards/chosen": -0.1977623850107193,
547
+ "rewards/margins": 0.24996185302734375,
548
+ "rewards/rejected": -0.44772419333457947,
549
+ "step": 350
550
+ },
551
+ {
552
+ "epoch": 0.7292062286365363,
553
+ "grad_norm": 28.540573120117188,
554
+ "learning_rate": 9.386980204666698e-08,
555
+ "logits/chosen": -2.369175910949707,
556
+ "logits/rejected": -2.214489459991455,
557
+ "logps/chosen": -80.21549224853516,
558
+ "logps/rejected": -79.48722839355469,
559
+ "loss": 0.5941,
560
+ "rewards/accuracies": 0.7281249761581421,
561
+ "rewards/chosen": -0.21811044216156006,
562
+ "rewards/margins": 0.2636975646018982,
563
+ "rewards/rejected": -0.48180800676345825,
564
+ "step": 360
565
+ },
566
+ {
567
+ "epoch": 0.7494619572097734,
568
+ "grad_norm": 32.249847412109375,
569
+ "learning_rate": 9.3291432151213e-08,
570
+ "logits/chosen": -2.3587095737457275,
571
+ "logits/rejected": -2.218735456466675,
572
+ "logps/chosen": -85.94621276855469,
573
+ "logps/rejected": -85.6765365600586,
574
+ "loss": 0.5842,
575
+ "rewards/accuracies": 0.746874988079071,
576
+ "rewards/chosen": -0.21563701331615448,
577
+ "rewards/margins": 0.27373427152633667,
578
+ "rewards/rejected": -0.48937129974365234,
579
+ "step": 370
580
+ },
581
+ {
582
+ "epoch": 0.7697176857830105,
583
+ "grad_norm": 26.788440704345703,
584
+ "learning_rate": 9.26889451368339e-08,
585
+ "logits/chosen": -2.368450164794922,
586
+ "logits/rejected": -2.2382750511169434,
587
+ "logps/chosen": -84.80735778808594,
588
+ "logps/rejected": -83.58937072753906,
589
+ "loss": 0.5836,
590
+ "rewards/accuracies": 0.734375,
591
+ "rewards/chosen": -0.22893838584423065,
592
+ "rewards/margins": 0.27746590971946716,
593
+ "rewards/rejected": -0.5064042806625366,
594
+ "step": 380
595
+ },
596
+ {
597
+ "epoch": 0.7899734143562476,
598
+ "grad_norm": 26.557083129882812,
599
+ "learning_rate": 9.206267664155906e-08,
600
+ "logits/chosen": -2.389660120010376,
601
+ "logits/rejected": -2.2198188304901123,
602
+ "logps/chosen": -80.32451629638672,
603
+ "logps/rejected": -81.00599670410156,
604
+ "loss": 0.5576,
605
+ "rewards/accuracies": 0.753125011920929,
606
+ "rewards/chosen": -0.22655515372753143,
607
+ "rewards/margins": 0.3575701117515564,
608
+ "rewards/rejected": -0.584125280380249,
609
+ "step": 390
610
+ },
611
+ {
612
+ "epoch": 0.8102291429294848,
613
+ "grad_norm": 22.922771453857422,
614
+ "learning_rate": 9.141297555178535e-08,
615
+ "logits/chosen": -2.4124135971069336,
616
+ "logits/rejected": -2.25138521194458,
617
+ "logps/chosen": -73.63737487792969,
618
+ "logps/rejected": -74.43919372558594,
619
+ "loss": 0.5756,
620
+ "rewards/accuracies": 0.734375,
621
+ "rewards/chosen": -0.2471962869167328,
622
+ "rewards/margins": 0.31536445021629333,
623
+ "rewards/rejected": -0.5625607371330261,
624
+ "step": 400
625
+ },
626
+ {
627
+ "epoch": 0.8304848715027219,
628
+ "grad_norm": 27.076038360595703,
629
+ "learning_rate": 9.074020380791693e-08,
630
+ "logits/chosen": -2.387418270111084,
631
+ "logits/rejected": -2.233450412750244,
632
+ "logps/chosen": -75.89783477783203,
633
+ "logps/rejected": -77.44602966308594,
634
+ "loss": 0.5727,
635
+ "rewards/accuracies": 0.706250011920929,
636
+ "rewards/chosen": -0.2607758641242981,
637
+ "rewards/margins": 0.3421139121055603,
638
+ "rewards/rejected": -0.6028897762298584,
639
+ "step": 410
640
+ },
641
+ {
642
+ "epoch": 0.850740600075959,
643
+ "grad_norm": 28.83265495300293,
644
+ "learning_rate": 9.004473620273263e-08,
645
+ "logits/chosen": -2.3343796730041504,
646
+ "logits/rejected": -2.207730293273926,
647
+ "logps/chosen": -80.99537658691406,
648
+ "logps/rejected": -83.44149017333984,
649
+ "loss": 0.5767,
650
+ "rewards/accuracies": 0.7124999761581421,
651
+ "rewards/chosen": -0.2705189883708954,
652
+ "rewards/margins": 0.32222992181777954,
653
+ "rewards/rejected": -0.5927489399909973,
654
+ "step": 420
655
+ },
656
+ {
657
+ "epoch": 0.8709963286491961,
658
+ "grad_norm": 29.984909057617188,
659
+ "learning_rate": 8.932696017259361e-08,
660
+ "logits/chosen": -2.3199007511138916,
661
+ "logits/rejected": -2.1576333045959473,
662
+ "logps/chosen": -85.59019470214844,
663
+ "logps/rejected": -84.45293426513672,
664
+ "loss": 0.5712,
665
+ "rewards/accuracies": 0.7437499761581421,
666
+ "rewards/chosen": -0.25998255610466003,
667
+ "rewards/margins": 0.3359551429748535,
668
+ "rewards/rejected": -0.5959377288818359,
669
+ "step": 430
670
+ },
671
+ {
672
+ "epoch": 0.8912520572224332,
673
+ "grad_norm": 25.058645248413086,
674
+ "learning_rate": 8.858727558160743e-08,
675
+ "logits/chosen": -2.3427436351776123,
676
+ "logits/rejected": -2.1888678073883057,
677
+ "logps/chosen": -82.66050720214844,
678
+ "logps/rejected": -82.61741638183594,
679
+ "loss": 0.5589,
680
+ "rewards/accuracies": 0.762499988079071,
681
+ "rewards/chosen": -0.26411646604537964,
682
+ "rewards/margins": 0.3582358956336975,
683
+ "rewards/rejected": -0.6223524212837219,
684
+ "step": 440
685
+ },
686
+ {
687
+ "epoch": 0.9115077857956704,
688
+ "grad_norm": 27.469467163085938,
689
+ "learning_rate": 8.782609449886861e-08,
690
+ "logits/chosen": -2.325899839401245,
691
+ "logits/rejected": -2.1819796562194824,
692
+ "logps/chosen": -82.68738555908203,
693
+ "logps/rejected": -82.69558715820312,
694
+ "loss": 0.5571,
695
+ "rewards/accuracies": 0.753125011920929,
696
+ "rewards/chosen": -0.27801764011383057,
697
+ "rewards/margins": 0.37557533383369446,
698
+ "rewards/rejected": -0.6535929441452026,
699
+ "step": 450
700
+ },
701
+ {
702
+ "epoch": 0.9317635143689075,
703
+ "grad_norm": 25.38697052001953,
704
+ "learning_rate": 8.704384096890013e-08,
705
+ "logits/chosen": -2.3276028633117676,
706
+ "logits/rejected": -2.1806609630584717,
707
+ "logps/chosen": -84.02021789550781,
708
+ "logps/rejected": -83.25233459472656,
709
+ "loss": 0.5403,
710
+ "rewards/accuracies": 0.784375011920929,
711
+ "rewards/chosen": -0.2819129526615143,
712
+ "rewards/margins": 0.41642332077026367,
713
+ "rewards/rejected": -0.6983363032341003,
714
+ "step": 460
715
+ },
716
+ {
717
+ "epoch": 0.9520192429421446,
718
+ "grad_norm": 32.92365264892578,
719
+ "learning_rate": 8.62409507754235e-08,
720
+ "logits/chosen": -2.2575485706329346,
721
+ "logits/rejected": -2.1456449031829834,
722
+ "logps/chosen": -87.51969909667969,
723
+ "logps/rejected": -87.74186706542969,
724
+ "loss": 0.5594,
725
+ "rewards/accuracies": 0.7406250238418579,
726
+ "rewards/chosen": -0.3425014913082123,
727
+ "rewards/margins": 0.37962430715560913,
728
+ "rewards/rejected": -0.7221258878707886,
729
+ "step": 470
730
+ },
731
+ {
732
+ "epoch": 0.9722749715153817,
733
+ "grad_norm": 29.529056549072266,
734
+ "learning_rate": 8.541787119858902e-08,
735
+ "logits/chosen": -2.302694082260132,
736
+ "logits/rejected": -2.162090539932251,
737
+ "logps/chosen": -79.02600860595703,
738
+ "logps/rejected": -78.34095764160156,
739
+ "loss": 0.5721,
740
+ "rewards/accuracies": 0.734375,
741
+ "rewards/chosen": -0.35388362407684326,
742
+ "rewards/margins": 0.3509567975997925,
743
+ "rewards/rejected": -0.7048403024673462,
744
+ "step": 480
745
+ },
746
+ {
747
+ "epoch": 0.9925307000886188,
748
+ "grad_norm": 23.92174530029297,
749
+ "learning_rate": 8.457506076580162e-08,
750
+ "logits/chosen": -2.3030953407287598,
751
+ "logits/rejected": -2.158973217010498,
752
+ "logps/chosen": -81.10468292236328,
753
+ "logps/rejected": -83.77845764160156,
754
+ "loss": 0.547,
755
+ "rewards/accuracies": 0.753125011920929,
756
+ "rewards/chosen": -0.3333788514137268,
757
+ "rewards/margins": 0.4156631529331207,
758
+ "rewards/rejected": -0.7490419745445251,
759
+ "step": 490
760
+ },
761
+ {
762
+ "epoch": 1.012786428661856,
763
+ "grad_norm": 30.66814613342285,
764
+ "learning_rate": 8.371298899628089e-08,
765
+ "logits/chosen": -2.2549185752868652,
766
+ "logits/rejected": -2.122537612915039,
767
+ "logps/chosen": -83.01513671875,
768
+ "logps/rejected": -86.776123046875,
769
+ "loss": 0.5316,
770
+ "rewards/accuracies": 0.778124988079071,
771
+ "rewards/chosen": -0.380868136882782,
772
+ "rewards/margins": 0.471442848443985,
773
+ "rewards/rejected": -0.8523109555244446,
774
+ "step": 500
775
+ },
776
+ {
777
+ "epoch": 1.033042157235093,
778
+ "grad_norm": 26.109542846679688,
779
+ "learning_rate": 8.28321361394978e-08,
780
+ "logits/chosen": -2.2775070667266846,
781
+ "logits/rejected": -2.1255202293395996,
782
+ "logps/chosen": -81.81797790527344,
783
+ "logps/rejected": -84.06095123291016,
784
+ "loss": 0.5349,
785
+ "rewards/accuracies": 0.768750011920929,
786
+ "rewards/chosen": -0.3651012182235718,
787
+ "rewards/margins": 0.4572354853153229,
788
+ "rewards/rejected": -0.8223366737365723,
789
+ "step": 510
790
+ },
791
+ {
792
+ "epoch": 1.0532978858083302,
793
+ "grad_norm": 28.901084899902344,
794
+ "learning_rate": 8.193299290763362e-08,
795
+ "logits/chosen": -2.2764482498168945,
796
+ "logits/rejected": -2.128359317779541,
797
+ "logps/chosen": -83.87080383300781,
798
+ "logps/rejected": -83.10397338867188,
799
+ "loss": 0.5413,
800
+ "rewards/accuracies": 0.762499988079071,
801
+ "rewards/chosen": -0.38391047716140747,
802
+ "rewards/margins": 0.43836045265197754,
803
+ "rewards/rejected": -0.8222709894180298,
804
+ "step": 520
805
+ },
806
+ {
807
+ "epoch": 1.0735536143815674,
808
+ "grad_norm": 26.954652786254883,
809
+ "learning_rate": 8.101606020221038e-08,
810
+ "logits/chosen": -2.26556134223938,
811
+ "logits/rejected": -2.12338924407959,
812
+ "logps/chosen": -85.02649688720703,
813
+ "logps/rejected": -85.88096618652344,
814
+ "loss": 0.5519,
815
+ "rewards/accuracies": 0.7593749761581421,
816
+ "rewards/chosen": -0.4198201298713684,
817
+ "rewards/margins": 0.4140090048313141,
818
+ "rewards/rejected": -0.8338291049003601,
819
+ "step": 530
820
+ },
821
+ {
822
+ "epoch": 1.0938093429548044,
823
+ "grad_norm": 27.471698760986328,
824
+ "learning_rate": 8.008184883504472e-08,
825
+ "logits/chosen": -2.285780668258667,
826
+ "logits/rejected": -2.15956449508667,
827
+ "logps/chosen": -91.95730590820312,
828
+ "logps/rejected": -89.56153869628906,
829
+ "loss": 0.5388,
830
+ "rewards/accuracies": 0.778124988079071,
831
+ "rewards/chosen": -0.37540143728256226,
832
+ "rewards/margins": 0.4454117715358734,
833
+ "rewards/rejected": -0.8208131790161133,
834
+ "step": 540
835
+ },
836
+ {
837
+ "epoch": 1.1140650715280416,
838
+ "grad_norm": 27.839784622192383,
839
+ "learning_rate": 7.913087924368102e-08,
840
+ "logits/chosen": -2.272618055343628,
841
+ "logits/rejected": -2.146136522293091,
842
+ "logps/chosen": -82.60453033447266,
843
+ "logps/rejected": -84.6351547241211,
844
+ "loss": 0.5466,
845
+ "rewards/accuracies": 0.768750011920929,
846
+ "rewards/chosen": -0.3854660391807556,
847
+ "rewards/margins": 0.4508994221687317,
848
+ "rewards/rejected": -0.8363655209541321,
849
+ "step": 550
850
+ },
851
+ {
852
+ "epoch": 1.1343208001012786,
853
+ "grad_norm": 29.220504760742188,
854
+ "learning_rate": 7.816368120146224e-08,
855
+ "logits/chosen": -2.2264904975891113,
856
+ "logits/rejected": -2.1163620948791504,
857
+ "logps/chosen": -80.90209197998047,
858
+ "logps/rejected": -84.96456909179688,
859
+ "loss": 0.5371,
860
+ "rewards/accuracies": 0.778124988079071,
861
+ "rewards/chosen": -0.3850155472755432,
862
+ "rewards/margins": 0.47849899530410767,
863
+ "rewards/rejected": -0.8635146021842957,
864
+ "step": 560
865
+ },
866
+ {
867
+ "epoch": 1.1545765286745158,
868
+ "grad_norm": 24.049087524414062,
869
+ "learning_rate": 7.718079352239955e-08,
870
+ "logits/chosen": -2.2715275287628174,
871
+ "logits/rejected": -2.095773220062256,
872
+ "logps/chosen": -83.15594482421875,
873
+ "logps/rejected": -83.11366271972656,
874
+ "loss": 0.5244,
875
+ "rewards/accuracies": 0.778124988079071,
876
+ "rewards/chosen": -0.38344329595565796,
877
+ "rewards/margins": 0.4991793632507324,
878
+ "rewards/rejected": -0.8826227188110352,
879
+ "step": 570
880
+ },
881
+ {
882
+ "epoch": 1.174832257247753,
883
+ "grad_norm": 32.831993103027344,
884
+ "learning_rate": 7.618276376100587e-08,
885
+ "logits/chosen": -2.2670161724090576,
886
+ "logits/rejected": -2.106199264526367,
887
+ "logps/chosen": -81.29884338378906,
888
+ "logps/rejected": -82.74469757080078,
889
+ "loss": 0.5303,
890
+ "rewards/accuracies": 0.762499988079071,
891
+ "rewards/chosen": -0.4227599501609802,
892
+ "rewards/margins": 0.4853205680847168,
893
+ "rewards/rejected": -0.9080804586410522,
894
+ "step": 580
895
+ },
896
+ {
897
+ "epoch": 1.19508798582099,
898
+ "grad_norm": 26.217998504638672,
899
+ "learning_rate": 7.517014790726011e-08,
900
+ "logits/chosen": -2.2205467224121094,
901
+ "logits/rejected": -2.0953052043914795,
902
+ "logps/chosen": -83.64337921142578,
903
+ "logps/rejected": -86.88542938232422,
904
+ "loss": 0.5513,
905
+ "rewards/accuracies": 0.734375,
906
+ "rewards/chosen": -0.43212181329727173,
907
+ "rewards/margins": 0.44605493545532227,
908
+ "rewards/rejected": -0.8781768083572388,
909
+ "step": 590
910
+ },
911
+ {
912
+ "epoch": 1.2153437143942272,
913
+ "grad_norm": 22.538532257080078,
914
+ "learning_rate": 7.414351007687187e-08,
915
+ "logits/chosen": -2.205540418624878,
916
+ "logits/rejected": -2.0615344047546387,
917
+ "logps/chosen": -88.0082015991211,
918
+ "logps/rejected": -85.7721176147461,
919
+ "loss": 0.5173,
920
+ "rewards/accuracies": 0.793749988079071,
921
+ "rewards/chosen": -0.4331473708152771,
922
+ "rewards/margins": 0.5372229814529419,
923
+ "rewards/rejected": -0.9703702926635742,
924
+ "step": 600
925
+ },
926
+ {
927
+ "epoch": 1.2355994429674642,
928
+ "grad_norm": 29.271059036254883,
929
+ "learning_rate": 7.310342219701981e-08,
930
+ "logits/chosen": -2.2468390464782715,
931
+ "logits/rejected": -2.107861280441284,
932
+ "logps/chosen": -82.70673370361328,
933
+ "logps/rejected": -83.1991958618164,
934
+ "loss": 0.5279,
935
+ "rewards/accuracies": 0.75,
936
+ "rewards/chosen": -0.4486440122127533,
937
+ "rewards/margins": 0.5201537013053894,
938
+ "rewards/rejected": -0.9687976837158203,
939
+ "step": 610
940
+ },
941
+ {
942
+ "epoch": 1.2558551715407014,
943
+ "grad_norm": 26.889476776123047,
944
+ "learning_rate": 7.205046368773794e-08,
945
+ "logits/chosen": -2.1630682945251465,
946
+ "logits/rejected": -2.018644332885742,
947
+ "logps/chosen": -86.0914535522461,
948
+ "logps/rejected": -87.1644515991211,
949
+ "loss": 0.5364,
950
+ "rewards/accuracies": 0.731249988079071,
951
+ "rewards/chosen": -0.48391470313072205,
952
+ "rewards/margins": 0.474750280380249,
953
+ "rewards/rejected": -0.9586650133132935,
954
+ "step": 620
955
+ },
956
+ {
957
+ "epoch": 1.2761109001139386,
958
+ "grad_norm": 32.43236541748047,
959
+ "learning_rate": 7.098522113912808e-08,
960
+ "logits/chosen": -2.2398154735565186,
961
+ "logits/rejected": -2.0971333980560303,
962
+ "logps/chosen": -81.19099426269531,
963
+ "logps/rejected": -82.07807922363281,
964
+ "loss": 0.5372,
965
+ "rewards/accuracies": 0.753125011920929,
966
+ "rewards/chosen": -0.441434383392334,
967
+ "rewards/margins": 0.5117905735969543,
968
+ "rewards/rejected": -0.9532249569892883,
969
+ "step": 630
970
+ },
971
+ {
972
+ "epoch": 1.2963666286871756,
973
+ "grad_norm": 26.83734130859375,
974
+ "learning_rate": 6.990828798457764e-08,
975
+ "logits/chosen": -2.2109534740448,
976
+ "logits/rejected": -2.083139657974243,
977
+ "logps/chosen": -80.3914566040039,
978
+ "logps/rejected": -87.99849700927734,
979
+ "loss": 0.5314,
980
+ "rewards/accuracies": 0.765625,
981
+ "rewards/chosen": -0.4658758044242859,
982
+ "rewards/margins": 0.5169892311096191,
983
+ "rewards/rejected": -0.9828651547431946,
984
+ "step": 640
985
+ },
986
+ {
987
+ "epoch": 1.3166223572604128,
988
+ "grad_norm": 25.39097785949707,
989
+ "learning_rate": 6.882026417016541e-08,
990
+ "logits/chosen": -2.230027437210083,
991
+ "logits/rejected": -2.100419759750366,
992
+ "logps/chosen": -82.01457214355469,
993
+ "logps/rejected": -83.42109680175781,
994
+ "loss": 0.5346,
995
+ "rewards/accuracies": 0.746874988079071,
996
+ "rewards/chosen": -0.4833486080169678,
997
+ "rewards/margins": 0.4825879633426666,
998
+ "rewards/rejected": -0.9659366607666016,
999
+ "step": 650
1000
+ },
1001
+ {
1002
+ "epoch": 1.3368780858336498,
1003
+ "grad_norm": 30.31522560119629,
1004
+ "learning_rate": 6.772175582043889e-08,
1005
+ "logits/chosen": -2.1776843070983887,
1006
+ "logits/rejected": -2.039802074432373,
1007
+ "logps/chosen": -83.42335510253906,
1008
+ "logps/rejected": -88.11833953857422,
1009
+ "loss": 0.4909,
1010
+ "rewards/accuracies": 0.828125,
1011
+ "rewards/chosen": -0.45507392287254333,
1012
+ "rewards/margins": 0.6273630857467651,
1013
+ "rewards/rejected": -1.0824369192123413,
1014
+ "step": 660
1015
+ },
1016
+ {
1017
+ "epoch": 1.357133814406887,
1018
+ "grad_norm": 34.97041702270508,
1019
+ "learning_rate": 6.661337490075003e-08,
1020
+ "logits/chosen": -2.2355475425720215,
1021
+ "logits/rejected": -2.095804452896118,
1022
+ "logps/chosen": -83.29669952392578,
1023
+ "logps/rejected": -84.55702209472656,
1024
+ "loss": 0.5307,
1025
+ "rewards/accuracies": 0.753125011920929,
1026
+ "rewards/chosen": -0.5031043291091919,
1027
+ "rewards/margins": 0.5249863862991333,
1028
+ "rewards/rejected": -1.0280907154083252,
1029
+ "step": 670
1030
+ },
1031
+ {
1032
+ "epoch": 1.377389542980124,
1033
+ "grad_norm": 26.33791160583496,
1034
+ "learning_rate": 6.549573887633676e-08,
1035
+ "logits/chosen": -2.1734795570373535,
1036
+ "logits/rejected": -2.028352737426758,
1037
+ "logps/chosen": -83.61205291748047,
1038
+ "logps/rejected": -84.9587173461914,
1039
+ "loss": 0.4813,
1040
+ "rewards/accuracies": 0.8125,
1041
+ "rewards/chosen": -0.46062904596328735,
1042
+ "rewards/margins": 0.6703583002090454,
1043
+ "rewards/rejected": -1.1309874057769775,
1044
+ "step": 680
1045
+ },
1046
+ {
1047
+ "epoch": 1.3976452715533612,
1048
+ "grad_norm": 35.68913269042969,
1049
+ "learning_rate": 6.436947036834086e-08,
1050
+ "logits/chosen": -2.1777005195617676,
1051
+ "logits/rejected": -2.054405689239502,
1052
+ "logps/chosen": -83.69397735595703,
1053
+ "logps/rejected": -86.34680938720703,
1054
+ "loss": 0.5313,
1055
+ "rewards/accuracies": 0.765625,
1056
+ "rewards/chosen": -0.518213152885437,
1057
+ "rewards/margins": 0.5348538756370544,
1058
+ "rewards/rejected": -1.0530669689178467,
1059
+ "step": 690
1060
+ },
1061
+ {
1062
+ "epoch": 1.4179010001265984,
1063
+ "grad_norm": 30.295581817626953,
1064
+ "learning_rate": 6.323519680695349e-08,
1065
+ "logits/chosen": -2.1419105529785156,
1066
+ "logits/rejected": -1.9936020374298096,
1067
+ "logps/chosen": -90.16146087646484,
1068
+ "logps/rejected": -89.37017822265625,
1069
+ "loss": 0.522,
1070
+ "rewards/accuracies": 0.762499988079071,
1071
+ "rewards/chosen": -0.5005131363868713,
1072
+ "rewards/margins": 0.5668459534645081,
1073
+ "rewards/rejected": -1.067359209060669,
1074
+ "step": 700
1075
+ },
1076
+ {
1077
+ "epoch": 1.4381567286998354,
1078
+ "grad_norm": 26.36173439025879,
1079
+ "learning_rate": 6.209355008188152e-08,
1080
+ "logits/chosen": -2.1437783241271973,
1081
+ "logits/rejected": -2.0539603233337402,
1082
+ "logps/chosen": -89.0562973022461,
1083
+ "logps/rejected": -92.04231262207031,
1084
+ "loss": 0.5409,
1085
+ "rewards/accuracies": 0.7437499761581421,
1086
+ "rewards/chosen": -0.5396249294281006,
1087
+ "rewards/margins": 0.5132231116294861,
1088
+ "rewards/rejected": -1.0528481006622314,
1089
+ "step": 710
1090
+ },
1091
+ {
1092
+ "epoch": 1.4584124572730726,
1093
+ "grad_norm": 27.820674896240234,
1094
+ "learning_rate": 6.094516619032975e-08,
1095
+ "logits/chosen": -2.1499810218811035,
1096
+ "logits/rejected": -2.025269031524658,
1097
+ "logps/chosen": -83.47040557861328,
1098
+ "logps/rejected": -86.60690307617188,
1099
+ "loss": 0.5196,
1100
+ "rewards/accuracies": 0.7562500238418579,
1101
+ "rewards/chosen": -0.49484682083129883,
1102
+ "rewards/margins": 0.5879716277122498,
1103
+ "rewards/rejected": -1.0828183889389038,
1104
+ "step": 720
1105
+ },
1106
+ {
1107
+ "epoch": 1.4786681858463098,
1108
+ "grad_norm": 24.657302856445312,
1109
+ "learning_rate": 5.979068488269468e-08,
1110
+ "logits/chosen": -2.1996073722839355,
1111
+ "logits/rejected": -2.0537047386169434,
1112
+ "logps/chosen": -86.91001892089844,
1113
+ "logps/rejected": -89.12824249267578,
1114
+ "loss": 0.4914,
1115
+ "rewards/accuracies": 0.8062499761581421,
1116
+ "rewards/chosen": -0.542976975440979,
1117
+ "rewards/margins": 0.6267004013061523,
1118
+ "rewards/rejected": -1.169677495956421,
1119
+ "step": 730
1120
+ },
1121
+ {
1122
+ "epoch": 1.4989239144195468,
1123
+ "grad_norm": 28.313621520996094,
1124
+ "learning_rate": 5.8630749306167556e-08,
1125
+ "logits/chosen": -2.1813175678253174,
1126
+ "logits/rejected": -2.0757999420166016,
1127
+ "logps/chosen": -83.51826477050781,
1128
+ "logps/rejected": -89.63159942626953,
1129
+ "loss": 0.5115,
1130
+ "rewards/accuracies": 0.7406250238418579,
1131
+ "rewards/chosen": -0.515524685382843,
1132
+ "rewards/margins": 0.594115674495697,
1133
+ "rewards/rejected": -1.1096404790878296,
1134
+ "step": 740
1135
+ },
1136
+ {
1137
+ "epoch": 1.5191796429927837,
1138
+ "grad_norm": 29.172090530395508,
1139
+ "learning_rate": 5.7466005646445095e-08,
1140
+ "logits/chosen": -2.1559250354766846,
1141
+ "logits/rejected": -2.0057528018951416,
1142
+ "logps/chosen": -83.4648208618164,
1143
+ "logps/rejected": -85.3985366821289,
1144
+ "loss": 0.4947,
1145
+ "rewards/accuracies": 0.765625,
1146
+ "rewards/chosen": -0.5010935068130493,
1147
+ "rewards/margins": 0.6447556018829346,
1148
+ "rewards/rejected": -1.1458488702774048,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 1.5394353715660212,
1153
+ "grad_norm": 30.766334533691406,
1154
+ "learning_rate": 5.6297102767747325e-08,
1155
+ "logits/chosen": -2.1724162101745605,
1156
+ "logits/rejected": -2.0574355125427246,
1157
+ "logps/chosen": -90.57199096679688,
1158
+ "logps/rejected": -92.65487670898438,
1159
+ "loss": 0.5309,
1160
+ "rewards/accuracies": 0.737500011920929,
1161
+ "rewards/chosen": -0.5361162424087524,
1162
+ "rewards/margins": 0.5525388121604919,
1163
+ "rewards/rejected": -1.0886551141738892,
1164
+ "step": 760
1165
+ },
1166
+ {
1167
+ "epoch": 1.5596911001392582,
1168
+ "grad_norm": 32.11360549926758,
1169
+ "learning_rate": 5.512469185134354e-08,
1170
+ "logits/chosen": -2.1918747425079346,
1171
+ "logits/rejected": -2.054835796356201,
1172
+ "logps/chosen": -84.68449401855469,
1173
+ "logps/rejected": -87.59661865234375,
1174
+ "loss": 0.4972,
1175
+ "rewards/accuracies": 0.778124988079071,
1176
+ "rewards/chosen": -0.5611705780029297,
1177
+ "rewards/margins": 0.6253485083580017,
1178
+ "rewards/rejected": -1.1865190267562866,
1179
+ "step": 770
1180
+ },
1181
+ {
1182
+ "epoch": 1.5799468287124951,
1183
+ "grad_norm": 28.335988998413086,
1184
+ "learning_rate": 5.394942603278726e-08,
1185
+ "logits/chosen": -2.1388983726501465,
1186
+ "logits/rejected": -2.0282931327819824,
1187
+ "logps/chosen": -89.77628326416016,
1188
+ "logps/rejected": -90.62747955322266,
1189
+ "loss": 0.5258,
1190
+ "rewards/accuracies": 0.75,
1191
+ "rewards/chosen": -0.524773895740509,
1192
+ "rewards/margins": 0.5710369944572449,
1193
+ "rewards/rejected": -1.095810890197754,
1194
+ "step": 780
1195
+ },
1196
+ {
1197
+ "epoch": 1.6002025572857324,
1198
+ "grad_norm": 25.966876983642578,
1199
+ "learning_rate": 5.277196003806249e-08,
1200
+ "logits/chosen": -2.2010576725006104,
1201
+ "logits/rejected": -2.0596330165863037,
1202
+ "logps/chosen": -79.80914306640625,
1203
+ "logps/rejected": -82.98551940917969,
1204
+ "loss": 0.4784,
1205
+ "rewards/accuracies": 0.793749988079071,
1206
+ "rewards/chosen": -0.4947393536567688,
1207
+ "rewards/margins": 0.6838704347610474,
1208
+ "rewards/rejected": -1.1786099672317505,
1209
+ "step": 790
1210
+ },
1211
+ {
1212
+ "epoch": 1.6204582858589696,
1213
+ "grad_norm": 29.524734497070312,
1214
+ "learning_rate": 5.1592949818844046e-08,
1215
+ "logits/chosen": -2.1284611225128174,
1216
+ "logits/rejected": -2.0030832290649414,
1217
+ "logps/chosen": -86.19153594970703,
1218
+ "logps/rejected": -87.28998565673828,
1219
+ "loss": 0.5126,
1220
+ "rewards/accuracies": 0.753125011920929,
1221
+ "rewards/chosen": -0.6002845764160156,
1222
+ "rewards/margins": 0.6240141987800598,
1223
+ "rewards/rejected": -1.2242988348007202,
1224
+ "step": 800
1225
+ },
1226
+ {
1227
+ "epoch": 1.6407140144322065,
1228
+ "grad_norm": 25.266782760620117,
1229
+ "learning_rate": 5.0413052187075054e-08,
1230
+ "logits/chosen": -2.168487787246704,
1231
+ "logits/rejected": -2.0182714462280273,
1232
+ "logps/chosen": -79.49079132080078,
1233
+ "logps/rejected": -81.5199203491211,
1234
+ "loss": 0.5011,
1235
+ "rewards/accuracies": 0.778124988079071,
1236
+ "rewards/chosen": -0.48622050881385803,
1237
+ "rewards/margins": 0.6199517846107483,
1238
+ "rewards/rejected": -1.1061723232269287,
1239
+ "step": 810
1240
+ },
1241
+ {
1242
+ "epoch": 1.6609697430054438,
1243
+ "grad_norm": 26.315034866333008,
1244
+ "learning_rate": 4.9232924449065095e-08,
1245
+ "logits/chosen": -2.1593496799468994,
1246
+ "logits/rejected": -2.030149459838867,
1247
+ "logps/chosen": -81.82089233398438,
1248
+ "logps/rejected": -89.67662811279297,
1249
+ "loss": 0.5042,
1250
+ "rewards/accuracies": 0.778124988079071,
1251
+ "rewards/chosen": -0.5738162994384766,
1252
+ "rewards/margins": 0.6879658102989197,
1253
+ "rewards/rejected": -1.261782169342041,
1254
+ "step": 820
1255
+ },
1256
+ {
1257
+ "epoch": 1.681225471578681,
1258
+ "grad_norm": 33.470664978027344,
1259
+ "learning_rate": 4.8053224039313114e-08,
1260
+ "logits/chosen": -2.1270673274993896,
1261
+ "logits/rejected": -2.012338638305664,
1262
+ "logps/chosen": -86.01063537597656,
1263
+ "logps/rejected": -85.80992889404297,
1264
+ "loss": 0.5459,
1265
+ "rewards/accuracies": 0.715624988079071,
1266
+ "rewards/chosen": -0.535088837146759,
1267
+ "rewards/margins": 0.49604400992393494,
1268
+ "rewards/rejected": -1.0311328172683716,
1269
+ "step": 830
1270
+ },
1271
+ {
1272
+ "epoch": 1.701481200151918,
1273
+ "grad_norm": 20.869911193847656,
1274
+ "learning_rate": 4.687460815425878e-08,
1275
+ "logits/chosen": -2.157341480255127,
1276
+ "logits/rejected": -2.007072925567627,
1277
+ "logps/chosen": -80.98677825927734,
1278
+ "logps/rejected": -83.40042114257812,
1279
+ "loss": 0.4965,
1280
+ "rewards/accuracies": 0.793749988079071,
1281
+ "rewards/chosen": -0.568179190158844,
1282
+ "rewards/margins": 0.660231351852417,
1283
+ "rewards/rejected": -1.2284104824066162,
1284
+ "step": 840
1285
+ },
1286
+ {
1287
+ "epoch": 1.721736928725155,
1288
+ "grad_norm": 23.50938606262207,
1289
+ "learning_rate": 4.5697733386166524e-08,
1290
+ "logits/chosen": -2.1210384368896484,
1291
+ "logits/rejected": -1.9905798435211182,
1292
+ "logps/chosen": -87.40711975097656,
1293
+ "logps/rejected": -86.93902587890625,
1294
+ "loss": 0.5181,
1295
+ "rewards/accuracies": 0.78125,
1296
+ "rewards/chosen": -0.5617082715034485,
1297
+ "rewards/margins": 0.592042088508606,
1298
+ "rewards/rejected": -1.1537501811981201,
1299
+ "step": 850
1300
+ },
1301
+ {
1302
+ "epoch": 1.7419926572983924,
1303
+ "grad_norm": 27.28333282470703,
1304
+ "learning_rate": 4.4523255357346187e-08,
1305
+ "logits/chosen": -2.1478943824768066,
1306
+ "logits/rejected": -2.024747371673584,
1307
+ "logps/chosen": -84.65662384033203,
1308
+ "logps/rejected": -88.84449005126953,
1309
+ "loss": 0.4993,
1310
+ "rewards/accuracies": 0.800000011920929,
1311
+ "rewards/chosen": -0.5495315790176392,
1312
+ "rewards/margins": 0.6314437985420227,
1313
+ "rewards/rejected": -1.1809751987457275,
1314
+ "step": 860
1315
+ },
1316
+ {
1317
+ "epoch": 1.7622483858716294,
1318
+ "grad_norm": 25.31683349609375,
1319
+ "learning_rate": 4.335182835491387e-08,
1320
+ "logits/chosen": -2.1592297554016113,
1321
+ "logits/rejected": -2.031510829925537,
1322
+ "logps/chosen": -83.72755432128906,
1323
+ "logps/rejected": -90.34858703613281,
1324
+ "loss": 0.5023,
1325
+ "rewards/accuracies": 0.7718750238418579,
1326
+ "rewards/chosen": -0.6199240684509277,
1327
+ "rewards/margins": 0.634971022605896,
1328
+ "rewards/rejected": -1.2548949718475342,
1329
+ "step": 870
1330
+ },
1331
+ {
1332
+ "epoch": 1.7825041144448663,
1333
+ "grad_norm": 29.31256103515625,
1334
+ "learning_rate": 4.218410496629684e-08,
1335
+ "logits/chosen": -2.1241517066955566,
1336
+ "logits/rejected": -1.9871305227279663,
1337
+ "logps/chosen": -76.77335357666016,
1338
+ "logps/rejected": -80.37085723876953,
1339
+ "loss": 0.4854,
1340
+ "rewards/accuracies": 0.7875000238418579,
1341
+ "rewards/chosen": -0.5658852458000183,
1342
+ "rewards/margins": 0.6777737736701965,
1343
+ "rewards/rejected": -1.2436590194702148,
1344
+ "step": 880
1345
+ },
1346
+ {
1347
+ "epoch": 1.8027598430181035,
1348
+ "grad_norm": 29.485347747802734,
1349
+ "learning_rate": 4.102073571568516e-08,
1350
+ "logits/chosen": -2.1224985122680664,
1351
+ "logits/rejected": -1.993857741355896,
1352
+ "logps/chosen": -86.9496078491211,
1353
+ "logps/rejected": -88.30699157714844,
1354
+ "loss": 0.506,
1355
+ "rewards/accuracies": 0.7718750238418579,
1356
+ "rewards/chosen": -0.5687106847763062,
1357
+ "rewards/margins": 0.6147977709770203,
1358
+ "rewards/rejected": -1.1835086345672607,
1359
+ "step": 890
1360
+ },
1361
+ {
1362
+ "epoch": 1.8230155715913408,
1363
+ "grad_norm": 27.672901153564453,
1364
+ "learning_rate": 3.986236870163262e-08,
1365
+ "logits/chosen": -2.1112308502197266,
1366
+ "logits/rejected": -1.99734628200531,
1367
+ "logps/chosen": -85.15098571777344,
1368
+ "logps/rejected": -90.8910140991211,
1369
+ "loss": 0.5078,
1370
+ "rewards/accuracies": 0.7749999761581421,
1371
+ "rewards/chosen": -0.5743888020515442,
1372
+ "rewards/margins": 0.6254476308822632,
1373
+ "rewards/rejected": -1.1998364925384521,
1374
+ "step": 900
1375
+ },
1376
+ {
1377
+ "epoch": 1.8432713001645777,
1378
+ "grad_norm": 28.670635223388672,
1379
+ "learning_rate": 3.870964923600923e-08,
1380
+ "logits/chosen": -2.088013172149658,
1381
+ "logits/rejected": -1.9703779220581055,
1382
+ "logps/chosen": -83.16795349121094,
1383
+ "logps/rejected": -86.55892181396484,
1384
+ "loss": 0.496,
1385
+ "rewards/accuracies": 0.762499988079071,
1386
+ "rewards/chosen": -0.5139526724815369,
1387
+ "rewards/margins": 0.6616466045379639,
1388
+ "rewards/rejected": -1.1755993366241455,
1389
+ "step": 910
1390
+ },
1391
+ {
1392
+ "epoch": 1.863527028737815,
1393
+ "grad_norm": 30.791370391845703,
1394
+ "learning_rate": 3.756321948450599e-08,
1395
+ "logits/chosen": -2.120954751968384,
1396
+ "logits/rejected": -1.9975354671478271,
1397
+ "logps/chosen": -84.26751708984375,
1398
+ "logps/rejected": -86.91252899169922,
1399
+ "loss": 0.5453,
1400
+ "rewards/accuracies": 0.7437499761581421,
1401
+ "rewards/chosen": -0.6508383750915527,
1402
+ "rewards/margins": 0.5812313556671143,
1403
+ "rewards/rejected": -1.232069730758667,
1404
+ "step": 920
1405
+ },
1406
+ {
1407
+ "epoch": 1.8837827573110522,
1408
+ "grad_norm": 33.708187103271484,
1409
+ "learning_rate": 3.642371810889222e-08,
1410
+ "logits/chosen": -2.092048168182373,
1411
+ "logits/rejected": -1.9911048412322998,
1412
+ "logps/chosen": -84.46561431884766,
1413
+ "logps/rejected": -88.94342041015625,
1414
+ "loss": 0.5404,
1415
+ "rewards/accuracies": 0.7124999761581421,
1416
+ "rewards/chosen": -0.5762837529182434,
1417
+ "rewards/margins": 0.5255244970321655,
1418
+ "rewards/rejected": -1.1018081903457642,
1419
+ "step": 930
1420
+ },
1421
+ {
1422
+ "epoch": 1.9040384858842891,
1423
+ "grad_norm": 25.534833908081055,
1424
+ "learning_rate": 3.529177991122518e-08,
1425
+ "logits/chosen": -2.066344738006592,
1426
+ "logits/rejected": -1.9486182928085327,
1427
+ "logps/chosen": -91.85676574707031,
1428
+ "logps/rejected": -94.47185516357422,
1429
+ "loss": 0.493,
1430
+ "rewards/accuracies": 0.7749999761581421,
1431
+ "rewards/chosen": -0.5671579837799072,
1432
+ "rewards/margins": 0.6500786542892456,
1433
+ "rewards/rejected": -1.2172366380691528,
1434
+ "step": 940
1435
+ },
1436
+ {
1437
+ "epoch": 1.9242942144575261,
1438
+ "grad_norm": 26.457944869995117,
1439
+ "learning_rate": 3.416803548020969e-08,
1440
+ "logits/chosen": -2.115591049194336,
1441
+ "logits/rejected": -1.9885094165802002,
1442
+ "logps/chosen": -89.46985626220703,
1443
+ "logps/rejected": -93.4120864868164,
1444
+ "loss": 0.5111,
1445
+ "rewards/accuracies": 0.753125011920929,
1446
+ "rewards/chosen": -0.5443202257156372,
1447
+ "rewards/margins": 0.6258162260055542,
1448
+ "rewards/rejected": -1.1701364517211914,
1449
+ "step": 950
1450
+ },
1451
+ {
1452
+ "epoch": 1.9445499430307633,
1453
+ "grad_norm": 27.294607162475586,
1454
+ "learning_rate": 3.305311083990496e-08,
1455
+ "logits/chosen": -2.1644487380981445,
1456
+ "logits/rejected": -2.040801525115967,
1457
+ "logps/chosen": -76.92610931396484,
1458
+ "logps/rejected": -82.45623016357422,
1459
+ "loss": 0.5196,
1460
+ "rewards/accuracies": 0.734375,
1461
+ "rewards/chosen": -0.6151694655418396,
1462
+ "rewards/margins": 0.5893052220344543,
1463
+ "rewards/rejected": -1.204474687576294,
1464
+ "step": 960
1465
+ },
1466
+ {
1467
+ "epoch": 1.9648056716040005,
1468
+ "grad_norm": 27.27229881286621,
1469
+ "learning_rate": 3.194762710097436e-08,
1470
+ "logits/chosen": -2.1350479125976562,
1471
+ "logits/rejected": -2.030987501144409,
1472
+ "logps/chosen": -80.64533996582031,
1473
+ "logps/rejected": -85.28868103027344,
1474
+ "loss": 0.5371,
1475
+ "rewards/accuracies": 0.7281249761581421,
1476
+ "rewards/chosen": -0.5596300363540649,
1477
+ "rewards/margins": 0.5202323794364929,
1478
+ "rewards/rejected": -1.0798624753952026,
1479
+ "step": 970
1480
+ },
1481
+ {
1482
+ "epoch": 1.9850614001772375,
1483
+ "grad_norm": 24.55060386657715,
1484
+ "learning_rate": 3.0852200114672453e-08,
1485
+ "logits/chosen": -2.127375602722168,
1486
+ "logits/rejected": -1.991199254989624,
1487
+ "logps/chosen": -85.77812957763672,
1488
+ "logps/rejected": -89.46642303466797,
1489
+ "loss": 0.5081,
1490
+ "rewards/accuracies": 0.7749999761581421,
1491
+ "rewards/chosen": -0.5709745287895203,
1492
+ "rewards/margins": 0.6586155891418457,
1493
+ "rewards/rejected": -1.2295901775360107,
1494
+ "step": 980
1495
+ },
1496
+ {
1497
+ "epoch": 2.005317128750475,
1498
+ "grad_norm": 25.645376205444336,
1499
+ "learning_rate": 2.976744012976189e-08,
1500
+ "logits/chosen": -2.1159276962280273,
1501
+ "logits/rejected": -1.9866254329681396,
1502
+ "logps/chosen": -87.69465637207031,
1503
+ "logps/rejected": -88.93052673339844,
1504
+ "loss": 0.4894,
1505
+ "rewards/accuracies": 0.800000011920929,
1506
+ "rewards/chosen": -0.5695109367370605,
1507
+ "rewards/margins": 0.6490300893783569,
1508
+ "rewards/rejected": -1.218540906906128,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 2.025572857323712,
1513
+ "grad_norm": 27.878236770629883,
1514
+ "learning_rate": 2.8693951452551307e-08,
1515
+ "logits/chosen": -2.0782949924468994,
1516
+ "logits/rejected": -1.9823192358016968,
1517
+ "logps/chosen": -79.88198852539062,
1518
+ "logps/rejected": -86.08316802978516,
1519
+ "loss": 0.5315,
1520
+ "rewards/accuracies": 0.7437499761581421,
1521
+ "rewards/chosen": -0.5980249643325806,
1522
+ "rewards/margins": 0.5769001245498657,
1523
+ "rewards/rejected": -1.1749250888824463,
1524
+ "step": 1000
1525
+ },
1526
+ {
1527
+ "epoch": 2.045828585896949,
1528
+ "grad_norm": 28.799379348754883,
1529
+ "learning_rate": 2.7632332110243967e-08,
1530
+ "logits/chosen": -2.0895416736602783,
1531
+ "logits/rejected": -1.975619912147522,
1532
+ "logps/chosen": -86.46625518798828,
1533
+ "logps/rejected": -90.41603088378906,
1534
+ "loss": 0.4972,
1535
+ "rewards/accuracies": 0.793749988079071,
1536
+ "rewards/chosen": -0.5257114171981812,
1537
+ "rewards/margins": 0.6659296751022339,
1538
+ "rewards/rejected": -1.1916412115097046,
1539
+ "step": 1010
1540
+ },
1541
+ {
1542
+ "epoch": 2.066084314470186,
1543
+ "grad_norm": 48.88608169555664,
1544
+ "learning_rate": 2.658317351778412e-08,
1545
+ "logits/chosen": -2.099612236022949,
1546
+ "logits/rejected": -1.9862468242645264,
1547
+ "logps/chosen": -86.78905487060547,
1548
+ "logps/rejected": -92.29573059082031,
1549
+ "loss": 0.4904,
1550
+ "rewards/accuracies": 0.762499988079071,
1551
+ "rewards/chosen": -0.564150869846344,
1552
+ "rewards/margins": 0.7153151631355286,
1553
+ "rewards/rejected": -1.279465913772583,
1554
+ "step": 1020
1555
+ },
1556
+ {
1557
+ "epoch": 2.0863400430434234,
1558
+ "grad_norm": 24.239578247070312,
1559
+ "learning_rate": 2.554706014838705e-08,
1560
+ "logits/chosen": -2.1574556827545166,
1561
+ "logits/rejected": -2.014895439147949,
1562
+ "logps/chosen": -84.7563247680664,
1563
+ "logps/rejected": -85.77194213867188,
1564
+ "loss": 0.4855,
1565
+ "rewards/accuracies": 0.800000011920929,
1566
+ "rewards/chosen": -0.5780550837516785,
1567
+ "rewards/margins": 0.6501516103744507,
1568
+ "rewards/rejected": -1.2282066345214844,
1569
+ "step": 1030
1570
+ },
1571
+ {
1572
+ "epoch": 2.1065957716166603,
1573
+ "grad_norm": 28.262638092041016,
1574
+ "learning_rate": 2.4524569207936445e-08,
1575
+ "logits/chosen": -2.0934982299804688,
1576
+ "logits/rejected": -1.962937355041504,
1577
+ "logps/chosen": -86.09654235839844,
1578
+ "logps/rejected": -91.18133544921875,
1579
+ "loss": 0.4598,
1580
+ "rewards/accuracies": 0.815625011920929,
1581
+ "rewards/chosen": -0.567767322063446,
1582
+ "rewards/margins": 0.8064430356025696,
1583
+ "rewards/rejected": -1.374210238456726,
1584
+ "step": 1040
1585
+ },
1586
+ {
1587
+ "epoch": 2.1268515001898973,
1588
+ "grad_norm": 30.161561965942383,
1589
+ "learning_rate": 2.351627031343008e-08,
1590
+ "logits/chosen": -2.134225368499756,
1591
+ "logits/rejected": -1.9962198734283447,
1592
+ "logps/chosen": -87.08121490478516,
1593
+ "logps/rejected": -90.91963958740234,
1594
+ "loss": 0.5068,
1595
+ "rewards/accuracies": 0.7718750238418579,
1596
+ "rewards/chosen": -0.5947022438049316,
1597
+ "rewards/margins": 0.6418746113777161,
1598
+ "rewards/rejected": -1.236576795578003,
1599
+ "step": 1050
1600
+ },
1601
+ {
1602
+ "epoch": 2.1471072287631348,
1603
+ "grad_norm": 29.3469295501709,
1604
+ "learning_rate": 2.2522725175653233e-08,
1605
+ "logits/chosen": -2.0764639377593994,
1606
+ "logits/rejected": -1.9766466617584229,
1607
+ "logps/chosen": -87.24481964111328,
1608
+ "logps/rejected": -93.39856719970703,
1609
+ "loss": 0.533,
1610
+ "rewards/accuracies": 0.7562500238418579,
1611
+ "rewards/chosen": -0.6166614294052124,
1612
+ "rewards/margins": 0.5650585293769836,
1613
+ "rewards/rejected": -1.1817197799682617,
1614
+ "step": 1060
1615
+ },
1616
+ {
1617
+ "epoch": 2.1673629573363717,
1618
+ "grad_norm": 23.277862548828125,
1619
+ "learning_rate": 2.154448728625668e-08,
1620
+ "logits/chosen": -2.1141998767852783,
1621
+ "logits/rejected": -1.9909133911132812,
1622
+ "logps/chosen": -84.21327209472656,
1623
+ "logps/rejected": -86.62323760986328,
1624
+ "loss": 0.4652,
1625
+ "rewards/accuracies": 0.800000011920929,
1626
+ "rewards/chosen": -0.5331219434738159,
1627
+ "rewards/margins": 0.7241859436035156,
1628
+ "rewards/rejected": -1.257307767868042,
1629
+ "step": 1070
1630
+ },
1631
+ {
1632
+ "epoch": 2.1876186859096087,
1633
+ "grad_norm": 26.303924560546875,
1634
+ "learning_rate": 2.0582101609413333e-08,
1635
+ "logits/chosen": -2.0846378803253174,
1636
+ "logits/rejected": -1.9430017471313477,
1637
+ "logps/chosen": -90.30846405029297,
1638
+ "logps/rejected": -91.7738265991211,
1639
+ "loss": 0.483,
1640
+ "rewards/accuracies": 0.7875000238418579,
1641
+ "rewards/chosen": -0.5784596800804138,
1642
+ "rewards/margins": 0.7019392251968384,
1643
+ "rewards/rejected": -1.2803988456726074,
1644
+ "step": 1080
1645
+ },
1646
+ {
1647
+ "epoch": 2.2078744144828457,
1648
+ "grad_norm": 29.66044044494629,
1649
+ "learning_rate": 1.9636104278225413e-08,
1650
+ "logits/chosen": -2.113520860671997,
1651
+ "logits/rejected": -2.006913661956787,
1652
+ "logps/chosen": -87.39739990234375,
1653
+ "logps/rejected": -91.94351959228516,
1654
+ "loss": 0.4885,
1655
+ "rewards/accuracies": 0.793749988079071,
1656
+ "rewards/chosen": -0.6303068995475769,
1657
+ "rewards/margins": 0.7132034301757812,
1658
+ "rewards/rejected": -1.343510389328003,
1659
+ "step": 1090
1660
+ },
1661
+ {
1662
+ "epoch": 2.228130143056083,
1663
+ "grad_norm": 31.82991600036621,
1664
+ "learning_rate": 1.8707022296051462e-08,
1665
+ "logits/chosen": -2.1318724155426025,
1666
+ "logits/rejected": -1.997719407081604,
1667
+ "logps/chosen": -83.96778869628906,
1668
+ "logps/rejected": -91.7020492553711,
1669
+ "loss": 0.4837,
1670
+ "rewards/accuracies": 0.7749999761581421,
1671
+ "rewards/chosen": -0.5827969312667847,
1672
+ "rewards/margins": 0.7294033765792847,
1673
+ "rewards/rejected": -1.3122001886367798,
1674
+ "step": 1100
1675
+ },
1676
+ {
1677
+ "epoch": 2.24838587162932,
1678
+ "grad_norm": 30.810279846191406,
1679
+ "learning_rate": 1.779537324291926e-08,
1680
+ "logits/chosen": -2.087120771408081,
1681
+ "logits/rejected": -1.974585771560669,
1682
+ "logps/chosen": -85.29585266113281,
1683
+ "logps/rejected": -90.92556762695312,
1684
+ "loss": 0.5036,
1685
+ "rewards/accuracies": 0.7749999761581421,
1686
+ "rewards/chosen": -0.6058118939399719,
1687
+ "rewards/margins": 0.6656503081321716,
1688
+ "rewards/rejected": -1.2714622020721436,
1689
+ "step": 1110
1690
+ },
1691
+ {
1692
+ "epoch": 2.268641600202557,
1693
+ "grad_norm": 24.00498390197754,
1694
+ "learning_rate": 1.6901664987188425e-08,
1695
+ "logits/chosen": -2.0903851985931396,
1696
+ "logits/rejected": -1.9751678705215454,
1697
+ "logps/chosen": -83.34523010253906,
1698
+ "logps/rejected": -86.21971130371094,
1699
+ "loss": 0.5044,
1700
+ "rewards/accuracies": 0.7437499761581421,
1701
+ "rewards/chosen": -0.5954999923706055,
1702
+ "rewards/margins": 0.6783249378204346,
1703
+ "rewards/rejected": -1.27382493019104,
1704
+ "step": 1120
1705
+ },
1706
+ {
1707
+ "epoch": 2.2888973287757945,
1708
+ "grad_norm": 39.50699234008789,
1709
+ "learning_rate": 1.6026395402623272e-08,
1710
+ "logits/chosen": -2.0663511753082275,
1711
+ "logits/rejected": -1.9365609884262085,
1712
+ "logps/chosen": -94.30004119873047,
1713
+ "logps/rejected": -97.46401977539062,
1714
+ "loss": 0.4973,
1715
+ "rewards/accuracies": 0.7562500238418579,
1716
+ "rewards/chosen": -0.6117586493492126,
1717
+ "rewards/margins": 0.6866195797920227,
1718
+ "rewards/rejected": -1.2983782291412354,
1719
+ "step": 1130
1720
+ },
1721
+ {
1722
+ "epoch": 2.3091530573490315,
1723
+ "grad_norm": 26.588275909423828,
1724
+ "learning_rate": 1.5170052091033552e-08,
1725
+ "logits/chosen": -2.1126387119293213,
1726
+ "logits/rejected": -1.959896445274353,
1727
+ "logps/chosen": -80.80674743652344,
1728
+ "logps/rejected": -83.63319396972656,
1729
+ "loss": 0.4726,
1730
+ "rewards/accuracies": 0.8218749761581421,
1731
+ "rewards/chosen": -0.616263747215271,
1732
+ "rewards/margins": 0.7580442428588867,
1733
+ "rewards/rejected": -1.3743079900741577,
1734
+ "step": 1140
1735
+ },
1736
+ {
1737
+ "epoch": 2.3294087859222685,
1738
+ "grad_norm": 28.325511932373047,
1739
+ "learning_rate": 1.4333112110637453e-08,
1740
+ "logits/chosen": -2.064669609069824,
1741
+ "logits/rejected": -1.9410665035247803,
1742
+ "logps/chosen": -84.78388977050781,
1743
+ "logps/rejected": -87.63703155517578,
1744
+ "loss": 0.493,
1745
+ "rewards/accuracies": 0.7875000238418579,
1746
+ "rewards/chosen": -0.5784262418746948,
1747
+ "rewards/margins": 0.691449761390686,
1748
+ "rewards/rejected": -1.2698760032653809,
1749
+ "step": 1150
1750
+ },
1751
+ {
1752
+ "epoch": 2.349664514495506,
1753
+ "grad_norm": 27.597017288208008,
1754
+ "learning_rate": 1.3516041710298498e-08,
1755
+ "logits/chosen": -2.1402578353881836,
1756
+ "logits/rejected": -2.004826068878174,
1757
+ "logps/chosen": -87.74010467529297,
1758
+ "logps/rejected": -89.4506607055664,
1759
+ "loss": 0.5047,
1760
+ "rewards/accuracies": 0.762499988079071,
1761
+ "rewards/chosen": -0.5920398831367493,
1762
+ "rewards/margins": 0.6377407908439636,
1763
+ "rewards/rejected": -1.2297805547714233,
1764
+ "step": 1160
1765
+ },
1766
+ {
1767
+ "epoch": 2.369920243068743,
1768
+ "grad_norm": 33.10106658935547,
1769
+ "learning_rate": 1.2719296069784063e-08,
1770
+ "logits/chosen": -2.062407970428467,
1771
+ "logits/rejected": -1.9447336196899414,
1772
+ "logps/chosen": -89.19010925292969,
1773
+ "logps/rejected": -95.0318374633789,
1774
+ "loss": 0.4953,
1775
+ "rewards/accuracies": 0.8187500238418579,
1776
+ "rewards/chosen": -0.5857541561126709,
1777
+ "rewards/margins": 0.7076044082641602,
1778
+ "rewards/rejected": -1.293358564376831,
1779
+ "step": 1170
1780
+ },
1781
+ {
1782
+ "epoch": 2.39017597164198,
1783
+ "grad_norm": 33.07532501220703,
1784
+ "learning_rate": 1.1943319046190332e-08,
1785
+ "logits/chosen": -2.074035167694092,
1786
+ "logits/rejected": -1.965685486793518,
1787
+ "logps/chosen": -80.5416030883789,
1788
+ "logps/rejected": -84.71125030517578,
1789
+ "loss": 0.4871,
1790
+ "rewards/accuracies": 0.8062499761581421,
1791
+ "rewards/chosen": -0.5754114389419556,
1792
+ "rewards/margins": 0.7324446439743042,
1793
+ "rewards/rejected": -1.3078559637069702,
1794
+ "step": 1180
1795
+ },
1796
+ {
1797
+ "epoch": 2.4104317002152174,
1798
+ "grad_norm": 26.195051193237305,
1799
+ "learning_rate": 1.1188542926675104e-08,
1800
+ "logits/chosen": -2.117806911468506,
1801
+ "logits/rejected": -1.9781955480575562,
1802
+ "logps/chosen": -86.0428466796875,
1803
+ "logps/rejected": -90.29086303710938,
1804
+ "loss": 0.4505,
1805
+ "rewards/accuracies": 0.831250011920929,
1806
+ "rewards/chosen": -0.5670899152755737,
1807
+ "rewards/margins": 0.8041001558303833,
1808
+ "rewards/rejected": -1.371190071105957,
1809
+ "step": 1190
1810
+ },
1811
+ {
1812
+ "epoch": 2.4306874287884543,
1813
+ "grad_norm": 23.817218780517578,
1814
+ "learning_rate": 1.0455388187635933e-08,
1815
+ "logits/chosen": -2.1228573322296143,
1816
+ "logits/rejected": -1.9943969249725342,
1817
+ "logps/chosen": -81.49883270263672,
1818
+ "logps/rejected": -83.27964782714844,
1819
+ "loss": 0.4844,
1820
+ "rewards/accuracies": 0.7718750238418579,
1821
+ "rewards/chosen": -0.6024131774902344,
1822
+ "rewards/margins": 0.6800934076309204,
1823
+ "rewards/rejected": -1.2825065851211548,
1824
+ "step": 1200
1825
+ },
1826
+ {
1827
+ "epoch": 2.4509431573616913,
1828
+ "grad_norm": 30.316747665405273,
1829
+ "learning_rate": 9.744263260468005e-09,
1830
+ "logits/chosen": -2.059378147125244,
1831
+ "logits/rejected": -1.9458458423614502,
1832
+ "logps/chosen": -92.07587432861328,
1833
+ "logps/rejected": -95.30807495117188,
1834
+ "loss": 0.4914,
1835
+ "rewards/accuracies": 0.7906249761581421,
1836
+ "rewards/chosen": -0.5844647884368896,
1837
+ "rewards/margins": 0.7100616097450256,
1838
+ "rewards/rejected": -1.2945263385772705,
1839
+ "step": 1210
1840
+ },
1841
+ {
1842
+ "epoch": 2.4711988859349283,
1843
+ "grad_norm": 28.162200927734375,
1844
+ "learning_rate": 9.055564304031981e-09,
1845
+ "logits/chosen": -2.082139730453491,
1846
+ "logits/rejected": -1.9701576232910156,
1847
+ "logps/chosen": -86.1491928100586,
1848
+ "logps/rejected": -92.10936737060547,
1849
+ "loss": 0.4954,
1850
+ "rewards/accuracies": 0.7718750238418579,
1851
+ "rewards/chosen": -0.5959383845329285,
1852
+ "rewards/margins": 0.7093779444694519,
1853
+ "rewards/rejected": -1.3053163290023804,
1854
+ "step": 1220
1855
+ },
1856
+ {
1857
+ "epoch": 2.4914546145081657,
1858
+ "grad_norm": 29.766904830932617,
1859
+ "learning_rate": 8.38967498395895e-09,
1860
+ "logits/chosen": -2.1094155311584473,
1861
+ "logits/rejected": -1.9789727926254272,
1862
+ "logps/chosen": -80.73603820800781,
1863
+ "logps/rejected": -84.64231872558594,
1864
+ "loss": 0.4904,
1865
+ "rewards/accuracies": 0.800000011920929,
1866
+ "rewards/chosen": -0.5383921265602112,
1867
+ "rewards/margins": 0.6760575175285339,
1868
+ "rewards/rejected": -1.2144496440887451,
1869
+ "step": 1230
1870
+ },
1871
+ {
1872
+ "epoch": 2.5117103430814027,
1873
+ "grad_norm": 35.819358825683594,
1874
+ "learning_rate": 7.746966258914988e-09,
1875
+ "logits/chosen": -2.1154112815856934,
1876
+ "logits/rejected": -1.9815971851348877,
1877
+ "logps/chosen": -86.95155334472656,
1878
+ "logps/rejected": -85.64454650878906,
1879
+ "loss": 0.5197,
1880
+ "rewards/accuracies": 0.734375,
1881
+ "rewards/chosen": -0.6145626306533813,
1882
+ "rewards/margins": 0.631356418132782,
1883
+ "rewards/rejected": -1.2459189891815186,
1884
+ "step": 1240
1885
+ },
1886
+ {
1887
+ "epoch": 2.5319660716546397,
1888
+ "grad_norm": 29.06343650817871,
1889
+ "learning_rate": 7.127796173944695e-09,
1890
+ "logits/chosen": -2.085669994354248,
1891
+ "logits/rejected": -1.9675817489624023,
1892
+ "logps/chosen": -89.59977722167969,
1893
+ "logps/rejected": -91.9139404296875,
1894
+ "loss": 0.4879,
1895
+ "rewards/accuracies": 0.796875,
1896
+ "rewards/chosen": -0.5644815564155579,
1897
+ "rewards/margins": 0.7162417769432068,
1898
+ "rewards/rejected": -1.2807233333587646,
1899
+ "step": 1250
1900
+ },
1901
+ {
1902
+ "epoch": 2.552221800227877,
1903
+ "grad_norm": 32.731590270996094,
1904
+ "learning_rate": 6.532509661008789e-09,
1905
+ "logits/chosen": -2.1157116889953613,
1906
+ "logits/rejected": -1.9942877292633057,
1907
+ "logps/chosen": -81.25712585449219,
1908
+ "logps/rejected": -85.86751556396484,
1909
+ "loss": 0.4855,
1910
+ "rewards/accuracies": 0.7749999761581421,
1911
+ "rewards/chosen": -0.5965838432312012,
1912
+ "rewards/margins": 0.7261613011360168,
1913
+ "rewards/rejected": -1.3227452039718628,
1914
+ "step": 1260
1915
+ },
1916
+ {
1917
+ "epoch": 2.572477528801114,
1918
+ "grad_norm": 21.976213455200195,
1919
+ "learning_rate": 5.9614383468267916e-09,
1920
+ "logits/chosen": -2.064387321472168,
1921
+ "logits/rejected": -1.939223289489746,
1922
+ "logps/chosen": -87.50946807861328,
1923
+ "logps/rejected": -90.81806945800781,
1924
+ "loss": 0.4677,
1925
+ "rewards/accuracies": 0.78125,
1926
+ "rewards/chosen": -0.5381378531455994,
1927
+ "rewards/margins": 0.7613744735717773,
1928
+ "rewards/rejected": -1.2995123863220215,
1929
+ "step": 1270
1930
+ },
1931
+ {
1932
+ "epoch": 2.592733257374351,
1933
+ "grad_norm": 26.645793914794922,
1934
+ "learning_rate": 5.4149003681318525e-09,
1935
+ "logits/chosen": -2.0983309745788574,
1936
+ "logits/rejected": -1.9681360721588135,
1937
+ "logps/chosen": -86.5027847290039,
1938
+ "logps/rejected": -88.28022003173828,
1939
+ "loss": 0.4846,
1940
+ "rewards/accuracies": 0.784375011920929,
1941
+ "rewards/chosen": -0.6178566217422485,
1942
+ "rewards/margins": 0.7156898975372314,
1943
+ "rewards/rejected": -1.3335466384887695,
1944
+ "step": 1280
1945
+ },
1946
+ {
1947
+ "epoch": 2.612988985947588,
1948
+ "grad_norm": 34.658424377441406,
1949
+ "learning_rate": 4.8932001944408e-09,
1950
+ "logits/chosen": -2.114567995071411,
1951
+ "logits/rejected": -2.003492832183838,
1952
+ "logps/chosen": -86.31463623046875,
1953
+ "logps/rejected": -87.3377456665039,
1954
+ "loss": 0.5289,
1955
+ "rewards/accuracies": 0.746874988079071,
1956
+ "rewards/chosen": -0.5726319551467896,
1957
+ "rewards/margins": 0.5831801891326904,
1958
+ "rewards/rejected": -1.1558120250701904,
1959
+ "step": 1290
1960
+ },
1961
+ {
1962
+ "epoch": 2.6332447145208255,
1963
+ "grad_norm": 29.894866943359375,
1964
+ "learning_rate": 4.396628458437912e-09,
1965
+ "logits/chosen": -2.057438373565674,
1966
+ "logits/rejected": -1.9271215200424194,
1967
+ "logps/chosen": -86.70679473876953,
1968
+ "logps/rejected": -89.4132308959961,
1969
+ "loss": 0.4833,
1970
+ "rewards/accuracies": 0.828125,
1971
+ "rewards/chosen": -0.5987198948860168,
1972
+ "rewards/margins": 0.7157739996910095,
1973
+ "rewards/rejected": -1.3144938945770264,
1974
+ "step": 1300
1975
+ },
1976
+ {
1977
+ "epoch": 2.6535004430940625,
1978
+ "grad_norm": 28.439125061035156,
1979
+ "learning_rate": 3.9254617940670474e-09,
1980
+ "logits/chosen": -2.0954787731170654,
1981
+ "logits/rejected": -1.9630225896835327,
1982
+ "logps/chosen": -84.08492279052734,
1983
+ "logps/rejected": -87.75090026855469,
1984
+ "loss": 0.4635,
1985
+ "rewards/accuracies": 0.800000011920929,
1986
+ "rewards/chosen": -0.523126482963562,
1987
+ "rewards/margins": 0.7484750151634216,
1988
+ "rewards/rejected": -1.2716015577316284,
1989
+ "step": 1310
1990
+ },
1991
+ {
1992
+ "epoch": 2.6737561716672995,
1993
+ "grad_norm": 25.333951950073242,
1994
+ "learning_rate": 3.479962682422366e-09,
1995
+ "logits/chosen": -2.1200668811798096,
1996
+ "logits/rejected": -1.9594342708587646,
1997
+ "logps/chosen": -83.06964111328125,
1998
+ "logps/rejected": -84.25973510742188,
1999
+ "loss": 0.4752,
2000
+ "rewards/accuracies": 0.8062499761581421,
2001
+ "rewards/chosen": -0.6142527461051941,
2002
+ "rewards/margins": 0.7399193048477173,
2003
+ "rewards/rejected": -1.3541719913482666,
2004
+ "step": 1320
2005
+ },
2006
+ {
2007
+ "epoch": 2.694011900240537,
2008
+ "grad_norm": 38.79462814331055,
2009
+ "learning_rate": 3.0603793055233194e-09,
2010
+ "logits/chosen": -2.078015089035034,
2011
+ "logits/rejected": -1.9544031620025635,
2012
+ "logps/chosen": -86.95954895019531,
2013
+ "logps/rejected": -87.41011810302734,
2014
+ "loss": 0.5153,
2015
+ "rewards/accuracies": 0.7562500238418579,
2016
+ "rewards/chosen": -0.6789627075195312,
2017
+ "rewards/margins": 0.6111471652984619,
2018
+ "rewards/rejected": -1.2901098728179932,
2019
+ "step": 1330
2020
+ },
2021
+ {
2022
+ "epoch": 2.714267628813774,
2023
+ "grad_norm": 25.715852737426758,
2024
+ "learning_rate": 2.6669454080555707e-09,
2025
+ "logits/chosen": -2.081672191619873,
2026
+ "logits/rejected": -1.9599504470825195,
2027
+ "logps/chosen": -81.85166931152344,
2028
+ "logps/rejected": -84.5459213256836,
2029
+ "loss": 0.4941,
2030
+ "rewards/accuracies": 0.78125,
2031
+ "rewards/chosen": -0.5518099069595337,
2032
+ "rewards/margins": 0.6579364538192749,
2033
+ "rewards/rejected": -1.2097463607788086,
2034
+ "step": 1340
2035
+ },
2036
+ {
2037
+ "epoch": 2.734523357387011,
2038
+ "grad_norm": 31.674190521240234,
2039
+ "learning_rate": 2.299880167154694e-09,
2040
+ "logits/chosen": -2.0664610862731934,
2041
+ "logits/rejected": -1.9412147998809814,
2042
+ "logps/chosen": -86.22771453857422,
2043
+ "logps/rejected": -90.61781311035156,
2044
+ "loss": 0.4909,
2045
+ "rewards/accuracies": 0.8125,
2046
+ "rewards/chosen": -0.5787885785102844,
2047
+ "rewards/margins": 0.6425621509552002,
2048
+ "rewards/rejected": -1.2213506698608398,
2049
+ "step": 1350
2050
+ },
2051
+ {
2052
+ "epoch": 2.754779085960248,
2053
+ "grad_norm": 21.180307388305664,
2054
+ "learning_rate": 1.959388070305368e-09,
2055
+ "logits/chosen": -2.1191658973693848,
2056
+ "logits/rejected": -1.9780559539794922,
2057
+ "logps/chosen": -84.12895202636719,
2058
+ "logps/rejected": -87.06913757324219,
2059
+ "loss": 0.4683,
2060
+ "rewards/accuracies": 0.784375011920929,
2061
+ "rewards/chosen": -0.598002016544342,
2062
+ "rewards/margins": 0.7502027750015259,
2063
+ "rewards/rejected": -1.3482048511505127,
2064
+ "step": 1360
2065
+ },
2066
+ {
2067
+ "epoch": 2.7750348145334853,
2068
+ "grad_norm": 31.467397689819336,
2069
+ "learning_rate": 1.6456588014238826e-09,
2070
+ "logits/chosen": -2.0679941177368164,
2071
+ "logits/rejected": -1.984758973121643,
2072
+ "logps/chosen": -82.20966339111328,
2073
+ "logps/rejected": -89.56126403808594,
2074
+ "loss": 0.5158,
2075
+ "rewards/accuracies": 0.7437499761581421,
2076
+ "rewards/chosen": -0.6504064798355103,
2077
+ "rewards/margins": 0.6637855768203735,
2078
+ "rewards/rejected": -1.3141919374465942,
2079
+ "step": 1370
2080
+ },
2081
+ {
2082
+ "epoch": 2.7952905431067223,
2083
+ "grad_norm": 25.813554763793945,
2084
+ "learning_rate": 1.3588671351876358e-09,
2085
+ "logits/chosen": -2.088512897491455,
2086
+ "logits/rejected": -1.9785858392715454,
2087
+ "logps/chosen": -86.8198013305664,
2088
+ "logps/rejected": -90.5335922241211,
2089
+ "loss": 0.4835,
2090
+ "rewards/accuracies": 0.7749999761581421,
2091
+ "rewards/chosen": -0.6225306987762451,
2092
+ "rewards/margins": 0.7479228973388672,
2093
+ "rewards/rejected": -1.3704535961151123,
2094
+ "step": 1380
2095
+ },
2096
+ {
2097
+ "epoch": 2.8155462716799597,
2098
+ "grad_norm": 27.169641494750977,
2099
+ "learning_rate": 1.099172839670298e-09,
2100
+ "logits/chosen": -2.0676891803741455,
2101
+ "logits/rejected": -1.9737918376922607,
2102
+ "logps/chosen": -78.63626861572266,
2103
+ "logps/rejected": -83.41529083251953,
2104
+ "loss": 0.5417,
2105
+ "rewards/accuracies": 0.715624988079071,
2106
+ "rewards/chosen": -0.6757029294967651,
2107
+ "rewards/margins": 0.6025967597961426,
2108
+ "rewards/rejected": -1.2782996892929077,
2109
+ "step": 1390
2110
+ },
2111
+ {
2112
+ "epoch": 2.8358020002531967,
2113
+ "grad_norm": 27.115802764892578,
2114
+ "learning_rate": 8.66720587337011e-10,
2115
+ "logits/chosen": -2.065960645675659,
2116
+ "logits/rejected": -1.9609451293945312,
2117
+ "logps/chosen": -87.2630615234375,
2118
+ "logps/rejected": -90.57234191894531,
2119
+ "loss": 0.5015,
2120
+ "rewards/accuracies": 0.762499988079071,
2121
+ "rewards/chosen": -0.619225263595581,
2122
+ "rewards/margins": 0.6657966375350952,
2123
+ "rewards/rejected": -1.2850219011306763,
2124
+ "step": 1400
2125
+ },
2126
+ {
2127
+ "epoch": 2.8560577288264337,
2128
+ "grad_norm": 24.163036346435547,
2129
+ "learning_rate": 6.616398744491825e-10,
2130
+ "logits/chosen": -2.079883337020874,
2131
+ "logits/rejected": -1.9338324069976807,
2132
+ "logps/chosen": -88.01893615722656,
2133
+ "logps/rejected": -89.63729095458984,
2134
+ "loss": 0.4572,
2135
+ "rewards/accuracies": 0.78125,
2136
+ "rewards/chosen": -0.5343765020370483,
2137
+ "rewards/margins": 0.7622194886207581,
2138
+ "rewards/rejected": -1.2965959310531616,
2139
+ "step": 1410
2140
+ },
2141
+ {
2142
+ "epoch": 2.8763134573996707,
2143
+ "grad_norm": 33.94268798828125,
2144
+ "learning_rate": 4.840449489236786e-10,
2145
+ "logits/chosen": -2.0775258541107178,
2146
+ "logits/rejected": -1.9470727443695068,
2147
+ "logps/chosen": -83.77029418945312,
2148
+ "logps/rejected": -87.10340881347656,
2149
+ "loss": 0.478,
2150
+ "rewards/accuracies": 0.809374988079071,
2151
+ "rewards/chosen": -0.578041672706604,
2152
+ "rewards/margins": 0.695804238319397,
2153
+ "rewards/rejected": -1.273845911026001,
2154
+ "step": 1420
2155
+ },
2156
+ {
2157
+ "epoch": 2.8965691859729077,
2158
+ "grad_norm": 31.864702224731445,
2159
+ "learning_rate": 3.3403474668677324e-10,
2160
+ "logits/chosen": -2.063178062438965,
2161
+ "logits/rejected": -1.9387401342391968,
2162
+ "logps/chosen": -88.2574234008789,
2163
+ "logps/rejected": -90.79651641845703,
2164
+ "loss": 0.5111,
2165
+ "rewards/accuracies": 0.7875000238418579,
2166
+ "rewards/chosen": -0.6772388219833374,
2167
+ "rewards/margins": 0.6761503219604492,
2168
+ "rewards/rejected": -1.3533891439437866,
2169
+ "step": 1430
2170
+ },
2171
+ {
2172
+ "epoch": 2.916824914546145,
2173
+ "grad_norm": 23.90968894958496,
2174
+ "learning_rate": 2.1169283655815274e-10,
2175
+ "logits/chosen": -2.082099199295044,
2176
+ "logits/rejected": -1.949532151222229,
2177
+ "logps/chosen": -87.7041244506836,
2178
+ "logps/rejected": -91.3929214477539,
2179
+ "loss": 0.4748,
2180
+ "rewards/accuracies": 0.7749999761581421,
2181
+ "rewards/chosen": -0.6174831390380859,
2182
+ "rewards/margins": 0.7841922044754028,
2183
+ "rewards/rejected": -1.4016753435134888,
2184
+ "step": 1440
2185
+ },
2186
+ {
2187
+ "epoch": 2.937080643119382,
2188
+ "grad_norm": 25.46677589416504,
2189
+ "learning_rate": 1.1708737369576228e-10,
2190
+ "logits/chosen": -2.081848621368408,
2191
+ "logits/rejected": -1.9594366550445557,
2192
+ "logps/chosen": -78.64682006835938,
2193
+ "logps/rejected": -87.83316040039062,
2194
+ "loss": 0.4826,
2195
+ "rewards/accuracies": 0.8125,
2196
+ "rewards/chosen": -0.5947951078414917,
2197
+ "rewards/margins": 0.7158970832824707,
2198
+ "rewards/rejected": -1.3106920719146729,
2199
+ "step": 1450
2200
+ },
2201
+ {
2202
+ "epoch": 2.9573363716926195,
2203
+ "grad_norm": 34.160301208496094,
2204
+ "learning_rate": 5.0271061627427115e-11,
2205
+ "logits/chosen": -2.1119987964630127,
2206
+ "logits/rejected": -1.9852664470672607,
2207
+ "logps/chosen": -80.1048355102539,
2208
+ "logps/rejected": -88.52069091796875,
2209
+ "loss": 0.5029,
2210
+ "rewards/accuracies": 0.7562500238418579,
2211
+ "rewards/chosen": -0.6696670651435852,
2212
+ "rewards/margins": 0.718749463558197,
2213
+ "rewards/rejected": -1.3884165287017822,
2214
+ "step": 1460
2215
+ },
2216
+ {
2217
+ "epoch": 2.9775921002658565,
2218
+ "grad_norm": 26.002410888671875,
2219
+ "learning_rate": 1.1281122890355322e-11,
2220
+ "logits/chosen": -2.076430082321167,
2221
+ "logits/rejected": -1.9372243881225586,
2222
+ "logps/chosen": -83.05271911621094,
2223
+ "logps/rejected": -85.09577941894531,
2224
+ "loss": 0.5069,
2225
+ "rewards/accuracies": 0.7562500238418579,
2226
+ "rewards/chosen": -0.542018711566925,
2227
+ "rewards/margins": 0.6729723215103149,
2228
+ "rewards/rejected": -1.2149909734725952,
2229
+ "step": 1470
2230
+ },
2231
+ {
2232
+ "epoch": 2.99582225598177,
2233
+ "step": 1479,
2234
+ "total_flos": 0.0,
2235
+ "train_loss": 0.546259533964032,
2236
+ "train_runtime": 14357.5178,
2237
+ "train_samples_per_second": 3.301,
2238
+ "train_steps_per_second": 0.103
2239
+ }
2240
+ ],
2241
+ "logging_steps": 10,
2242
+ "max_steps": 1479,
2243
+ "num_input_tokens_seen": 0,
2244
+ "num_train_epochs": 3,
2245
+ "save_steps": 100,
2246
+ "stateful_callbacks": {
2247
+ "TrainerControl": {
2248
+ "args": {
2249
+ "should_epoch_stop": false,
2250
+ "should_evaluate": false,
2251
+ "should_log": false,
2252
+ "should_save": true,
2253
+ "should_training_stop": false
2254
+ },
2255
+ "attributes": {}
2256
+ }
2257
+ },
2258
+ "total_flos": 0.0,
2259
+ "train_batch_size": 1,
2260
+ "trial_name": null,
2261
+ "trial_params": null
2262
+ }