hugodk-sch commited on
Commit
afc2ba1
1 Parent(s): 9c0b014

Model save

Browse files
Files changed (5) hide show
  1. README.md +13 -16
  2. adapter_model.safetensors +1 -1
  3. all_results.json +2 -15
  4. train_results.json +2 -2
  5. trainer_state.json +377 -377
README.md CHANGED
@@ -1,13 +1,10 @@
1
  ---
2
  library_name: peft
3
  tags:
4
- - alignment-handbook
5
  - trl
6
  - dpo
7
  - generated_from_trainer
8
  base_model: NbAiLab/nb-gpt-j-6B-v2
9
- datasets:
10
- - hugodk-sch/aftonposten_title_prefs
11
  model-index:
12
  - name: aftonposten-6b-align-scan
13
  results: []
@@ -18,17 +15,17 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # aftonposten-6b-align-scan
20
 
21
- This model is a fine-tuned version of [data/ap-gpt-j-6b-sft-qlora-04-08](https://huggingface.co/data/ap-gpt-j-6b-sft-qlora-04-08) on the hugodk-sch/aftonposten_title_prefs dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9894
24
- - Rewards/chosen: -0.0296
25
- - Rewards/rejected: -0.0401
26
- - Rewards/accuracies: 0.5125
27
- - Rewards/margins: 0.0105
28
- - Logps/rejected: -37.6168
29
- - Logps/chosen: -34.1085
30
- - Logits/rejected: -2.2185
31
- - Logits/chosen: -2.2233
32
 
33
  ## Model description
34
 
@@ -63,9 +60,9 @@ The following hyperparameters were used during training:
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
- | 0.9521 | 0.26 | 100 | 0.9920 | -0.0014 | -0.0094 | 0.5361 | 0.0080 | -37.5401 | -34.0381 | -2.2279 | -2.2327 |
67
- | 0.8515 | 0.52 | 200 | 0.9817 | -0.0042 | -0.0226 | 0.5594 | 0.0184 | -37.5730 | -34.0451 | -2.2226 | -2.2274 |
68
- | 0.7473 | 0.78 | 300 | 0.9830 | -0.0231 | -0.0399 | 0.5390 | 0.0169 | -37.6165 | -34.0922 | -2.2186 | -2.2234 |
69
 
70
 
71
  ### Framework versions
 
1
  ---
2
  library_name: peft
3
  tags:
 
4
  - trl
5
  - dpo
6
  - generated_from_trainer
7
  base_model: NbAiLab/nb-gpt-j-6B-v2
 
 
8
  model-index:
9
  - name: aftonposten-6b-align-scan
10
  results: []
 
15
 
16
  # aftonposten-6b-align-scan
17
 
18
+ This model is a fine-tuned version of [NbAiLab/nb-gpt-j-6B-v2](https://huggingface.co/NbAiLab/nb-gpt-j-6B-v2) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.6920
21
+ - Rewards/chosen: -0.0273
22
+ - Rewards/rejected: -0.0406
23
+ - Rewards/accuracies: 0.5278
24
+ - Rewards/margins: 0.0133
25
+ - Logps/rejected: -37.5978
26
+ - Logps/chosen: -34.0891
27
+ - Logits/rejected: -2.2231
28
+ - Logits/chosen: -2.2279
29
 
30
  ## Model description
31
 
 
60
 
61
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
62
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
63
+ | 0.6634 | 0.26 | 100 | 0.6931 | 0.0028 | -0.0041 | 0.5216 | 0.0069 | -37.5249 | -34.0290 | -2.2276 | -2.2324 |
64
+ | 0.6329 | 0.52 | 200 | 0.6905 | -0.0127 | -0.0275 | 0.5274 | 0.0148 | -37.5716 | -34.0600 | -2.2255 | -2.2304 |
65
+ | 0.5742 | 0.78 | 300 | 0.6920 | -0.0273 | -0.0406 | 0.5278 | 0.0133 | -37.5978 | -34.0891 | -2.2231 | -2.2279 |
66
 
67
 
68
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6c9ab0b5a807529fb4eda32e797f68875c5e54ce248261258a517a6afb9dd75
3
  size 176183216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa405f5b86503e02e2b370eae5754be314df4fd8079a6033e67d561adc170c8
3
  size 176183216
all_results.json CHANGED
@@ -1,20 +1,7 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -2.223268747329712,
4
- "eval_logits/rejected": -2.218452215194702,
5
- "eval_logps/chosen": -34.1085205078125,
6
- "eval_logps/rejected": -37.61680603027344,
7
- "eval_loss": 0.9893841743469238,
8
- "eval_rewards/accuracies": 0.5124584436416626,
9
- "eval_rewards/chosen": -0.02958693355321884,
10
- "eval_rewards/margins": 0.010487398132681847,
11
- "eval_rewards/rejected": -0.04007433354854584,
12
- "eval_runtime": 145.4187,
13
- "eval_samples": 343,
14
- "eval_samples_per_second": 2.359,
15
- "eval_steps_per_second": 0.296,
16
- "train_loss": 0.8990784768934373,
17
- "train_runtime": 3250.3805,
18
  "train_samples": 3079,
19
  "train_samples_per_second": 0.947,
20
  "train_steps_per_second": 0.118
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.64145151051608,
4
+ "train_runtime": 3249.8987,
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 3079,
6
  "train_samples_per_second": 0.947,
7
  "train_steps_per_second": 0.118
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.8990784768934373,
4
- "train_runtime": 3250.3805,
5
  "train_samples": 3079,
6
  "train_samples_per_second": 0.947,
7
  "train_steps_per_second": 0.118
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.64145151051608,
4
+ "train_runtime": 3249.8987,
5
  "train_samples": 3079,
6
  "train_samples_per_second": 0.947,
7
  "train_steps_per_second": 0.118
trainer_state.json CHANGED
@@ -15,7 +15,7 @@
15
  "logits/rejected": -1.7377450466156006,
16
  "logps/chosen": -29.553977966308594,
17
  "logps/rejected": -42.813133239746094,
18
- "loss": 1.0,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
@@ -25,589 +25,589 @@
25
  {
26
  "epoch": 0.03,
27
  "learning_rate": 1.282051282051282e-06,
28
- "logits/chosen": -1.8667426109313965,
29
- "logits/rejected": -1.8710602521896362,
30
- "logps/chosen": -36.991912841796875,
31
- "logps/rejected": -33.67206954956055,
32
- "loss": 0.9773,
33
- "rewards/accuracies": 0.5694444179534912,
34
- "rewards/chosen": 0.0058750128373503685,
35
- "rewards/margins": 0.022671451792120934,
36
- "rewards/rejected": -0.01679643802344799,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.05,
41
  "learning_rate": 2.564102564102564e-06,
42
- "logits/chosen": -1.9977442026138306,
43
- "logits/rejected": -2.0003952980041504,
44
- "logps/chosen": -29.659366607666016,
45
- "logps/rejected": -29.05437660217285,
46
- "loss": 1.0105,
47
- "rewards/accuracies": 0.4124999940395355,
48
- "rewards/chosen": -0.006868218071758747,
49
- "rewards/margins": -0.010493903420865536,
50
- "rewards/rejected": 0.003625686513260007,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.08,
55
  "learning_rate": 3.846153846153847e-06,
56
- "logits/chosen": -1.920693039894104,
57
- "logits/rejected": -1.91802179813385,
58
- "logps/chosen": -31.39971351623535,
59
- "logps/rejected": -33.21495819091797,
60
- "loss": 0.9948,
61
- "rewards/accuracies": 0.5625,
62
- "rewards/chosen": 0.006551843136548996,
63
- "rewards/margins": 0.0051523735746741295,
64
- "rewards/rejected": 0.001399471191689372,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.1,
69
  "learning_rate": 4.999896948438434e-06,
70
- "logits/chosen": -2.018057107925415,
71
- "logits/rejected": -2.0093047618865967,
72
- "logps/chosen": -32.565284729003906,
73
- "logps/rejected": -32.50053405761719,
74
- "loss": 1.0002,
75
- "rewards/accuracies": 0.5375000238418579,
76
- "rewards/chosen": 0.0046076299622654915,
77
- "rewards/margins": -0.00024683662923052907,
78
- "rewards/rejected": 0.004854466766119003,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.13,
83
  "learning_rate": 4.987541037542187e-06,
84
- "logits/chosen": -1.8627817630767822,
85
- "logits/rejected": -1.851999044418335,
86
- "logps/chosen": -33.549964904785156,
87
- "logps/rejected": -35.44340896606445,
88
- "loss": 1.0006,
89
- "rewards/accuracies": 0.4749999940395355,
90
- "rewards/chosen": 0.0030060340650379658,
91
- "rewards/margins": -0.0005688609671778977,
92
- "rewards/rejected": 0.003574896603822708,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.16,
97
  "learning_rate": 4.954691471941119e-06,
98
- "logits/chosen": -1.9416770935058594,
99
- "logits/rejected": -1.9436094760894775,
100
- "logps/chosen": -32.53351593017578,
101
- "logps/rejected": -33.217529296875,
102
- "loss": 0.9561,
103
- "rewards/accuracies": 0.5625,
104
- "rewards/chosen": 0.026666466146707535,
105
- "rewards/margins": 0.04512657970190048,
106
- "rewards/rejected": -0.018460111692547798,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.18,
111
  "learning_rate": 4.901618883413549e-06,
112
- "logits/chosen": -2.072779655456543,
113
- "logits/rejected": -2.077756881713867,
114
- "logps/chosen": -34.002342224121094,
115
- "logps/rejected": -36.633216857910156,
116
- "loss": 0.9798,
117
- "rewards/accuracies": 0.5249999761581421,
118
- "rewards/chosen": -0.0055419160053133965,
119
- "rewards/margins": 0.020196745172142982,
120
- "rewards/rejected": -0.025738662108778954,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.21,
125
  "learning_rate": 4.828760511501322e-06,
126
- "logits/chosen": -1.9332294464111328,
127
- "logits/rejected": -1.9363486766815186,
128
- "logps/chosen": -34.33064651489258,
129
- "logps/rejected": -34.64745330810547,
130
- "loss": 0.9391,
131
- "rewards/accuracies": 0.574999988079071,
132
- "rewards/chosen": 0.03886651247739792,
133
- "rewards/margins": 0.06088464334607124,
134
- "rewards/rejected": -0.022018127143383026,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.23,
139
  "learning_rate": 4.7367166013034295e-06,
140
- "logits/chosen": -1.9409536123275757,
141
- "logits/rejected": -1.9454677104949951,
142
- "logps/chosen": -32.381752014160156,
143
- "logps/rejected": -32.35805892944336,
144
- "loss": 0.9661,
145
- "rewards/accuracies": 0.612500011920929,
146
- "rewards/chosen": 0.0342683270573616,
147
- "rewards/margins": 0.033929694443941116,
148
- "rewards/rejected": 0.0003386303724255413,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.26,
153
  "learning_rate": 4.626245458345211e-06,
154
- "logits/chosen": -2.0382819175720215,
155
- "logits/rejected": -2.036304473876953,
156
- "logps/chosen": -32.15763473510742,
157
- "logps/rejected": -31.302764892578125,
158
- "loss": 0.9521,
159
- "rewards/accuracies": 0.5625,
160
- "rewards/chosen": 0.033279187977313995,
161
- "rewards/margins": 0.047900475561618805,
162
- "rewards/rejected": -0.014621290378272533,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.26,
167
- "eval_logits/chosen": -2.2327499389648438,
168
- "eval_logits/rejected": -2.227902889251709,
169
- "eval_logps/chosen": -34.038063049316406,
170
- "eval_logps/rejected": -37.54012680053711,
171
- "eval_loss": 0.9920137524604797,
172
- "eval_rewards/accuracies": 0.5361295938491821,
173
- "eval_rewards/chosen": -0.0014053798513486981,
174
- "eval_rewards/margins": 0.007998107932507992,
175
- "eval_rewards/rejected": -0.009403487667441368,
176
- "eval_runtime": 146.1087,
177
- "eval_samples_per_second": 2.348,
178
- "eval_steps_per_second": 0.294,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.29,
183
  "learning_rate": 4.498257201263691e-06,
184
- "logits/chosen": -1.9931761026382446,
185
- "logits/rejected": -1.9907801151275635,
186
- "logps/chosen": -33.120845794677734,
187
- "logps/rejected": -34.02234649658203,
188
- "loss": 0.9536,
189
- "rewards/accuracies": 0.6000000238418579,
190
- "rewards/chosen": 0.04918808490037918,
191
- "rewards/margins": 0.046377379447221756,
192
- "rewards/rejected": 0.0028106991667300463,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.31,
197
  "learning_rate": 4.353806263777678e-06,
198
- "logits/chosen": -2.004035472869873,
199
- "logits/rejected": -1.9957201480865479,
200
- "logps/chosen": -32.31081771850586,
201
- "logps/rejected": -32.129127502441406,
202
- "loss": 0.9593,
203
  "rewards/accuracies": 0.574999988079071,
204
- "rewards/chosen": 0.05363499000668526,
205
- "rewards/margins": 0.04068244248628616,
206
- "rewards/rejected": 0.012952548451721668,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.34,
211
  "learning_rate": 4.1940827077152755e-06,
212
- "logits/chosen": -2.0312001705169678,
213
- "logits/rejected": -2.023239850997925,
214
- "logps/chosen": -30.317270278930664,
215
- "logps/rejected": -32.05809783935547,
216
- "loss": 0.9384,
217
- "rewards/accuracies": 0.5625,
218
- "rewards/chosen": 0.06337819993495941,
219
- "rewards/margins": 0.06541591882705688,
220
- "rewards/rejected": -0.0020377314649522305,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.36,
225
  "learning_rate": 4.0204024186666215e-06,
226
- "logits/chosen": -1.9611848592758179,
227
- "logits/rejected": -1.9714059829711914,
228
- "logps/chosen": -31.235523223876953,
229
- "logps/rejected": -32.55936813354492,
230
- "loss": 0.9165,
231
- "rewards/accuracies": 0.6499999761581421,
232
- "rewards/chosen": 0.07365532219409943,
233
- "rewards/margins": 0.08345074951648712,
234
- "rewards/rejected": -0.009795431979000568,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.39,
239
  "learning_rate": 3.834196265035119e-06,
240
- "logits/chosen": -1.8718490600585938,
241
- "logits/rejected": -1.8730093240737915,
242
- "logps/chosen": -33.873046875,
243
- "logps/rejected": -34.80980682373047,
244
- "loss": 0.8563,
245
- "rewards/accuracies": 0.6625000238418579,
246
- "rewards/chosen": 0.12785716354846954,
247
- "rewards/margins": 0.1488770991563797,
248
- "rewards/rejected": -0.021019931882619858,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.42,
253
  "learning_rate": 3.636998309800573e-06,
254
- "logits/chosen": -1.9235217571258545,
255
- "logits/rejected": -1.9201278686523438,
256
- "logps/chosen": -36.01029968261719,
257
- "logps/rejected": -32.690834045410156,
258
- "loss": 0.9482,
259
- "rewards/accuracies": 0.625,
260
- "rewards/chosen": 0.06455211341381073,
261
- "rewards/margins": 0.05178683251142502,
262
- "rewards/rejected": 0.012765283696353436,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.44,
267
  "learning_rate": 3.4304331721118078e-06,
268
- "logits/chosen": -2.0244345664978027,
269
- "logits/rejected": -2.0171027183532715,
270
- "logps/chosen": -33.474937438964844,
271
- "logps/rejected": -31.42409324645996,
272
- "loss": 0.8419,
273
- "rewards/accuracies": 0.7124999761581421,
274
- "rewards/chosen": 0.12916772067546844,
275
- "rewards/margins": 0.15806543827056885,
276
- "rewards/rejected": -0.028897713869810104,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.47,
281
  "learning_rate": 3.2162026428305436e-06,
282
- "logits/chosen": -2.0310873985290527,
283
- "logits/rejected": -2.036334276199341,
284
- "logps/chosen": -32.2062873840332,
285
- "logps/rejected": -32.42302703857422,
286
- "loss": 0.8919,
287
- "rewards/accuracies": 0.6625000238418579,
288
- "rewards/chosen": 0.1354007124900818,
289
- "rewards/margins": 0.11055666208267212,
290
- "rewards/rejected": 0.024844054132699966,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.49,
295
  "learning_rate": 2.996071664294641e-06,
296
- "logits/chosen": -2.0314042568206787,
297
- "logits/rejected": -2.02862286567688,
298
- "logps/chosen": -31.278457641601562,
299
- "logps/rejected": -31.348251342773438,
300
- "loss": 0.9038,
301
- "rewards/accuracies": 0.6625000238418579,
302
- "rewards/chosen": 0.0841989517211914,
303
- "rewards/margins": 0.09621445834636688,
304
- "rewards/rejected": -0.012015508487820625,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.52,
309
  "learning_rate": 2.7718537898066833e-06,
310
- "logits/chosen": -1.9020229578018188,
311
- "logits/rejected": -1.906660795211792,
312
- "logps/chosen": -31.28672218322754,
313
- "logps/rejected": -32.84270477294922,
314
- "loss": 0.8515,
315
- "rewards/accuracies": 0.7250000238418579,
316
- "rewards/chosen": 0.12782469391822815,
317
- "rewards/margins": 0.14853176474571228,
318
- "rewards/rejected": -0.020707078278064728,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.52,
323
- "eval_logits/chosen": -2.22743558883667,
324
- "eval_logits/rejected": -2.2226204872131348,
325
- "eval_logps/chosen": -34.04505920410156,
326
- "eval_logps/rejected": -37.573036193847656,
327
- "eval_loss": 0.9816663265228271,
328
- "eval_rewards/accuracies": 0.559385359287262,
329
- "eval_rewards/chosen": -0.004204160068184137,
330
- "eval_rewards/margins": 0.018361244350671768,
331
- "eval_rewards/rejected": -0.022565403953194618,
332
- "eval_runtime": 145.491,
333
- "eval_samples_per_second": 2.358,
334
- "eval_steps_per_second": 0.296,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.55,
339
  "learning_rate": 2.5453962426402006e-06,
340
- "logits/chosen": -2.0146515369415283,
341
- "logits/rejected": -2.025285005569458,
342
- "logps/chosen": -31.747217178344727,
343
- "logps/rejected": -33.96654510498047,
344
- "loss": 0.8755,
345
  "rewards/accuracies": 0.6625000238418579,
346
- "rewards/chosen": 0.08963600546121597,
347
- "rewards/margins": 0.13117524981498718,
348
- "rewards/rejected": -0.04153924435377121,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.57,
353
  "learning_rate": 2.3185646976551794e-06,
354
- "logits/chosen": -1.9072997570037842,
355
- "logits/rejected": -1.922041654586792,
356
- "logps/chosen": -29.81575584411621,
357
- "logps/rejected": -31.636306762695312,
358
- "loss": 0.8509,
359
- "rewards/accuracies": 0.7250000238418579,
360
- "rewards/chosen": 0.11831430345773697,
361
- "rewards/margins": 0.14956556260585785,
362
- "rewards/rejected": -0.03125125169754028,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.6,
367
  "learning_rate": 2.0932279108998323e-06,
368
- "logits/chosen": -1.9637253284454346,
369
- "logits/rejected": -1.9677000045776367,
370
- "logps/chosen": -33.084877014160156,
371
- "logps/rejected": -31.639108657836914,
372
- "loss": 0.8476,
373
- "rewards/accuracies": 0.6875,
374
- "rewards/chosen": 0.13303686678409576,
375
- "rewards/margins": 0.16895917057991028,
376
- "rewards/rejected": -0.03592229634523392,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.62,
381
  "learning_rate": 1.8712423238279358e-06,
382
- "logits/chosen": -1.9612106084823608,
383
- "logits/rejected": -1.9394094944000244,
384
- "logps/chosen": -33.84415817260742,
385
- "logps/rejected": -35.132205963134766,
386
- "loss": 0.8154,
387
- "rewards/accuracies": 0.7124999761581421,
388
- "rewards/chosen": 0.1238357201218605,
389
- "rewards/margins": 0.19639968872070312,
390
- "rewards/rejected": -0.07256398350000381,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.65,
395
  "learning_rate": 1.6544367689701824e-06,
396
- "logits/chosen": -2.002776861190796,
397
- "logits/rejected": -1.999464988708496,
398
- "logps/chosen": -32.74934387207031,
399
- "logps/rejected": -36.28064727783203,
400
- "loss": 0.9096,
401
- "rewards/accuracies": 0.574999988079071,
402
- "rewards/chosen": 0.0702909380197525,
403
- "rewards/margins": 0.09257940202951431,
404
- "rewards/rejected": -0.02228846587240696,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.68,
409
  "learning_rate": 1.4445974030621963e-06,
410
- "logits/chosen": -1.8692924976348877,
411
- "logits/rejected": -1.8668625354766846,
412
- "logps/chosen": -33.982818603515625,
413
- "logps/rejected": -35.535736083984375,
414
- "loss": 0.9053,
415
- "rewards/accuracies": 0.6499999761581421,
416
- "rewards/chosen": 0.08223171532154083,
417
- "rewards/margins": 0.09467832744121552,
418
- "rewards/rejected": -0.012446624226868153,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.7,
423
  "learning_rate": 1.243452991757889e-06,
424
- "logits/chosen": -1.8543907403945923,
425
- "logits/rejected": -1.8520078659057617,
426
- "logps/chosen": -34.20638656616211,
427
- "logps/rejected": -31.873388290405273,
428
- "loss": 0.8923,
429
- "rewards/accuracies": 0.6000000238418579,
430
- "rewards/chosen": 0.07648433744907379,
431
- "rewards/margins": 0.11730633676052094,
432
- "rewards/rejected": -0.040821999311447144,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.73,
437
  "learning_rate": 1.0526606671603523e-06,
438
- "logits/chosen": -1.9576927423477173,
439
- "logits/rejected": -1.9472328424453735,
440
- "logps/chosen": -34.999000549316406,
441
- "logps/rejected": -31.88382911682129,
442
- "loss": 0.8398,
443
  "rewards/accuracies": 0.737500011920929,
444
- "rewards/chosen": 0.1430555135011673,
445
- "rewards/margins": 0.16019006073474884,
446
- "rewards/rejected": -0.017134560272097588,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.75,
451
  "learning_rate": 8.737922755071455e-07,
452
- "logits/chosen": -2.0523831844329834,
453
- "logits/rejected": -2.037504196166992,
454
- "logps/chosen": -30.712234497070312,
455
- "logps/rejected": -32.659637451171875,
456
- "loss": 0.9146,
457
- "rewards/accuracies": 0.612500011920929,
458
- "rewards/chosen": 0.08475608378648758,
459
- "rewards/margins": 0.08787757158279419,
460
- "rewards/rejected": -0.0031214915215969086,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.78,
465
  "learning_rate": 7.08321427484816e-07,
466
- "logits/chosen": -1.9236303567886353,
467
- "logits/rejected": -1.9211324453353882,
468
- "logps/chosen": -32.370330810546875,
469
- "logps/rejected": -30.952068328857422,
470
- "loss": 0.7473,
471
- "rewards/accuracies": 0.7875000238418579,
472
- "rewards/chosen": 0.2227112501859665,
473
- "rewards/margins": 0.27681466937065125,
474
- "rewards/rejected": -0.054103411734104156,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.78,
479
- "eval_logits/chosen": -2.223449468612671,
480
- "eval_logits/rejected": -2.2186241149902344,
481
- "eval_logps/chosen": -34.09219741821289,
482
- "eval_logps/rejected": -37.61648178100586,
483
- "eval_loss": 0.9830461740493774,
484
- "eval_rewards/accuracies": 0.5390365719795227,
485
- "eval_rewards/chosen": -0.023058738559484482,
486
- "eval_rewards/margins": 0.016884688287973404,
487
- "eval_rewards/rejected": -0.03994342312216759,
488
- "eval_runtime": 145.6848,
489
- "eval_samples_per_second": 2.354,
490
  "eval_steps_per_second": 0.295,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.81,
495
  "learning_rate": 5.576113578589035e-07,
496
- "logits/chosen": -1.908721923828125,
497
- "logits/rejected": -1.9054863452911377,
498
- "logps/chosen": -31.30953025817871,
499
- "logps/rejected": -33.8243408203125,
500
- "loss": 0.8639,
501
- "rewards/accuracies": 0.7250000238418579,
502
- "rewards/chosen": 0.1175277829170227,
503
- "rewards/margins": 0.15040358901023865,
504
- "rewards/rejected": -0.032875802367925644,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
  "learning_rate": 4.229036944380913e-07,
510
- "logits/chosen": -1.958335518836975,
511
- "logits/rejected": -1.946170449256897,
512
- "logps/chosen": -34.308921813964844,
513
- "logps/rejected": -33.70027542114258,
514
- "loss": 0.8275,
515
- "rewards/accuracies": 0.6875,
516
- "rewards/chosen": 0.11486158519983292,
517
- "rewards/margins": 0.17943526804447174,
518
- "rewards/rejected": -0.06457368284463882,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
  "learning_rate": 3.053082288996112e-07,
524
- "logits/chosen": -1.9931294918060303,
525
- "logits/rejected": -1.9917207956314087,
526
- "logps/chosen": -33.1971321105957,
527
- "logps/rejected": -32.582763671875,
528
- "loss": 0.8474,
529
- "rewards/accuracies": 0.7124999761581421,
530
- "rewards/chosen": 0.11546528339385986,
531
- "rewards/margins": 0.15673772990703583,
532
- "rewards/rejected": -0.04127243906259537,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
  "learning_rate": 2.0579377374915805e-07,
538
- "logits/chosen": -2.0800864696502686,
539
- "logits/rejected": -2.064441204071045,
540
- "logps/chosen": -33.79554748535156,
541
- "logps/rejected": -33.09272384643555,
542
- "loss": 0.8566,
543
- "rewards/accuracies": 0.675000011920929,
544
- "rewards/chosen": 0.15353715419769287,
545
- "rewards/margins": 0.14341332018375397,
546
- "rewards/rejected": 0.010123846121132374,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
  "learning_rate": 1.2518018074041684e-07,
552
- "logits/chosen": -1.9522091150283813,
553
- "logits/rejected": -1.9513927698135376,
554
- "logps/chosen": -32.85493469238281,
555
- "logps/rejected": -32.53038787841797,
556
- "loss": 0.818,
557
- "rewards/accuracies": 0.699999988079071,
558
- "rewards/chosen": 0.170864075422287,
559
- "rewards/margins": 0.19696101546287537,
560
- "rewards/rejected": -0.02609694004058838,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
  "learning_rate": 6.41315865106129e-08,
566
- "logits/chosen": -1.9075958728790283,
567
- "logits/rejected": -1.9178472757339478,
568
- "logps/chosen": -31.86941909790039,
569
- "logps/rejected": -35.343692779541016,
570
- "loss": 0.8461,
571
- "rewards/accuracies": 0.675000011920929,
572
- "rewards/chosen": 0.1341095268726349,
573
- "rewards/margins": 0.1560816764831543,
574
- "rewards/rejected": -0.021972158923745155,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
  "learning_rate": 2.3150941078050325e-08,
580
- "logits/chosen": -2.047290325164795,
581
- "logits/rejected": -2.0408453941345215,
582
- "logps/chosen": -33.33329772949219,
583
- "logps/rejected": -29.256567001342773,
584
- "loss": 0.8605,
585
- "rewards/accuracies": 0.7250000238418579,
586
- "rewards/chosen": 0.12746797502040863,
587
- "rewards/margins": 0.1395258903503418,
588
- "rewards/rejected": -0.012057906948029995,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
  "learning_rate": 2.575864278703266e-09,
594
- "logits/chosen": -1.9072849750518799,
595
- "logits/rejected": -1.909480333328247,
596
- "logps/chosen": -33.89991760253906,
597
- "logps/rejected": -31.013565063476562,
598
- "loss": 0.8133,
599
- "rewards/accuracies": 0.6875,
600
- "rewards/chosen": 0.13565407693386078,
601
- "rewards/margins": 0.1920291632413864,
602
- "rewards/rejected": -0.05637507513165474,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 1.0,
607
  "step": 385,
608
  "total_flos": 0.0,
609
- "train_loss": 0.8990784768934373,
610
- "train_runtime": 3250.3805,
611
  "train_samples_per_second": 0.947,
612
  "train_steps_per_second": 0.118
613
  }
 
15
  "logits/rejected": -1.7377450466156006,
16
  "logps/chosen": -29.553977966308594,
17
  "logps/rejected": -42.813133239746094,
18
+ "loss": 0.6931,
19
  "rewards/accuracies": 0.0,
20
  "rewards/chosen": 0.0,
21
  "rewards/margins": 0.0,
 
25
  {
26
  "epoch": 0.03,
27
  "learning_rate": 1.282051282051282e-06,
28
+ "logits/chosen": -1.8664319515228271,
29
+ "logits/rejected": -1.8707623481750488,
30
+ "logps/chosen": -36.98527526855469,
31
+ "logps/rejected": -33.654090881347656,
32
+ "loss": 0.6829,
33
+ "rewards/accuracies": 0.5416666865348816,
34
+ "rewards/chosen": 0.010662304237484932,
35
+ "rewards/margins": 0.02267039567232132,
36
+ "rewards/rejected": -0.012008090503513813,
37
  "step": 10
38
  },
39
  {
40
  "epoch": 0.05,
41
  "learning_rate": 2.564102564102564e-06,
42
+ "logits/chosen": -1.9980642795562744,
43
+ "logits/rejected": -2.0007288455963135,
44
+ "logps/chosen": -29.634414672851562,
45
+ "logps/rejected": -29.0543270111084,
46
+ "loss": 0.6943,
47
+ "rewards/accuracies": 0.5874999761581421,
48
+ "rewards/chosen": 0.0038894296158105135,
49
+ "rewards/margins": -0.000667938613332808,
50
+ "rewards/rejected": 0.004557368345558643,
51
  "step": 20
52
  },
53
  {
54
  "epoch": 0.08,
55
  "learning_rate": 3.846153846153847e-06,
56
+ "logits/chosen": -1.9210376739501953,
57
+ "logits/rejected": -1.9183601140975952,
58
+ "logps/chosen": -31.391239166259766,
59
+ "logps/rejected": -33.24319076538086,
60
+ "loss": 0.6828,
61
+ "rewards/accuracies": 0.6000000238418579,
62
+ "rewards/chosen": 0.012426799163222313,
63
+ "rewards/margins": 0.024792592972517014,
64
+ "rewards/rejected": -0.0123657938092947,
65
  "step": 30
66
  },
67
  {
68
  "epoch": 0.1,
69
  "learning_rate": 4.999896948438434e-06,
70
+ "logits/chosen": -2.017646312713623,
71
+ "logits/rejected": -2.0089142322540283,
72
+ "logps/chosen": -32.557518005371094,
73
+ "logps/rejected": -32.51502227783203,
74
+ "loss": 0.6894,
75
+ "rewards/accuracies": 0.4749999940395355,
76
+ "rewards/chosen": 0.009644975885748863,
77
+ "rewards/margins": 0.010818523354828358,
78
+ "rewards/rejected": -0.0011735468870028853,
79
  "step": 40
80
  },
81
  {
82
  "epoch": 0.13,
83
  "learning_rate": 4.987541037542187e-06,
84
+ "logits/chosen": -1.8620023727416992,
85
+ "logits/rejected": -1.8512481451034546,
86
+ "logps/chosen": -33.577735900878906,
87
+ "logps/rejected": -35.46040344238281,
88
+ "loss": 0.6982,
89
+ "rewards/accuracies": 0.5,
90
+ "rewards/chosen": -0.010127579793334007,
91
+ "rewards/margins": -0.0060965316370129585,
92
+ "rewards/rejected": -0.004031048621982336,
93
  "step": 50
94
  },
95
  {
96
  "epoch": 0.16,
97
  "learning_rate": 4.954691471941119e-06,
98
+ "logits/chosen": -1.9402154684066772,
99
+ "logits/rejected": -1.9421701431274414,
100
+ "logps/chosen": -32.552555084228516,
101
+ "logps/rejected": -33.22978973388672,
102
+ "loss": 0.6723,
103
+ "rewards/accuracies": 0.6000000238418579,
104
+ "rewards/chosen": 0.023814614862203598,
105
+ "rewards/margins": 0.053018856793642044,
106
+ "rewards/rejected": -0.029204240068793297,
107
  "step": 60
108
  },
109
  {
110
  "epoch": 0.18,
111
  "learning_rate": 4.901618883413549e-06,
112
+ "logits/chosen": -2.071889877319336,
113
+ "logits/rejected": -2.0768685340881348,
114
+ "logps/chosen": -33.997718811035156,
115
+ "logps/rejected": -36.63623809814453,
116
+ "loss": 0.6836,
117
+ "rewards/accuracies": 0.5375000238418579,
118
+ "rewards/chosen": -0.004616844467818737,
119
+ "rewards/margins": 0.02906452678143978,
120
+ "rewards/rejected": -0.033681370317935944,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.21,
125
  "learning_rate": 4.828760511501322e-06,
126
+ "logits/chosen": -1.9326432943344116,
127
+ "logits/rejected": -1.935786247253418,
128
+ "logps/chosen": -34.30440902709961,
129
+ "logps/rejected": -34.659637451171875,
130
+ "loss": 0.6521,
131
+ "rewards/accuracies": 0.625,
132
+ "rewards/chosen": 0.06169893592596054,
133
+ "rewards/margins": 0.09531383961439133,
134
+ "rewards/rejected": -0.03361489623785019,
135
  "step": 80
136
  },
137
  {
138
  "epoch": 0.23,
139
  "learning_rate": 4.7367166013034295e-06,
140
+ "logits/chosen": -1.9409675598144531,
141
+ "logits/rejected": -1.9454774856567383,
142
+ "logps/chosen": -32.3830680847168,
143
+ "logps/rejected": -32.33238983154297,
144
+ "loss": 0.6838,
145
+ "rewards/accuracies": 0.5874999761581421,
146
+ "rewards/chosen": 0.042176127433776855,
147
+ "rewards/margins": 0.028917592018842697,
148
+ "rewards/rejected": 0.013258534483611584,
149
  "step": 90
150
  },
151
  {
152
  "epoch": 0.26,
153
  "learning_rate": 4.626245458345211e-06,
154
+ "logits/chosen": -2.038121461868286,
155
+ "logits/rejected": -2.036132574081421,
156
+ "logps/chosen": -32.12568664550781,
157
+ "logps/rejected": -31.2890567779541,
158
+ "loss": 0.6634,
159
+ "rewards/accuracies": 0.5874999761581421,
160
+ "rewards/chosen": 0.05757413059473038,
161
+ "rewards/margins": 0.06899620592594147,
162
+ "rewards/rejected": -0.011422084644436836,
163
  "step": 100
164
  },
165
  {
166
  "epoch": 0.26,
167
+ "eval_logits/chosen": -2.232415199279785,
168
+ "eval_logits/rejected": -2.2275755405426025,
169
+ "eval_logps/chosen": -34.029048919677734,
170
+ "eval_logps/rejected": -37.52485275268555,
171
+ "eval_loss": 0.693107545375824,
172
+ "eval_rewards/accuracies": 0.5215947031974792,
173
+ "eval_rewards/chosen": 0.0027513643726706505,
174
+ "eval_rewards/margins": 0.006868092343211174,
175
+ "eval_rewards/rejected": -0.0041167279705405235,
176
+ "eval_runtime": 145.7484,
177
+ "eval_samples_per_second": 2.353,
178
+ "eval_steps_per_second": 0.295,
179
  "step": 100
180
  },
181
  {
182
  "epoch": 0.29,
183
  "learning_rate": 4.498257201263691e-06,
184
+ "logits/chosen": -1.9926633834838867,
185
+ "logits/rejected": -1.9902803897857666,
186
+ "logps/chosen": -33.11687088012695,
187
+ "logps/rejected": -34.01213836669922,
188
+ "loss": 0.6814,
189
+ "rewards/accuracies": 0.5874999761581421,
190
+ "rewards/chosen": 0.06347335875034332,
191
+ "rewards/margins": 0.054856397211551666,
192
+ "rewards/rejected": 0.008616959676146507,
193
  "step": 110
194
  },
195
  {
196
  "epoch": 0.31,
197
  "learning_rate": 4.353806263777678e-06,
198
+ "logits/chosen": -2.0042788982391357,
199
+ "logits/rejected": -1.9959495067596436,
200
+ "logps/chosen": -32.306739807128906,
201
+ "logps/rejected": -32.13039779663086,
202
+ "loss": 0.6734,
203
  "rewards/accuracies": 0.574999988079071,
204
+ "rewards/chosen": 0.06908417493104935,
205
+ "rewards/margins": 0.05353052541613579,
206
+ "rewards/rejected": 0.015553650446236134,
207
  "step": 120
208
  },
209
  {
210
  "epoch": 0.34,
211
  "learning_rate": 4.1940827077152755e-06,
212
+ "logits/chosen": -2.0326714515686035,
213
+ "logits/rejected": -2.0247092247009277,
214
+ "logps/chosen": -30.308746337890625,
215
+ "logps/rejected": -32.05224609375,
216
+ "loss": 0.6637,
217
+ "rewards/accuracies": 0.5874999761581421,
218
+ "rewards/chosen": 0.08348459005355835,
219
+ "rewards/margins": 0.08310474455356598,
220
+ "rewards/rejected": 0.00037985146627761424,
221
  "step": 130
222
  },
223
  {
224
  "epoch": 0.36,
225
  "learning_rate": 4.0204024186666215e-06,
226
+ "logits/chosen": -1.9627164602279663,
227
+ "logits/rejected": -1.9729163646697998,
228
+ "logps/chosen": -31.189788818359375,
229
+ "logps/rejected": -32.54594421386719,
230
+ "loss": 0.6424,
231
+ "rewards/accuracies": 0.612500011920929,
232
+ "rewards/chosen": 0.1149359717965126,
233
+ "rewards/margins": 0.12046756595373154,
234
+ "rewards/rejected": -0.005531603004783392,
235
  "step": 140
236
  },
237
  {
238
  "epoch": 0.39,
239
  "learning_rate": 3.834196265035119e-06,
240
+ "logits/chosen": -1.8740726709365845,
241
+ "logits/rejected": -1.875239372253418,
242
+ "logps/chosen": -33.88011932373047,
243
+ "logps/rejected": -34.779319763183594,
244
+ "loss": 0.6271,
245
+ "rewards/accuracies": 0.6000000238418579,
246
+ "rewards/chosen": 0.15628577768802643,
247
+ "rewards/margins": 0.1673184335231781,
248
+ "rewards/rejected": -0.011032682843506336,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.42,
253
  "learning_rate": 3.636998309800573e-06,
254
+ "logits/chosen": -1.9257261753082275,
255
+ "logits/rejected": -1.922323226928711,
256
+ "logps/chosen": -35.9793586730957,
257
+ "logps/rejected": -32.714969635009766,
258
+ "loss": 0.6539,
259
+ "rewards/accuracies": 0.612500011920929,
260
+ "rewards/chosen": 0.09615939855575562,
261
+ "rewards/margins": 0.09226818382740021,
262
+ "rewards/rejected": 0.003891219152137637,
263
  "step": 160
264
  },
265
  {
266
  "epoch": 0.44,
267
  "learning_rate": 3.4304331721118078e-06,
268
+ "logits/chosen": -2.027190923690796,
269
+ "logits/rejected": -2.019850254058838,
270
+ "logps/chosen": -33.4937629699707,
271
+ "logps/rejected": -31.404333114624023,
272
+ "loss": 0.6193,
273
+ "rewards/accuracies": 0.6875,
274
+ "rewards/chosen": 0.15204860270023346,
275
+ "rewards/margins": 0.17829009890556335,
276
+ "rewards/rejected": -0.026241496205329895,
277
  "step": 170
278
  },
279
  {
280
  "epoch": 0.47,
281
  "learning_rate": 3.2162026428305436e-06,
282
+ "logits/chosen": -2.033841609954834,
283
+ "logits/rejected": -2.039079427719116,
284
+ "logps/chosen": -32.22673797607422,
285
+ "logps/rejected": -32.453857421875,
286
+ "loss": 0.632,
287
+ "rewards/accuracies": 0.75,
288
+ "rewards/chosen": 0.1590258777141571,
289
+ "rewards/margins": 0.1433834582567215,
290
+ "rewards/rejected": 0.015642408281564713,
291
  "step": 180
292
  },
293
  {
294
  "epoch": 0.49,
295
  "learning_rate": 2.996071664294641e-06,
296
+ "logits/chosen": -2.03458833694458,
297
+ "logits/rejected": -2.031813144683838,
298
+ "logps/chosen": -31.249963760375977,
299
+ "logps/rejected": -31.329097747802734,
300
+ "loss": 0.6439,
301
+ "rewards/accuracies": 0.5874999761581421,
302
+ "rewards/chosen": 0.11949291080236435,
303
+ "rewards/margins": 0.12493407726287842,
304
+ "rewards/rejected": -0.005441152956336737,
305
  "step": 190
306
  },
307
  {
308
  "epoch": 0.52,
309
  "learning_rate": 2.7718537898066833e-06,
310
+ "logits/chosen": -1.9052807092666626,
311
+ "logits/rejected": -1.9099184274673462,
312
+ "logps/chosen": -31.314193725585938,
313
+ "logps/rejected": -32.81206512451172,
314
+ "loss": 0.6329,
315
+ "rewards/accuracies": 0.7124999761581421,
316
+ "rewards/chosen": 0.14604374766349792,
317
+ "rewards/margins": 0.15660937130451202,
318
+ "rewards/rejected": -0.010565629228949547,
319
  "step": 200
320
  },
321
  {
322
  "epoch": 0.52,
323
+ "eval_logits/chosen": -2.230377674102783,
324
+ "eval_logits/rejected": -2.225529193878174,
325
+ "eval_logps/chosen": -34.05998992919922,
326
+ "eval_logps/rejected": -37.57161331176758,
327
+ "eval_loss": 0.6905081868171692,
328
+ "eval_rewards/accuracies": 0.5274086594581604,
329
+ "eval_rewards/chosen": -0.012718739919364452,
330
+ "eval_rewards/margins": 0.014778696931898594,
331
+ "eval_rewards/rejected": -0.027497438713908195,
332
+ "eval_runtime": 145.701,
333
+ "eval_samples_per_second": 2.354,
334
+ "eval_steps_per_second": 0.295,
335
  "step": 200
336
  },
337
  {
338
  "epoch": 0.55,
339
  "learning_rate": 2.5453962426402006e-06,
340
+ "logits/chosen": -2.017439603805542,
341
+ "logits/rejected": -2.0280823707580566,
342
+ "logps/chosen": -31.72454261779785,
343
+ "logps/rejected": -33.935951232910156,
344
+ "loss": 0.6269,
345
  "rewards/accuracies": 0.6625000238418579,
346
+ "rewards/chosen": 0.12337962538003922,
347
+ "rewards/margins": 0.16000542044639587,
348
+ "rewards/rejected": -0.036625780165195465,
349
  "step": 210
350
  },
351
  {
352
  "epoch": 0.57,
353
  "learning_rate": 2.3185646976551794e-06,
354
+ "logits/chosen": -1.909328818321228,
355
+ "logits/rejected": -1.924088716506958,
356
+ "logps/chosen": -29.841415405273438,
357
+ "logps/rejected": -31.60904884338379,
358
+ "loss": 0.6274,
359
+ "rewards/accuracies": 0.675000011920929,
360
+ "rewards/chosen": 0.13506175577640533,
361
+ "rewards/margins": 0.16049805283546448,
362
+ "rewards/rejected": -0.02543630823493004,
363
  "step": 220
364
  },
365
  {
366
  "epoch": 0.6,
367
  "learning_rate": 2.0932279108998323e-06,
368
+ "logits/chosen": -1.966265320777893,
369
+ "logits/rejected": -1.970245361328125,
370
+ "logps/chosen": -33.091209411621094,
371
+ "logps/rejected": -31.639759063720703,
372
+ "loss": 0.6126,
373
+ "rewards/accuracies": 0.7250000238418579,
374
+ "rewards/chosen": 0.16312837600708008,
375
+ "rewards/margins": 0.20835788547992706,
376
+ "rewards/rejected": -0.045229505747556686,
377
  "step": 230
378
  },
379
  {
380
  "epoch": 0.62,
381
  "learning_rate": 1.8712423238279358e-06,
382
+ "logits/chosen": -1.9648067951202393,
383
+ "logits/rejected": -1.9429614543914795,
384
+ "logps/chosen": -33.82001495361328,
385
+ "logps/rejected": -35.11749267578125,
386
+ "loss": 0.5941,
387
+ "rewards/accuracies": 0.7250000238418579,
388
+ "rewards/chosen": 0.16686691343784332,
389
+ "rewards/margins": 0.25021862983703613,
390
+ "rewards/rejected": -0.08335171639919281,
391
  "step": 240
392
  },
393
  {
394
  "epoch": 0.65,
395
  "learning_rate": 1.6544367689701824e-06,
396
+ "logits/chosen": -2.005873441696167,
397
+ "logits/rejected": -2.002545118331909,
398
+ "logps/chosen": -32.70961380004883,
399
+ "logps/rejected": -36.252098083496094,
400
+ "loss": 0.6448,
401
+ "rewards/accuracies": 0.637499988079071,
402
+ "rewards/chosen": 0.10772605985403061,
403
+ "rewards/margins": 0.12131496518850327,
404
+ "rewards/rejected": -0.013588905334472656,
405
  "step": 250
406
  },
407
  {
408
  "epoch": 0.68,
409
  "learning_rate": 1.4445974030621963e-06,
410
+ "logits/chosen": -1.8737099170684814,
411
+ "logits/rejected": -1.8713098764419556,
412
+ "logps/chosen": -33.96501922607422,
413
+ "logps/rejected": -35.54829025268555,
414
+ "loss": 0.6377,
415
+ "rewards/accuracies": 0.7250000238418579,
416
+ "rewards/chosen": 0.11169042438268661,
417
+ "rewards/margins": 0.13352498412132263,
418
+ "rewards/rejected": -0.02183455601334572,
419
  "step": 260
420
  },
421
  {
422
  "epoch": 0.7,
423
  "learning_rate": 1.243452991757889e-06,
424
+ "logits/chosen": -1.858642578125,
425
+ "logits/rejected": -1.8562240600585938,
426
+ "logps/chosen": -34.18030548095703,
427
+ "logps/rejected": -31.82675552368164,
428
+ "loss": 0.6384,
429
+ "rewards/accuracies": 0.6625000238418579,
430
+ "rewards/chosen": 0.1086462140083313,
431
+ "rewards/margins": 0.1363571435213089,
432
+ "rewards/rejected": -0.027710938826203346,
433
  "step": 270
434
  },
435
  {
436
  "epoch": 0.73,
437
  "learning_rate": 1.0526606671603523e-06,
438
+ "logits/chosen": -1.9618957042694092,
439
+ "logits/rejected": -1.9513881206512451,
440
+ "logps/chosen": -35.000816345214844,
441
+ "logps/rejected": -31.879558563232422,
442
+ "loss": 0.6107,
443
  "rewards/accuracies": 0.737500011920929,
444
+ "rewards/chosen": 0.17791253328323364,
445
+ "rewards/margins": 0.1971966028213501,
446
+ "rewards/rejected": -0.019284065812826157,
447
  "step": 280
448
  },
449
  {
450
  "epoch": 0.75,
451
  "learning_rate": 8.737922755071455e-07,
452
+ "logits/chosen": -2.0570178031921387,
453
+ "logits/rejected": -2.04209566116333,
454
+ "logps/chosen": -30.695226669311523,
455
+ "logps/rejected": -32.64103317260742,
456
+ "loss": 0.654,
457
+ "rewards/accuracies": 0.5874999761581421,
458
+ "rewards/chosen": 0.11444780975580215,
459
+ "rewards/margins": 0.10904743522405624,
460
+ "rewards/rejected": 0.005400371737778187,
461
  "step": 290
462
  },
463
  {
464
  "epoch": 0.78,
465
  "learning_rate": 7.08321427484816e-07,
466
+ "logits/chosen": -1.9285688400268555,
467
+ "logits/rejected": -1.9260343313217163,
468
+ "logps/chosen": -32.38969039916992,
469
+ "logps/rejected": -30.898773193359375,
470
+ "loss": 0.5742,
471
+ "rewards/accuracies": 0.762499988079071,
472
+ "rewards/chosen": 0.2687075138092041,
473
+ "rewards/margins": 0.3096885085105896,
474
+ "rewards/rejected": -0.04098101332783699,
475
  "step": 300
476
  },
477
  {
478
  "epoch": 0.78,
479
+ "eval_logits/chosen": -2.2279160022735596,
480
+ "eval_logits/rejected": -2.223081350326538,
481
+ "eval_logps/chosen": -34.089088439941406,
482
+ "eval_logps/rejected": -37.59783935546875,
483
+ "eval_loss": 0.6919631958007812,
484
+ "eval_rewards/accuracies": 0.5278239250183105,
485
+ "eval_rewards/chosen": -0.027267219498753548,
486
+ "eval_rewards/margins": 0.013343668542802334,
487
+ "eval_rewards/rejected": -0.040610890835523605,
488
+ "eval_runtime": 145.7459,
489
+ "eval_samples_per_second": 2.353,
490
  "eval_steps_per_second": 0.295,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.81,
495
  "learning_rate": 5.576113578589035e-07,
496
+ "logits/chosen": -1.9126602411270142,
497
+ "logits/rejected": -1.9093825817108154,
498
+ "logps/chosen": -31.319168090820312,
499
+ "logps/rejected": -33.805519104003906,
500
+ "loss": 0.624,
501
+ "rewards/accuracies": 0.7749999761581421,
502
+ "rewards/chosen": 0.1420917958021164,
503
+ "rewards/margins": 0.1737762689590454,
504
+ "rewards/rejected": -0.03168448060750961,
505
  "step": 310
506
  },
507
  {
508
  "epoch": 0.83,
509
  "learning_rate": 4.229036944380913e-07,
510
+ "logits/chosen": -1.9627044200897217,
511
+ "logits/rejected": -1.9504749774932861,
512
+ "logps/chosen": -34.31007385253906,
513
+ "logps/rejected": -33.66672134399414,
514
+ "loss": 0.6084,
515
+ "rewards/accuracies": 0.7124999761581421,
516
+ "rewards/chosen": 0.1430002748966217,
517
+ "rewards/margins": 0.20694026350975037,
518
+ "rewards/rejected": -0.06393997371196747,
519
  "step": 320
520
  },
521
  {
522
  "epoch": 0.86,
523
  "learning_rate": 3.053082288996112e-07,
524
+ "logits/chosen": -1.9980976581573486,
525
+ "logits/rejected": -1.9966537952423096,
526
+ "logps/chosen": -33.16533660888672,
527
+ "logps/rejected": -32.55678939819336,
528
+ "loss": 0.6136,
529
+ "rewards/accuracies": 0.7250000238418579,
530
+ "rewards/chosen": 0.1602279245853424,
531
+ "rewards/margins": 0.19883206486701965,
532
+ "rewards/rejected": -0.03860412910580635,
533
  "step": 330
534
  },
535
  {
536
  "epoch": 0.88,
537
  "learning_rate": 2.0579377374915805e-07,
538
+ "logits/chosen": -2.0848796367645264,
539
+ "logits/rejected": -2.069186210632324,
540
+ "logps/chosen": -33.787841796875,
541
+ "logps/rejected": -33.07987976074219,
542
+ "loss": 0.6229,
543
+ "rewards/accuracies": 0.7250000238418579,
544
+ "rewards/chosen": 0.19577431678771973,
545
+ "rewards/margins": 0.17669571936130524,
546
+ "rewards/rejected": 0.01907859742641449,
547
  "step": 340
548
  },
549
  {
550
  "epoch": 0.91,
551
  "learning_rate": 1.2518018074041684e-07,
552
+ "logits/chosen": -1.957275390625,
553
+ "logits/rejected": -1.9564218521118164,
554
+ "logps/chosen": -32.81622314453125,
555
+ "logps/rejected": -32.52650833129883,
556
+ "loss": 0.5923,
557
+ "rewards/accuracies": 0.7124999761581421,
558
+ "rewards/chosen": 0.23293733596801758,
559
+ "rewards/margins": 0.2636169195175171,
560
+ "rewards/rejected": -0.030679568648338318,
561
  "step": 350
562
  },
563
  {
564
  "epoch": 0.94,
565
  "learning_rate": 6.41315865106129e-08,
566
+ "logits/chosen": -1.9124408960342407,
567
+ "logits/rejected": -1.92275071144104,
568
+ "logps/chosen": -31.859888076782227,
569
+ "logps/rejected": -35.33869934082031,
570
+ "loss": 0.6119,
571
+ "rewards/accuracies": 0.699999988079071,
572
+ "rewards/chosen": 0.17240020632743835,
573
+ "rewards/margins": 0.19736871123313904,
574
+ "rewards/rejected": -0.024968529120087624,
575
  "step": 360
576
  },
577
  {
578
  "epoch": 0.96,
579
  "learning_rate": 2.3150941078050325e-08,
580
+ "logits/chosen": -2.0522782802581787,
581
+ "logits/rejected": -2.045797824859619,
582
+ "logps/chosen": -33.34915542602539,
583
+ "logps/rejected": -29.27215576171875,
584
+ "loss": 0.6194,
585
+ "rewards/accuracies": 0.75,
586
+ "rewards/chosen": 0.15140748023986816,
587
+ "rewards/margins": 0.1742721050977707,
588
+ "rewards/rejected": -0.022864630445837975,
589
  "step": 370
590
  },
591
  {
592
  "epoch": 0.99,
593
  "learning_rate": 2.575864278703266e-09,
594
+ "logits/chosen": -1.9120715856552124,
595
+ "logits/rejected": -1.9142844676971436,
596
+ "logps/chosen": -33.86906051635742,
597
+ "logps/rejected": -30.961559295654297,
598
+ "loss": 0.5996,
599
+ "rewards/accuracies": 0.7250000238418579,
600
+ "rewards/chosen": 0.1849948763847351,
601
+ "rewards/margins": 0.22946183383464813,
602
+ "rewards/rejected": -0.04446694999933243,
603
  "step": 380
604
  },
605
  {
606
  "epoch": 1.0,
607
  "step": 385,
608
  "total_flos": 0.0,
609
+ "train_loss": 0.64145151051608,
610
+ "train_runtime": 3249.8987,
611
  "train_samples_per_second": 0.947,
612
  "train_steps_per_second": 0.118
613
  }