{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.875, "learning_rate": 3.2467532467532474e-08, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 19.375, "learning_rate": 3.2467532467532465e-07, "logits/chosen": -3.009194850921631, "logits/rejected": -2.9980297088623047, "logps/chosen": -33.18357849121094, "logps/rejected": -31.97453498840332, "loss": 0.499, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.0015809221658855677, "rewards/margins": 0.004794789478182793, "rewards/rejected": -0.006375710014253855, "step": 10 }, { "epoch": 0.05, "grad_norm": 12.1875, "learning_rate": 6.493506493506493e-07, "logits/chosen": -2.89937162399292, "logits/rejected": -2.8941903114318848, "logps/chosen": -32.48249053955078, "logps/rejected": -28.95150375366211, "loss": 0.5038, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.0044335778802633286, "rewards/margins": -0.022651705890893936, "rewards/rejected": 0.01821812614798546, "step": 20 }, { "epoch": 0.08, "grad_norm": 11.75, "learning_rate": 9.740259740259742e-07, "logits/chosen": -3.0963780879974365, "logits/rejected": -3.108004093170166, "logps/chosen": -32.87579345703125, "logps/rejected": -30.174633026123047, "loss": 0.4983, "rewards/accuracies": 0.5, "rewards/chosen": 0.01973636820912361, "rewards/margins": 0.006884939037263393, "rewards/rejected": 0.012851427309215069, "step": 30 }, { "epoch": 0.1, "grad_norm": 13.625, "learning_rate": 1.2987012987012986e-06, "logits/chosen": -2.8654465675354004, "logits/rejected": -2.8560633659362793, "logps/chosen": -31.759456634521484, "logps/rejected": -32.346187591552734, "loss": 0.492, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.04251789674162865, "rewards/margins": 0.03575839102268219, "rewards/rejected": 0.006759509444236755, "step": 40 }, { "epoch": 0.13, "grad_norm": 12.3125, "learning_rate": 1.6233766233766235e-06, "logits/chosen": -2.8855957984924316, "logits/rejected": -2.883223533630371, "logps/chosen": -29.667795181274414, "logps/rejected": -30.0822696685791, "loss": 0.4909, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.043420128524303436, "rewards/margins": 0.03941510617733002, "rewards/rejected": 0.004005011636763811, "step": 50 }, { "epoch": 0.16, "grad_norm": 11.0625, "learning_rate": 1.9480519480519483e-06, "logits/chosen": -2.915156126022339, "logits/rejected": -2.917161226272583, "logps/chosen": -30.017940521240234, "logps/rejected": -27.95892906188965, "loss": 0.4856, "rewards/accuracies": 0.625, "rewards/chosen": 0.06709733605384827, "rewards/margins": 0.06904710829257965, "rewards/rejected": -0.0019497796893119812, "step": 60 }, { "epoch": 0.18, "grad_norm": 13.4375, "learning_rate": 2.2727272727272728e-06, "logits/chosen": -2.99483060836792, "logits/rejected": -3.0007681846618652, "logps/chosen": -29.220535278320312, "logps/rejected": -30.8293399810791, "loss": 0.4983, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.03625893592834473, "rewards/margins": -0.0012179904151707888, "rewards/rejected": 0.037476930767297745, "step": 70 }, { "epoch": 0.21, "grad_norm": 12.0625, "learning_rate": 2.597402597402597e-06, "logits/chosen": -2.8088645935058594, "logits/rejected": -2.8247311115264893, "logps/chosen": -29.434457778930664, "logps/rejected": -29.687795639038086, "loss": 0.4873, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0637618824839592, "rewards/margins": 0.07186902314424515, "rewards/rejected": -0.008107141591608524, "step": 80 }, { "epoch": 0.23, "grad_norm": 13.4375, "learning_rate": 2.922077922077922e-06, "logits/chosen": -2.89559006690979, "logits/rejected": -2.877815008163452, "logps/chosen": -32.699546813964844, "logps/rejected": -30.035818099975586, "loss": 0.4783, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.08212994039058685, "rewards/margins": 0.09063344448804855, "rewards/rejected": -0.008503502234816551, "step": 90 }, { "epoch": 0.26, "grad_norm": 10.5, "learning_rate": 3.246753246753247e-06, "logits/chosen": -2.999152421951294, "logits/rejected": -2.9996345043182373, "logps/chosen": -31.857818603515625, "logps/rejected": -30.728164672851562, "loss": 0.4852, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.11943354457616806, "rewards/margins": 0.06662966310977936, "rewards/rejected": 0.0528038814663887, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.804969072341919, "eval_logits/rejected": -2.8022663593292236, "eval_logps/chosen": -31.14324188232422, "eval_logps/rejected": -34.634891510009766, "eval_loss": 0.48944562673568726, "eval_rewards/accuracies": 0.5689368844032288, "eval_rewards/chosen": 0.08352527022361755, "eval_rewards/margins": 0.04556553438305855, "eval_rewards/rejected": 0.03795973211526871, "eval_runtime": 113.3036, "eval_samples_per_second": 3.027, "eval_steps_per_second": 0.38, "step": 100 }, { "epoch": 0.29, "grad_norm": 12.375, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -2.951740264892578, "logits/rejected": -2.927734851837158, "logps/chosen": -31.90311622619629, "logps/rejected": -31.108535766601562, "loss": 0.4668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18820719420909882, "rewards/margins": 0.15802007913589478, "rewards/rejected": 0.03018711879849434, "step": 110 }, { "epoch": 0.31, "grad_norm": 12.125, "learning_rate": 3.896103896103897e-06, "logits/chosen": -3.0347812175750732, "logits/rejected": -3.0642948150634766, "logps/chosen": -28.706600189208984, "logps/rejected": -34.014793395996094, "loss": 0.4566, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.274380087852478, "rewards/margins": 0.19472572207450867, "rewards/rejected": 0.07965437322854996, "step": 120 }, { "epoch": 0.34, "grad_norm": 7.28125, "learning_rate": 4.220779220779221e-06, "logits/chosen": -2.735238790512085, "logits/rejected": -2.7312469482421875, "logps/chosen": -28.54458236694336, "logps/rejected": -30.073749542236328, "loss": 0.4549, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2580580413341522, "rewards/margins": 0.2220243662595749, "rewards/rejected": 0.036033693701028824, "step": 130 }, { "epoch": 0.36, "grad_norm": 10.0, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -3.010335683822632, "logits/rejected": -3.0075175762176514, "logps/chosen": -27.002349853515625, "logps/rejected": -31.557636260986328, "loss": 0.4537, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.29032427072525024, "rewards/margins": 0.23084287345409393, "rewards/rejected": 0.059481363743543625, "step": 140 }, { "epoch": 0.39, "grad_norm": 10.3125, "learning_rate": 4.870129870129871e-06, "logits/chosen": -2.8053736686706543, "logits/rejected": -2.8005282878875732, "logps/chosen": -27.175460815429688, "logps/rejected": -31.19356346130371, "loss": 0.4212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3900282680988312, "rewards/margins": 0.3719804286956787, "rewards/rejected": 0.018047798424959183, "step": 150 }, { "epoch": 0.42, "grad_norm": 10.8125, "learning_rate": 4.999768804644796e-06, "logits/chosen": -3.122887372970581, "logits/rejected": -3.1051273345947266, "logps/chosen": -31.603347778320312, "logps/rejected": -29.00677490234375, "loss": 0.3923, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5330235958099365, "rewards/margins": 0.5492233633995056, "rewards/rejected": -0.016199791803956032, "step": 160 }, { "epoch": 0.44, "grad_norm": 9.875, "learning_rate": 4.998356098992574e-06, "logits/chosen": -2.93461012840271, "logits/rejected": -2.9427475929260254, "logps/chosen": -29.249469757080078, "logps/rejected": -31.181896209716797, "loss": 0.4135, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3870863616466522, "rewards/margins": 0.4365348815917969, "rewards/rejected": -0.04944852739572525, "step": 170 }, { "epoch": 0.47, "grad_norm": 7.6875, "learning_rate": 4.9956598544545566e-06, "logits/chosen": -2.784058094024658, "logits/rejected": -2.781888484954834, "logps/chosen": -28.929378509521484, "logps/rejected": -29.65081787109375, "loss": 0.4245, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.44428300857543945, "rewards/margins": 0.37386995553970337, "rewards/rejected": 0.07041303813457489, "step": 180 }, { "epoch": 0.49, "grad_norm": 5.40625, "learning_rate": 4.991681456235483e-06, "logits/chosen": -2.9030683040618896, "logits/rejected": -2.899808406829834, "logps/chosen": -29.135570526123047, "logps/rejected": -28.150049209594727, "loss": 0.4023, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6306881308555603, "rewards/margins": 0.5061992406845093, "rewards/rejected": 0.12448881566524506, "step": 190 }, { "epoch": 0.52, "grad_norm": 5.90625, "learning_rate": 4.986422948250881e-06, "logits/chosen": -2.975163698196411, "logits/rejected": -2.963109254837036, "logps/chosen": -32.520572662353516, "logps/rejected": -30.08755111694336, "loss": 0.4094, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7070678472518921, "rewards/margins": 0.4991455078125, "rewards/rejected": 0.20792225003242493, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8150291442871094, "eval_logits/rejected": -2.8121931552886963, "eval_logps/chosen": -31.104522705078125, "eval_logps/rejected": -34.668701171875, "eval_loss": 0.482407808303833, "eval_rewards/accuracies": 0.5431894063949585, "eval_rewards/chosen": 0.10675713419914246, "eval_rewards/margins": 0.0890858992934227, "eval_rewards/rejected": 0.01767122559249401, "eval_runtime": 112.9842, "eval_samples_per_second": 3.036, "eval_steps_per_second": 0.381, "step": 200 }, { "epoch": 0.55, "grad_norm": 11.5, "learning_rate": 4.9798870320769884e-06, "logits/chosen": -2.911447763442993, "logits/rejected": -2.913109302520752, "logps/chosen": -31.8501033782959, "logps/rejected": -33.66575241088867, "loss": 0.3828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.709983766078949, "rewards/margins": 0.5925556421279907, "rewards/rejected": 0.11742812395095825, "step": 210 }, { "epoch": 0.57, "grad_norm": 8.6875, "learning_rate": 4.9720770655628216e-06, "logits/chosen": -2.8921656608581543, "logits/rejected": -2.907845973968506, "logps/chosen": -29.020355224609375, "logps/rejected": -28.375995635986328, "loss": 0.3833, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6946095824241638, "rewards/margins": 0.6032464504241943, "rewards/rejected": 0.09136321395635605, "step": 220 }, { "epoch": 0.6, "grad_norm": 10.0, "learning_rate": 4.96299706110506e-06, "logits/chosen": -2.9413981437683105, "logits/rejected": -2.9465761184692383, "logps/chosen": -30.13448143005371, "logps/rejected": -31.341196060180664, "loss": 0.42, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.610433042049408, "rewards/margins": 0.36025816202163696, "rewards/rejected": 0.250174880027771, "step": 230 }, { "epoch": 0.62, "grad_norm": 10.6875, "learning_rate": 4.952651683586668e-06, "logits/chosen": -3.0000805854797363, "logits/rejected": -3.006808280944824, "logps/chosen": -29.295434951782227, "logps/rejected": -30.107656478881836, "loss": 0.3337, "rewards/accuracies": 0.8125, "rewards/chosen": 0.9656942486763, "rewards/margins": 0.889144778251648, "rewards/rejected": 0.07654955983161926, "step": 240 }, { "epoch": 0.65, "grad_norm": 8.4375, "learning_rate": 4.9410462479802945e-06, "logits/chosen": -2.8317043781280518, "logits/rejected": -2.821113109588623, "logps/chosen": -25.70285415649414, "logps/rejected": -29.257152557373047, "loss": 0.3883, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7508188486099243, "rewards/margins": 0.5861265659332275, "rewards/rejected": 0.1646922081708908, "step": 250 }, { "epoch": 0.68, "grad_norm": 4.96875, "learning_rate": 4.928186716617686e-06, "logits/chosen": -2.8137967586517334, "logits/rejected": -2.833986759185791, "logps/chosen": -28.409215927124023, "logps/rejected": -34.153076171875, "loss": 0.3551, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8524467349052429, "rewards/margins": 0.8398415446281433, "rewards/rejected": 0.012605251744389534, "step": 260 }, { "epoch": 0.7, "grad_norm": 7.5625, "learning_rate": 4.914079696126526e-06, "logits/chosen": -2.9588234424591064, "logits/rejected": -2.965420961380005, "logps/chosen": -29.723834991455078, "logps/rejected": -30.076526641845703, "loss": 0.3619, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.6345151662826538, "rewards/margins": 0.7612727284431458, "rewards/rejected": -0.12675747275352478, "step": 270 }, { "epoch": 0.73, "grad_norm": 6.8125, "learning_rate": 4.8987324340362445e-06, "logits/chosen": -2.973771333694458, "logits/rejected": -2.9608845710754395, "logps/chosen": -29.548681259155273, "logps/rejected": -28.605484008789062, "loss": 0.376, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7038027048110962, "rewards/margins": 0.6882829070091248, "rewards/rejected": 0.01551983691751957, "step": 280 }, { "epoch": 0.75, "grad_norm": 5.625, "learning_rate": 4.882152815054587e-06, "logits/chosen": -2.9029154777526855, "logits/rejected": -2.8847804069519043, "logps/chosen": -30.586185455322266, "logps/rejected": -30.917659759521484, "loss": 0.2623, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.2300727367401123, "rewards/margins": 1.466684341430664, "rewards/rejected": -0.23661167919635773, "step": 290 }, { "epoch": 0.78, "grad_norm": 9.875, "learning_rate": 4.864349357016816e-06, "logits/chosen": -2.906841278076172, "logits/rejected": -2.9034266471862793, "logps/chosen": -30.452688217163086, "logps/rejected": -27.45937156677246, "loss": 0.3408, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0619806051254272, "rewards/margins": 0.9754523038864136, "rewards/rejected": 0.08652831614017487, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.8212053775787354, "eval_logits/rejected": -2.818753719329834, "eval_logps/chosen": -31.249738693237305, "eval_logps/rejected": -34.84955596923828, "eval_loss": 0.48274698853492737, "eval_rewards/accuracies": 0.5693521499633789, "eval_rewards/chosen": 0.01962737925350666, "eval_rewards/margins": 0.11046534031629562, "eval_rewards/rejected": -0.09083796292543411, "eval_runtime": 113.0013, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.381, "step": 300 }, { "epoch": 0.81, "grad_norm": 7.28125, "learning_rate": 4.84533120650964e-06, "logits/chosen": -2.788771152496338, "logits/rejected": -2.804905652999878, "logps/chosen": -28.02764320373535, "logps/rejected": -31.13899803161621, "loss": 0.345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7210657596588135, "rewards/margins": 0.9652621150016785, "rewards/rejected": -0.2441963255405426, "step": 310 }, { "epoch": 0.83, "grad_norm": 6.09375, "learning_rate": 4.825108134172131e-06, "logits/chosen": -3.036255359649658, "logits/rejected": -3.0221335887908936, "logps/chosen": -28.51851463317871, "logps/rejected": -28.734134674072266, "loss": 0.3044, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.0158424377441406, "rewards/margins": 1.2650604248046875, "rewards/rejected": -0.2492178976535797, "step": 320 }, { "epoch": 0.86, "grad_norm": 3.484375, "learning_rate": 4.80369052967602e-06, "logits/chosen": -2.9529166221618652, "logits/rejected": -2.9361720085144043, "logps/chosen": -26.438312530517578, "logps/rejected": -30.691097259521484, "loss": 0.2802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 1.1542550325393677, "rewards/margins": 1.3339264392852783, "rewards/rejected": -0.17967157065868378, "step": 330 }, { "epoch": 0.88, "grad_norm": 8.25, "learning_rate": 4.781089396387968e-06, "logits/chosen": -3.163755178451538, "logits/rejected": -3.1707050800323486, "logps/chosen": -29.714208602905273, "logps/rejected": -33.160545349121094, "loss": 0.2852, "rewards/accuracies": 0.8125, "rewards/chosen": 1.15499746799469, "rewards/margins": 1.4337189197540283, "rewards/rejected": -0.2787213623523712, "step": 340 }, { "epoch": 0.91, "grad_norm": 4.53125, "learning_rate": 4.757316345716554e-06, "logits/chosen": -3.045511484146118, "logits/rejected": -3.04934024810791, "logps/chosen": -28.422229766845703, "logps/rejected": -31.450841903686523, "loss": 0.2882, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 1.430792212486267, "rewards/margins": 1.3933117389678955, "rewards/rejected": 0.03748062998056412, "step": 350 }, { "epoch": 0.94, "grad_norm": 6.09375, "learning_rate": 4.73238359114687e-06, "logits/chosen": -2.8853230476379395, "logits/rejected": -2.8869376182556152, "logps/chosen": -26.608606338500977, "logps/rejected": -30.023727416992188, "loss": 0.2917, "rewards/accuracies": 0.8125, "rewards/chosen": 1.224672555923462, "rewards/margins": 1.3403455018997192, "rewards/rejected": -0.11567302793264389, "step": 360 }, { "epoch": 0.96, "grad_norm": 12.1875, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.9616503715515137, "logits/rejected": -2.9597020149230957, "logps/chosen": -28.82027816772461, "logps/rejected": -32.02310562133789, "loss": 0.3079, "rewards/accuracies": 0.8125, "rewards/chosen": 1.1798226833343506, "rewards/margins": 1.2368601560592651, "rewards/rejected": -0.05703740566968918, "step": 370 }, { "epoch": 0.99, "grad_norm": 7.28125, "learning_rate": 4.679090796681225e-06, "logits/chosen": -2.9148154258728027, "logits/rejected": -2.8975720405578613, "logps/chosen": -27.317724227905273, "logps/rejected": -28.09055519104004, "loss": 0.3185, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9763870239257812, "rewards/margins": 1.0969483852386475, "rewards/rejected": -0.12056143581867218, "step": 380 }, { "epoch": 1.01, "grad_norm": 6.8125, "learning_rate": 4.650758136138454e-06, "logits/chosen": -3.2071735858917236, "logits/rejected": -3.1806423664093018, "logps/chosen": -26.902801513671875, "logps/rejected": -34.96044921875, "loss": 0.2235, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.6331195831298828, "rewards/margins": 2.0143303871154785, "rewards/rejected": -0.38121047616004944, "step": 390 }, { "epoch": 1.04, "grad_norm": 3.546875, "learning_rate": 4.621320516337559e-06, "logits/chosen": -2.971569538116455, "logits/rejected": -2.980116844177246, "logps/chosen": -28.79498863220215, "logps/rejected": -30.81607437133789, "loss": 0.1572, "rewards/accuracies": 0.875, "rewards/chosen": 2.092515707015991, "rewards/margins": 2.7485079765319824, "rewards/rejected": -0.6559919118881226, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.817176580429077, "eval_logits/rejected": -2.81557559967041, "eval_logps/chosen": -31.311691284179688, "eval_logps/rejected": -34.947532653808594, "eval_loss": 0.47979074716567993, "eval_rewards/accuracies": 0.5838870406150818, "eval_rewards/chosen": -0.017545443028211594, "eval_rewards/margins": 0.13208113610744476, "eval_rewards/rejected": -0.14962658286094666, "eval_runtime": 112.8693, "eval_samples_per_second": 3.039, "eval_steps_per_second": 0.381, "step": 400 }, { "epoch": 1.06, "grad_norm": 8.25, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.916222095489502, "logits/rejected": -2.9033970832824707, "logps/chosen": -24.827913284301758, "logps/rejected": -29.30416488647461, "loss": 0.169, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7523695230484009, "rewards/margins": 2.7877392768859863, "rewards/rejected": -1.035369634628296, "step": 410 }, { "epoch": 1.09, "grad_norm": 2.28125, "learning_rate": 4.559191453574582e-06, "logits/chosen": -2.95746111869812, "logits/rejected": -2.9747726917266846, "logps/chosen": -28.64497947692871, "logps/rejected": -28.555246353149414, "loss": 0.1894, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.0714821815490723, "rewards/margins": 2.5132362842559814, "rewards/rejected": -0.4417542815208435, "step": 420 }, { "epoch": 1.12, "grad_norm": 4.28125, "learning_rate": 4.52653192962838e-06, "logits/chosen": -2.9137301445007324, "logits/rejected": -2.9347903728485107, "logps/chosen": -25.830142974853516, "logps/rejected": -31.51810073852539, "loss": 0.1695, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7540576457977295, "rewards/margins": 2.549172878265381, "rewards/rejected": -0.7951155304908752, "step": 430 }, { "epoch": 1.14, "grad_norm": 4.59375, "learning_rate": 4.492831268057307e-06, "logits/chosen": -3.002916097640991, "logits/rejected": -2.986323356628418, "logps/chosen": -30.73443031311035, "logps/rejected": -32.88414764404297, "loss": 0.1378, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.124145269393921, "rewards/margins": 2.7998132705688477, "rewards/rejected": -0.6756676435470581, "step": 440 }, { "epoch": 1.17, "grad_norm": 2.4375, "learning_rate": 4.458106782690094e-06, "logits/chosen": -2.8221781253814697, "logits/rejected": -2.826918125152588, "logps/chosen": -26.489513397216797, "logps/rejected": -31.854440689086914, "loss": 0.1559, "rewards/accuracies": 0.9375, "rewards/chosen": 2.130248546600342, "rewards/margins": 2.6519882678985596, "rewards/rejected": -0.5217396020889282, "step": 450 }, { "epoch": 1.19, "grad_norm": 3.3125, "learning_rate": 4.422376313348405e-06, "logits/chosen": -2.8889822959899902, "logits/rejected": -2.8780407905578613, "logps/chosen": -26.380014419555664, "logps/rejected": -35.219970703125, "loss": 0.13, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.2302517890930176, "rewards/margins": 3.1337485313415527, "rewards/rejected": -0.9034968614578247, "step": 460 }, { "epoch": 1.22, "grad_norm": 2.46875, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -2.947139024734497, "logits/rejected": -2.954092025756836, "logps/chosen": -28.266454696655273, "logps/rejected": -32.47053146362305, "loss": 0.1473, "rewards/accuracies": 0.9375, "rewards/chosen": 2.311420202255249, "rewards/margins": 3.206058979034424, "rewards/rejected": -0.8946388959884644, "step": 470 }, { "epoch": 1.25, "grad_norm": 3.34375, "learning_rate": 4.347971356735789e-06, "logits/chosen": -2.88096284866333, "logits/rejected": -2.8864543437957764, "logps/chosen": -24.51046371459961, "logps/rejected": -30.8259334564209, "loss": 0.1619, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.911630392074585, "rewards/margins": 2.5901265144348145, "rewards/rejected": -0.6784960627555847, "step": 480 }, { "epoch": 1.27, "grad_norm": 3.40625, "learning_rate": 4.309335095262675e-06, "logits/chosen": -2.9428093433380127, "logits/rejected": -2.9582343101501465, "logps/chosen": -28.8426456451416, "logps/rejected": -31.713489532470703, "loss": 0.1316, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.3431544303894043, "rewards/margins": 3.1132519245147705, "rewards/rejected": -0.7700973749160767, "step": 490 }, { "epoch": 1.3, "grad_norm": 7.09375, "learning_rate": 4.269769281772082e-06, "logits/chosen": -3.09596586227417, "logits/rejected": -3.0849087238311768, "logps/chosen": -25.846426010131836, "logps/rejected": -34.608150482177734, "loss": 0.1463, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.3493077754974365, "rewards/margins": 3.3844122886657715, "rewards/rejected": -1.0351048707962036, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.829404354095459, "eval_logits/rejected": -2.828195095062256, "eval_logps/chosen": -31.34617805480957, "eval_logps/rejected": -34.976097106933594, "eval_loss": 0.48155340552330017, "eval_rewards/accuracies": 0.5843023061752319, "eval_rewards/chosen": -0.0382361114025116, "eval_rewards/margins": 0.12853005528450012, "eval_rewards/rejected": -0.1667661815881729, "eval_runtime": 113.0419, "eval_samples_per_second": 3.034, "eval_steps_per_second": 0.38, "step": 500 }, { "epoch": 1.32, "grad_norm": 4.375, "learning_rate": 4.22929424333435e-06, "logits/chosen": -2.8922245502471924, "logits/rejected": -2.8978965282440186, "logps/chosen": -26.80906105041504, "logps/rejected": -30.753646850585938, "loss": 0.1301, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.5040323734283447, "rewards/margins": 3.261317729949951, "rewards/rejected": -0.7572852373123169, "step": 510 }, { "epoch": 1.35, "grad_norm": 5.78125, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -2.823896884918213, "logits/rejected": -2.8274755477905273, "logps/chosen": -25.659046173095703, "logps/rejected": -31.231517791748047, "loss": 0.1458, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.4062535762786865, "rewards/margins": 3.25431489944458, "rewards/rejected": -0.8480612635612488, "step": 520 }, { "epoch": 1.38, "grad_norm": 2.625, "learning_rate": 4.145700124802693e-06, "logits/chosen": -2.9729692935943604, "logits/rejected": -2.9581494331359863, "logps/chosen": -26.197402954101562, "logps/rejected": -31.588998794555664, "loss": 0.1273, "rewards/accuracies": 0.9375, "rewards/chosen": 2.31791615486145, "rewards/margins": 3.3834166526794434, "rewards/rejected": -1.0655006170272827, "step": 530 }, { "epoch": 1.4, "grad_norm": 3.484375, "learning_rate": 4.102623991469562e-06, "logits/chosen": -3.1310153007507324, "logits/rejected": -3.138054609298706, "logps/chosen": -25.74569320678711, "logps/rejected": -31.806625366210938, "loss": 0.1364, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.515667200088501, "rewards/margins": 3.3743622303009033, "rewards/rejected": -0.858695387840271, "step": 540 }, { "epoch": 1.43, "grad_norm": 3.015625, "learning_rate": 4.058724504646834e-06, "logits/chosen": -3.091869592666626, "logits/rejected": -3.0971007347106934, "logps/chosen": -27.216278076171875, "logps/rejected": -29.325424194335938, "loss": 0.1242, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.619089365005493, "rewards/margins": 3.465157985687256, "rewards/rejected": -0.8460685610771179, "step": 550 }, { "epoch": 1.45, "grad_norm": 2.984375, "learning_rate": 4.014024217844167e-06, "logits/chosen": -2.855199098587036, "logits/rejected": -2.83630108833313, "logps/chosen": -26.59674072265625, "logps/rejected": -29.897335052490234, "loss": 0.1373, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.618032455444336, "rewards/margins": 3.229351043701172, "rewards/rejected": -0.6113186478614807, "step": 560 }, { "epoch": 1.48, "grad_norm": 3.515625, "learning_rate": 3.968546095984911e-06, "logits/chosen": -3.108513832092285, "logits/rejected": -3.09846830368042, "logps/chosen": -26.349437713623047, "logps/rejected": -29.17409896850586, "loss": 0.1452, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.472120761871338, "rewards/margins": 3.112577199935913, "rewards/rejected": -0.640457034111023, "step": 570 }, { "epoch": 1.51, "grad_norm": 0.86328125, "learning_rate": 3.922313503607806e-06, "logits/chosen": -3.004727840423584, "logits/rejected": -2.995328187942505, "logps/chosen": -25.118703842163086, "logps/rejected": -33.02366256713867, "loss": 0.1304, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.349811315536499, "rewards/margins": 3.2119812965393066, "rewards/rejected": -0.862169623374939, "step": 580 }, { "epoch": 1.53, "grad_norm": 4.28125, "learning_rate": 3.875350192863368e-06, "logits/chosen": -2.822783946990967, "logits/rejected": -2.7982170581817627, "logps/chosen": -24.033367156982422, "logps/rejected": -28.908611297607422, "loss": 0.1569, "rewards/accuracies": 0.875, "rewards/chosen": 2.0938639640808105, "rewards/margins": 2.7519569396972656, "rewards/rejected": -0.6580930948257446, "step": 590 }, { "epoch": 1.56, "grad_norm": 2.1875, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -2.8916754722595215, "logits/rejected": -2.901120662689209, "logps/chosen": -25.461360931396484, "logps/rejected": -30.766143798828125, "loss": 0.1315, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.5561599731445312, "rewards/margins": 3.3379065990448, "rewards/rejected": -0.7817465662956238, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.8065245151519775, "eval_logits/rejected": -2.8038008213043213, "eval_logps/chosen": -31.303083419799805, "eval_logps/rejected": -34.967655181884766, "eval_loss": 0.47860151529312134, "eval_rewards/accuracies": 0.5510797500610352, "eval_rewards/chosen": -0.012380531057715416, "eval_rewards/margins": 0.14931893348693848, "eval_rewards/rejected": -0.16169947385787964, "eval_runtime": 113.0498, "eval_samples_per_second": 3.034, "eval_steps_per_second": 0.38, "step": 600 }, { "epoch": 1.58, "grad_norm": 3.421875, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -2.9185099601745605, "logits/rejected": -2.915782928466797, "logps/chosen": -27.880090713500977, "logps/rejected": -31.514537811279297, "loss": 0.0942, "rewards/accuracies": 0.9375, "rewards/chosen": 2.876309871673584, "rewards/margins": 3.7296981811523438, "rewards/rejected": -0.8533883094787598, "step": 610 }, { "epoch": 1.61, "grad_norm": 2.546875, "learning_rate": 3.730319028506478e-06, "logits/chosen": -2.991306781768799, "logits/rejected": -3.000715970993042, "logps/chosen": -26.9293212890625, "logps/rejected": -32.55492401123047, "loss": 0.1366, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.5626492500305176, "rewards/margins": 3.308913469314575, "rewards/rejected": -0.7462641596794128, "step": 620 }, { "epoch": 1.64, "grad_norm": 5.46875, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -2.8689351081848145, "logits/rejected": -2.8681142330169678, "logps/chosen": -25.376575469970703, "logps/rejected": -29.066162109375, "loss": 0.1343, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.63903546333313, "rewards/margins": 3.1918487548828125, "rewards/rejected": -0.5528129935264587, "step": 630 }, { "epoch": 1.66, "grad_norm": 3.625, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -2.7976834774017334, "logits/rejected": -2.789928436279297, "logps/chosen": -25.94685935974121, "logps/rejected": -30.736852645874023, "loss": 0.1106, "rewards/accuracies": 0.9375, "rewards/chosen": 2.5344443321228027, "rewards/margins": 3.1657357215881348, "rewards/rejected": -0.6312912702560425, "step": 640 }, { "epoch": 1.69, "grad_norm": 3.890625, "learning_rate": 3.579601087369492e-06, "logits/chosen": -3.023878574371338, "logits/rejected": -3.0125577449798584, "logps/chosen": -27.215251922607422, "logps/rejected": -33.43865966796875, "loss": 0.0966, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.687009334564209, "rewards/margins": 3.534433364868164, "rewards/rejected": -0.8474239110946655, "step": 650 }, { "epoch": 1.71, "grad_norm": 3.5625, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -2.894012689590454, "logits/rejected": -2.8996853828430176, "logps/chosen": -26.658580780029297, "logps/rejected": -30.440664291381836, "loss": 0.1015, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.902081251144409, "rewards/margins": 3.6854026317596436, "rewards/rejected": -0.7833213210105896, "step": 660 }, { "epoch": 1.74, "grad_norm": 1.8671875, "learning_rate": 3.476306177936961e-06, "logits/chosen": -2.879511594772339, "logits/rejected": -2.8856289386749268, "logps/chosen": -26.245128631591797, "logps/rejected": -31.559085845947266, "loss": 0.0948, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.720550537109375, "rewards/margins": 3.60906720161438, "rewards/rejected": -0.8885162472724915, "step": 670 }, { "epoch": 1.77, "grad_norm": 2.171875, "learning_rate": 3.423893017450324e-06, "logits/chosen": -3.0573570728302, "logits/rejected": -3.0592801570892334, "logps/chosen": -24.206645965576172, "logps/rejected": -33.46504592895508, "loss": 0.0873, "rewards/accuracies": 0.9375, "rewards/chosen": 3.1212868690490723, "rewards/margins": 4.180509090423584, "rewards/rejected": -1.0592223405838013, "step": 680 }, { "epoch": 1.79, "grad_norm": 2.671875, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -3.090211868286133, "logits/rejected": -3.0656895637512207, "logps/chosen": -27.19948959350586, "logps/rejected": -33.74449920654297, "loss": 0.1154, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.8534696102142334, "rewards/margins": 3.8204524517059326, "rewards/rejected": -0.9669831395149231, "step": 690 }, { "epoch": 1.82, "grad_norm": 2.59375, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -3.083780288696289, "logits/rejected": -3.0772273540496826, "logps/chosen": -23.855234146118164, "logps/rejected": -34.134037017822266, "loss": 0.1119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.035677433013916, "rewards/margins": 3.9378445148468018, "rewards/rejected": -0.902167022228241, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.8201990127563477, "eval_logits/rejected": -2.8174102306365967, "eval_logps/chosen": -31.423402786254883, "eval_logps/rejected": -35.12089157104492, "eval_loss": 0.47779276967048645, "eval_rewards/accuracies": 0.5892857313156128, "eval_rewards/chosen": -0.08456874638795853, "eval_rewards/margins": 0.16907250881195068, "eval_rewards/rejected": -0.2536412179470062, "eval_runtime": 113.0163, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 700 }, { "epoch": 1.84, "grad_norm": 4.0, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -2.902108669281006, "logits/rejected": -2.90989351272583, "logps/chosen": -27.421884536743164, "logps/rejected": -33.41091537475586, "loss": 0.1206, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.9890270233154297, "rewards/margins": 3.5715935230255127, "rewards/rejected": -0.5825664401054382, "step": 710 }, { "epoch": 1.87, "grad_norm": 4.4375, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -3.0041208267211914, "logits/rejected": -3.002070665359497, "logps/chosen": -27.410593032836914, "logps/rejected": -31.874393463134766, "loss": 0.1144, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.121032238006592, "rewards/margins": 3.904160737991333, "rewards/rejected": -0.7831286787986755, "step": 720 }, { "epoch": 1.9, "grad_norm": 3.390625, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -2.928129196166992, "logits/rejected": -2.9367146492004395, "logps/chosen": -23.772188186645508, "logps/rejected": -30.644943237304688, "loss": 0.1195, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.626952648162842, "rewards/margins": 3.675017833709717, "rewards/rejected": -1.0480650663375854, "step": 730 }, { "epoch": 1.92, "grad_norm": 2.234375, "learning_rate": 3.100405083388799e-06, "logits/chosen": -3.0098233222961426, "logits/rejected": -3.0144054889678955, "logps/chosen": -26.03444480895996, "logps/rejected": -37.69293212890625, "loss": 0.0753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.2283482551574707, "rewards/margins": 4.672045707702637, "rewards/rejected": -1.4436976909637451, "step": 740 }, { "epoch": 1.95, "grad_norm": 3.84375, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -3.0085926055908203, "logits/rejected": -2.999788999557495, "logps/chosen": -27.861740112304688, "logps/rejected": -33.77484893798828, "loss": 0.1034, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.2917075157165527, "rewards/margins": 4.111935615539551, "rewards/rejected": -0.8202279806137085, "step": 750 }, { "epoch": 1.97, "grad_norm": 1.0625, "learning_rate": 2.989809792446417e-06, "logits/chosen": -2.915017604827881, "logits/rejected": -2.9230797290802, "logps/chosen": -25.956058502197266, "logps/rejected": -30.55405616760254, "loss": 0.0828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.0310025215148926, "rewards/margins": 3.933666229248047, "rewards/rejected": -0.9026637077331543, "step": 760 }, { "epoch": 2.0, "grad_norm": 5.65625, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -2.9238173961639404, "logits/rejected": -2.93845534324646, "logps/chosen": -25.6632137298584, "logps/rejected": -33.923255920410156, "loss": 0.1017, "rewards/accuracies": 0.9375, "rewards/chosen": 3.0752081871032715, "rewards/margins": 3.832324981689453, "rewards/rejected": -0.7571166157722473, "step": 770 }, { "epoch": 2.03, "grad_norm": 1.328125, "learning_rate": 2.878208065043501e-06, "logits/chosen": -2.993861675262451, "logits/rejected": -2.9855971336364746, "logps/chosen": -26.52446937561035, "logps/rejected": -31.801513671875, "loss": 0.0571, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.4877707958221436, "rewards/margins": 4.527015686035156, "rewards/rejected": -1.0392451286315918, "step": 780 }, { "epoch": 2.05, "grad_norm": 3.65625, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -2.987353801727295, "logits/rejected": -2.9833192825317383, "logps/chosen": -21.95352554321289, "logps/rejected": -31.86166763305664, "loss": 0.0711, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.0340464115142822, "rewards/margins": 4.315891742706299, "rewards/rejected": -1.2818453311920166, "step": 790 }, { "epoch": 2.08, "grad_norm": 0.95703125, "learning_rate": 2.76582921478147e-06, "logits/chosen": -2.9272208213806152, "logits/rejected": -2.90736722946167, "logps/chosen": -24.432416915893555, "logps/rejected": -34.16832733154297, "loss": 0.0639, "rewards/accuracies": 0.9375, "rewards/chosen": 3.5171732902526855, "rewards/margins": 4.869467258453369, "rewards/rejected": -1.352293610572815, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.8334927558898926, "eval_logits/rejected": -2.8306193351745605, "eval_logps/chosen": -31.477209091186523, "eval_logps/rejected": -35.219730377197266, "eval_loss": 0.4745987057685852, "eval_rewards/accuracies": 0.5689368844032288, "eval_rewards/chosen": -0.11685475707054138, "eval_rewards/margins": 0.1960887312889099, "eval_rewards/rejected": -0.3129434883594513, "eval_runtime": 112.9451, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 800 }, { "epoch": 2.1, "grad_norm": 1.2109375, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -3.0222575664520264, "logits/rejected": -3.0411593914031982, "logps/chosen": -25.39664077758789, "logps/rejected": -34.01559066772461, "loss": 0.0482, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7271015644073486, "rewards/margins": 5.292603492736816, "rewards/rejected": -1.5655021667480469, "step": 810 }, { "epoch": 2.13, "grad_norm": 1.8671875, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -3.014270067214966, "logits/rejected": -3.003432512283325, "logps/chosen": -26.43511390686035, "logps/rejected": -32.2878303527832, "loss": 0.0526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.8910248279571533, "rewards/margins": 5.3643479347229, "rewards/rejected": -1.4733226299285889, "step": 820 }, { "epoch": 2.16, "grad_norm": 2.109375, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -2.8816628456115723, "logits/rejected": -2.8790135383605957, "logps/chosen": -25.601741790771484, "logps/rejected": -27.62906837463379, "loss": 0.0569, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.6401939392089844, "rewards/margins": 4.657577991485596, "rewards/rejected": -1.0173836946487427, "step": 830 }, { "epoch": 2.18, "grad_norm": 1.4921875, "learning_rate": 2.53966490958702e-06, "logits/chosen": -2.982262372970581, "logits/rejected": -2.9851441383361816, "logps/chosen": -26.071903228759766, "logps/rejected": -29.37398910522461, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.9142680168151855, "rewards/margins": 5.082181453704834, "rewards/rejected": -1.1679133176803589, "step": 840 }, { "epoch": 2.21, "grad_norm": 3.0, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -3.063699245452881, "logits/rejected": -3.06714129447937, "logps/chosen": -23.322050094604492, "logps/rejected": -32.086116790771484, "loss": 0.0488, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.732470989227295, "rewards/margins": 4.926577568054199, "rewards/rejected": -1.1941062211990356, "step": 850 }, { "epoch": 2.23, "grad_norm": 1.40625, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -2.901484489440918, "logits/rejected": -2.9057114124298096, "logps/chosen": -18.839258193969727, "logps/rejected": -29.934463500976562, "loss": 0.069, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.2325949668884277, "rewards/margins": 4.606924057006836, "rewards/rejected": -1.374328851699829, "step": 860 }, { "epoch": 2.26, "grad_norm": 1.578125, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -3.07415509223938, "logits/rejected": -3.0575265884399414, "logps/chosen": -25.639835357666016, "logps/rejected": -29.889537811279297, "loss": 0.0567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7141928672790527, "rewards/margins": 4.955038070678711, "rewards/rejected": -1.2408453226089478, "step": 870 }, { "epoch": 2.29, "grad_norm": 1.4375, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -2.9700875282287598, "logits/rejected": -2.947779417037964, "logps/chosen": -25.307472229003906, "logps/rejected": -32.87110137939453, "loss": 0.0694, "rewards/accuracies": 0.9375, "rewards/chosen": 3.471764087677002, "rewards/margins": 4.802722930908203, "rewards/rejected": -1.330958604812622, "step": 880 }, { "epoch": 2.31, "grad_norm": 1.0546875, "learning_rate": 2.256719512667651e-06, "logits/chosen": -2.9568545818328857, "logits/rejected": -2.9552457332611084, "logps/chosen": -26.856555938720703, "logps/rejected": -35.93258285522461, "loss": 0.0539, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.675328493118286, "rewards/margins": 5.377382278442383, "rewards/rejected": -1.7020537853240967, "step": 890 }, { "epoch": 2.34, "grad_norm": 1.046875, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -2.968878746032715, "logits/rejected": -2.987762928009033, "logps/chosen": -24.137746810913086, "logps/rejected": -33.46139144897461, "loss": 0.0635, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.534593105316162, "rewards/margins": 5.318085670471191, "rewards/rejected": -1.7834930419921875, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.834638833999634, "eval_logits/rejected": -2.8313143253326416, "eval_logps/chosen": -31.596710205078125, "eval_logps/rejected": -35.334754943847656, "eval_loss": 0.47681429982185364, "eval_rewards/accuracies": 0.5776578187942505, "eval_rewards/chosen": -0.18855668604373932, "eval_rewards/margins": 0.19340036809444427, "eval_rewards/rejected": -0.381957083940506, "eval_runtime": 112.9515, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 900 }, { "epoch": 2.36, "grad_norm": 1.2109375, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -2.9530720710754395, "logits/rejected": -2.942460775375366, "logps/chosen": -25.267536163330078, "logps/rejected": -34.3054313659668, "loss": 0.0519, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6163666248321533, "rewards/margins": 5.410401344299316, "rewards/rejected": -1.794034719467163, "step": 910 }, { "epoch": 2.39, "grad_norm": 1.359375, "learning_rate": 2.088219349982323e-06, "logits/chosen": -3.026940107345581, "logits/rejected": -3.0076868534088135, "logps/chosen": -25.835464477539062, "logps/rejected": -31.962453842163086, "loss": 0.0514, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.233044624328613, "rewards/margins": 5.530001640319824, "rewards/rejected": -1.2969573736190796, "step": 920 }, { "epoch": 2.42, "grad_norm": 1.96875, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -2.927703857421875, "logits/rejected": -2.92795991897583, "logps/chosen": -24.350366592407227, "logps/rejected": -34.520790100097656, "loss": 0.051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.440142869949341, "rewards/margins": 5.1205925941467285, "rewards/rejected": -1.6804498434066772, "step": 930 }, { "epoch": 2.44, "grad_norm": 1.5625, "learning_rate": 1.976895560604729e-06, "logits/chosen": -3.020810604095459, "logits/rejected": -3.0526111125946045, "logps/chosen": -23.749736785888672, "logps/rejected": -32.38919448852539, "loss": 0.0596, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.4667294025421143, "rewards/margins": 4.964657783508301, "rewards/rejected": -1.4979279041290283, "step": 940 }, { "epoch": 2.47, "grad_norm": 1.4140625, "learning_rate": 1.921622518534466e-06, "logits/chosen": -2.960104465484619, "logits/rejected": -2.963212251663208, "logps/chosen": -23.69285774230957, "logps/rejected": -34.711917877197266, "loss": 0.0857, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.276454210281372, "rewards/margins": 4.899614334106445, "rewards/rejected": -1.6231597661972046, "step": 950 }, { "epoch": 2.49, "grad_norm": 1.765625, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -3.0337109565734863, "logits/rejected": -3.014859676361084, "logps/chosen": -25.189592361450195, "logps/rejected": -37.11577224731445, "loss": 0.0675, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5953078269958496, "rewards/margins": 5.337515830993652, "rewards/rejected": -1.7422077655792236, "step": 960 }, { "epoch": 2.52, "grad_norm": 0.5390625, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -3.1791131496429443, "logits/rejected": -3.1823792457580566, "logps/chosen": -22.690166473388672, "logps/rejected": -33.159767150878906, "loss": 0.0519, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.744643449783325, "rewards/margins": 5.208704948425293, "rewards/rejected": -1.4640613794326782, "step": 970 }, { "epoch": 2.55, "grad_norm": 1.203125, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -3.0011444091796875, "logits/rejected": -3.0270564556121826, "logps/chosen": -24.048646926879883, "logps/rejected": -33.56853103637695, "loss": 0.0571, "rewards/accuracies": 0.9375, "rewards/chosen": 3.940692901611328, "rewards/margins": 5.312582969665527, "rewards/rejected": -1.3718901872634888, "step": 980 }, { "epoch": 2.57, "grad_norm": 1.0234375, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -2.9023399353027344, "logits/rejected": -2.9006032943725586, "logps/chosen": -27.717641830444336, "logps/rejected": -31.604290008544922, "loss": 0.0834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6766304969787598, "rewards/margins": 4.718010902404785, "rewards/rejected": -1.041380763053894, "step": 990 }, { "epoch": 2.6, "grad_norm": 3.4375, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -3.0629963874816895, "logits/rejected": -3.062066078186035, "logps/chosen": -25.74740219116211, "logps/rejected": -34.4100456237793, "loss": 0.0517, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.686403274536133, "rewards/margins": 4.985658645629883, "rewards/rejected": -1.29925537109375, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.8335611820220947, "eval_logits/rejected": -2.830505609512329, "eval_logps/chosen": -31.62620735168457, "eval_logps/rejected": -35.325984954833984, "eval_loss": 0.48039767146110535, "eval_rewards/accuracies": 0.5544019937515259, "eval_rewards/chosen": -0.20625291764736176, "eval_rewards/margins": 0.17044585943222046, "eval_rewards/rejected": -0.3766987919807434, "eval_runtime": 113.0374, "eval_samples_per_second": 3.034, "eval_steps_per_second": 0.38, "step": 1000 }, { "epoch": 2.62, "grad_norm": 2.40625, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -3.0035667419433594, "logits/rejected": -2.9842052459716797, "logps/chosen": -24.36931610107422, "logps/rejected": -33.009544372558594, "loss": 0.0701, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.5192456245422363, "rewards/margins": 4.74137020111084, "rewards/rejected": -1.2221249341964722, "step": 1010 }, { "epoch": 2.65, "grad_norm": 1.3359375, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -3.0026066303253174, "logits/rejected": -3.0053837299346924, "logps/chosen": -22.129589080810547, "logps/rejected": -31.10976219177246, "loss": 0.0458, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.6123199462890625, "rewards/margins": 5.211001396179199, "rewards/rejected": -1.5986814498901367, "step": 1020 }, { "epoch": 2.68, "grad_norm": 0.69140625, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -2.9331469535827637, "logits/rejected": -2.917952537536621, "logps/chosen": -24.52996826171875, "logps/rejected": -33.4886474609375, "loss": 0.0447, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.6283657550811768, "rewards/margins": 5.028862476348877, "rewards/rejected": -1.4004970788955688, "step": 1030 }, { "epoch": 2.7, "grad_norm": 5.5625, "learning_rate": 1.440887158673332e-06, "logits/chosen": -2.8529088497161865, "logits/rejected": -2.8682234287261963, "logps/chosen": -26.911334991455078, "logps/rejected": -35.011165618896484, "loss": 0.0626, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.6526360511779785, "rewards/margins": 5.054805755615234, "rewards/rejected": -1.4021697044372559, "step": 1040 }, { "epoch": 2.73, "grad_norm": 1.46875, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -2.9247629642486572, "logits/rejected": -2.9314653873443604, "logps/chosen": -23.7067813873291, "logps/rejected": -32.50202941894531, "loss": 0.0761, "rewards/accuracies": 0.9375, "rewards/chosen": 3.3870720863342285, "rewards/margins": 4.524316310882568, "rewards/rejected": -1.137244462966919, "step": 1050 }, { "epoch": 2.75, "grad_norm": 3.046875, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -3.0412840843200684, "logits/rejected": -3.038510799407959, "logps/chosen": -24.021516799926758, "logps/rejected": -32.73785400390625, "loss": 0.056, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6689066886901855, "rewards/margins": 4.808653354644775, "rewards/rejected": -1.1397463083267212, "step": 1060 }, { "epoch": 2.78, "grad_norm": 1.828125, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -2.927729368209839, "logits/rejected": -2.9417080879211426, "logps/chosen": -25.402706146240234, "logps/rejected": -34.0269775390625, "loss": 0.0478, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.6457531452178955, "rewards/margins": 5.079578399658203, "rewards/rejected": -1.4338250160217285, "step": 1070 }, { "epoch": 2.81, "grad_norm": 1.2734375, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -2.8187551498413086, "logits/rejected": -2.80017352104187, "logps/chosen": -25.68526840209961, "logps/rejected": -31.5473690032959, "loss": 0.0722, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 3.6150219440460205, "rewards/margins": 4.627933502197266, "rewards/rejected": -1.0129120349884033, "step": 1080 }, { "epoch": 2.83, "grad_norm": 1.734375, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -2.89567232131958, "logits/rejected": -2.8840785026550293, "logps/chosen": -26.24384117126465, "logps/rejected": -30.80573081970215, "loss": 0.0608, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.66672945022583, "rewards/margins": 5.016554355621338, "rewards/rejected": -1.3498249053955078, "step": 1090 }, { "epoch": 2.86, "grad_norm": 3.21875, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -3.0152509212493896, "logits/rejected": -3.0058329105377197, "logps/chosen": -25.186405181884766, "logps/rejected": -36.0217399597168, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 4.073953628540039, "rewards/margins": 5.811936855316162, "rewards/rejected": -1.7379831075668335, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.833212375640869, "eval_logits/rejected": -2.829836368560791, "eval_logps/chosen": -31.67898941040039, "eval_logps/rejected": -35.403167724609375, "eval_loss": 0.4793899655342102, "eval_rewards/accuracies": 0.5689368844032288, "eval_rewards/chosen": -0.23792240023612976, "eval_rewards/margins": 0.1850845217704773, "eval_rewards/rejected": -0.42300689220428467, "eval_runtime": 112.9301, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 1100 }, { "epoch": 2.88, "grad_norm": 1.0546875, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -2.9000511169433594, "logits/rejected": -2.882561206817627, "logps/chosen": -26.57952880859375, "logps/rejected": -35.87388229370117, "loss": 0.0781, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.605377674102783, "rewards/margins": 4.869858741760254, "rewards/rejected": -1.2644808292388916, "step": 1110 }, { "epoch": 2.91, "grad_norm": 0.72265625, "learning_rate": 1.049857726072005e-06, "logits/chosen": -2.9708096981048584, "logits/rejected": -2.9758188724517822, "logps/chosen": -23.589977264404297, "logps/rejected": -32.91487503051758, "loss": 0.0506, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.568798780441284, "rewards/margins": 5.034553050994873, "rewards/rejected": -1.4657539129257202, "step": 1120 }, { "epoch": 2.94, "grad_norm": 2.96875, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -2.955108165740967, "logits/rejected": -2.952389717102051, "logps/chosen": -21.273967742919922, "logps/rejected": -29.598825454711914, "loss": 0.0378, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8687362670898438, "rewards/margins": 5.249798774719238, "rewards/rejected": -1.3810631036758423, "step": 1130 }, { "epoch": 2.96, "grad_norm": 2.140625, "learning_rate": 9.59060791022566e-07, "logits/chosen": -2.8295435905456543, "logits/rejected": -2.8248634338378906, "logps/chosen": -25.344301223754883, "logps/rejected": -32.20793151855469, "loss": 0.0594, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.755171537399292, "rewards/margins": 4.977625846862793, "rewards/rejected": -1.2224547863006592, "step": 1140 }, { "epoch": 2.99, "grad_norm": 1.890625, "learning_rate": 9.148382544856885e-07, "logits/chosen": -3.0266003608703613, "logits/rejected": -3.0318973064422607, "logps/chosen": -20.908588409423828, "logps/rejected": -32.997406005859375, "loss": 0.0657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.2996649742126465, "rewards/margins": 4.838730812072754, "rewards/rejected": -1.5390652418136597, "step": 1150 }, { "epoch": 3.01, "grad_norm": 1.171875, "learning_rate": 8.714301001505568e-07, "logits/chosen": -2.8871684074401855, "logits/rejected": -2.872318983078003, "logps/chosen": -24.629959106445312, "logps/rejected": -36.02922058105469, "loss": 0.0492, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.7981956005096436, "rewards/margins": 5.512874603271484, "rewards/rejected": -1.7146790027618408, "step": 1160 }, { "epoch": 3.04, "grad_norm": 1.0390625, "learning_rate": 8.288586291031025e-07, "logits/chosen": -3.016979694366455, "logits/rejected": -3.015721559524536, "logps/chosen": -22.779308319091797, "logps/rejected": -33.03929138183594, "loss": 0.04, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.8180980682373047, "rewards/margins": 5.5419487953186035, "rewards/rejected": -1.7238508462905884, "step": 1170 }, { "epoch": 3.06, "grad_norm": 1.1015625, "learning_rate": 7.871457125803897e-07, "logits/chosen": -2.9248671531677246, "logits/rejected": -2.9196035861968994, "logps/chosen": -27.77500343322754, "logps/rejected": -34.656959533691406, "loss": 0.0455, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9855456352233887, "rewards/margins": 5.358269691467285, "rewards/rejected": -1.3727240562438965, "step": 1180 }, { "epoch": 3.09, "grad_norm": 1.2421875, "learning_rate": 7.463127807341966e-07, "logits/chosen": -3.0505237579345703, "logits/rejected": -3.050422191619873, "logps/chosen": -26.378360748291016, "logps/rejected": -33.47454833984375, "loss": 0.0472, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.9589202404022217, "rewards/margins": 5.243517875671387, "rewards/rejected": -1.2845983505249023, "step": 1190 }, { "epoch": 3.12, "grad_norm": 1.03125, "learning_rate": 7.063808116212021e-07, "logits/chosen": -3.0457065105438232, "logits/rejected": -3.035034656524658, "logps/chosen": -24.127140045166016, "logps/rejected": -32.09668731689453, "loss": 0.0511, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.5936577320098877, "rewards/margins": 4.924044609069824, "rewards/rejected": -1.3303877115249634, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.83296275138855, "eval_logits/rejected": -2.8296968936920166, "eval_logps/chosen": -31.673555374145508, "eval_logps/rejected": -35.419559478759766, "eval_loss": 0.47834986448287964, "eval_rewards/accuracies": 0.5805647969245911, "eval_rewards/chosen": -0.23466025292873383, "eval_rewards/margins": 0.1981838196516037, "eval_rewards/rejected": -0.4328440725803375, "eval_runtime": 113.0122, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 1200 }, { "epoch": 3.14, "grad_norm": 2.0625, "learning_rate": 6.673703204254348e-07, "logits/chosen": -2.946625232696533, "logits/rejected": -2.9526054859161377, "logps/chosen": -23.166765213012695, "logps/rejected": -31.085351943969727, "loss": 0.0453, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.759180784225464, "rewards/margins": 5.341059684753418, "rewards/rejected": -1.5818792581558228, "step": 1210 }, { "epoch": 3.17, "grad_norm": 1.109375, "learning_rate": 6.293013489185315e-07, "logits/chosen": -2.9684956073760986, "logits/rejected": -2.977858781814575, "logps/chosen": -24.800304412841797, "logps/rejected": -33.38405227661133, "loss": 0.0494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 4.063855171203613, "rewards/margins": 5.38551139831543, "rewards/rejected": -1.3216559886932373, "step": 1220 }, { "epoch": 3.19, "grad_norm": 0.7265625, "learning_rate": 5.921934551632086e-07, "logits/chosen": -2.8769516944885254, "logits/rejected": -2.883462429046631, "logps/chosen": -24.908557891845703, "logps/rejected": -33.673988342285156, "loss": 0.064, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 3.6550304889678955, "rewards/margins": 5.116556644439697, "rewards/rejected": -1.4615256786346436, "step": 1230 }, { "epoch": 3.22, "grad_norm": 0.498046875, "learning_rate": 5.560657034652405e-07, "logits/chosen": -2.9312660694122314, "logits/rejected": -2.920850992202759, "logps/chosen": -24.9986572265625, "logps/rejected": -32.03197479248047, "loss": 0.0444, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.929927349090576, "rewards/margins": 5.388668537139893, "rewards/rejected": -1.4587411880493164, "step": 1240 }, { "epoch": 3.25, "grad_norm": 1.421875, "learning_rate": 5.2093665457911e-07, "logits/chosen": -2.887247085571289, "logits/rejected": -2.8908119201660156, "logps/chosen": -25.24740982055664, "logps/rejected": -35.18986892700195, "loss": 0.0313, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.033031940460205, "rewards/margins": 5.677703857421875, "rewards/rejected": -1.6446723937988281, "step": 1250 }, { "epoch": 3.27, "grad_norm": 0.671875, "learning_rate": 4.868243561723535e-07, "logits/chosen": -2.9369864463806152, "logits/rejected": -2.932027578353882, "logps/chosen": -23.15317153930664, "logps/rejected": -33.180938720703125, "loss": 0.0521, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.6488914489746094, "rewards/margins": 5.095983028411865, "rewards/rejected": -1.4470914602279663, "step": 1260 }, { "epoch": 3.3, "grad_norm": 0.73828125, "learning_rate": 4.537463335535161e-07, "logits/chosen": -2.9909985065460205, "logits/rejected": -2.983795166015625, "logps/chosen": -23.164331436157227, "logps/rejected": -32.18230056762695, "loss": 0.0417, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.8721091747283936, "rewards/margins": 5.354145526885986, "rewards/rejected": -1.4820353984832764, "step": 1270 }, { "epoch": 3.32, "grad_norm": 1.6796875, "learning_rate": 4.217195806684629e-07, "logits/chosen": -3.09962797164917, "logits/rejected": -3.095383644104004, "logps/chosen": -25.109115600585938, "logps/rejected": -32.72521209716797, "loss": 0.0508, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.8072261810302734, "rewards/margins": 5.412503719329834, "rewards/rejected": -1.6052783727645874, "step": 1280 }, { "epoch": 3.35, "grad_norm": 0.8671875, "learning_rate": 3.907605513696808e-07, "logits/chosen": -3.1644155979156494, "logits/rejected": -3.155869722366333, "logps/chosen": -24.283798217773438, "logps/rejected": -37.04186248779297, "loss": 0.0609, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.075181484222412, "rewards/margins": 5.640608787536621, "rewards/rejected": -1.5654271841049194, "step": 1290 }, { "epoch": 3.38, "grad_norm": 0.4375, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -3.0158133506774902, "logits/rejected": -3.00400972366333, "logps/chosen": -23.823991775512695, "logps/rejected": -33.074520111083984, "loss": 0.0414, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.1293792724609375, "rewards/margins": 5.917204856872559, "rewards/rejected": -1.7878259420394897, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.8339529037475586, "eval_logits/rejected": -2.830853223800659, "eval_logps/chosen": -31.698429107666016, "eval_logps/rejected": -35.45466613769531, "eval_loss": 0.4777953028678894, "eval_rewards/accuracies": 0.5660299062728882, "eval_rewards/chosen": -0.24958597123622894, "eval_rewards/margins": 0.20431919395923615, "eval_rewards/rejected": -0.4539051353931427, "eval_runtime": 112.9348, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 1300 }, { "epoch": 3.4, "grad_norm": 1.609375, "learning_rate": 3.321087280364757e-07, "logits/chosen": -2.985424757003784, "logits/rejected": -2.9805543422698975, "logps/chosen": -21.739099502563477, "logps/rejected": -34.769596099853516, "loss": 0.0479, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.8243980407714844, "rewards/margins": 5.4849443435668945, "rewards/rejected": -1.6605466604232788, "step": 1310 }, { "epoch": 3.43, "grad_norm": 1.2578125, "learning_rate": 3.044460665744284e-07, "logits/chosen": -2.931580066680908, "logits/rejected": -2.9400153160095215, "logps/chosen": -23.88373565673828, "logps/rejected": -33.76406478881836, "loss": 0.0368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.7448341846466064, "rewards/margins": 5.480520248413086, "rewards/rejected": -1.73568594455719, "step": 1320 }, { "epoch": 3.45, "grad_norm": 1.453125, "learning_rate": 2.779113783626916e-07, "logits/chosen": -2.940342426300049, "logits/rejected": -2.9283084869384766, "logps/chosen": -23.365154266357422, "logps/rejected": -34.24005889892578, "loss": 0.0354, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.842486619949341, "rewards/margins": 5.5103759765625, "rewards/rejected": -1.6678886413574219, "step": 1330 }, { "epoch": 3.48, "grad_norm": 1.46875, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -3.1717753410339355, "logits/rejected": -3.1576473712921143, "logps/chosen": -24.95553207397461, "logps/rejected": -30.344104766845703, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 3.9849143028259277, "rewards/margins": 5.419525146484375, "rewards/rejected": -1.4346110820770264, "step": 1340 }, { "epoch": 3.51, "grad_norm": 1.5, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -3.0225768089294434, "logits/rejected": -3.0164501667022705, "logps/chosen": -23.988834381103516, "logps/rejected": -33.98307418823242, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 4.108965873718262, "rewards/margins": 5.866541385650635, "rewards/rejected": -1.7575757503509521, "step": 1350 }, { "epoch": 3.53, "grad_norm": 3.046875, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -2.9172768592834473, "logits/rejected": -2.905526638031006, "logps/chosen": -24.87664222717285, "logps/rejected": -34.49529266357422, "loss": 0.0388, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.101553916931152, "rewards/margins": 5.823625564575195, "rewards/rejected": -1.7220712900161743, "step": 1360 }, { "epoch": 3.56, "grad_norm": 1.109375, "learning_rate": 1.833161662683672e-07, "logits/chosen": -2.9339916706085205, "logits/rejected": -2.931077480316162, "logps/chosen": -23.854618072509766, "logps/rejected": -31.55521011352539, "loss": 0.0439, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.770139217376709, "rewards/margins": 5.255643844604492, "rewards/rejected": -1.485504388809204, "step": 1370 }, { "epoch": 3.58, "grad_norm": 0.76171875, "learning_rate": 1.626139998169246e-07, "logits/chosen": -2.949082851409912, "logits/rejected": -2.9616751670837402, "logps/chosen": -22.38051986694336, "logps/rejected": -30.219009399414062, "loss": 0.0559, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.778834819793701, "rewards/margins": 5.045391082763672, "rewards/rejected": -1.2665560245513916, "step": 1380 }, { "epoch": 3.61, "grad_norm": 2.609375, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -3.1200642585754395, "logits/rejected": -3.123692750930786, "logps/chosen": -23.121355056762695, "logps/rejected": -33.11595916748047, "loss": 0.0457, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.6164307594299316, "rewards/margins": 4.941426753997803, "rewards/rejected": -1.3249962329864502, "step": 1390 }, { "epoch": 3.64, "grad_norm": 0.498046875, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -2.9267444610595703, "logits/rejected": -2.906433343887329, "logps/chosen": -26.804752349853516, "logps/rejected": -32.748016357421875, "loss": 0.0334, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.143013954162598, "rewards/margins": 5.595001220703125, "rewards/rejected": -1.4519875049591064, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.8337981700897217, "eval_logits/rejected": -2.8307693004608154, "eval_logps/chosen": -31.6983642578125, "eval_logps/rejected": -35.452125549316406, "eval_loss": 0.47829362750053406, "eval_rewards/accuracies": 0.5776578187942505, "eval_rewards/chosen": -0.2495480477809906, "eval_rewards/margins": 0.20283380150794983, "eval_rewards/rejected": -0.4523819386959076, "eval_runtime": 113.0096, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 1400 }, { "epoch": 3.66, "grad_norm": 1.421875, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -2.8893215656280518, "logits/rejected": -2.9038751125335693, "logps/chosen": -25.14376449584961, "logps/rejected": -33.68744659423828, "loss": 0.0468, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 4.028775691986084, "rewards/margins": 5.332303047180176, "rewards/rejected": -1.3035272359848022, "step": 1410 }, { "epoch": 3.69, "grad_norm": 3.109375, "learning_rate": 9.191080703056604e-08, "logits/chosen": -3.1276631355285645, "logits/rejected": -3.1215927600860596, "logps/chosen": -24.765342712402344, "logps/rejected": -32.85124588012695, "loss": 0.0417, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.918041706085205, "rewards/margins": 5.568787574768066, "rewards/rejected": -1.6507457494735718, "step": 1420 }, { "epoch": 3.71, "grad_norm": 2.515625, "learning_rate": 7.730678442730539e-08, "logits/chosen": -2.9715523719787598, "logits/rejected": -2.958172559738159, "logps/chosen": -26.20791244506836, "logps/rejected": -33.907264709472656, "loss": 0.0413, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 3.9364962577819824, "rewards/margins": 5.6175336837768555, "rewards/rejected": -1.6810375452041626, "step": 1430 }, { "epoch": 3.74, "grad_norm": 0.46484375, "learning_rate": 6.394742864787806e-08, "logits/chosen": -2.957505226135254, "logits/rejected": -2.9644668102264404, "logps/chosen": -23.84396743774414, "logps/rejected": -34.71217346191406, "loss": 0.0407, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.8953583240509033, "rewards/margins": 5.603337287902832, "rewards/rejected": -1.7079784870147705, "step": 1440 }, { "epoch": 3.77, "grad_norm": 2.65625, "learning_rate": 5.183960310644748e-08, "logits/chosen": -2.929403781890869, "logits/rejected": -2.931849241256714, "logps/chosen": -25.9273738861084, "logps/rejected": -34.08827209472656, "loss": 0.0449, "rewards/accuracies": 0.9375, "rewards/chosen": 4.037843227386475, "rewards/margins": 5.682816505432129, "rewards/rejected": -1.644972801208496, "step": 1450 }, { "epoch": 3.79, "grad_norm": 0.79296875, "learning_rate": 4.098952823928693e-08, "logits/chosen": -2.9166524410247803, "logits/rejected": -2.912785530090332, "logps/chosen": -22.781108856201172, "logps/rejected": -33.448795318603516, "loss": 0.0463, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.875708818435669, "rewards/margins": 5.689621925354004, "rewards/rejected": -1.8139129877090454, "step": 1460 }, { "epoch": 3.82, "grad_norm": 1.890625, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -2.9308972358703613, "logits/rejected": -2.915548086166382, "logps/chosen": -21.67947006225586, "logps/rejected": -30.910449981689453, "loss": 0.056, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7967326641082764, "rewards/margins": 5.388635158538818, "rewards/rejected": -1.5919020175933838, "step": 1470 }, { "epoch": 3.84, "grad_norm": 2.046875, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -2.89516019821167, "logits/rejected": -2.9039645195007324, "logps/chosen": -26.142074584960938, "logps/rejected": -32.463157653808594, "loss": 0.0454, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.7165653705596924, "rewards/margins": 5.240030765533447, "rewards/rejected": -1.5234657526016235, "step": 1480 }, { "epoch": 3.87, "grad_norm": 1.3046875, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -2.971940755844116, "logits/rejected": -2.9643890857696533, "logps/chosen": -20.724720001220703, "logps/rejected": -28.788055419921875, "loss": 0.0551, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.707993984222412, "rewards/margins": 5.004820823669434, "rewards/rejected": -1.2968261241912842, "step": 1490 }, { "epoch": 3.9, "grad_norm": 1.203125, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -2.888521194458008, "logits/rejected": -2.901872158050537, "logps/chosen": -23.54452133178711, "logps/rejected": -31.10415267944336, "loss": 0.0416, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.892826557159424, "rewards/margins": 5.418337821960449, "rewards/rejected": -1.525511384010315, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.8337180614471436, "eval_logits/rejected": -2.830277919769287, "eval_logps/chosen": -31.692487716674805, "eval_logps/rejected": -35.43448257446289, "eval_loss": 0.47854000329971313, "eval_rewards/accuracies": 0.5631229281425476, "eval_rewards/chosen": -0.2460203766822815, "eval_rewards/margins": 0.19577398896217346, "eval_rewards/rejected": -0.44179436564445496, "eval_runtime": 112.8174, "eval_samples_per_second": 3.04, "eval_steps_per_second": 0.381, "step": 1500 }, { "epoch": 3.92, "grad_norm": 0.6484375, "learning_rate": 5.777746105209147e-09, "logits/chosen": -3.0734591484069824, "logits/rejected": -3.082270860671997, "logps/chosen": -26.544811248779297, "logps/rejected": -34.36488723754883, "loss": 0.0576, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.7020084857940674, "rewards/margins": 4.973893165588379, "rewards/rejected": -1.271884560585022, "step": 1510 }, { "epoch": 3.95, "grad_norm": 2.390625, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -2.895297050476074, "logits/rejected": -2.900613784790039, "logps/chosen": -22.58160400390625, "logps/rejected": -32.83278274536133, "loss": 0.0541, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 3.7639541625976562, "rewards/margins": 5.176316261291504, "rewards/rejected": -1.4123618602752686, "step": 1520 }, { "epoch": 3.97, "grad_norm": 0.73046875, "learning_rate": 6.421917227455999e-10, "logits/chosen": -2.971235990524292, "logits/rejected": -2.9728827476501465, "logps/chosen": -19.740856170654297, "logps/rejected": -28.33868408203125, "loss": 0.0461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.6634414196014404, "rewards/margins": 4.870739936828613, "rewards/rejected": -1.2072982788085938, "step": 1530 }, { "epoch": 4.0, "grad_norm": 1.734375, "learning_rate": 0.0, "logits/chosen": -2.88759446144104, "logits/rejected": -2.8974978923797607, "logps/chosen": -24.240150451660156, "logps/rejected": -35.22872543334961, "loss": 0.0434, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.8821120262145996, "rewards/margins": 5.640433311462402, "rewards/rejected": -1.758321762084961, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.15808662716057392, "train_runtime": 11226.5517, "train_samples_per_second": 1.097, "train_steps_per_second": 0.137 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }