{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 100, "global_step": 2208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 0.8079685568809509, "debug/policy_chosen_logps": -5.034485816955566, "debug/policy_rejected_logits": 0.6268295645713806, "debug/policy_rejected_logps": -2.0584616661071777, "debug/reference_chosen_logps": -5.034485816955566, "debug/reference_rejected_logps": -2.0584616661071777, "epoch": 0.0036231884057971015, "grad_norm": 118.50866867312655, "learning_rate": 1e-09, "logits/chosen": 0.8079685568809509, "logits/rejected": 0.6268295645713806, "logps/chosen": -5.034485816955566, "logps/rejected": -2.0584616661071777, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 1.131899118423462, "debug/policy_chosen_logps": -5.150993347167969, "debug/policy_rejected_logits": 1.3698731660842896, "debug/policy_rejected_logps": -47.780250549316406, "debug/reference_chosen_logps": -5.141074180603027, "debug/reference_rejected_logps": -47.71269607543945, "epoch": 0.018115942028985508, "grad_norm": 175.6192907924916, "learning_rate": 5e-09, "logits/chosen": 1.131899118423462, "logits/rejected": 1.3698731660842896, "logps/chosen": -5.150993347167969, "logps/rejected": -47.780250549316406, "loss": 0.7053, "rewards/accuracies": 0.125, "rewards/chosen": -0.0049594249576330185, "rewards/margins": 0.028817227110266685, "rewards/rejected": -0.03377665579319, "step": 5 }, { "debug/policy_chosen_logits": 1.3391051292419434, "debug/policy_chosen_logps": -37.599971771240234, "debug/policy_rejected_logits": 1.6075389385223389, "debug/policy_rejected_logps": -8.87445068359375, "debug/reference_chosen_logps": -37.657630920410156, "debug/reference_rejected_logps": -8.866057395935059, "epoch": 0.036231884057971016, "grad_norm": 69.22118779858911, "learning_rate": 1e-08, "logits/chosen": 1.3391051292419434, "logits/rejected": 1.6075389385223389, "logps/chosen": -37.599971771240234, "logps/rejected": -8.87445068359375, "loss": 0.7058, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.028827469795942307, "rewards/margins": 0.03302405774593353, "rewards/rejected": -0.004196587018668652, "step": 10 }, { "debug/policy_chosen_logits": 1.2805339097976685, "debug/policy_chosen_logps": -7.345069885253906, "debug/policy_rejected_logits": 1.636499047279358, "debug/policy_rejected_logps": -5.37490701675415, "debug/reference_chosen_logps": -7.438336372375488, "debug/reference_rejected_logps": -5.3998122215271, "epoch": 0.05434782608695652, "grad_norm": 44.27540031985504, "learning_rate": 1.5e-08, "logits/chosen": 1.2805339097976685, "logits/rejected": 1.636499047279358, "logps/chosen": -7.345069885253906, "logps/rejected": -5.37490701675415, "loss": 0.7201, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.04663294553756714, "rewards/margins": 0.0341799296438694, "rewards/rejected": 0.012453016825020313, "step": 15 }, { "debug/policy_chosen_logits": 1.2522757053375244, "debug/policy_chosen_logps": -4.2000932693481445, "debug/policy_rejected_logits": 1.598736047744751, "debug/policy_rejected_logps": -88.78929138183594, "debug/reference_chosen_logps": -4.198297023773193, "debug/reference_rejected_logps": -88.761962890625, "epoch": 0.07246376811594203, "grad_norm": 53.407424491790465, "learning_rate": 2e-08, "logits/chosen": 1.2522757053375244, "logits/rejected": 1.598736047744751, "logps/chosen": -4.2000932693481445, "logps/rejected": -88.78929138183594, "loss": 0.7338, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.0008979484555311501, "rewards/margins": 0.012763584032654762, "rewards/rejected": -0.013661530800163746, "step": 20 }, { "debug/policy_chosen_logits": 1.3289144039154053, "debug/policy_chosen_logps": -208.35336303710938, "debug/policy_rejected_logits": 1.7015018463134766, "debug/policy_rejected_logps": -267.64886474609375, "debug/reference_chosen_logps": -208.846923828125, "debug/reference_rejected_logps": -267.5577697753906, "epoch": 0.09057971014492754, "grad_norm": 46.08877968346987, "learning_rate": 2.5e-08, "logits/chosen": 1.3289144039154053, "logits/rejected": 1.7015018463134766, "logps/chosen": -208.35336303710938, "logps/rejected": -267.64886474609375, "loss": 0.6779, "rewards/accuracies": 0.375, "rewards/chosen": 0.24678325653076172, "rewards/margins": 0.2923273742198944, "rewards/rejected": -0.04554413631558418, "step": 25 }, { "debug/policy_chosen_logits": 1.496201992034912, "debug/policy_chosen_logps": -61.36063766479492, "debug/policy_rejected_logits": 1.8403695821762085, "debug/policy_rejected_logps": -5.367673397064209, "debug/reference_chosen_logps": -61.356834411621094, "debug/reference_rejected_logps": -5.380406379699707, "epoch": 0.10869565217391304, "grad_norm": 102.1474816795724, "learning_rate": 3e-08, "logits/chosen": 1.496201992034912, "logits/rejected": 1.8403695821762085, "logps/chosen": -61.36063766479492, "logps/rejected": -5.367673397064209, "loss": 0.6918, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0019006371730938554, "rewards/margins": -0.008266950026154518, "rewards/rejected": 0.006366312503814697, "step": 30 }, { "debug/policy_chosen_logits": 1.5898544788360596, "debug/policy_chosen_logps": -67.1188735961914, "debug/policy_rejected_logits": 1.9144293069839478, "debug/policy_rejected_logps": -7.447409152984619, "debug/reference_chosen_logps": -67.15159606933594, "debug/reference_rejected_logps": -7.461578369140625, "epoch": 0.12681159420289856, "grad_norm": 223.8053306345378, "learning_rate": 3.4999999999999996e-08, "logits/chosen": 1.5898544788360596, "logits/rejected": 1.9144293069839478, "logps/chosen": -67.1188735961914, "logps/rejected": -7.447409152984619, "loss": 0.6929, "rewards/accuracies": 0.375, "rewards/chosen": 0.016367986798286438, "rewards/margins": 0.009283947758376598, "rewards/rejected": 0.00708403903990984, "step": 35 }, { "debug/policy_chosen_logits": 1.4932410717010498, "debug/policy_chosen_logps": -90.79304504394531, "debug/policy_rejected_logits": 1.819014549255371, "debug/policy_rejected_logps": -80.99085998535156, "debug/reference_chosen_logps": -90.82229614257812, "debug/reference_rejected_logps": -81.13166809082031, "epoch": 0.14492753623188406, "grad_norm": 200.4244323839049, "learning_rate": 4e-08, "logits/chosen": 1.4932410717010498, "logits/rejected": 1.819014549255371, "logps/chosen": -90.79304504394531, "logps/rejected": -80.99085998535156, "loss": 0.7192, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.014629915356636047, "rewards/margins": -0.05577345937490463, "rewards/rejected": 0.07040338218212128, "step": 40 }, { "debug/policy_chosen_logits": 1.446497917175293, "debug/policy_chosen_logps": -43.30194091796875, "debug/policy_rejected_logits": 1.6278880834579468, "debug/policy_rejected_logps": -7.595385551452637, "debug/reference_chosen_logps": -43.420326232910156, "debug/reference_rejected_logps": -7.612727165222168, "epoch": 0.16304347826086957, "grad_norm": 313.62324907533645, "learning_rate": 4.5e-08, "logits/chosen": 1.446497917175293, "logits/rejected": 1.6278880834579468, "logps/chosen": -43.30194091796875, "logps/rejected": -7.595385551452637, "loss": 0.6923, "rewards/accuracies": 0.375, "rewards/chosen": 0.05919359251856804, "rewards/margins": 0.050522781908512115, "rewards/rejected": 0.008670812472701073, "step": 45 }, { "debug/policy_chosen_logits": 1.3541991710662842, "debug/policy_chosen_logps": -88.19208526611328, "debug/policy_rejected_logits": 1.7921403646469116, "debug/policy_rejected_logps": -3.398169755935669, "debug/reference_chosen_logps": -88.64862823486328, "debug/reference_rejected_logps": -3.3678181171417236, "epoch": 0.18115942028985507, "grad_norm": 159.2874904795196, "learning_rate": 5e-08, "logits/chosen": 1.3541991710662842, "logits/rejected": 1.7921403646469116, "logps/chosen": -88.19208526611328, "logps/rejected": -3.398169755935669, "loss": 0.6975, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.2282753884792328, "rewards/margins": 0.24345119297504425, "rewards/rejected": -0.015175792388617992, "step": 50 }, { "debug/policy_chosen_logits": 1.3330062627792358, "debug/policy_chosen_logps": -11.84801197052002, "debug/policy_rejected_logits": 1.8746674060821533, "debug/policy_rejected_logps": -3.4664337635040283, "debug/reference_chosen_logps": -11.87784194946289, "debug/reference_rejected_logps": -3.4915835857391357, "epoch": 0.19927536231884058, "grad_norm": 56.1991536597922, "learning_rate": 5.5e-08, "logits/chosen": 1.3330062627792358, "logits/rejected": 1.8746674060821533, "logps/chosen": -11.84801197052002, "logps/rejected": -3.4664337635040283, "loss": 0.6965, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.014915320090949535, "rewards/margins": 0.002340298844501376, "rewards/rejected": 0.012575021013617516, "step": 55 }, { "debug/policy_chosen_logits": 1.5304359197616577, "debug/policy_chosen_logps": -4.556481838226318, "debug/policy_rejected_logits": 1.8311818838119507, "debug/policy_rejected_logps": -7.999899864196777, "debug/reference_chosen_logps": -4.53155517578125, "debug/reference_rejected_logps": -8.040096282958984, "epoch": 0.21739130434782608, "grad_norm": 284.9058064479617, "learning_rate": 6e-08, "logits/chosen": 1.5304359197616577, "logits/rejected": 1.8311818838119507, "logps/chosen": -4.556481838226318, "logps/rejected": -7.999899864196777, "loss": 0.7528, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.012463441118597984, "rewards/margins": -0.032561782747507095, "rewards/rejected": 0.020098339766263962, "step": 60 }, { "debug/policy_chosen_logits": 1.486358880996704, "debug/policy_chosen_logps": -133.09585571289062, "debug/policy_rejected_logits": 1.8627755641937256, "debug/policy_rejected_logps": -12.550134658813477, "debug/reference_chosen_logps": -133.0397491455078, "debug/reference_rejected_logps": -12.575475692749023, "epoch": 0.23550724637681159, "grad_norm": 43.20761255014075, "learning_rate": 6.5e-08, "logits/chosen": 1.486358880996704, "logits/rejected": 1.8627755641937256, "logps/chosen": -133.09585571289062, "logps/rejected": -12.550134658813477, "loss": 0.7002, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.028051769360899925, "rewards/margins": -0.04072146862745285, "rewards/rejected": 0.012669695541262627, "step": 65 }, { "debug/policy_chosen_logits": 1.6480764150619507, "debug/policy_chosen_logps": -7.052374839782715, "debug/policy_rejected_logits": 1.6341689825057983, "debug/policy_rejected_logps": -95.78614807128906, "debug/reference_chosen_logps": -7.0487470626831055, "debug/reference_rejected_logps": -95.66188049316406, "epoch": 0.2536231884057971, "grad_norm": 48.09046379061911, "learning_rate": 6.999999999999999e-08, "logits/chosen": 1.6480764150619507, "logits/rejected": 1.6341689825057983, "logps/chosen": -7.052374839782715, "logps/rejected": -95.78614807128906, "loss": 0.6899, "rewards/accuracies": 0.25, "rewards/chosen": -0.001814147806726396, "rewards/margins": 0.060319315642118454, "rewards/rejected": -0.06213346868753433, "step": 70 }, { "debug/policy_chosen_logits": 1.405449628829956, "debug/policy_chosen_logps": -122.58160400390625, "debug/policy_rejected_logits": 1.7929956912994385, "debug/policy_rejected_logps": -41.420372009277344, "debug/reference_chosen_logps": -122.72642517089844, "debug/reference_rejected_logps": -41.365840911865234, "epoch": 0.2717391304347826, "grad_norm": 732.069178943941, "learning_rate": 7.5e-08, "logits/chosen": 1.405449628829956, "logits/rejected": 1.7929956912994385, "logps/chosen": -122.58160400390625, "logps/rejected": -41.420372009277344, "loss": 0.7176, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.07241252809762955, "rewards/margins": 0.09967795759439468, "rewards/rejected": -0.027265435084700584, "step": 75 }, { "debug/policy_chosen_logits": 1.3693842887878418, "debug/policy_chosen_logps": -5.879665374755859, "debug/policy_rejected_logits": 1.7491607666015625, "debug/policy_rejected_logps": -4.8096513748168945, "debug/reference_chosen_logps": -5.829151630401611, "debug/reference_rejected_logps": -4.762129306793213, "epoch": 0.2898550724637681, "grad_norm": 46.50217415864538, "learning_rate": 8e-08, "logits/chosen": 1.3693842887878418, "logits/rejected": 1.7491607666015625, "logps/chosen": -5.879665374755859, "logps/rejected": -4.8096513748168945, "loss": 0.6944, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.025256846100091934, "rewards/margins": -0.00149595330003649, "rewards/rejected": -0.02376089058816433, "step": 80 }, { "debug/policy_chosen_logits": 1.5072828531265259, "debug/policy_chosen_logps": -64.70248413085938, "debug/policy_rejected_logits": 2.131293773651123, "debug/policy_rejected_logps": -7.926405429840088, "debug/reference_chosen_logps": -64.85469055175781, "debug/reference_rejected_logps": -7.927921295166016, "epoch": 0.3079710144927536, "grad_norm": 122.89270429487259, "learning_rate": 8.5e-08, "logits/chosen": 1.5072828531265259, "logits/rejected": 2.131293773651123, "logps/chosen": -64.70248413085938, "logps/rejected": -7.926405429840088, "loss": 0.6919, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.07610483467578888, "rewards/margins": 0.0753467008471489, "rewards/rejected": 0.0007581293466500938, "step": 85 }, { "debug/policy_chosen_logits": 1.246946096420288, "debug/policy_chosen_logps": -122.2060546875, "debug/policy_rejected_logits": 1.4303045272827148, "debug/policy_rejected_logps": -7.572357177734375, "debug/reference_chosen_logps": -122.35246276855469, "debug/reference_rejected_logps": -7.5182623863220215, "epoch": 0.32608695652173914, "grad_norm": 37.796949435252394, "learning_rate": 9e-08, "logits/chosen": 1.246946096420288, "logits/rejected": 1.4303045272827148, "logps/chosen": -122.2060546875, "logps/rejected": -7.572357177734375, "loss": 0.6851, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.07321014255285263, "rewards/margins": 0.10025773197412491, "rewards/rejected": -0.02704758569598198, "step": 90 }, { "debug/policy_chosen_logits": 1.4417003393173218, "debug/policy_chosen_logps": -149.5094757080078, "debug/policy_rejected_logits": 1.7644884586334229, "debug/policy_rejected_logps": -8.178075790405273, "debug/reference_chosen_logps": -149.43124389648438, "debug/reference_rejected_logps": -8.184698104858398, "epoch": 0.3442028985507246, "grad_norm": 37.42875850861949, "learning_rate": 9.499999999999999e-08, "logits/chosen": 1.4417003393173218, "logits/rejected": 1.7644884586334229, "logps/chosen": -149.5094757080078, "logps/rejected": -8.178075790405273, "loss": 0.7104, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.03911998122930527, "rewards/margins": -0.04243061691522598, "rewards/rejected": 0.0033106356859207153, "step": 95 }, { "debug/policy_chosen_logits": 1.2033050060272217, "debug/policy_chosen_logps": -6.482167720794678, "debug/policy_rejected_logits": 1.8862864971160889, "debug/policy_rejected_logps": -1.8354628086090088, "debug/reference_chosen_logps": -6.515327453613281, "debug/reference_rejected_logps": -1.8118356466293335, "epoch": 0.36231884057971014, "grad_norm": 190.5019169051825, "learning_rate": 1e-07, "logits/chosen": 1.2033050060272217, "logits/rejected": 1.8862864971160889, "logps/chosen": -6.482167720794678, "logps/rejected": -1.8354628086090088, "loss": 0.6965, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.016580190509557724, "rewards/margins": 0.028393691405653954, "rewards/rejected": -0.01181350089609623, "step": 100 }, { "epoch": 0.36231884057971014, "eval_debug/policy_chosen_logits": 1.6604009866714478, "eval_debug/policy_chosen_logps": -122.82534790039062, "eval_debug/policy_rejected_logits": 1.7215158939361572, "eval_debug/policy_rejected_logps": -63.74076843261719, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6604009866714478, "eval_logits/rejected": 1.7215158939361572, "eval_logps/chosen": -122.82534790039062, "eval_logps/rejected": -63.74076843261719, "eval_loss": 0.6847904324531555, "eval_rewards/accuracies": 0.28947368264198303, "eval_rewards/chosen": 0.16135363280773163, "eval_rewards/margins": 0.08820728957653046, "eval_rewards/rejected": 0.07314635068178177, "eval_runtime": 28.7197, "eval_samples_per_second": 20.892, "eval_steps_per_second": 0.662, "step": 100 }, { "debug/policy_chosen_logits": 1.129212737083435, "debug/policy_chosen_logps": -5.465607643127441, "debug/policy_rejected_logits": 1.2478989362716675, "debug/policy_rejected_logps": -100.41439819335938, "debug/reference_chosen_logps": -5.478997230529785, "debug/reference_rejected_logps": -100.13005828857422, "epoch": 0.3804347826086957, "grad_norm": 872.8316211149054, "learning_rate": 9.97628083491461e-08, "logits/chosen": 1.129212737083435, "logits/rejected": 1.2478989362716675, "logps/chosen": -5.465607643127441, "logps/rejected": -100.41439819335938, "loss": 0.7151, "rewards/accuracies": 0.25, "rewards/chosen": 0.006695074029266834, "rewards/margins": 0.14886613190174103, "rewards/rejected": -0.14217105507850647, "step": 105 }, { "debug/policy_chosen_logits": 1.2888004779815674, "debug/policy_chosen_logps": -69.70719909667969, "debug/policy_rejected_logits": 1.5768632888793945, "debug/policy_rejected_logps": -6.7308454513549805, "debug/reference_chosen_logps": -70.15248107910156, "debug/reference_rejected_logps": -6.677098274230957, "epoch": 0.39855072463768115, "grad_norm": 36.900362994892625, "learning_rate": 9.952561669829221e-08, "logits/chosen": 1.2888004779815674, "logits/rejected": 1.5768632888793945, "logps/chosen": -69.70719909667969, "logps/rejected": -6.7308454513549805, "loss": 0.6744, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.22263307869434357, "rewards/margins": 0.24950680136680603, "rewards/rejected": -0.02687370777130127, "step": 110 }, { "debug/policy_chosen_logits": 1.5455763339996338, "debug/policy_chosen_logps": -73.1484375, "debug/policy_rejected_logits": 1.9271223545074463, "debug/policy_rejected_logps": -17.94552993774414, "debug/reference_chosen_logps": -73.14714050292969, "debug/reference_rejected_logps": -17.959739685058594, "epoch": 0.4166666666666667, "grad_norm": 42.96576617802578, "learning_rate": 9.928842504743833e-08, "logits/chosen": 1.5455763339996338, "logits/rejected": 1.9271223545074463, "logps/chosen": -73.1484375, "logps/rejected": -17.94552993774414, "loss": 0.7542, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.000652603805065155, "rewards/margins": -0.007757800631225109, "rewards/rejected": 0.007105196826159954, "step": 115 }, { "debug/policy_chosen_logits": 1.331587553024292, "debug/policy_chosen_logps": -47.55541229248047, "debug/policy_rejected_logits": 1.6876602172851562, "debug/policy_rejected_logps": -2.451244592666626, "debug/reference_chosen_logps": -47.64037322998047, "debug/reference_rejected_logps": -2.4745194911956787, "epoch": 0.43478260869565216, "grad_norm": 74.51417012747747, "learning_rate": 9.905123339658443e-08, "logits/chosen": 1.331587553024292, "logits/rejected": 1.6876602172851562, "logps/chosen": -47.55541229248047, "logps/rejected": -2.451244592666626, "loss": 0.7, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.042478710412979126, "rewards/margins": 0.030841302126646042, "rewards/rejected": 0.01163740735501051, "step": 120 }, { "debug/policy_chosen_logits": 1.4097095727920532, "debug/policy_chosen_logps": -6.3295207023620605, "debug/policy_rejected_logits": 1.6506379842758179, "debug/policy_rejected_logps": -58.54729461669922, "debug/reference_chosen_logps": -6.291218280792236, "debug/reference_rejected_logps": -58.57655715942383, "epoch": 0.4528985507246377, "grad_norm": 92.66385681106617, "learning_rate": 9.881404174573055e-08, "logits/chosen": 1.4097095727920532, "logits/rejected": 1.6506379842758179, "logps/chosen": -6.3295207023620605, "logps/rejected": -58.54729461669922, "loss": 0.7034, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.019150998443365097, "rewards/margins": -0.03378099948167801, "rewards/rejected": 0.014630001969635487, "step": 125 }, { "debug/policy_chosen_logits": 1.1956632137298584, "debug/policy_chosen_logps": -4.7902512550354, "debug/policy_rejected_logits": 1.6222461462020874, "debug/policy_rejected_logps": -7.702892303466797, "debug/reference_chosen_logps": -4.811456203460693, "debug/reference_rejected_logps": -7.753373622894287, "epoch": 0.47101449275362317, "grad_norm": 52.775474562516436, "learning_rate": 9.857685009487665e-08, "logits/chosen": 1.1956632137298584, "logits/rejected": 1.6222461462020874, "logps/chosen": -4.7902512550354, "logps/rejected": -7.702892303466797, "loss": 0.6857, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.010602576658129692, "rewards/margins": -0.01463851798325777, "rewards/rejected": 0.025241095572710037, "step": 130 }, { "debug/policy_chosen_logits": 1.3353550434112549, "debug/policy_chosen_logps": -7.6248626708984375, "debug/policy_rejected_logits": 1.7967023849487305, "debug/policy_rejected_logps": -45.980072021484375, "debug/reference_chosen_logps": -7.610405921936035, "debug/reference_rejected_logps": -46.041690826416016, "epoch": 0.4891304347826087, "grad_norm": 65.22552646407442, "learning_rate": 9.833965844402277e-08, "logits/chosen": 1.3353550434112549, "logits/rejected": 1.7967023849487305, "logps/chosen": -7.6248626708984375, "logps/rejected": -45.980072021484375, "loss": 0.7197, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.007228744216263294, "rewards/margins": -0.03803950548171997, "rewards/rejected": 0.03081076219677925, "step": 135 }, { "debug/policy_chosen_logits": 1.713627576828003, "debug/policy_chosen_logps": -10.879376411437988, "debug/policy_rejected_logits": 2.008789539337158, "debug/policy_rejected_logps": -1.9460251331329346, "debug/reference_chosen_logps": -10.920201301574707, "debug/reference_rejected_logps": -1.9509022235870361, "epoch": 0.5072463768115942, "grad_norm": 529.2040554388126, "learning_rate": 9.810246679316887e-08, "logits/chosen": 1.713627576828003, "logits/rejected": 2.008789539337158, "logps/chosen": -10.879376411437988, "logps/rejected": -1.9460251331329346, "loss": 0.7262, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.020412564277648926, "rewards/margins": 0.017973994836211205, "rewards/rejected": 0.002438568277284503, "step": 140 }, { "debug/policy_chosen_logits": 1.2310256958007812, "debug/policy_chosen_logps": -227.7897491455078, "debug/policy_rejected_logits": 1.555720329284668, "debug/policy_rejected_logps": -5.876959800720215, "debug/reference_chosen_logps": -228.1366729736328, "debug/reference_rejected_logps": -5.889040946960449, "epoch": 0.5253623188405797, "grad_norm": 236.73352158678884, "learning_rate": 9.786527514231498e-08, "logits/chosen": 1.2310256958007812, "logits/rejected": 1.555720329284668, "logps/chosen": -227.7897491455078, "logps/rejected": -5.876959800720215, "loss": 0.7378, "rewards/accuracies": 0.375, "rewards/chosen": 0.17345476150512695, "rewards/margins": 0.1674138605594635, "rewards/rejected": 0.006040886044502258, "step": 145 }, { "debug/policy_chosen_logits": 1.595587968826294, "debug/policy_chosen_logps": -196.78579711914062, "debug/policy_rejected_logits": 2.0610172748565674, "debug/policy_rejected_logps": -70.01274108886719, "debug/reference_chosen_logps": -198.35165405273438, "debug/reference_rejected_logps": -70.21018981933594, "epoch": 0.5434782608695652, "grad_norm": 55.01605119997572, "learning_rate": 9.76280834914611e-08, "logits/chosen": 1.595587968826294, "logits/rejected": 2.0610172748565674, "logps/chosen": -196.78579711914062, "logps/rejected": -70.01274108886719, "loss": 0.7071, "rewards/accuracies": 0.25, "rewards/chosen": 0.7829268574714661, "rewards/margins": 0.6841999292373657, "rewards/rejected": 0.09872697293758392, "step": 150 }, { "debug/policy_chosen_logits": 1.3722137212753296, "debug/policy_chosen_logps": -188.15171813964844, "debug/policy_rejected_logits": 1.7261148691177368, "debug/policy_rejected_logps": -11.316301345825195, "debug/reference_chosen_logps": -188.67599487304688, "debug/reference_rejected_logps": -11.309015274047852, "epoch": 0.5615942028985508, "grad_norm": 83.60951340920663, "learning_rate": 9.73908918406072e-08, "logits/chosen": 1.3722137212753296, "logits/rejected": 1.7261148691177368, "logps/chosen": -188.15171813964844, "logps/rejected": -11.316301345825195, "loss": 0.6875, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.262137770652771, "rewards/margins": 0.2657802700996399, "rewards/rejected": -0.0036424933932721615, "step": 155 }, { "debug/policy_chosen_logits": 1.1060752868652344, "debug/policy_chosen_logps": -41.36880111694336, "debug/policy_rejected_logits": 1.57027268409729, "debug/policy_rejected_logps": -7.269781589508057, "debug/reference_chosen_logps": -41.46500778198242, "debug/reference_rejected_logps": -7.236231803894043, "epoch": 0.5797101449275363, "grad_norm": 33.19604528563128, "learning_rate": 9.715370018975332e-08, "logits/chosen": 1.1060752868652344, "logits/rejected": 1.57027268409729, "logps/chosen": -41.36880111694336, "logps/rejected": -7.269781589508057, "loss": 0.6971, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.048101745545864105, "rewards/margins": 0.06487661600112915, "rewards/rejected": -0.016774868592619896, "step": 160 }, { "debug/policy_chosen_logits": 1.6114082336425781, "debug/policy_chosen_logps": -47.111656188964844, "debug/policy_rejected_logits": 1.4320528507232666, "debug/policy_rejected_logps": -26.484411239624023, "debug/reference_chosen_logps": -47.207061767578125, "debug/reference_rejected_logps": -26.676687240600586, "epoch": 0.5978260869565217, "grad_norm": 72.49400066520543, "learning_rate": 9.691650853889942e-08, "logits/chosen": 1.6114082336425781, "logits/rejected": 1.4320528507232666, "logps/chosen": -47.111656188964844, "logps/rejected": -26.484411239624023, "loss": 0.7271, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.04770234227180481, "rewards/margins": -0.04843612387776375, "rewards/rejected": 0.09613846242427826, "step": 165 }, { "debug/policy_chosen_logits": 1.2819362878799438, "debug/policy_chosen_logps": -50.41461944580078, "debug/policy_rejected_logits": 1.592810869216919, "debug/policy_rejected_logps": -6.382050514221191, "debug/reference_chosen_logps": -49.824684143066406, "debug/reference_rejected_logps": -6.327885627746582, "epoch": 0.6159420289855072, "grad_norm": 502.02145890304405, "learning_rate": 9.667931688804554e-08, "logits/chosen": 1.2819362878799438, "logits/rejected": 1.592810869216919, "logps/chosen": -50.41461944580078, "logps/rejected": -6.382050514221191, "loss": 0.8875, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.2949669361114502, "rewards/margins": -0.26788461208343506, "rewards/rejected": -0.02708229422569275, "step": 170 }, { "debug/policy_chosen_logits": 1.4071213006973267, "debug/policy_chosen_logps": -11.66738510131836, "debug/policy_rejected_logits": 1.8494924306869507, "debug/policy_rejected_logps": -113.35693359375, "debug/reference_chosen_logps": -11.554305076599121, "debug/reference_rejected_logps": -113.70204162597656, "epoch": 0.6340579710144928, "grad_norm": 314.2762190508336, "learning_rate": 9.644212523719165e-08, "logits/chosen": 1.4071213006973267, "logits/rejected": 1.8494924306869507, "logps/chosen": -11.66738510131836, "logps/rejected": -113.35693359375, "loss": 0.845, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.056540071964263916, "rewards/margins": -0.22908970713615417, "rewards/rejected": 0.17254965007305145, "step": 175 }, { "debug/policy_chosen_logits": 1.2379229068756104, "debug/policy_chosen_logps": -6.554471015930176, "debug/policy_rejected_logits": 1.6064598560333252, "debug/policy_rejected_logps": -11.22156047821045, "debug/reference_chosen_logps": -6.455367088317871, "debug/reference_rejected_logps": -11.164003372192383, "epoch": 0.6521739130434783, "grad_norm": 474.8677886823652, "learning_rate": 9.620493358633775e-08, "logits/chosen": 1.2379229068756104, "logits/rejected": 1.6064598560333252, "logps/chosen": -6.554471015930176, "logps/rejected": -11.22156047821045, "loss": 0.7054, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.04955184832215309, "rewards/margins": -0.020772721618413925, "rewards/rejected": -0.028779124841094017, "step": 180 }, { "debug/policy_chosen_logits": 1.2265291213989258, "debug/policy_chosen_logps": -7.861398220062256, "debug/policy_rejected_logits": 1.6796070337295532, "debug/policy_rejected_logps": -15.574020385742188, "debug/reference_chosen_logps": -7.793820858001709, "debug/reference_rejected_logps": -15.452303886413574, "epoch": 0.6702898550724637, "grad_norm": 57.714314187125964, "learning_rate": 9.596774193548388e-08, "logits/chosen": 1.2265291213989258, "logits/rejected": 1.6796070337295532, "logps/chosen": -7.861398220062256, "logps/rejected": -15.574020385742188, "loss": 0.7027, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.03378872200846672, "rewards/margins": 0.027070069685578346, "rewards/rejected": -0.060858793556690216, "step": 185 }, { "debug/policy_chosen_logits": 1.2329134941101074, "debug/policy_chosen_logps": -55.34379196166992, "debug/policy_rejected_logits": 1.30684494972229, "debug/policy_rejected_logps": -3.619515895843506, "debug/reference_chosen_logps": -55.711204528808594, "debug/reference_rejected_logps": -3.6667861938476562, "epoch": 0.6884057971014492, "grad_norm": 30.498840212087302, "learning_rate": 9.573055028462997e-08, "logits/chosen": 1.2329134941101074, "logits/rejected": 1.30684494972229, "logps/chosen": -55.34379196166992, "logps/rejected": -3.619515895843506, "loss": 0.7574, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.1837066113948822, "rewards/margins": 0.16007152199745178, "rewards/rejected": 0.023635083809494972, "step": 190 }, { "debug/policy_chosen_logits": 1.7879149913787842, "debug/policy_chosen_logps": -100.79790496826172, "debug/policy_rejected_logits": 2.1582083702087402, "debug/policy_rejected_logps": -83.45491790771484, "debug/reference_chosen_logps": -101.08064270019531, "debug/reference_rejected_logps": -83.28086853027344, "epoch": 0.7065217391304348, "grad_norm": 56.125196435822126, "learning_rate": 9.549335863377609e-08, "logits/chosen": 1.7879149913787842, "logits/rejected": 2.1582083702087402, "logps/chosen": -100.79790496826172, "logps/rejected": -83.45491790771484, "loss": 0.7087, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.14136560261249542, "rewards/margins": 0.22838811576366425, "rewards/rejected": -0.08702252060174942, "step": 195 }, { "debug/policy_chosen_logits": 1.245687484741211, "debug/policy_chosen_logps": -4.824423313140869, "debug/policy_rejected_logits": 1.5642822980880737, "debug/policy_rejected_logps": -52.92164993286133, "debug/reference_chosen_logps": -4.822422027587891, "debug/reference_rejected_logps": -53.10357666015625, "epoch": 0.7246376811594203, "grad_norm": 125.87588383424739, "learning_rate": 9.525616698292219e-08, "logits/chosen": 1.245687484741211, "logits/rejected": 1.5642822980880737, "logps/chosen": -4.824423313140869, "logps/rejected": -52.92164993286133, "loss": 0.7398, "rewards/accuracies": 0.25, "rewards/chosen": -0.001000724732875824, "rewards/margins": -0.0919642522931099, "rewards/rejected": 0.09096352756023407, "step": 200 }, { "epoch": 0.7246376811594203, "eval_debug/policy_chosen_logits": 1.651292324066162, "eval_debug/policy_chosen_logps": -122.15211486816406, "eval_debug/policy_rejected_logits": 1.7104647159576416, "eval_debug/policy_rejected_logps": -63.662506103515625, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.651292324066162, "eval_logits/rejected": 1.7104647159576416, "eval_logps/chosen": -122.15211486816406, "eval_logps/rejected": -63.662506103515625, "eval_loss": 0.7128049731254578, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.4979678690433502, "eval_rewards/margins": 0.38569140434265137, "eval_rewards/rejected": 0.11227651685476303, "eval_runtime": 28.5192, "eval_samples_per_second": 21.038, "eval_steps_per_second": 0.666, "step": 200 }, { "debug/policy_chosen_logits": 1.3564214706420898, "debug/policy_chosen_logps": -4.859026908874512, "debug/policy_rejected_logits": 1.701321005821228, "debug/policy_rejected_logps": -85.96968078613281, "debug/reference_chosen_logps": -4.833613872528076, "debug/reference_rejected_logps": -85.68836212158203, "epoch": 0.7427536231884058, "grad_norm": 187.4770935806715, "learning_rate": 9.501897533206831e-08, "logits/chosen": 1.3564214706420898, "logits/rejected": 1.701321005821228, "logps/chosen": -4.859026908874512, "logps/rejected": -85.96968078613281, "loss": 0.715, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.012706214562058449, "rewards/margins": 0.12795329093933105, "rewards/rejected": -0.14065949618816376, "step": 205 }, { "debug/policy_chosen_logits": 1.188867211341858, "debug/policy_chosen_logps": -63.4585075378418, "debug/policy_rejected_logits": 1.4904444217681885, "debug/policy_rejected_logps": -45.49162673950195, "debug/reference_chosen_logps": -63.378448486328125, "debug/reference_rejected_logps": -45.54474639892578, "epoch": 0.7608695652173914, "grad_norm": 186.05283960282506, "learning_rate": 9.478178368121442e-08, "logits/chosen": 1.188867211341858, "logits/rejected": 1.4904444217681885, "logps/chosen": -63.4585075378418, "logps/rejected": -45.49162673950195, "loss": 0.6978, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.04002813994884491, "rewards/margins": -0.06658516079187393, "rewards/rejected": 0.02655702270567417, "step": 210 }, { "debug/policy_chosen_logits": 1.4695371389389038, "debug/policy_chosen_logps": -46.51490020751953, "debug/policy_rejected_logits": 1.8625717163085938, "debug/policy_rejected_logps": -7.945758819580078, "debug/reference_chosen_logps": -46.571571350097656, "debug/reference_rejected_logps": -7.923023223876953, "epoch": 0.7789855072463768, "grad_norm": 65.86153267046119, "learning_rate": 9.454459203036053e-08, "logits/chosen": 1.4695371389389038, "logits/rejected": 1.8625717163085938, "logps/chosen": -46.51490020751953, "logps/rejected": -7.945758819580078, "loss": 0.7128, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.028337795287370682, "rewards/margins": 0.039705730974674225, "rewards/rejected": -0.011367934755980968, "step": 215 }, { "debug/policy_chosen_logits": 0.9705718159675598, "debug/policy_chosen_logps": -26.911203384399414, "debug/policy_rejected_logits": 1.2969322204589844, "debug/policy_rejected_logps": -10.081793785095215, "debug/reference_chosen_logps": -26.971553802490234, "debug/reference_rejected_logps": -10.127599716186523, "epoch": 0.7971014492753623, "grad_norm": 48.78014783910225, "learning_rate": 9.430740037950665e-08, "logits/chosen": 0.9705718159675598, "logits/rejected": 1.2969322204589844, "logps/chosen": -26.911203384399414, "logps/rejected": -10.081793785095215, "loss": 0.7075, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.030174437910318375, "rewards/margins": 0.007271638605743647, "rewards/rejected": 0.022902797907590866, "step": 220 }, { "debug/policy_chosen_logits": 1.3417659997940063, "debug/policy_chosen_logps": -7.4791460037231445, "debug/policy_rejected_logits": 1.6347606182098389, "debug/policy_rejected_logps": -117.86802673339844, "debug/reference_chosen_logps": -7.492678165435791, "debug/reference_rejected_logps": -117.96882629394531, "epoch": 0.8152173913043478, "grad_norm": 74.15979287187052, "learning_rate": 9.407020872865274e-08, "logits/chosen": 1.3417659997940063, "logits/rejected": 1.6347606182098389, "logps/chosen": -7.4791460037231445, "logps/rejected": -117.86802673339844, "loss": 0.6954, "rewards/accuracies": 0.25, "rewards/chosen": 0.00676552951335907, "rewards/margins": -0.04363274946808815, "rewards/rejected": 0.05039827898144722, "step": 225 }, { "debug/policy_chosen_logits": 1.200887680053711, "debug/policy_chosen_logps": -44.43962860107422, "debug/policy_rejected_logits": 1.4172102212905884, "debug/policy_rejected_logps": -9.612649917602539, "debug/reference_chosen_logps": -44.462120056152344, "debug/reference_rejected_logps": -9.55119514465332, "epoch": 0.8333333333333334, "grad_norm": 312.5199970537055, "learning_rate": 9.383301707779886e-08, "logits/chosen": 1.200887680053711, "logits/rejected": 1.4172102212905884, "logps/chosen": -44.43962860107422, "logps/rejected": -9.612649917602539, "loss": 0.6928, "rewards/accuracies": 0.25, "rewards/chosen": 0.011244731023907661, "rewards/margins": 0.041972629725933075, "rewards/rejected": -0.030727898702025414, "step": 230 }, { "debug/policy_chosen_logits": 1.3822534084320068, "debug/policy_chosen_logps": -3.829446315765381, "debug/policy_rejected_logits": 1.554442286491394, "debug/policy_rejected_logps": -11.288410186767578, "debug/reference_chosen_logps": -3.7865378856658936, "debug/reference_rejected_logps": -11.303999900817871, "epoch": 0.8514492753623188, "grad_norm": 54.12299928135151, "learning_rate": 9.359582542694496e-08, "logits/chosen": 1.3822534084320068, "logits/rejected": 1.554442286491394, "logps/chosen": -3.829446315765381, "logps/rejected": -11.288410186767578, "loss": 0.6842, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.021454567089676857, "rewards/margins": -0.02925025299191475, "rewards/rejected": 0.007795685436576605, "step": 235 }, { "debug/policy_chosen_logits": 1.377142310142517, "debug/policy_chosen_logps": -3.6146438121795654, "debug/policy_rejected_logits": 1.973607063293457, "debug/policy_rejected_logps": -36.35700225830078, "debug/reference_chosen_logps": -3.5993995666503906, "debug/reference_rejected_logps": -36.39490509033203, "epoch": 0.8695652173913043, "grad_norm": 178.44275630260412, "learning_rate": 9.335863377609108e-08, "logits/chosen": 1.377142310142517, "logits/rejected": 1.973607063293457, "logps/chosen": -3.6146438121795654, "logps/rejected": -36.35700225830078, "loss": 0.7103, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.007622136268764734, "rewards/margins": -0.026574622839689255, "rewards/rejected": 0.018952488899230957, "step": 240 }, { "debug/policy_chosen_logits": 1.1033878326416016, "debug/policy_chosen_logps": -3.6646347045898438, "debug/policy_rejected_logits": 1.455925703048706, "debug/policy_rejected_logps": -171.12744140625, "debug/reference_chosen_logps": -3.6889584064483643, "debug/reference_rejected_logps": -171.53671264648438, "epoch": 0.8876811594202898, "grad_norm": 225.02145248999597, "learning_rate": 9.312144212523719e-08, "logits/chosen": 1.1033878326416016, "logits/rejected": 1.455925703048706, "logps/chosen": -3.6646347045898438, "logps/rejected": -171.12744140625, "loss": 0.7275, "rewards/accuracies": 0.375, "rewards/chosen": 0.01216198317706585, "rewards/margins": -0.1924702376127243, "rewards/rejected": 0.2046322375535965, "step": 245 }, { "debug/policy_chosen_logits": 1.2224252223968506, "debug/policy_chosen_logps": -5.483409404754639, "debug/policy_rejected_logits": 1.5863492488861084, "debug/policy_rejected_logps": -4.8543500900268555, "debug/reference_chosen_logps": -5.442919731140137, "debug/reference_rejected_logps": -4.868186950683594, "epoch": 0.9057971014492754, "grad_norm": 68.12397986831508, "learning_rate": 9.28842504743833e-08, "logits/chosen": 1.2224252223968506, "logits/rejected": 1.5863492488861084, "logps/chosen": -5.483409404754639, "logps/rejected": -4.8543500900268555, "loss": 0.7082, "rewards/accuracies": 0.125, "rewards/chosen": -0.0202449019998312, "rewards/margins": -0.027163231745362282, "rewards/rejected": 0.006918327417224646, "step": 250 }, { "debug/policy_chosen_logits": 1.4105212688446045, "debug/policy_chosen_logps": -6.510067939758301, "debug/policy_rejected_logits": 1.5815021991729736, "debug/policy_rejected_logps": -2.819765090942383, "debug/reference_chosen_logps": -6.483206748962402, "debug/reference_rejected_logps": -2.824536085128784, "epoch": 0.9239130434782609, "grad_norm": 544.4628153289185, "learning_rate": 9.26470588235294e-08, "logits/chosen": 1.4105212688446045, "logits/rejected": 1.5815021991729736, "logps/chosen": -6.510067939758301, "logps/rejected": -2.819765090942383, "loss": 0.7231, "rewards/accuracies": 0.25, "rewards/chosen": -0.013430392369627953, "rewards/margins": -0.015815991908311844, "rewards/rejected": 0.002385598374530673, "step": 255 }, { "debug/policy_chosen_logits": 1.3324449062347412, "debug/policy_chosen_logps": -67.6996078491211, "debug/policy_rejected_logits": 1.9034836292266846, "debug/policy_rejected_logps": -47.44561004638672, "debug/reference_chosen_logps": -67.80909729003906, "debug/reference_rejected_logps": -47.41716384887695, "epoch": 0.9420289855072463, "grad_norm": 98.65509016552036, "learning_rate": 9.240986717267551e-08, "logits/chosen": 1.3324449062347412, "logits/rejected": 1.9034836292266846, "logps/chosen": -67.6996078491211, "logps/rejected": -47.44561004638672, "loss": 0.6812, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.054740529507398605, "rewards/margins": 0.06896495819091797, "rewards/rejected": -0.014224430546164513, "step": 260 }, { "debug/policy_chosen_logits": 1.5492708683013916, "debug/policy_chosen_logps": -38.544578552246094, "debug/policy_rejected_logits": 1.7015316486358643, "debug/policy_rejected_logps": -132.37881469726562, "debug/reference_chosen_logps": -38.708030700683594, "debug/reference_rejected_logps": -132.49644470214844, "epoch": 0.9601449275362319, "grad_norm": 56.237752102493005, "learning_rate": 9.217267552182164e-08, "logits/chosen": 1.5492708683013916, "logits/rejected": 1.7015316486358643, "logps/chosen": -38.544578552246094, "logps/rejected": -132.37881469726562, "loss": 0.68, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.08172450959682465, "rewards/margins": 0.02291094698011875, "rewards/rejected": 0.058813564479351044, "step": 265 }, { "debug/policy_chosen_logits": 1.5024263858795166, "debug/policy_chosen_logps": -5.89066219329834, "debug/policy_rejected_logits": 2.0262081623077393, "debug/policy_rejected_logps": -3.53810453414917, "debug/reference_chosen_logps": -5.859536170959473, "debug/reference_rejected_logps": -3.5223605632781982, "epoch": 0.9782608695652174, "grad_norm": 142.5879016451601, "learning_rate": 9.193548387096773e-08, "logits/chosen": 1.5024263858795166, "logits/rejected": 2.0262081623077393, "logps/chosen": -5.89066219329834, "logps/rejected": -3.53810453414917, "loss": 0.6951, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.015562695451080799, "rewards/margins": -0.007690657861530781, "rewards/rejected": -0.007872037589550018, "step": 270 }, { "debug/policy_chosen_logits": 1.1445276737213135, "debug/policy_chosen_logps": -4.187214374542236, "debug/policy_rejected_logits": 1.4142119884490967, "debug/policy_rejected_logps": -12.04640007019043, "debug/reference_chosen_logps": -4.2250447273254395, "debug/reference_rejected_logps": -11.967859268188477, "epoch": 0.9963768115942029, "grad_norm": 192.43202029990996, "learning_rate": 9.169829222011385e-08, "logits/chosen": 1.1445276737213135, "logits/rejected": 1.4142119884490967, "logps/chosen": -4.187214374542236, "logps/rejected": -12.04640007019043, "loss": 0.7002, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.018915316089987755, "rewards/margins": 0.058185137808322906, "rewards/rejected": -0.039269816130399704, "step": 275 }, { "debug/policy_chosen_logits": 1.3898887634277344, "debug/policy_chosen_logps": -134.59823608398438, "debug/policy_rejected_logits": 1.847193956375122, "debug/policy_rejected_logps": -10.337038040161133, "debug/reference_chosen_logps": -135.3412628173828, "debug/reference_rejected_logps": -10.286213874816895, "epoch": 1.0144927536231885, "grad_norm": 32.19448121040258, "learning_rate": 9.146110056925995e-08, "logits/chosen": 1.3898887634277344, "logits/rejected": 1.847193956375122, "logps/chosen": -134.59823608398438, "logps/rejected": -10.337038040161133, "loss": 0.7074, "rewards/accuracies": 0.25, "rewards/chosen": 0.3715194761753082, "rewards/margins": 0.3969319462776184, "rewards/rejected": -0.025412458926439285, "step": 280 }, { "debug/policy_chosen_logits": 1.271003246307373, "debug/policy_chosen_logps": -40.884300231933594, "debug/policy_rejected_logits": 1.509128451347351, "debug/policy_rejected_logps": -7.454216003417969, "debug/reference_chosen_logps": -40.90019989013672, "debug/reference_rejected_logps": -7.467019557952881, "epoch": 1.0326086956521738, "grad_norm": 91.58014447347324, "learning_rate": 9.122390891840607e-08, "logits/chosen": 1.271003246307373, "logits/rejected": 1.509128451347351, "logps/chosen": -40.884300231933594, "logps/rejected": -7.454216003417969, "loss": 0.7221, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.007949036546051502, "rewards/margins": 0.0015475660329684615, "rewards/rejected": 0.006401470396667719, "step": 285 }, { "debug/policy_chosen_logits": 1.5717649459838867, "debug/policy_chosen_logps": -128.00160217285156, "debug/policy_rejected_logits": 1.6953779458999634, "debug/policy_rejected_logps": -2.0961408615112305, "debug/reference_chosen_logps": -128.1494140625, "debug/reference_rejected_logps": -2.083263397216797, "epoch": 1.0507246376811594, "grad_norm": 32.314050279641414, "learning_rate": 9.098671726755218e-08, "logits/chosen": 1.5717649459838867, "logits/rejected": 1.6953779458999634, "logps/chosen": -128.00160217285156, "logps/rejected": -2.0961408615112305, "loss": 0.675, "rewards/accuracies": 0.25, "rewards/chosen": 0.0739077627658844, "rewards/margins": 0.08034648001194, "rewards/rejected": -0.006438717246055603, "step": 290 }, { "debug/policy_chosen_logits": 1.3084014654159546, "debug/policy_chosen_logps": -3.206479549407959, "debug/policy_rejected_logits": 1.7398570775985718, "debug/policy_rejected_logps": -5.167227268218994, "debug/reference_chosen_logps": -3.226156711578369, "debug/reference_rejected_logps": -5.182940483093262, "epoch": 1.068840579710145, "grad_norm": 86.82984931901706, "learning_rate": 9.074952561669828e-08, "logits/chosen": 1.3084014654159546, "logits/rejected": 1.7398570775985718, "logps/chosen": -3.206479549407959, "logps/rejected": -5.167227268218994, "loss": 0.6856, "rewards/accuracies": 0.25, "rewards/chosen": 0.009838591329753399, "rewards/margins": 0.0019819277804344893, "rewards/rejected": 0.00785666424781084, "step": 295 }, { "debug/policy_chosen_logits": 1.0967118740081787, "debug/policy_chosen_logps": -6.225718021392822, "debug/policy_rejected_logits": 1.3784626722335815, "debug/policy_rejected_logps": -92.94539642333984, "debug/reference_chosen_logps": -6.221102237701416, "debug/reference_rejected_logps": -93.07762145996094, "epoch": 1.0869565217391304, "grad_norm": 31.049893652088976, "learning_rate": 9.05123339658444e-08, "logits/chosen": 1.0967118740081787, "logits/rejected": 1.3784626722335815, "logps/chosen": -6.225718021392822, "logps/rejected": -92.94539642333984, "loss": 0.7007, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.002307932125404477, "rewards/margins": -0.06841960549354553, "rewards/rejected": 0.06611166894435883, "step": 300 }, { "epoch": 1.0869565217391304, "eval_debug/policy_chosen_logits": 1.6541943550109863, "eval_debug/policy_chosen_logps": -122.3354263305664, "eval_debug/policy_rejected_logits": 1.7138471603393555, "eval_debug/policy_rejected_logps": -63.888336181640625, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6541943550109863, "eval_logits/rejected": 1.7138471603393555, "eval_logps/chosen": -122.3354263305664, "eval_logps/rejected": -63.888336181640625, "eval_loss": 0.6869186162948608, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.4063166081905365, "eval_rewards/margins": 0.40696093440055847, "eval_rewards/rejected": -0.0006443716702051461, "eval_runtime": 28.693, "eval_samples_per_second": 20.911, "eval_steps_per_second": 0.662, "step": 300 }, { "debug/policy_chosen_logits": 1.7249126434326172, "debug/policy_chosen_logps": -8.13222599029541, "debug/policy_rejected_logits": 1.9329736232757568, "debug/policy_rejected_logps": -64.0086441040039, "debug/reference_chosen_logps": -8.141159057617188, "debug/reference_rejected_logps": -64.1399154663086, "epoch": 1.105072463768116, "grad_norm": 33.15124267114398, "learning_rate": 9.02751423149905e-08, "logits/chosen": 1.7249126434326172, "logits/rejected": 1.9329736232757568, "logps/chosen": -8.13222599029541, "logps/rejected": -64.0086441040039, "loss": 0.7002, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.0044667720794677734, "rewards/margins": -0.06117189675569534, "rewards/rejected": 0.06563866883516312, "step": 305 }, { "debug/policy_chosen_logits": 1.439462423324585, "debug/policy_chosen_logps": -44.76249313354492, "debug/policy_rejected_logits": 1.6918329000473022, "debug/policy_rejected_logps": -68.22378540039062, "debug/reference_chosen_logps": -44.858192443847656, "debug/reference_rejected_logps": -68.05248260498047, "epoch": 1.1231884057971016, "grad_norm": 34.51294011136934, "learning_rate": 9.003795066413662e-08, "logits/chosen": 1.439462423324585, "logits/rejected": 1.6918329000473022, "logps/chosen": -44.76249313354492, "logps/rejected": -68.22378540039062, "loss": 0.679, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.04784851148724556, "rewards/margins": 0.13349848985671997, "rewards/rejected": -0.08564998209476471, "step": 310 }, { "debug/policy_chosen_logits": 1.0631589889526367, "debug/policy_chosen_logps": -142.385986328125, "debug/policy_rejected_logits": 1.4526264667510986, "debug/policy_rejected_logps": -43.890926361083984, "debug/reference_chosen_logps": -142.84730529785156, "debug/reference_rejected_logps": -44.07756805419922, "epoch": 1.141304347826087, "grad_norm": 58.48047034793869, "learning_rate": 8.980075901328272e-08, "logits/chosen": 1.0631589889526367, "logits/rejected": 1.4526264667510986, "logps/chosen": -142.385986328125, "logps/rejected": -43.890926361083984, "loss": 0.7323, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.23066258430480957, "rewards/margins": 0.1373397707939148, "rewards/rejected": 0.09332280606031418, "step": 315 }, { "debug/policy_chosen_logits": 1.5473772287368774, "debug/policy_chosen_logps": -7.956326961517334, "debug/policy_rejected_logits": 1.761932373046875, "debug/policy_rejected_logps": -5.67390251159668, "debug/reference_chosen_logps": -7.988713264465332, "debug/reference_rejected_logps": -5.633814811706543, "epoch": 1.1594202898550725, "grad_norm": 306.0362402178197, "learning_rate": 8.956356736242884e-08, "logits/chosen": 1.5473772287368774, "logits/rejected": 1.761932373046875, "logps/chosen": -7.956326961517334, "logps/rejected": -5.67390251159668, "loss": 0.6887, "rewards/accuracies": 0.375, "rewards/chosen": 0.016192886978387833, "rewards/margins": 0.03623682260513306, "rewards/rejected": -0.020043935626745224, "step": 320 }, { "debug/policy_chosen_logits": 0.9684053659439087, "debug/policy_chosen_logps": -8.3453369140625, "debug/policy_rejected_logits": 1.254480004310608, "debug/policy_rejected_logps": -9.08045768737793, "debug/reference_chosen_logps": -8.260573387145996, "debug/reference_rejected_logps": -9.093328475952148, "epoch": 1.177536231884058, "grad_norm": 528.7831530461841, "learning_rate": 8.932637571157495e-08, "logits/chosen": 0.9684053659439087, "logits/rejected": 1.254480004310608, "logps/chosen": -8.3453369140625, "logps/rejected": -9.08045768737793, "loss": 1.0389, "rewards/accuracies": 0.125, "rewards/chosen": -0.042381651699543, "rewards/margins": -0.04881778731942177, "rewards/rejected": 0.006436133291572332, "step": 325 }, { "debug/policy_chosen_logits": 0.7250394821166992, "debug/policy_chosen_logps": -49.0622444152832, "debug/policy_rejected_logits": 1.2071263790130615, "debug/policy_rejected_logps": -3.729813814163208, "debug/reference_chosen_logps": -46.528934478759766, "debug/reference_rejected_logps": -3.632153034210205, "epoch": 1.1956521739130435, "grad_norm": 639.0341143070133, "learning_rate": 8.908918406072106e-08, "logits/chosen": 0.7250394821166992, "logits/rejected": 1.2071263790130615, "logps/chosen": -49.0622444152832, "logps/rejected": -3.729813814163208, "loss": 1.6964, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -1.266656517982483, "rewards/margins": -1.217826008796692, "rewards/rejected": -0.048830386251211166, "step": 330 }, { "debug/policy_chosen_logits": 1.2238719463348389, "debug/policy_chosen_logps": -63.63777542114258, "debug/policy_rejected_logits": 1.4221211671829224, "debug/policy_rejected_logps": -6.221766471862793, "debug/reference_chosen_logps": -62.81486129760742, "debug/reference_rejected_logps": -6.199158668518066, "epoch": 1.213768115942029, "grad_norm": 733.6748681173494, "learning_rate": 8.885199240986718e-08, "logits/chosen": 1.2238719463348389, "logits/rejected": 1.4221211671829224, "logps/chosen": -63.63777542114258, "logps/rejected": -6.221766471862793, "loss": 1.2481, "rewards/accuracies": 0.25, "rewards/chosen": -0.4114552438259125, "rewards/margins": -0.40015095472335815, "rewards/rejected": -0.01130426861345768, "step": 335 }, { "debug/policy_chosen_logits": 1.6413097381591797, "debug/policy_chosen_logps": -119.95709228515625, "debug/policy_rejected_logits": 1.803786039352417, "debug/policy_rejected_logps": -6.664618492126465, "debug/reference_chosen_logps": -120.7728271484375, "debug/reference_rejected_logps": -6.6778130531311035, "epoch": 1.2318840579710144, "grad_norm": 475.87558907684684, "learning_rate": 8.861480075901327e-08, "logits/chosen": 1.6413097381591797, "logits/rejected": 1.803786039352417, "logps/chosen": -119.95709228515625, "logps/rejected": -6.664618492126465, "loss": 0.7478, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.40786734223365784, "rewards/margins": 0.4012700021266937, "rewards/rejected": 0.006597369909286499, "step": 340 }, { "debug/policy_chosen_logits": 1.3061001300811768, "debug/policy_chosen_logps": -127.8729019165039, "debug/policy_rejected_logits": 1.5316654443740845, "debug/policy_rejected_logps": -9.507734298706055, "debug/reference_chosen_logps": -128.16708374023438, "debug/reference_rejected_logps": -9.438863754272461, "epoch": 1.25, "grad_norm": 737.1680986615111, "learning_rate": 8.83776091081594e-08, "logits/chosen": 1.3061001300811768, "logits/rejected": 1.5316654443740845, "logps/chosen": -127.8729019165039, "logps/rejected": -9.507734298706055, "loss": 0.841, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.14709270000457764, "rewards/margins": 0.18152830004692078, "rewards/rejected": -0.03443557769060135, "step": 345 }, { "debug/policy_chosen_logits": 1.0943431854248047, "debug/policy_chosen_logps": -16.573427200317383, "debug/policy_rejected_logits": 1.4561195373535156, "debug/policy_rejected_logps": -146.78561401367188, "debug/reference_chosen_logps": -16.5395565032959, "debug/reference_rejected_logps": -146.82769775390625, "epoch": 1.2681159420289856, "grad_norm": 53.506517409781054, "learning_rate": 8.814041745730549e-08, "logits/chosen": 1.0943431854248047, "logits/rejected": 1.4561195373535156, "logps/chosen": -16.573427200317383, "logps/rejected": -146.78561401367188, "loss": 0.6884, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.016934193670749664, "rewards/margins": -0.03797728568315506, "rewards/rejected": 0.021043092012405396, "step": 350 }, { "debug/policy_chosen_logits": 1.564039945602417, "debug/policy_chosen_logps": -66.35012817382812, "debug/policy_rejected_logits": 1.7537450790405273, "debug/policy_rejected_logps": -63.73201370239258, "debug/reference_chosen_logps": -66.51875305175781, "debug/reference_rejected_logps": -63.463356018066406, "epoch": 1.286231884057971, "grad_norm": 469.9966104068159, "learning_rate": 8.790322580645161e-08, "logits/chosen": 1.564039945602417, "logits/rejected": 1.7537450790405273, "logps/chosen": -66.35012817382812, "logps/rejected": -63.73201370239258, "loss": 0.7607, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.08431331068277359, "rewards/margins": 0.21863976120948792, "rewards/rejected": -0.13432642817497253, "step": 355 }, { "debug/policy_chosen_logits": 0.9238218069076538, "debug/policy_chosen_logps": -46.65524673461914, "debug/policy_rejected_logits": 1.189935564994812, "debug/policy_rejected_logps": -14.190457344055176, "debug/reference_chosen_logps": -46.70463180541992, "debug/reference_rejected_logps": -14.1810302734375, "epoch": 1.3043478260869565, "grad_norm": 42.68472454156339, "learning_rate": 8.766603415559772e-08, "logits/chosen": 0.9238218069076538, "logits/rejected": 1.189935564994812, "logps/chosen": -46.65524673461914, "logps/rejected": -14.190457344055176, "loss": 0.7393, "rewards/accuracies": 0.25, "rewards/chosen": 0.024695372208952904, "rewards/margins": 0.029408371075987816, "rewards/rejected": -0.004712998867034912, "step": 360 }, { "debug/policy_chosen_logits": 1.375157117843628, "debug/policy_chosen_logps": -84.59254455566406, "debug/policy_rejected_logits": 1.9235658645629883, "debug/policy_rejected_logps": -3.298945665359497, "debug/reference_chosen_logps": -84.7721939086914, "debug/reference_rejected_logps": -3.231293201446533, "epoch": 1.322463768115942, "grad_norm": 413.05456101269164, "learning_rate": 8.742884250474383e-08, "logits/chosen": 1.375157117843628, "logits/rejected": 1.9235658645629883, "logps/chosen": -84.59254455566406, "logps/rejected": -3.298945665359497, "loss": 0.7246, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.08982699364423752, "rewards/margins": 0.12365307658910751, "rewards/rejected": -0.033826082944869995, "step": 365 }, { "debug/policy_chosen_logits": 1.4550307989120483, "debug/policy_chosen_logps": -87.75889587402344, "debug/policy_rejected_logits": 1.7019217014312744, "debug/policy_rejected_logps": -8.397745132446289, "debug/reference_chosen_logps": -87.77662658691406, "debug/reference_rejected_logps": -8.368696212768555, "epoch": 1.3405797101449275, "grad_norm": 34.958416811535464, "learning_rate": 8.719165085388994e-08, "logits/chosen": 1.4550307989120483, "logits/rejected": 1.7019217014312744, "logps/chosen": -87.75889587402344, "logps/rejected": -8.397745132446289, "loss": 0.7064, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.008868198841810226, "rewards/margins": 0.02339218556880951, "rewards/rejected": -0.014523985795676708, "step": 370 }, { "debug/policy_chosen_logits": 1.2568262815475464, "debug/policy_chosen_logps": -5.916423797607422, "debug/policy_rejected_logits": 1.5424621105194092, "debug/policy_rejected_logps": -6.34293270111084, "debug/reference_chosen_logps": -5.9267425537109375, "debug/reference_rejected_logps": -6.289887428283691, "epoch": 1.358695652173913, "grad_norm": 343.93203882496323, "learning_rate": 8.695445920303604e-08, "logits/chosen": 1.2568262815475464, "logits/rejected": 1.5424621105194092, "logps/chosen": -5.916423797607422, "logps/rejected": -6.34293270111084, "loss": 0.7012, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.005159348249435425, "rewards/margins": 0.031682200729846954, "rewards/rejected": -0.02652285061776638, "step": 375 }, { "debug/policy_chosen_logits": 1.2270126342773438, "debug/policy_chosen_logps": -39.1225700378418, "debug/policy_rejected_logits": 1.5721514225006104, "debug/policy_rejected_logps": -10.081442832946777, "debug/reference_chosen_logps": -39.23326110839844, "debug/reference_rejected_logps": -10.048616409301758, "epoch": 1.3768115942028984, "grad_norm": 44.24369918808943, "learning_rate": 8.671726755218217e-08, "logits/chosen": 1.2270126342773438, "logits/rejected": 1.5721514225006104, "logps/chosen": -39.1225700378418, "logps/rejected": -10.081442832946777, "loss": 0.6815, "rewards/accuracies": 0.25, "rewards/chosen": 0.05534300208091736, "rewards/margins": 0.07175685465335846, "rewards/rejected": -0.016413848847150803, "step": 380 }, { "debug/policy_chosen_logits": 1.5944477319717407, "debug/policy_chosen_logps": -71.57782745361328, "debug/policy_rejected_logits": 1.711469292640686, "debug/policy_rejected_logps": -6.252926826477051, "debug/reference_chosen_logps": -71.53660583496094, "debug/reference_rejected_logps": -6.130346775054932, "epoch": 1.394927536231884, "grad_norm": 150.4639186350394, "learning_rate": 8.648007590132826e-08, "logits/chosen": 1.5944477319717407, "logits/rejected": 1.711469292640686, "logps/chosen": -71.57782745361328, "logps/rejected": -6.252926826477051, "loss": 0.6992, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.020604563876986504, "rewards/margins": 0.040685635060071945, "rewards/rejected": -0.0612901970744133, "step": 385 }, { "debug/policy_chosen_logits": 1.674203872680664, "debug/policy_chosen_logps": -301.27923583984375, "debug/policy_rejected_logits": 1.4842541217803955, "debug/policy_rejected_logps": -162.65579223632812, "debug/reference_chosen_logps": -303.56982421875, "debug/reference_rejected_logps": -163.62176513671875, "epoch": 1.4130434782608696, "grad_norm": 107.49071038138865, "learning_rate": 8.624288425047438e-08, "logits/chosen": 1.674203872680664, "logits/rejected": 1.4842541217803955, "logps/chosen": -301.27923583984375, "logps/rejected": -162.65579223632812, "loss": 0.7026, "rewards/accuracies": 0.375, "rewards/chosen": 1.1452780961990356, "rewards/margins": 0.6622902154922485, "rewards/rejected": 0.48298779129981995, "step": 390 }, { "debug/policy_chosen_logits": 1.0491282939910889, "debug/policy_chosen_logps": -74.3606185913086, "debug/policy_rejected_logits": 1.3029966354370117, "debug/policy_rejected_logps": -6.755346775054932, "debug/reference_chosen_logps": -74.72174835205078, "debug/reference_rejected_logps": -6.651388645172119, "epoch": 1.431159420289855, "grad_norm": 847.5471100410522, "learning_rate": 8.600569259962049e-08, "logits/chosen": 1.0491282939910889, "logits/rejected": 1.3029966354370117, "logps/chosen": -74.3606185913086, "logps/rejected": -6.755346775054932, "loss": 0.725, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.18056420981884003, "rewards/margins": 0.23254287242889404, "rewards/rejected": -0.05197867751121521, "step": 395 }, { "debug/policy_chosen_logits": 1.3033652305603027, "debug/policy_chosen_logps": -4.185441017150879, "debug/policy_rejected_logits": 1.4282686710357666, "debug/policy_rejected_logps": -3.240309476852417, "debug/reference_chosen_logps": -4.213369846343994, "debug/reference_rejected_logps": -3.2468135356903076, "epoch": 1.4492753623188406, "grad_norm": 747.80368385623, "learning_rate": 8.57685009487666e-08, "logits/chosen": 1.3033652305603027, "logits/rejected": 1.4282686710357666, "logps/chosen": -4.185441017150879, "logps/rejected": -3.240309476852417, "loss": 0.7084, "rewards/accuracies": 0.25, "rewards/chosen": 0.013964463956654072, "rewards/margins": 0.010712547227740288, "rewards/rejected": 0.0032519162632524967, "step": 400 }, { "epoch": 1.4492753623188406, "eval_debug/policy_chosen_logits": 1.6406279802322388, "eval_debug/policy_chosen_logps": -122.28227996826172, "eval_debug/policy_rejected_logits": 1.7009400129318237, "eval_debug/policy_rejected_logps": -63.632041931152344, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6406279802322388, "eval_logits/rejected": 1.7009400129318237, "eval_logps/chosen": -122.28227996826172, "eval_logps/rejected": -63.632041931152344, "eval_loss": 0.7387707233428955, "eval_rewards/accuracies": 0.30263158679008484, "eval_rewards/chosen": 0.4328933656215668, "eval_rewards/margins": 0.30538782477378845, "eval_rewards/rejected": 0.12750548124313354, "eval_runtime": 28.6398, "eval_samples_per_second": 20.95, "eval_steps_per_second": 0.663, "step": 400 }, { "debug/policy_chosen_logits": 1.4270344972610474, "debug/policy_chosen_logps": -47.14699935913086, "debug/policy_rejected_logits": 1.9358320236206055, "debug/policy_rejected_logps": -4.528962135314941, "debug/reference_chosen_logps": -47.18468475341797, "debug/reference_rejected_logps": -4.431929111480713, "epoch": 1.4673913043478262, "grad_norm": 167.19844255179274, "learning_rate": 8.553130929791271e-08, "logits/chosen": 1.4270344972610474, "logits/rejected": 1.9358320236206055, "logps/chosen": -47.14699935913086, "logps/rejected": -4.528962135314941, "loss": 0.7022, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01883995532989502, "rewards/margins": 0.0673564076423645, "rewards/rejected": -0.04851645231246948, "step": 405 }, { "debug/policy_chosen_logits": 1.4621151685714722, "debug/policy_chosen_logps": -69.35401916503906, "debug/policy_rejected_logits": 1.699241280555725, "debug/policy_rejected_logps": -89.02747344970703, "debug/reference_chosen_logps": -69.37290954589844, "debug/reference_rejected_logps": -88.96483612060547, "epoch": 1.4855072463768115, "grad_norm": 52.834115400844055, "learning_rate": 8.529411764705881e-08, "logits/chosen": 1.4621151685714722, "logits/rejected": 1.699241280555725, "logps/chosen": -69.35401916503906, "logps/rejected": -89.02747344970703, "loss": 0.6987, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009439739398658276, "rewards/margins": 0.040753789246082306, "rewards/rejected": -0.031314048916101456, "step": 410 }, { "debug/policy_chosen_logits": 1.0053094625473022, "debug/policy_chosen_logps": -119.05128479003906, "debug/policy_rejected_logits": 1.4087492227554321, "debug/policy_rejected_logps": -10.053757667541504, "debug/reference_chosen_logps": -119.50848388671875, "debug/reference_rejected_logps": -10.004406929016113, "epoch": 1.5036231884057971, "grad_norm": 305.30682423923275, "learning_rate": 8.505692599620494e-08, "logits/chosen": 1.0053094625473022, "logits/rejected": 1.4087492227554321, "logps/chosen": -119.05128479003906, "logps/rejected": -10.053757667541504, "loss": 0.687, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.2285972386598587, "rewards/margins": 0.2532727122306824, "rewards/rejected": -0.024675479158759117, "step": 415 }, { "debug/policy_chosen_logits": 1.6585094928741455, "debug/policy_chosen_logps": -131.85653686523438, "debug/policy_rejected_logits": 2.2336809635162354, "debug/policy_rejected_logps": -4.011847972869873, "debug/reference_chosen_logps": -132.0562286376953, "debug/reference_rejected_logps": -4.032851219177246, "epoch": 1.5217391304347827, "grad_norm": 246.44167715965466, "learning_rate": 8.481973434535103e-08, "logits/chosen": 1.6585094928741455, "logits/rejected": 2.2336809635162354, "logps/chosen": -131.85653686523438, "logps/rejected": -4.011847972869873, "loss": 0.7019, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.09984876960515976, "rewards/margins": 0.0893472209572792, "rewards/rejected": 0.01050154771655798, "step": 420 }, { "debug/policy_chosen_logits": 1.4657008647918701, "debug/policy_chosen_logps": -10.862765312194824, "debug/policy_rejected_logits": 2.0177993774414062, "debug/policy_rejected_logps": -7.8859543800354, "debug/reference_chosen_logps": -10.919036865234375, "debug/reference_rejected_logps": -7.864465236663818, "epoch": 1.539855072463768, "grad_norm": 27.38774575159296, "learning_rate": 8.458254269449715e-08, "logits/chosen": 1.4657008647918701, "logits/rejected": 2.0177993774414062, "logps/chosen": -10.862765312194824, "logps/rejected": -7.8859543800354, "loss": 0.6958, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.02813572809100151, "rewards/margins": 0.03888028487563133, "rewards/rejected": -0.010744556784629822, "step": 425 }, { "debug/policy_chosen_logits": 0.9924972653388977, "debug/policy_chosen_logps": -8.320334434509277, "debug/policy_rejected_logits": 1.369627594947815, "debug/policy_rejected_logps": -10.872963905334473, "debug/reference_chosen_logps": -8.375343322753906, "debug/reference_rejected_logps": -10.8840970993042, "epoch": 1.5579710144927537, "grad_norm": 627.3150930618979, "learning_rate": 8.434535104364326e-08, "logits/chosen": 0.9924972653388977, "logits/rejected": 1.369627594947815, "logps/chosen": -8.320334434509277, "logps/rejected": -10.872963905334473, "loss": 0.7279, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.027504444122314453, "rewards/margins": 0.021937314420938492, "rewards/rejected": 0.005567132029682398, "step": 430 }, { "debug/policy_chosen_logits": 1.4116184711456299, "debug/policy_chosen_logps": -6.752942085266113, "debug/policy_rejected_logits": 1.6035079956054688, "debug/policy_rejected_logps": -13.037881851196289, "debug/reference_chosen_logps": -6.73760461807251, "debug/reference_rejected_logps": -13.031814575195312, "epoch": 1.5760869565217392, "grad_norm": 42.83597199870891, "learning_rate": 8.410815939278937e-08, "logits/chosen": 1.4116184711456299, "logits/rejected": 1.6035079956054688, "logps/chosen": -6.752942085266113, "logps/rejected": -13.037881851196289, "loss": 0.7111, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0076690674759447575, "rewards/margins": -0.004635795950889587, "rewards/rejected": -0.00303327152505517, "step": 435 }, { "debug/policy_chosen_logits": 1.3784153461456299, "debug/policy_chosen_logps": -6.586009979248047, "debug/policy_rejected_logits": 1.8527352809906006, "debug/policy_rejected_logps": -5.8097124099731445, "debug/reference_chosen_logps": -6.581674098968506, "debug/reference_rejected_logps": -5.773827075958252, "epoch": 1.5942028985507246, "grad_norm": 116.9424276235437, "learning_rate": 8.387096774193548e-08, "logits/chosen": 1.3784153461456299, "logits/rejected": 1.8527352809906006, "logps/chosen": -6.586009979248047, "logps/rejected": -5.8097124099731445, "loss": 0.6864, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0021682351361960173, "rewards/margins": 0.01577456295490265, "rewards/rejected": -0.017942797392606735, "step": 440 }, { "debug/policy_chosen_logits": 1.4121224880218506, "debug/policy_chosen_logps": -88.59452819824219, "debug/policy_rejected_logits": 1.6039297580718994, "debug/policy_rejected_logps": -94.99937438964844, "debug/reference_chosen_logps": -88.7278823852539, "debug/reference_rejected_logps": -94.71257019042969, "epoch": 1.6123188405797102, "grad_norm": 189.59157456561374, "learning_rate": 8.363377609108159e-08, "logits/chosen": 1.4121224880218506, "logits/rejected": 1.6039297580718994, "logps/chosen": -88.59452819824219, "logps/rejected": -94.99937438964844, "loss": 0.6958, "rewards/accuracies": 0.375, "rewards/chosen": 0.0666738897562027, "rewards/margins": 0.21007855236530304, "rewards/rejected": -0.14340464770793915, "step": 445 }, { "debug/policy_chosen_logits": 1.2935651540756226, "debug/policy_chosen_logps": -7.066756248474121, "debug/policy_rejected_logits": 1.3916139602661133, "debug/policy_rejected_logps": -8.284643173217773, "debug/reference_chosen_logps": -7.1367597579956055, "debug/reference_rejected_logps": -8.24354362487793, "epoch": 1.6304347826086958, "grad_norm": 464.25002969198334, "learning_rate": 8.339658444022771e-08, "logits/chosen": 1.2935651540756226, "logits/rejected": 1.3916139602661133, "logps/chosen": -7.066756248474121, "logps/rejected": -8.284643173217773, "loss": 0.6772, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.035002078860998154, "rewards/margins": 0.05555196478962898, "rewards/rejected": -0.02054988220334053, "step": 450 }, { "debug/policy_chosen_logits": 1.2559988498687744, "debug/policy_chosen_logps": -9.722308158874512, "debug/policy_rejected_logits": 1.5358774662017822, "debug/policy_rejected_logps": -2.9760749340057373, "debug/reference_chosen_logps": -9.742944717407227, "debug/reference_rejected_logps": -2.9481358528137207, "epoch": 1.6485507246376812, "grad_norm": 86.12380223266854, "learning_rate": 8.31593927893738e-08, "logits/chosen": 1.2559988498687744, "logits/rejected": 1.5358774662017822, "logps/chosen": -9.722308158874512, "logps/rejected": -2.9760749340057373, "loss": 0.7192, "rewards/accuracies": 0.375, "rewards/chosen": 0.0103188157081604, "rewards/margins": 0.02428845688700676, "rewards/rejected": -0.013969642110168934, "step": 455 }, { "debug/policy_chosen_logits": 1.4306228160858154, "debug/policy_chosen_logps": -7.769255638122559, "debug/policy_rejected_logits": 1.6392366886138916, "debug/policy_rejected_logps": -5.45258092880249, "debug/reference_chosen_logps": -7.756170749664307, "debug/reference_rejected_logps": -5.4446258544921875, "epoch": 1.6666666666666665, "grad_norm": 147.1858789234327, "learning_rate": 8.292220113851992e-08, "logits/chosen": 1.4306228160858154, "logits/rejected": 1.6392366886138916, "logps/chosen": -7.769255638122559, "logps/rejected": -5.45258092880249, "loss": 0.6835, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.006542977876961231, "rewards/margins": -0.0025654465425759554, "rewards/rejected": -0.003977531101554632, "step": 460 }, { "debug/policy_chosen_logits": 1.3728729486465454, "debug/policy_chosen_logps": -7.748582363128662, "debug/policy_rejected_logits": 1.6508550643920898, "debug/policy_rejected_logps": -3.9221482276916504, "debug/reference_chosen_logps": -7.709532737731934, "debug/reference_rejected_logps": -3.8724441528320312, "epoch": 1.6847826086956523, "grad_norm": 202.72166226970583, "learning_rate": 8.268500948766603e-08, "logits/chosen": 1.3728729486465454, "logits/rejected": 1.6508550643920898, "logps/chosen": -7.748582363128662, "logps/rejected": -3.9221482276916504, "loss": 0.6955, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.019524943083524704, "rewards/margins": 0.00532715767621994, "rewards/rejected": -0.024852100759744644, "step": 465 }, { "debug/policy_chosen_logits": 1.353232741355896, "debug/policy_chosen_logps": -104.42596435546875, "debug/policy_rejected_logits": 1.9469273090362549, "debug/policy_rejected_logps": -9.105962753295898, "debug/reference_chosen_logps": -104.1981201171875, "debug/reference_rejected_logps": -9.050098419189453, "epoch": 1.7028985507246377, "grad_norm": 121.28442186605086, "learning_rate": 8.244781783681214e-08, "logits/chosen": 1.353232741355896, "logits/rejected": 1.9469273090362549, "logps/chosen": -104.42596435546875, "logps/rejected": -9.105962753295898, "loss": 0.712, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.1139189600944519, "rewards/margins": -0.08598621189594269, "rewards/rejected": -0.027932751923799515, "step": 470 }, { "debug/policy_chosen_logits": 1.4889159202575684, "debug/policy_chosen_logps": -7.201774597167969, "debug/policy_rejected_logits": 1.6883522272109985, "debug/policy_rejected_logps": -5.375905990600586, "debug/reference_chosen_logps": -7.227102756500244, "debug/reference_rejected_logps": -5.338844299316406, "epoch": 1.721014492753623, "grad_norm": 36.3134523637275, "learning_rate": 8.221062618595825e-08, "logits/chosen": 1.4889159202575684, "logits/rejected": 1.6883522272109985, "logps/chosen": -7.201774597167969, "logps/rejected": -5.375905990600586, "loss": 0.6904, "rewards/accuracies": 0.375, "rewards/chosen": 0.012663990259170532, "rewards/margins": 0.03119487129151821, "rewards/rejected": -0.01853088103234768, "step": 475 }, { "debug/policy_chosen_logits": 1.3426107168197632, "debug/policy_chosen_logps": -81.00315856933594, "debug/policy_rejected_logits": 1.5184953212738037, "debug/policy_rejected_logps": -109.05195617675781, "debug/reference_chosen_logps": -80.98526000976562, "debug/reference_rejected_logps": -109.05181884765625, "epoch": 1.7391304347826086, "grad_norm": 347.1004857199992, "learning_rate": 8.197343453510436e-08, "logits/chosen": 1.3426107168197632, "logits/rejected": 1.5184953212738037, "logps/chosen": -81.00315856933594, "logps/rejected": -109.05195617675781, "loss": 0.6874, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.008948716334998608, "rewards/margins": -0.00889271218329668, "rewards/rejected": -5.600452277576551e-05, "step": 480 }, { "debug/policy_chosen_logits": 1.5653622150421143, "debug/policy_chosen_logps": -40.81863021850586, "debug/policy_rejected_logits": 1.8120641708374023, "debug/policy_rejected_logps": -4.929619789123535, "debug/reference_chosen_logps": -40.81630325317383, "debug/reference_rejected_logps": -4.973757266998291, "epoch": 1.7572463768115942, "grad_norm": 32.194610089800186, "learning_rate": 8.173624288425048e-08, "logits/chosen": 1.5653622150421143, "logits/rejected": 1.8120641708374023, "logps/chosen": -40.81863021850586, "logps/rejected": -4.929619789123535, "loss": 0.683, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0011654272675514221, "rewards/margins": -0.023234261199831963, "rewards/rejected": 0.02206883393228054, "step": 485 }, { "debug/policy_chosen_logits": 1.5561188459396362, "debug/policy_chosen_logps": -6.576440334320068, "debug/policy_rejected_logits": 1.847739577293396, "debug/policy_rejected_logps": -5.0680413246154785, "debug/reference_chosen_logps": -6.592990875244141, "debug/reference_rejected_logps": -5.0225749015808105, "epoch": 1.7753623188405796, "grad_norm": 35.345947583637496, "learning_rate": 8.149905123339657e-08, "logits/chosen": 1.5561188459396362, "logits/rejected": 1.847739577293396, "logps/chosen": -6.576440334320068, "logps/rejected": -5.0680413246154785, "loss": 0.6844, "rewards/accuracies": 0.375, "rewards/chosen": 0.008275196887552738, "rewards/margins": 0.031008299440145493, "rewards/rejected": -0.02273310348391533, "step": 490 }, { "debug/policy_chosen_logits": 1.288992166519165, "debug/policy_chosen_logps": -23.104061126708984, "debug/policy_rejected_logits": 1.577358365058899, "debug/policy_rejected_logps": -66.31800842285156, "debug/reference_chosen_logps": -23.14602279663086, "debug/reference_rejected_logps": -66.37723541259766, "epoch": 1.7934782608695652, "grad_norm": 64.71683306502125, "learning_rate": 8.12618595825427e-08, "logits/chosen": 1.288992166519165, "logits/rejected": 1.577358365058899, "logps/chosen": -23.104061126708984, "logps/rejected": -66.31800842285156, "loss": 0.6989, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.020980743691325188, "rewards/margins": -0.008632917888462543, "rewards/rejected": 0.029613662511110306, "step": 495 }, { "debug/policy_chosen_logits": 1.3208658695220947, "debug/policy_chosen_logps": -102.35582733154297, "debug/policy_rejected_logits": 1.5698192119598389, "debug/policy_rejected_logps": -53.2846794128418, "debug/reference_chosen_logps": -102.49649810791016, "debug/reference_rejected_logps": -53.1512451171875, "epoch": 1.8115942028985508, "grad_norm": 307.4823378624926, "learning_rate": 8.10246679316888e-08, "logits/chosen": 1.3208658695220947, "logits/rejected": 1.5698192119598389, "logps/chosen": -102.35582733154297, "logps/rejected": -53.2846794128418, "loss": 0.693, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.07032980769872665, "rewards/margins": 0.1370486319065094, "rewards/rejected": -0.06671881675720215, "step": 500 }, { "epoch": 1.8115942028985508, "eval_debug/policy_chosen_logits": 1.6430633068084717, "eval_debug/policy_chosen_logps": -122.76631927490234, "eval_debug/policy_rejected_logits": 1.7034589052200317, "eval_debug/policy_rejected_logps": -63.999725341796875, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6430633068084717, "eval_logits/rejected": 1.7034589052200317, "eval_logps/chosen": -122.76631927490234, "eval_logps/rejected": -63.999725341796875, "eval_loss": 0.6927152872085571, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.19087384641170502, "eval_rewards/margins": 0.24721020460128784, "eval_rewards/rejected": -0.05633634701371193, "eval_runtime": 28.7683, "eval_samples_per_second": 20.856, "eval_steps_per_second": 0.66, "step": 500 }, { "debug/policy_chosen_logits": 1.3585078716278076, "debug/policy_chosen_logps": -7.259619235992432, "debug/policy_rejected_logits": 1.4920176267623901, "debug/policy_rejected_logps": -124.12772369384766, "debug/reference_chosen_logps": -7.294312953948975, "debug/reference_rejected_logps": -122.4188232421875, "epoch": 1.8297101449275361, "grad_norm": 32.571297547370094, "learning_rate": 8.078747628083491e-08, "logits/chosen": 1.3585078716278076, "logits/rejected": 1.4920176267623901, "logps/chosen": -7.259619235992432, "logps/rejected": -124.12772369384766, "loss": 0.7036, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01734679564833641, "rewards/margins": 0.8717970848083496, "rewards/rejected": -0.8544502258300781, "step": 505 }, { "debug/policy_chosen_logits": 1.4707800149917603, "debug/policy_chosen_logps": -102.15882873535156, "debug/policy_rejected_logits": 1.6607221364974976, "debug/policy_rejected_logps": -11.263330459594727, "debug/reference_chosen_logps": -102.32771301269531, "debug/reference_rejected_logps": -11.211788177490234, "epoch": 1.8478260869565217, "grad_norm": 39.58651719451268, "learning_rate": 8.055028462998102e-08, "logits/chosen": 1.4707800149917603, "logits/rejected": 1.6607221364974976, "logps/chosen": -102.15882873535156, "logps/rejected": -11.263330459594727, "loss": 0.8752, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.08444677293300629, "rewards/margins": 0.11021725088357925, "rewards/rejected": -0.025770481675863266, "step": 510 }, { "debug/policy_chosen_logits": 1.5113954544067383, "debug/policy_chosen_logps": -25.85674476623535, "debug/policy_rejected_logits": 1.6705751419067383, "debug/policy_rejected_logps": -28.088943481445312, "debug/reference_chosen_logps": -25.6488094329834, "debug/reference_rejected_logps": -27.818252563476562, "epoch": 1.8659420289855073, "grad_norm": 73.34944417322548, "learning_rate": 8.031309297912713e-08, "logits/chosen": 1.5113954544067383, "logits/rejected": 1.6705751419067383, "logps/chosen": -25.85674476623535, "logps/rejected": -28.088943481445312, "loss": 0.7965, "rewards/accuracies": 0.25, "rewards/chosen": -0.10396593809127808, "rewards/margins": 0.03138067200779915, "rewards/rejected": -0.13534662127494812, "step": 515 }, { "debug/policy_chosen_logits": 1.6345298290252686, "debug/policy_chosen_logps": -235.7809600830078, "debug/policy_rejected_logits": 1.7204082012176514, "debug/policy_rejected_logps": -212.71463012695312, "debug/reference_chosen_logps": -234.6603546142578, "debug/reference_rejected_logps": -211.98037719726562, "epoch": 1.8840579710144927, "grad_norm": 75.18454684608517, "learning_rate": 8.007590132827324e-08, "logits/chosen": 1.6345298290252686, "logits/rejected": 1.7204082012176514, "logps/chosen": -235.7809600830078, "logps/rejected": -212.71463012695312, "loss": 0.8068, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.560310959815979, "rewards/margins": -0.19318583607673645, "rewards/rejected": -0.36712518334388733, "step": 520 }, { "debug/policy_chosen_logits": 1.3294028043746948, "debug/policy_chosen_logps": -49.67485809326172, "debug/policy_rejected_logits": 1.7704395055770874, "debug/policy_rejected_logps": -45.25579833984375, "debug/reference_chosen_logps": -49.641510009765625, "debug/reference_rejected_logps": -45.34258270263672, "epoch": 1.9021739130434783, "grad_norm": 84.5021189180937, "learning_rate": 7.983870967741935e-08, "logits/chosen": 1.3294028043746948, "logits/rejected": 1.7704395055770874, "logps/chosen": -49.67485809326172, "logps/rejected": -45.25579833984375, "loss": 0.6898, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.016674524173140526, "rewards/margins": -0.060069240629673004, "rewards/rejected": 0.04339471831917763, "step": 525 }, { "debug/policy_chosen_logits": 1.281179666519165, "debug/policy_chosen_logps": -305.73809814453125, "debug/policy_rejected_logits": 1.3902794122695923, "debug/policy_rejected_logps": -84.81549835205078, "debug/reference_chosen_logps": -306.0018005371094, "debug/reference_rejected_logps": -85.27592468261719, "epoch": 1.9202898550724639, "grad_norm": 175.45007444244777, "learning_rate": 7.960151802656547e-08, "logits/chosen": 1.281179666519165, "logits/rejected": 1.3902794122695923, "logps/chosen": -305.73809814453125, "logps/rejected": -84.81549835205078, "loss": 0.7057, "rewards/accuracies": 0.375, "rewards/chosen": 0.13186052441596985, "rewards/margins": -0.09835465997457504, "rewards/rejected": 0.2302151620388031, "step": 530 }, { "debug/policy_chosen_logits": 1.5570428371429443, "debug/policy_chosen_logps": -86.09352111816406, "debug/policy_rejected_logits": 1.8406089544296265, "debug/policy_rejected_logps": -4.976877689361572, "debug/reference_chosen_logps": -86.55270385742188, "debug/reference_rejected_logps": -4.980034828186035, "epoch": 1.9384057971014492, "grad_norm": 495.39745079780516, "learning_rate": 7.936432637571158e-08, "logits/chosen": 1.5570428371429443, "logits/rejected": 1.8406089544296265, "logps/chosen": -86.09352111816406, "logps/rejected": -4.976877689361572, "loss": 0.6949, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.22958973050117493, "rewards/margins": 0.22801141440868378, "rewards/rejected": 0.001578301191329956, "step": 535 }, { "debug/policy_chosen_logits": 1.1591947078704834, "debug/policy_chosen_logps": -6.6574835777282715, "debug/policy_rejected_logits": 1.4891539812088013, "debug/policy_rejected_logps": -13.411718368530273, "debug/reference_chosen_logps": -6.643076419830322, "debug/reference_rejected_logps": -13.368135452270508, "epoch": 1.9565217391304348, "grad_norm": 198.4671088547006, "learning_rate": 7.912713472485768e-08, "logits/chosen": 1.1591947078704834, "logits/rejected": 1.4891539812088013, "logps/chosen": -6.6574835777282715, "logps/rejected": -13.411718368530273, "loss": 0.6868, "rewards/accuracies": 0.375, "rewards/chosen": -0.007203725166618824, "rewards/margins": 0.014587935991585255, "rewards/rejected": -0.02179166115820408, "step": 540 }, { "debug/policy_chosen_logits": 1.2819187641143799, "debug/policy_chosen_logps": -7.776444435119629, "debug/policy_rejected_logits": 1.532454490661621, "debug/policy_rejected_logps": -5.6756720542907715, "debug/reference_chosen_logps": -7.7561540603637695, "debug/reference_rejected_logps": -5.6011643409729, "epoch": 1.9746376811594204, "grad_norm": 263.1485829204638, "learning_rate": 7.888994307400379e-08, "logits/chosen": 1.2819187641143799, "logits/rejected": 1.532454490661621, "logps/chosen": -7.776444435119629, "logps/rejected": -5.6756720542907715, "loss": 0.7006, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.010145354084670544, "rewards/margins": 0.027108699083328247, "rewards/rejected": -0.037254054099321365, "step": 545 }, { "debug/policy_chosen_logits": 1.057496190071106, "debug/policy_chosen_logps": -108.1541519165039, "debug/policy_rejected_logits": 1.136480689048767, "debug/policy_rejected_logps": -48.04246139526367, "debug/reference_chosen_logps": -108.74848937988281, "debug/reference_rejected_logps": -48.089027404785156, "epoch": 1.9927536231884058, "grad_norm": 86.36595815385945, "learning_rate": 7.86527514231499e-08, "logits/chosen": 1.057496190071106, "logits/rejected": 1.136480689048767, "logps/chosen": -108.1541519165039, "logps/rejected": -48.04246139526367, "loss": 0.6911, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.29717230796813965, "rewards/margins": 0.27388888597488403, "rewards/rejected": 0.02328340709209442, "step": 550 }, { "debug/policy_chosen_logits": 1.115027666091919, "debug/policy_chosen_logps": -183.26589965820312, "debug/policy_rejected_logits": 1.7919365167617798, "debug/policy_rejected_logps": -3.0790908336639404, "debug/reference_chosen_logps": -184.52658081054688, "debug/reference_rejected_logps": -3.0741100311279297, "epoch": 2.010869565217391, "grad_norm": 63.85933046142476, "learning_rate": 7.841555977229601e-08, "logits/chosen": 1.115027666091919, "logits/rejected": 1.7919365167617798, "logps/chosen": -183.26589965820312, "logps/rejected": -3.0790908336639404, "loss": 0.6736, "rewards/accuracies": 0.375, "rewards/chosen": 0.6303540468215942, "rewards/margins": 0.6328445672988892, "rewards/rejected": -0.002490498125553131, "step": 555 }, { "debug/policy_chosen_logits": 1.646459937095642, "debug/policy_chosen_logps": -182.5862579345703, "debug/policy_rejected_logits": 1.895364761352539, "debug/policy_rejected_logps": -6.778590202331543, "debug/reference_chosen_logps": -182.94830322265625, "debug/reference_rejected_logps": -6.724902153015137, "epoch": 2.028985507246377, "grad_norm": 40.14666154579184, "learning_rate": 7.817836812144212e-08, "logits/chosen": 1.646459937095642, "logits/rejected": 1.895364761352539, "logps/chosen": -182.5862579345703, "logps/rejected": -6.778590202331543, "loss": 0.68, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.1810242235660553, "rewards/margins": 0.20786818861961365, "rewards/rejected": -0.026843953877687454, "step": 560 }, { "debug/policy_chosen_logits": 1.4563318490982056, "debug/policy_chosen_logps": -6.22916316986084, "debug/policy_rejected_logits": 1.7887405157089233, "debug/policy_rejected_logps": -65.63624572753906, "debug/reference_chosen_logps": -6.2621893882751465, "debug/reference_rejected_logps": -65.73162078857422, "epoch": 2.0471014492753623, "grad_norm": 82.41124184394604, "learning_rate": 7.794117647058824e-08, "logits/chosen": 1.4563318490982056, "logits/rejected": 1.7887405157089233, "logps/chosen": -6.22916316986084, "logps/rejected": -65.63624572753906, "loss": 0.6707, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.01651296205818653, "rewards/margins": -0.03118007816374302, "rewards/rejected": 0.04769303649663925, "step": 565 }, { "debug/policy_chosen_logits": 1.4452693462371826, "debug/policy_chosen_logps": -6.1301751136779785, "debug/policy_rejected_logits": 1.7755908966064453, "debug/policy_rejected_logps": -13.246371269226074, "debug/reference_chosen_logps": -6.186774253845215, "debug/reference_rejected_logps": -13.234382629394531, "epoch": 2.0652173913043477, "grad_norm": 140.4168701028282, "learning_rate": 7.770398481973435e-08, "logits/chosen": 1.4452693462371826, "logits/rejected": 1.7755908966064453, "logps/chosen": -6.1301751136779785, "logps/rejected": -13.246371269226074, "loss": 0.6668, "rewards/accuracies": 0.375, "rewards/chosen": 0.028299903497099876, "rewards/margins": 0.03429514914751053, "rewards/rejected": -0.005995243787765503, "step": 570 }, { "debug/policy_chosen_logits": 1.3798478841781616, "debug/policy_chosen_logps": -6.857008457183838, "debug/policy_rejected_logits": 1.69475519657135, "debug/policy_rejected_logps": -5.494004249572754, "debug/reference_chosen_logps": -6.833249092102051, "debug/reference_rejected_logps": -5.51195764541626, "epoch": 2.0833333333333335, "grad_norm": 25.02189718675772, "learning_rate": 7.746679316888045e-08, "logits/chosen": 1.3798478841781616, "logits/rejected": 1.69475519657135, "logps/chosen": -6.857008457183838, "logps/rejected": -5.494004249572754, "loss": 0.6829, "rewards/accuracies": 0.25, "rewards/chosen": -0.011879793368279934, "rewards/margins": -0.02085670456290245, "rewards/rejected": 0.008976912125945091, "step": 575 }, { "debug/policy_chosen_logits": 0.9362372159957886, "debug/policy_chosen_logps": -8.133824348449707, "debug/policy_rejected_logits": 1.341139554977417, "debug/policy_rejected_logps": -3.681180238723755, "debug/reference_chosen_logps": -8.1328763961792, "debug/reference_rejected_logps": -3.7049155235290527, "epoch": 2.101449275362319, "grad_norm": 39.225377320762426, "learning_rate": 7.722960151802656e-08, "logits/chosen": 0.9362372159957886, "logits/rejected": 1.341139554977417, "logps/chosen": -8.133824348449707, "logps/rejected": -3.681180238723755, "loss": 0.6871, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0004737779381684959, "rewards/margins": -0.012341367080807686, "rewards/rejected": 0.011867588385939598, "step": 580 }, { "debug/policy_chosen_logits": 1.500811219215393, "debug/policy_chosen_logps": -24.10525131225586, "debug/policy_rejected_logits": 1.750267744064331, "debug/policy_rejected_logps": -20.673208236694336, "debug/reference_chosen_logps": -24.254491806030273, "debug/reference_rejected_logps": -20.668045043945312, "epoch": 2.119565217391304, "grad_norm": 214.29806101630356, "learning_rate": 7.699240986717267e-08, "logits/chosen": 1.500811219215393, "logits/rejected": 1.750267744064331, "logps/chosen": -24.10525131225586, "logps/rejected": -20.673208236694336, "loss": 0.6841, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.07462036609649658, "rewards/margins": 0.07720223814249039, "rewards/rejected": -0.002581870649009943, "step": 585 }, { "debug/policy_chosen_logits": 1.4485924243927002, "debug/policy_chosen_logps": -152.88980102539062, "debug/policy_rejected_logits": 1.827588438987732, "debug/policy_rejected_logps": -121.49317932128906, "debug/reference_chosen_logps": -153.42547607421875, "debug/reference_rejected_logps": -121.61518859863281, "epoch": 2.13768115942029, "grad_norm": 386.35883285487347, "learning_rate": 7.675521821631878e-08, "logits/chosen": 1.4485924243927002, "logits/rejected": 1.827588438987732, "logps/chosen": -152.88980102539062, "logps/rejected": -121.49317932128906, "loss": 0.6839, "rewards/accuracies": 0.375, "rewards/chosen": 0.2678472101688385, "rewards/margins": 0.20683827996253967, "rewards/rejected": 0.061008960008621216, "step": 590 }, { "debug/policy_chosen_logits": 1.124606728553772, "debug/policy_chosen_logps": -3.096755266189575, "debug/policy_rejected_logits": 1.4715321063995361, "debug/policy_rejected_logps": -39.47820281982422, "debug/reference_chosen_logps": -3.100511074066162, "debug/reference_rejected_logps": -39.13794708251953, "epoch": 2.1557971014492754, "grad_norm": 44.56984436252543, "learning_rate": 7.651802656546489e-08, "logits/chosen": 1.124606728553772, "logits/rejected": 1.4715321063995361, "logps/chosen": -3.096755266189575, "logps/rejected": -39.47820281982422, "loss": 0.664, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.0018780737882480025, "rewards/margins": 0.17200471460819244, "rewards/rejected": -0.17012664675712585, "step": 595 }, { "debug/policy_chosen_logits": 1.2990307807922363, "debug/policy_chosen_logps": -89.97127532958984, "debug/policy_rejected_logits": 1.5649951696395874, "debug/policy_rejected_logps": -8.741037368774414, "debug/reference_chosen_logps": -90.09474182128906, "debug/reference_rejected_logps": -8.721701622009277, "epoch": 2.1739130434782608, "grad_norm": 35.873624799602794, "learning_rate": 7.628083491461101e-08, "logits/chosen": 1.2990307807922363, "logits/rejected": 1.5649951696395874, "logps/chosen": -89.97127532958984, "logps/rejected": -8.741037368774414, "loss": 0.6683, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.061729829758405685, "rewards/margins": 0.07139773666858673, "rewards/rejected": -0.009667905978858471, "step": 600 }, { "epoch": 2.1739130434782608, "eval_debug/policy_chosen_logits": 1.644214391708374, "eval_debug/policy_chosen_logps": -122.55877685546875, "eval_debug/policy_rejected_logits": 1.7044687271118164, "eval_debug/policy_rejected_logps": -63.846473693847656, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.644214391708374, "eval_logits/rejected": 1.7044687271118164, "eval_logps/chosen": -122.55877685546875, "eval_logps/rejected": -63.846473693847656, "eval_loss": 0.6754893064498901, "eval_rewards/accuracies": 0.34210526943206787, "eval_rewards/chosen": 0.29464319348335266, "eval_rewards/margins": 0.27435302734375, "eval_rewards/rejected": 0.020290151238441467, "eval_runtime": 28.4828, "eval_samples_per_second": 21.065, "eval_steps_per_second": 0.667, "step": 600 }, { "debug/policy_chosen_logits": 1.3662563562393188, "debug/policy_chosen_logps": -102.12466430664062, "debug/policy_rejected_logits": 1.9596894979476929, "debug/policy_rejected_logps": -195.77943420410156, "debug/reference_chosen_logps": -102.28282928466797, "debug/reference_rejected_logps": -195.50665283203125, "epoch": 2.1920289855072466, "grad_norm": 201.68860680196659, "learning_rate": 7.60436432637571e-08, "logits/chosen": 1.3662563562393188, "logits/rejected": 1.9596894979476929, "logps/chosen": -102.12466430664062, "logps/rejected": -195.77943420410156, "loss": 0.7024, "rewards/accuracies": 0.25, "rewards/chosen": 0.07908296585083008, "rewards/margins": 0.2154727429151535, "rewards/rejected": -0.13638977706432343, "step": 605 }, { "debug/policy_chosen_logits": 1.356082558631897, "debug/policy_chosen_logps": -4.553298473358154, "debug/policy_rejected_logits": 1.4579359292984009, "debug/policy_rejected_logps": -7.470458984375, "debug/reference_chosen_logps": -4.522200584411621, "debug/reference_rejected_logps": -7.417989253997803, "epoch": 2.210144927536232, "grad_norm": 49.81543420435375, "learning_rate": 7.580645161290323e-08, "logits/chosen": 1.356082558631897, "logits/rejected": 1.4579359292984009, "logps/chosen": -4.553298473358154, "logps/rejected": -7.470458984375, "loss": 0.679, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.015549642033874989, "rewards/margins": 0.01068491768091917, "rewards/rejected": -0.026234561577439308, "step": 610 }, { "debug/policy_chosen_logits": 1.3849835395812988, "debug/policy_chosen_logps": -94.78125762939453, "debug/policy_rejected_logits": 1.5779485702514648, "debug/policy_rejected_logps": -141.0235595703125, "debug/reference_chosen_logps": -94.77066802978516, "debug/reference_rejected_logps": -140.92489624023438, "epoch": 2.2282608695652173, "grad_norm": 45.39418811155164, "learning_rate": 7.556925996204933e-08, "logits/chosen": 1.3849835395812988, "logits/rejected": 1.5779485702514648, "logps/chosen": -94.78125762939453, "logps/rejected": -141.0235595703125, "loss": 0.6903, "rewards/accuracies": 0.375, "rewards/chosen": -0.0052950503304600716, "rewards/margins": 0.04403238743543625, "rewards/rejected": -0.049327440559864044, "step": 615 }, { "debug/policy_chosen_logits": 1.0549323558807373, "debug/policy_chosen_logps": -5.365431308746338, "debug/policy_rejected_logits": 1.5493519306182861, "debug/policy_rejected_logps": -10.255617141723633, "debug/reference_chosen_logps": -5.394388675689697, "debug/reference_rejected_logps": -10.254476547241211, "epoch": 2.246376811594203, "grad_norm": 41.36942695995215, "learning_rate": 7.533206831119544e-08, "logits/chosen": 1.0549323558807373, "logits/rejected": 1.5493519306182861, "logps/chosen": -5.365431308746338, "logps/rejected": -10.255617141723633, "loss": 0.6817, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.014478540048003197, "rewards/margins": 0.015048688277602196, "rewards/rejected": -0.000570148229598999, "step": 620 }, { "debug/policy_chosen_logits": 1.2149797677993774, "debug/policy_chosen_logps": -68.41226959228516, "debug/policy_rejected_logits": 1.3799153566360474, "debug/policy_rejected_logps": -122.28169250488281, "debug/reference_chosen_logps": -68.59146118164062, "debug/reference_rejected_logps": -121.55644226074219, "epoch": 2.2644927536231885, "grad_norm": 65.37527575329054, "learning_rate": 7.509487666034155e-08, "logits/chosen": 1.2149797677993774, "logits/rejected": 1.3799153566360474, "logps/chosen": -68.41226959228516, "logps/rejected": -122.28169250488281, "loss": 0.6709, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.08960092067718506, "rewards/margins": 0.45222634077072144, "rewards/rejected": -0.3626253604888916, "step": 625 }, { "debug/policy_chosen_logits": 1.4795180559158325, "debug/policy_chosen_logps": -95.09278106689453, "debug/policy_rejected_logits": 1.6099354028701782, "debug/policy_rejected_logps": -8.328186988830566, "debug/reference_chosen_logps": -95.49501037597656, "debug/reference_rejected_logps": -8.245024681091309, "epoch": 2.282608695652174, "grad_norm": 50.42058389866633, "learning_rate": 7.485768500948766e-08, "logits/chosen": 1.4795180559158325, "logits/rejected": 1.6099354028701782, "logps/chosen": -95.09278106689453, "logps/rejected": -8.328186988830566, "loss": 0.6583, "rewards/accuracies": 0.375, "rewards/chosen": 0.20111560821533203, "rewards/margins": 0.242696613073349, "rewards/rejected": -0.041580989956855774, "step": 630 }, { "debug/policy_chosen_logits": 1.329341173171997, "debug/policy_chosen_logps": -90.06200408935547, "debug/policy_rejected_logits": 1.570401906967163, "debug/policy_rejected_logps": -11.742565155029297, "debug/reference_chosen_logps": -90.65675354003906, "debug/reference_rejected_logps": -11.677980422973633, "epoch": 2.300724637681159, "grad_norm": 43.61572952416094, "learning_rate": 7.462049335863377e-08, "logits/chosen": 1.329341173171997, "logits/rejected": 1.570401906967163, "logps/chosen": -90.06200408935547, "logps/rejected": -11.742565155029297, "loss": 0.6834, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.29737308621406555, "rewards/margins": 0.32966578006744385, "rewards/rejected": -0.03229271247982979, "step": 635 }, { "debug/policy_chosen_logits": 1.1645739078521729, "debug/policy_chosen_logps": -115.27735900878906, "debug/policy_rejected_logits": 1.530078649520874, "debug/policy_rejected_logps": -7.676455020904541, "debug/reference_chosen_logps": -115.78373718261719, "debug/reference_rejected_logps": -7.740739345550537, "epoch": 2.318840579710145, "grad_norm": 45.72636697239578, "learning_rate": 7.438330170777988e-08, "logits/chosen": 1.1645739078521729, "logits/rejected": 1.530078649520874, "logps/chosen": -115.27735900878906, "logps/rejected": -7.676455020904541, "loss": 0.6806, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.2531926929950714, "rewards/margins": 0.22105033695697784, "rewards/rejected": 0.03214232623577118, "step": 640 }, { "debug/policy_chosen_logits": 1.3330034017562866, "debug/policy_chosen_logps": -36.74895477294922, "debug/policy_rejected_logits": 1.2860333919525146, "debug/policy_rejected_logps": -4.696889400482178, "debug/reference_chosen_logps": -36.7678108215332, "debug/reference_rejected_logps": -4.662884712219238, "epoch": 2.3369565217391304, "grad_norm": 36.238225612970574, "learning_rate": 7.4146110056926e-08, "logits/chosen": 1.3330034017562866, "logits/rejected": 1.2860333919525146, "logps/chosen": -36.74895477294922, "logps/rejected": -4.696889400482178, "loss": 0.6826, "rewards/accuracies": 0.25, "rewards/chosen": 0.009427306242287159, "rewards/margins": 0.02643001638352871, "rewards/rejected": -0.017002711072564125, "step": 645 }, { "debug/policy_chosen_logits": 1.5750983953475952, "debug/policy_chosen_logps": -173.6097869873047, "debug/policy_rejected_logits": 1.6201187372207642, "debug/policy_rejected_logps": -7.340598106384277, "debug/reference_chosen_logps": -173.88534545898438, "debug/reference_rejected_logps": -7.317915916442871, "epoch": 2.355072463768116, "grad_norm": 57.201455613129845, "learning_rate": 7.39089184060721e-08, "logits/chosen": 1.5750983953475952, "logits/rejected": 1.6201187372207642, "logps/chosen": -173.6097869873047, "logps/rejected": -7.340598106384277, "loss": 0.6708, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.13778185844421387, "rewards/margins": 0.14912331104278564, "rewards/rejected": -0.011341440491378307, "step": 650 }, { "debug/policy_chosen_logits": 1.4286606311798096, "debug/policy_chosen_logps": -72.2771987915039, "debug/policy_rejected_logits": 1.7335174083709717, "debug/policy_rejected_logps": -5.899569511413574, "debug/reference_chosen_logps": -72.91654205322266, "debug/reference_rejected_logps": -5.8764119148254395, "epoch": 2.3731884057971016, "grad_norm": 47.877666221545006, "learning_rate": 7.367172675521821e-08, "logits/chosen": 1.4286606311798096, "logits/rejected": 1.7335174083709717, "logps/chosen": -72.2771987915039, "logps/rejected": -5.899569511413574, "loss": 0.6727, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.31967201828956604, "rewards/margins": 0.33125072717666626, "rewards/rejected": -0.011578726582229137, "step": 655 }, { "debug/policy_chosen_logits": 1.1493260860443115, "debug/policy_chosen_logps": -157.1625518798828, "debug/policy_rejected_logits": 1.4012162685394287, "debug/policy_rejected_logps": -3.772329330444336, "debug/reference_chosen_logps": -157.91355895996094, "debug/reference_rejected_logps": -3.7736668586730957, "epoch": 2.391304347826087, "grad_norm": 122.31361658456863, "learning_rate": 7.343453510436432e-08, "logits/chosen": 1.1493260860443115, "logits/rejected": 1.4012162685394287, "logps/chosen": -157.1625518798828, "logps/rejected": -3.772329330444336, "loss": 0.6944, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.3755073845386505, "rewards/margins": 0.37483853101730347, "rewards/rejected": 0.0006688535213470459, "step": 660 }, { "debug/policy_chosen_logits": 1.3708937168121338, "debug/policy_chosen_logps": -218.97811889648438, "debug/policy_rejected_logits": 1.7635114192962646, "debug/policy_rejected_logps": -50.349647521972656, "debug/reference_chosen_logps": -219.96499633789062, "debug/reference_rejected_logps": -50.28038787841797, "epoch": 2.4094202898550723, "grad_norm": 27.806124818037347, "learning_rate": 7.319734345351043e-08, "logits/chosen": 1.3708937168121338, "logits/rejected": 1.7635114192962646, "logps/chosen": -218.97811889648438, "logps/rejected": -50.349647521972656, "loss": 0.6884, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.4934414327144623, "rewards/margins": 0.5280711650848389, "rewards/rejected": -0.03462976589798927, "step": 665 }, { "debug/policy_chosen_logits": 1.7052303552627563, "debug/policy_chosen_logps": -100.59894561767578, "debug/policy_rejected_logits": 1.9349712133407593, "debug/policy_rejected_logps": -2.9665045738220215, "debug/reference_chosen_logps": -100.52616882324219, "debug/reference_rejected_logps": -2.9794654846191406, "epoch": 2.427536231884058, "grad_norm": 20.447905844568204, "learning_rate": 7.296015180265654e-08, "logits/chosen": 1.7052303552627563, "logits/rejected": 1.9349712133407593, "logps/chosen": -100.59894561767578, "logps/rejected": -2.9665045738220215, "loss": 0.6877, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.03639256954193115, "rewards/margins": -0.04287288337945938, "rewards/rejected": 0.006480316631495953, "step": 670 }, { "debug/policy_chosen_logits": 1.3092758655548096, "debug/policy_chosen_logps": -19.385467529296875, "debug/policy_rejected_logits": 1.4740664958953857, "debug/policy_rejected_logps": -7.161191463470459, "debug/reference_chosen_logps": -19.44676971435547, "debug/reference_rejected_logps": -7.09658145904541, "epoch": 2.4456521739130435, "grad_norm": 31.129548945960707, "learning_rate": 7.272296015180265e-08, "logits/chosen": 1.3092758655548096, "logits/rejected": 1.4740664958953857, "logps/chosen": -19.385467529296875, "logps/rejected": -7.161191463470459, "loss": 0.6774, "rewards/accuracies": 0.375, "rewards/chosen": 0.030650924891233444, "rewards/margins": 0.0629563108086586, "rewards/rejected": -0.03230538219213486, "step": 675 }, { "debug/policy_chosen_logits": 1.6983013153076172, "debug/policy_chosen_logps": -8.618484497070312, "debug/policy_rejected_logits": 1.692448616027832, "debug/policy_rejected_logps": -6.639832973480225, "debug/reference_chosen_logps": -8.653813362121582, "debug/reference_rejected_logps": -6.547207832336426, "epoch": 2.463768115942029, "grad_norm": 34.88739012034015, "learning_rate": 7.248576850094877e-08, "logits/chosen": 1.6983013153076172, "logits/rejected": 1.692448616027832, "logps/chosen": -8.618484497070312, "logps/rejected": -6.639832973480225, "loss": 0.6889, "rewards/accuracies": 0.375, "rewards/chosen": 0.017664363607764244, "rewards/margins": 0.06397727876901627, "rewards/rejected": -0.04631291329860687, "step": 680 }, { "debug/policy_chosen_logits": 1.3575327396392822, "debug/policy_chosen_logps": -8.445387840270996, "debug/policy_rejected_logits": 1.6548616886138916, "debug/policy_rejected_logps": -8.764449119567871, "debug/reference_chosen_logps": -8.536941528320312, "debug/reference_rejected_logps": -8.682905197143555, "epoch": 2.4818840579710146, "grad_norm": 50.10009031553424, "learning_rate": 7.224857685009488e-08, "logits/chosen": 1.3575327396392822, "logits/rejected": 1.6548616886138916, "logps/chosen": -8.445387840270996, "logps/rejected": -8.764449119567871, "loss": 0.6739, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.04577675089240074, "rewards/margins": 0.08654932677745819, "rewards/rejected": -0.04077257961034775, "step": 685 }, { "debug/policy_chosen_logits": 1.2889703512191772, "debug/policy_chosen_logps": -87.86897277832031, "debug/policy_rejected_logits": 1.5911585092544556, "debug/policy_rejected_logps": -2.9361891746520996, "debug/reference_chosen_logps": -88.01163482666016, "debug/reference_rejected_logps": -2.9231133460998535, "epoch": 2.5, "grad_norm": 35.68134163315367, "learning_rate": 7.201138519924098e-08, "logits/chosen": 1.2889703512191772, "logits/rejected": 1.5911585092544556, "logps/chosen": -87.86897277832031, "logps/rejected": -2.9361891746520996, "loss": 0.6706, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.07133243978023529, "rewards/margins": 0.07787024229764938, "rewards/rejected": -0.006537807174026966, "step": 690 }, { "debug/policy_chosen_logits": 1.3106393814086914, "debug/policy_chosen_logps": -40.185882568359375, "debug/policy_rejected_logits": 1.425025224685669, "debug/policy_rejected_logps": -5.529564380645752, "debug/reference_chosen_logps": -40.38248062133789, "debug/reference_rejected_logps": -5.549628257751465, "epoch": 2.5181159420289854, "grad_norm": 29.589086362649926, "learning_rate": 7.177419354838709e-08, "logits/chosen": 1.3106393814086914, "logits/rejected": 1.425025224685669, "logps/chosen": -40.185882568359375, "logps/rejected": -5.529564380645752, "loss": 0.6803, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.09830110520124435, "rewards/margins": 0.08826910704374313, "rewards/rejected": 0.010031996294856071, "step": 695 }, { "debug/policy_chosen_logits": 1.4557925462722778, "debug/policy_chosen_logps": -6.179305076599121, "debug/policy_rejected_logits": 1.7278168201446533, "debug/policy_rejected_logps": -296.1449890136719, "debug/reference_chosen_logps": -6.198336601257324, "debug/reference_rejected_logps": -296.1982116699219, "epoch": 2.536231884057971, "grad_norm": 454.9316086536511, "learning_rate": 7.15370018975332e-08, "logits/chosen": 1.4557925462722778, "logits/rejected": 1.7278168201446533, "logps/chosen": -6.179305076599121, "logps/rejected": -296.1449890136719, "loss": 0.7035, "rewards/accuracies": 0.375, "rewards/chosen": 0.009515708312392235, "rewards/margins": -0.017088066786527634, "rewards/rejected": 0.026603782549500465, "step": 700 }, { "epoch": 2.536231884057971, "eval_debug/policy_chosen_logits": 1.644766926765442, "eval_debug/policy_chosen_logps": -122.86726379394531, "eval_debug/policy_rejected_logits": 1.705788016319275, "eval_debug/policy_rejected_logps": -63.944515228271484, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.644766926765442, "eval_logits/rejected": 1.705788016319275, "eval_logps/chosen": -122.86726379394531, "eval_logps/rejected": -63.944515228271484, "eval_loss": 0.6898954510688782, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.14039388298988342, "eval_rewards/margins": 0.16912512481212616, "eval_rewards/rejected": -0.028731241822242737, "eval_runtime": 28.566, "eval_samples_per_second": 21.004, "eval_steps_per_second": 0.665, "step": 700 }, { "debug/policy_chosen_logits": 0.8984088897705078, "debug/policy_chosen_logps": -77.83113098144531, "debug/policy_rejected_logits": 1.2360783815383911, "debug/policy_rejected_logps": -60.0334587097168, "debug/reference_chosen_logps": -65.52136993408203, "debug/reference_rejected_logps": -53.42998504638672, "epoch": 2.5543478260869565, "grad_norm": 364.05960292695886, "learning_rate": 7.129981024667931e-08, "logits/chosen": 0.8984088897705078, "logits/rejected": 1.2360783815383911, "logps/chosen": -77.83113098144531, "logps/rejected": -60.0334587097168, "loss": 3.8204, "rewards/accuracies": 0.25, "rewards/chosen": -6.154881477355957, "rewards/margins": -2.853144407272339, "rewards/rejected": -3.301736831665039, "step": 705 }, { "debug/policy_chosen_logits": 1.1741610765457153, "debug/policy_chosen_logps": -41.56535339355469, "debug/policy_rejected_logits": 1.4587411880493164, "debug/policy_rejected_logps": -61.12058639526367, "debug/reference_chosen_logps": -39.279052734375, "debug/reference_rejected_logps": -59.38611602783203, "epoch": 2.572463768115942, "grad_norm": 185.74132444303763, "learning_rate": 7.106261859582542e-08, "logits/chosen": 1.1741610765457153, "logits/rejected": 1.4587411880493164, "logps/chosen": -41.56535339355469, "logps/rejected": -61.12058639526367, "loss": 2.7867, "rewards/accuracies": 0.375, "rewards/chosen": -1.1431523561477661, "rewards/margins": -0.27591538429260254, "rewards/rejected": -0.8672367334365845, "step": 710 }, { "debug/policy_chosen_logits": 1.0475952625274658, "debug/policy_chosen_logps": -67.15113830566406, "debug/policy_rejected_logits": 1.3525660037994385, "debug/policy_rejected_logps": -99.1638412475586, "debug/reference_chosen_logps": -66.37857818603516, "debug/reference_rejected_logps": -97.83031463623047, "epoch": 2.5905797101449277, "grad_norm": 123.05388352230544, "learning_rate": 7.082542694497154e-08, "logits/chosen": 1.0475952625274658, "logits/rejected": 1.3525660037994385, "logps/chosen": -67.15113830566406, "logps/rejected": -99.1638412475586, "loss": 0.7536, "rewards/accuracies": 0.25, "rewards/chosen": -0.3862820565700531, "rewards/margins": 0.28047889471054077, "rewards/rejected": -0.6667609214782715, "step": 715 }, { "debug/policy_chosen_logits": 1.1804733276367188, "debug/policy_chosen_logps": -192.32504272460938, "debug/policy_rejected_logits": 1.3969064950942993, "debug/policy_rejected_logps": -50.71982955932617, "debug/reference_chosen_logps": -192.1670684814453, "debug/reference_rejected_logps": -50.335174560546875, "epoch": 2.608695652173913, "grad_norm": 50.79946239637441, "learning_rate": 7.058823529411765e-08, "logits/chosen": 1.1804733276367188, "logits/rejected": 1.3969064950942993, "logps/chosen": -192.32504272460938, "logps/rejected": -50.71982955932617, "loss": 0.7901, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07899363338947296, "rewards/margins": 0.11333222687244415, "rewards/rejected": -0.19232586026191711, "step": 720 }, { "debug/policy_chosen_logits": 1.3812291622161865, "debug/policy_chosen_logps": -3.2952938079833984, "debug/policy_rejected_logits": 2.040213108062744, "debug/policy_rejected_logps": -6.706567287445068, "debug/reference_chosen_logps": -3.298539400100708, "debug/reference_rejected_logps": -6.633472442626953, "epoch": 2.6268115942028984, "grad_norm": 28.92697606996415, "learning_rate": 7.035104364326376e-08, "logits/chosen": 1.3812291622161865, "logits/rejected": 2.040213108062744, "logps/chosen": -3.2952938079833984, "logps/rejected": -6.706567287445068, "loss": 0.6813, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.0016226947773247957, "rewards/margins": 0.03817015141248703, "rewards/rejected": -0.0365474596619606, "step": 725 }, { "debug/policy_chosen_logits": 1.2535831928253174, "debug/policy_chosen_logps": -96.40589141845703, "debug/policy_rejected_logits": 1.5496419668197632, "debug/policy_rejected_logps": -6.55911111831665, "debug/reference_chosen_logps": -96.49998474121094, "debug/reference_rejected_logps": -6.519639492034912, "epoch": 2.644927536231884, "grad_norm": 37.98233788439568, "learning_rate": 7.011385199240986e-08, "logits/chosen": 1.2535831928253174, "logits/rejected": 1.5496419668197632, "logps/chosen": -96.40589141845703, "logps/rejected": -6.55911111831665, "loss": 0.6875, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.04704520106315613, "rewards/margins": 0.06678075343370438, "rewards/rejected": -0.01973554864525795, "step": 730 }, { "debug/policy_chosen_logits": 1.347577691078186, "debug/policy_chosen_logps": -7.822085380554199, "debug/policy_rejected_logits": 1.7887099981307983, "debug/policy_rejected_logps": -8.985967636108398, "debug/reference_chosen_logps": -7.975428581237793, "debug/reference_rejected_logps": -9.003293991088867, "epoch": 2.6630434782608696, "grad_norm": 29.845063662908643, "learning_rate": 6.987666034155597e-08, "logits/chosen": 1.347577691078186, "logits/rejected": 1.7887099981307983, "logps/chosen": -7.822085380554199, "logps/rejected": -8.985967636108398, "loss": 0.6936, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.07667167484760284, "rewards/margins": 0.06800887733697891, "rewards/rejected": 0.008662795647978783, "step": 735 }, { "debug/policy_chosen_logits": 1.4809266328811646, "debug/policy_chosen_logps": -51.48683547973633, "debug/policy_rejected_logits": 1.280070185661316, "debug/policy_rejected_logps": -100.51704406738281, "debug/reference_chosen_logps": -51.798500061035156, "debug/reference_rejected_logps": -100.50444030761719, "epoch": 2.681159420289855, "grad_norm": 49.65768647656362, "learning_rate": 6.963946869070208e-08, "logits/chosen": 1.4809266328811646, "logits/rejected": 1.280070185661316, "logps/chosen": -51.48683547973633, "logps/rejected": -100.51704406738281, "loss": 0.6941, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.1558307558298111, "rewards/margins": 0.16213369369506836, "rewards/rejected": -0.0063029201701283455, "step": 740 }, { "debug/policy_chosen_logits": 1.4046558141708374, "debug/policy_chosen_logps": -6.247913837432861, "debug/policy_rejected_logits": 1.377934455871582, "debug/policy_rejected_logps": -9.825262069702148, "debug/reference_chosen_logps": -6.19936466217041, "debug/reference_rejected_logps": -9.75121021270752, "epoch": 2.699275362318841, "grad_norm": 68.96263731577534, "learning_rate": 6.940227703984819e-08, "logits/chosen": 1.4046558141708374, "logits/rejected": 1.377934455871582, "logps/chosen": -6.247913837432861, "logps/rejected": -9.825262069702148, "loss": 0.6786, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.024274742230772972, "rewards/margins": 0.012750571593642235, "rewards/rejected": -0.03702531382441521, "step": 745 }, { "debug/policy_chosen_logits": 1.218973994255066, "debug/policy_chosen_logps": -131.67039489746094, "debug/policy_rejected_logits": 1.3298184871673584, "debug/policy_rejected_logps": -5.238399505615234, "debug/reference_chosen_logps": -132.68191528320312, "debug/reference_rejected_logps": -5.177205562591553, "epoch": 2.717391304347826, "grad_norm": 218.32632982801243, "learning_rate": 6.916508538899431e-08, "logits/chosen": 1.218973994255066, "logits/rejected": 1.3298184871673584, "logps/chosen": -131.67039489746094, "logps/rejected": -5.238399505615234, "loss": 0.6819, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.5057564973831177, "rewards/margins": 0.5363535284996033, "rewards/rejected": -0.030597001314163208, "step": 750 }, { "debug/policy_chosen_logits": 1.3596988916397095, "debug/policy_chosen_logps": -11.752751350402832, "debug/policy_rejected_logits": 1.469085693359375, "debug/policy_rejected_logps": -6.199671268463135, "debug/reference_chosen_logps": -11.79719352722168, "debug/reference_rejected_logps": -6.178770542144775, "epoch": 2.7355072463768115, "grad_norm": 27.59877331527671, "learning_rate": 6.892789373814042e-08, "logits/chosen": 1.3596988916397095, "logits/rejected": 1.469085693359375, "logps/chosen": -11.752751350402832, "logps/rejected": -6.199671268463135, "loss": 0.6845, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0222210343927145, "rewards/margins": 0.032671328634023666, "rewards/rejected": -0.010450294241309166, "step": 755 }, { "debug/policy_chosen_logits": 1.371497392654419, "debug/policy_chosen_logps": -75.68782043457031, "debug/policy_rejected_logits": 1.8690340518951416, "debug/policy_rejected_logps": -5.5169219970703125, "debug/reference_chosen_logps": -75.92655181884766, "debug/reference_rejected_logps": -5.4222517013549805, "epoch": 2.753623188405797, "grad_norm": 66.14093578354746, "learning_rate": 6.869070208728653e-08, "logits/chosen": 1.371497392654419, "logits/rejected": 1.8690340518951416, "logps/chosen": -75.68782043457031, "logps/rejected": -5.5169219970703125, "loss": 0.6738, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.11936624348163605, "rewards/margins": 0.16670118272304535, "rewards/rejected": -0.0473349466919899, "step": 760 }, { "debug/policy_chosen_logits": 1.4766225814819336, "debug/policy_chosen_logps": -159.9072723388672, "debug/policy_rejected_logits": 1.5807374715805054, "debug/policy_rejected_logps": -104.93505859375, "debug/reference_chosen_logps": -160.2303009033203, "debug/reference_rejected_logps": -105.09793853759766, "epoch": 2.7717391304347827, "grad_norm": 53.70920997735293, "learning_rate": 6.845351043643264e-08, "logits/chosen": 1.4766225814819336, "logits/rejected": 1.5807374715805054, "logps/chosen": -159.9072723388672, "logps/rejected": -104.93505859375, "loss": 0.6922, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.16150960326194763, "rewards/margins": 0.08006690442562103, "rewards/rejected": 0.0814426988363266, "step": 765 }, { "debug/policy_chosen_logits": 1.590087652206421, "debug/policy_chosen_logps": -6.2314581871032715, "debug/policy_rejected_logits": 2.109851360321045, "debug/policy_rejected_logps": -100.70685577392578, "debug/reference_chosen_logps": -6.2502970695495605, "debug/reference_rejected_logps": -100.2561264038086, "epoch": 2.789855072463768, "grad_norm": 38.86144600314118, "learning_rate": 6.821631878557874e-08, "logits/chosen": 1.590087652206421, "logits/rejected": 2.109851360321045, "logps/chosen": -6.2314581871032715, "logps/rejected": -100.70685577392578, "loss": 0.6748, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009419417008757591, "rewards/margins": 0.23478055000305176, "rewards/rejected": -0.22536113858222961, "step": 770 }, { "debug/policy_chosen_logits": 1.7171459197998047, "debug/policy_chosen_logps": -8.094103813171387, "debug/policy_rejected_logits": 2.0026705265045166, "debug/policy_rejected_logps": -11.521265983581543, "debug/reference_chosen_logps": -8.093632698059082, "debug/reference_rejected_logps": -11.414515495300293, "epoch": 2.807971014492754, "grad_norm": 44.16565292451917, "learning_rate": 6.797912713472485e-08, "logits/chosen": 1.7171459197998047, "logits/rejected": 2.0026705265045166, "logps/chosen": -8.094103813171387, "logps/rejected": -11.521265983581543, "loss": 0.6808, "rewards/accuracies": 0.375, "rewards/chosen": -0.0002360701619181782, "rewards/margins": 0.05313878506422043, "rewards/rejected": -0.05337485671043396, "step": 775 }, { "debug/policy_chosen_logits": 1.2627537250518799, "debug/policy_chosen_logps": -6.5639848709106445, "debug/policy_rejected_logits": 1.7579565048217773, "debug/policy_rejected_logps": -7.902961730957031, "debug/reference_chosen_logps": -6.597743034362793, "debug/reference_rejected_logps": -7.864927768707275, "epoch": 2.8260869565217392, "grad_norm": 75.000309316966, "learning_rate": 6.774193548387096e-08, "logits/chosen": 1.2627537250518799, "logits/rejected": 1.7579565048217773, "logps/chosen": -6.5639848709106445, "logps/rejected": -7.902961730957031, "loss": 0.7307, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.016879335045814514, "rewards/margins": 0.03589591383934021, "rewards/rejected": -0.019016582518815994, "step": 780 }, { "debug/policy_chosen_logits": 1.2688283920288086, "debug/policy_chosen_logps": -163.25997924804688, "debug/policy_rejected_logits": 1.6192766427993774, "debug/policy_rejected_logps": -28.987003326416016, "debug/reference_chosen_logps": -164.0539093017578, "debug/reference_rejected_logps": -28.904144287109375, "epoch": 2.8442028985507246, "grad_norm": 29.33603885802658, "learning_rate": 6.750474383301707e-08, "logits/chosen": 1.2688283920288086, "logits/rejected": 1.6192766427993774, "logps/chosen": -163.25997924804688, "logps/rejected": -28.987003326416016, "loss": 0.6927, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.39695531129837036, "rewards/margins": 0.43838492035865784, "rewards/rejected": -0.04142959788441658, "step": 785 }, { "debug/policy_chosen_logits": 1.4841022491455078, "debug/policy_chosen_logps": -6.473947048187256, "debug/policy_rejected_logits": 1.8745334148406982, "debug/policy_rejected_logps": -8.771561622619629, "debug/reference_chosen_logps": -6.4667768478393555, "debug/reference_rejected_logps": -8.72008991241455, "epoch": 2.86231884057971, "grad_norm": 31.240541834394914, "learning_rate": 6.726755218216319e-08, "logits/chosen": 1.4841022491455078, "logits/rejected": 1.8745334148406982, "logps/chosen": -6.473947048187256, "logps/rejected": -8.771561622619629, "loss": 0.6856, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.003585115075111389, "rewards/margins": 0.022150352597236633, "rewards/rejected": -0.025735467672348022, "step": 790 }, { "debug/policy_chosen_logits": 1.3624491691589355, "debug/policy_chosen_logps": -200.53170776367188, "debug/policy_rejected_logits": 1.612990140914917, "debug/policy_rejected_logps": -7.673493385314941, "debug/reference_chosen_logps": -202.3122100830078, "debug/reference_rejected_logps": -7.594631195068359, "epoch": 2.880434782608696, "grad_norm": 40.789995267796925, "learning_rate": 6.70303605313093e-08, "logits/chosen": 1.3624491691589355, "logits/rejected": 1.612990140914917, "logps/chosen": -200.53170776367188, "logps/rejected": -7.673493385314941, "loss": 0.6763, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.8902603387832642, "rewards/margins": 0.929690957069397, "rewards/rejected": -0.03943060338497162, "step": 795 }, { "debug/policy_chosen_logits": 1.4642894268035889, "debug/policy_chosen_logps": -74.65554809570312, "debug/policy_rejected_logits": 1.6648118495941162, "debug/policy_rejected_logps": -109.84590911865234, "debug/reference_chosen_logps": -75.12210845947266, "debug/reference_rejected_logps": -110.198974609375, "epoch": 2.898550724637681, "grad_norm": 42.99863348572588, "learning_rate": 6.679316888045541e-08, "logits/chosen": 1.4642894268035889, "logits/rejected": 1.6648118495941162, "logps/chosen": -74.65554809570312, "logps/rejected": -109.84590911865234, "loss": 0.685, "rewards/accuracies": 0.25, "rewards/chosen": 0.23327946662902832, "rewards/margins": 0.05675656720995903, "rewards/rejected": 0.176522895693779, "step": 800 }, { "epoch": 2.898550724637681, "eval_debug/policy_chosen_logits": 1.649984359741211, "eval_debug/policy_chosen_logps": -122.28389739990234, "eval_debug/policy_rejected_logits": 1.7109218835830688, "eval_debug/policy_rejected_logps": -63.73524856567383, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.649984359741211, "eval_logits/rejected": 1.7109218835830688, "eval_logps/chosen": -122.28389739990234, "eval_logps/rejected": -63.73524856567383, "eval_loss": 0.6978356838226318, "eval_rewards/accuracies": 0.3947368562221527, "eval_rewards/chosen": 0.4320771098136902, "eval_rewards/margins": 0.35617244243621826, "eval_rewards/rejected": 0.07590456306934357, "eval_runtime": 28.5199, "eval_samples_per_second": 21.038, "eval_steps_per_second": 0.666, "step": 800 }, { "debug/policy_chosen_logits": 1.3750041723251343, "debug/policy_chosen_logps": -5.2688679695129395, "debug/policy_rejected_logits": 1.8108991384506226, "debug/policy_rejected_logps": -81.34712982177734, "debug/reference_chosen_logps": -5.291328430175781, "debug/reference_rejected_logps": -81.42278289794922, "epoch": 2.9166666666666665, "grad_norm": 51.26223742153452, "learning_rate": 6.655597722960152e-08, "logits/chosen": 1.3750041723251343, "logits/rejected": 1.8108991384506226, "logps/chosen": -5.2688679695129395, "logps/rejected": -81.34712982177734, "loss": 0.6734, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.011229927651584148, "rewards/margins": -0.026595916599035263, "rewards/rejected": 0.037825845181941986, "step": 805 }, { "debug/policy_chosen_logits": 1.3802759647369385, "debug/policy_chosen_logps": -9.522491455078125, "debug/policy_rejected_logits": 1.6636362075805664, "debug/policy_rejected_logps": -81.15298461914062, "debug/reference_chosen_logps": -9.574371337890625, "debug/reference_rejected_logps": -81.211669921875, "epoch": 2.9347826086956523, "grad_norm": 503.32597875041324, "learning_rate": 6.631878557874762e-08, "logits/chosen": 1.3802759647369385, "logits/rejected": 1.6636362075805664, "logps/chosen": -9.522491455078125, "logps/rejected": -81.15298461914062, "loss": 0.6835, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.025939855724573135, "rewards/margins": -0.0034011632669717073, "rewards/rejected": 0.029341017827391624, "step": 810 }, { "debug/policy_chosen_logits": 1.1557979583740234, "debug/policy_chosen_logps": -234.61660766601562, "debug/policy_rejected_logits": 1.5335800647735596, "debug/policy_rejected_logps": -86.92341613769531, "debug/reference_chosen_logps": -234.8483428955078, "debug/reference_rejected_logps": -86.75303649902344, "epoch": 2.9528985507246377, "grad_norm": 252.73991707492058, "learning_rate": 6.608159392789373e-08, "logits/chosen": 1.1557979583740234, "logits/rejected": 1.5335800647735596, "logps/chosen": -234.61660766601562, "logps/rejected": -86.92341613769531, "loss": 0.6859, "rewards/accuracies": 0.375, "rewards/chosen": 0.11585275083780289, "rewards/margins": 0.20104579627513885, "rewards/rejected": -0.08519303798675537, "step": 815 }, { "debug/policy_chosen_logits": 1.4925482273101807, "debug/policy_chosen_logps": -4.081394195556641, "debug/policy_rejected_logits": 2.085655689239502, "debug/policy_rejected_logps": -43.84490203857422, "debug/reference_chosen_logps": -4.089119911193848, "debug/reference_rejected_logps": -43.911102294921875, "epoch": 2.971014492753623, "grad_norm": 44.296976333982215, "learning_rate": 6.584440227703984e-08, "logits/chosen": 1.4925482273101807, "logits/rejected": 2.085655689239502, "logps/chosen": -4.081394195556641, "logps/rejected": -43.84490203857422, "loss": 0.6954, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.003862895769998431, "rewards/margins": -0.029234904795885086, "rewards/rejected": 0.03309779614210129, "step": 820 }, { "debug/policy_chosen_logits": 1.5940030813217163, "debug/policy_chosen_logps": -137.58055114746094, "debug/policy_rejected_logits": 1.8865711688995361, "debug/policy_rejected_logps": -2.2771830558776855, "debug/reference_chosen_logps": -137.60731506347656, "debug/reference_rejected_logps": -2.2748847007751465, "epoch": 2.9891304347826084, "grad_norm": 162.93185756186395, "learning_rate": 6.560721062618596e-08, "logits/chosen": 1.5940030813217163, "logits/rejected": 1.8865711688995361, "logps/chosen": -137.58055114746094, "logps/rejected": -2.2771830558776855, "loss": 0.6831, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.013367986306548119, "rewards/margins": 0.014517134055495262, "rewards/rejected": -0.001149150775745511, "step": 825 }, { "debug/policy_chosen_logits": 1.4136964082717896, "debug/policy_chosen_logps": -175.1110076904297, "debug/policy_rejected_logits": 1.8977985382080078, "debug/policy_rejected_logps": -120.87762451171875, "debug/reference_chosen_logps": -175.3301239013672, "debug/reference_rejected_logps": -120.7976303100586, "epoch": 3.0072463768115942, "grad_norm": 62.994534461409096, "learning_rate": 6.537001897533207e-08, "logits/chosen": 1.4136964082717896, "logits/rejected": 1.8977985382080078, "logps/chosen": -175.1110076904297, "logps/rejected": -120.87762451171875, "loss": 0.6804, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.10956084728240967, "rewards/margins": 0.149554044008255, "rewards/rejected": -0.039993204176425934, "step": 830 }, { "debug/policy_chosen_logits": 1.498468279838562, "debug/policy_chosen_logps": -44.90773391723633, "debug/policy_rejected_logits": 1.6970798969268799, "debug/policy_rejected_logps": -5.429778575897217, "debug/reference_chosen_logps": -45.01803970336914, "debug/reference_rejected_logps": -5.35309362411499, "epoch": 3.0253623188405796, "grad_norm": 82.26235791828255, "learning_rate": 6.513282732447818e-08, "logits/chosen": 1.498468279838562, "logits/rejected": 1.6970798969268799, "logps/chosen": -44.90773391723633, "logps/rejected": -5.429778575897217, "loss": 0.6764, "rewards/accuracies": 0.375, "rewards/chosen": 0.05515369027853012, "rewards/margins": 0.09349614381790161, "rewards/rejected": -0.038342446088790894, "step": 835 }, { "debug/policy_chosen_logits": 1.7101024389266968, "debug/policy_chosen_logps": -6.839188575744629, "debug/policy_rejected_logits": 1.4906718730926514, "debug/policy_rejected_logps": -49.02787399291992, "debug/reference_chosen_logps": -6.925424098968506, "debug/reference_rejected_logps": -48.94139862060547, "epoch": 3.0434782608695654, "grad_norm": 56.75596944384774, "learning_rate": 6.489563567362429e-08, "logits/chosen": 1.7101024389266968, "logits/rejected": 1.4906718730926514, "logps/chosen": -6.839188575744629, "logps/rejected": -49.02787399291992, "loss": 0.6811, "rewards/accuracies": 0.5, "rewards/chosen": 0.04311814904212952, "rewards/margins": 0.08635678142309189, "rewards/rejected": -0.04323863610625267, "step": 840 }, { "debug/policy_chosen_logits": 1.5343866348266602, "debug/policy_chosen_logps": -5.370896816253662, "debug/policy_rejected_logits": 1.6845684051513672, "debug/policy_rejected_logps": -104.4680404663086, "debug/reference_chosen_logps": -5.335769176483154, "debug/reference_rejected_logps": -104.54981994628906, "epoch": 3.0615942028985508, "grad_norm": 107.118062729067, "learning_rate": 6.46584440227704e-08, "logits/chosen": 1.5343866348266602, "logits/rejected": 1.6845684051513672, "logps/chosen": -5.370896816253662, "logps/rejected": -104.4680404663086, "loss": 0.6675, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.017563676461577415, "rewards/margins": -0.05845416709780693, "rewards/rejected": 0.040890492498874664, "step": 845 }, { "debug/policy_chosen_logits": 1.2884478569030762, "debug/policy_chosen_logps": -153.8372039794922, "debug/policy_rejected_logits": 1.742032766342163, "debug/policy_rejected_logps": -6.086215972900391, "debug/reference_chosen_logps": -155.05332946777344, "debug/reference_rejected_logps": -6.033763408660889, "epoch": 3.079710144927536, "grad_norm": 23.942763881828547, "learning_rate": 6.44212523719165e-08, "logits/chosen": 1.2884478569030762, "logits/rejected": 1.742032766342163, "logps/chosen": -153.8372039794922, "logps/rejected": -6.086215972900391, "loss": 0.6647, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.6080719232559204, "rewards/margins": 0.6342981457710266, "rewards/rejected": -0.02622619830071926, "step": 850 }, { "debug/policy_chosen_logits": 1.6489976644515991, "debug/policy_chosen_logps": -65.41344451904297, "debug/policy_rejected_logits": 1.7344964742660522, "debug/policy_rejected_logps": -55.25066375732422, "debug/reference_chosen_logps": -65.62288665771484, "debug/reference_rejected_logps": -55.23314666748047, "epoch": 3.097826086956522, "grad_norm": 35.02611219345287, "learning_rate": 6.418406072106261e-08, "logits/chosen": 1.6489976644515991, "logits/rejected": 1.7344964742660522, "logps/chosen": -65.41344451904297, "logps/rejected": -55.25066375732422, "loss": 0.6721, "rewards/accuracies": 0.375, "rewards/chosen": 0.10472525656223297, "rewards/margins": 0.11348487436771393, "rewards/rejected": -0.008759623393416405, "step": 855 }, { "debug/policy_chosen_logits": 1.3655107021331787, "debug/policy_chosen_logps": -5.15181827545166, "debug/policy_rejected_logits": 1.4160335063934326, "debug/policy_rejected_logps": -11.163800239562988, "debug/reference_chosen_logps": -5.2109761238098145, "debug/reference_rejected_logps": -11.119375228881836, "epoch": 3.1159420289855073, "grad_norm": 38.99326080918646, "learning_rate": 6.394686907020873e-08, "logits/chosen": 1.3655107021331787, "logits/rejected": 1.4160335063934326, "logps/chosen": -5.15181827545166, "logps/rejected": -11.163800239562988, "loss": 0.6704, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.02957860566675663, "rewards/margins": 0.05179072171449661, "rewards/rejected": -0.022212114185094833, "step": 860 }, { "debug/policy_chosen_logits": 1.4471279382705688, "debug/policy_chosen_logps": -130.35665893554688, "debug/policy_rejected_logits": 1.8090507984161377, "debug/policy_rejected_logps": -37.323341369628906, "debug/reference_chosen_logps": -131.31796264648438, "debug/reference_rejected_logps": -37.071983337402344, "epoch": 3.1340579710144927, "grad_norm": 67.25784977750303, "learning_rate": 6.370967741935484e-08, "logits/chosen": 1.4471279382705688, "logits/rejected": 1.8090507984161377, "logps/chosen": -130.35665893554688, "logps/rejected": -37.323341369628906, "loss": 0.6671, "rewards/accuracies": 0.375, "rewards/chosen": 0.48065805435180664, "rewards/margins": 0.6063353419303894, "rewards/rejected": -0.12567731738090515, "step": 865 }, { "debug/policy_chosen_logits": 1.6527878046035767, "debug/policy_chosen_logps": -133.9925537109375, "debug/policy_rejected_logits": 1.9235893487930298, "debug/policy_rejected_logps": -5.531818389892578, "debug/reference_chosen_logps": -134.39736938476562, "debug/reference_rejected_logps": -5.434831142425537, "epoch": 3.1521739130434785, "grad_norm": 127.87270142016548, "learning_rate": 6.347248576850095e-08, "logits/chosen": 1.6527878046035767, "logits/rejected": 1.9235893487930298, "logps/chosen": -133.9925537109375, "logps/rejected": -5.531818389892578, "loss": 0.6697, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.20242159068584442, "rewards/margins": 0.25091513991355896, "rewards/rejected": -0.048493556678295135, "step": 870 }, { "debug/policy_chosen_logits": 1.5905077457427979, "debug/policy_chosen_logps": -140.4800567626953, "debug/policy_rejected_logits": 1.6325492858886719, "debug/policy_rejected_logps": -4.193604469299316, "debug/reference_chosen_logps": -140.52279663085938, "debug/reference_rejected_logps": -4.109394550323486, "epoch": 3.170289855072464, "grad_norm": 52.58602662343731, "learning_rate": 6.323529411764706e-08, "logits/chosen": 1.5905077457427979, "logits/rejected": 1.6325492858886719, "logps/chosen": -140.4800567626953, "logps/rejected": -4.193604469299316, "loss": 0.6734, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.02137126587331295, "rewards/margins": 0.0634760633111, "rewards/rejected": -0.04210479184985161, "step": 875 }, { "debug/policy_chosen_logits": 1.4793184995651245, "debug/policy_chosen_logps": -28.353017807006836, "debug/policy_rejected_logits": 1.8253729343414307, "debug/policy_rejected_logps": -8.04998779296875, "debug/reference_chosen_logps": -28.440570831298828, "debug/reference_rejected_logps": -8.051187515258789, "epoch": 3.1884057971014492, "grad_norm": 44.40022158150952, "learning_rate": 6.299810246679317e-08, "logits/chosen": 1.4793184995651245, "logits/rejected": 1.8253729343414307, "logps/chosen": -28.353017807006836, "logps/rejected": -8.04998779296875, "loss": 0.6669, "rewards/accuracies": 0.25, "rewards/chosen": 0.043776437640190125, "rewards/margins": 0.04317639768123627, "rewards/rejected": 0.0006000399589538574, "step": 880 }, { "debug/policy_chosen_logits": 1.1218178272247314, "debug/policy_chosen_logps": -6.345554351806641, "debug/policy_rejected_logits": 1.4467482566833496, "debug/policy_rejected_logps": -71.78057098388672, "debug/reference_chosen_logps": -6.258852481842041, "debug/reference_rejected_logps": -71.7362289428711, "epoch": 3.2065217391304346, "grad_norm": 342.00589619469815, "learning_rate": 6.276091081593927e-08, "logits/chosen": 1.1218178272247314, "logits/rejected": 1.4467482566833496, "logps/chosen": -6.345554351806641, "logps/rejected": -71.78057098388672, "loss": 0.6734, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.043350961059331894, "rewards/margins": -0.02117815986275673, "rewards/rejected": -0.022172803059220314, "step": 885 }, { "debug/policy_chosen_logits": 1.4122637510299683, "debug/policy_chosen_logps": -7.052284240722656, "debug/policy_rejected_logits": 1.6874958276748657, "debug/policy_rejected_logps": -58.975929260253906, "debug/reference_chosen_logps": -7.0944318771362305, "debug/reference_rejected_logps": -57.73896408081055, "epoch": 3.2246376811594204, "grad_norm": 87.33038061157116, "learning_rate": 6.252371916508538e-08, "logits/chosen": 1.4122637510299683, "logits/rejected": 1.6874958276748657, "logps/chosen": -7.052284240722656, "logps/rejected": -58.975929260253906, "loss": 0.6652, "rewards/accuracies": 0.25, "rewards/chosen": 0.021073898300528526, "rewards/margins": 0.6395517587661743, "rewards/rejected": -0.6184778213500977, "step": 890 }, { "debug/policy_chosen_logits": 1.5235027074813843, "debug/policy_chosen_logps": -143.93533325195312, "debug/policy_rejected_logits": 1.7289530038833618, "debug/policy_rejected_logps": -9.569393157958984, "debug/reference_chosen_logps": -144.22349548339844, "debug/reference_rejected_logps": -9.525235176086426, "epoch": 3.2427536231884058, "grad_norm": 45.00237352292008, "learning_rate": 6.22865275142315e-08, "logits/chosen": 1.5235027074813843, "logits/rejected": 1.7289530038833618, "logps/chosen": -143.93533325195312, "logps/rejected": -9.569393157958984, "loss": 0.6688, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.14409488439559937, "rewards/margins": 0.1661745011806488, "rewards/rejected": -0.02207961305975914, "step": 895 }, { "debug/policy_chosen_logits": 1.2280583381652832, "debug/policy_chosen_logps": -12.240262985229492, "debug/policy_rejected_logits": 1.7034275531768799, "debug/policy_rejected_logps": -5.093583106994629, "debug/reference_chosen_logps": -12.303363800048828, "debug/reference_rejected_logps": -5.078009605407715, "epoch": 3.260869565217391, "grad_norm": 32.373095154361955, "learning_rate": 6.20493358633776e-08, "logits/chosen": 1.2280583381652832, "logits/rejected": 1.7034275531768799, "logps/chosen": -12.240262985229492, "logps/rejected": -5.093583106994629, "loss": 0.6585, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.03154980391263962, "rewards/margins": 0.03933659940958023, "rewards/rejected": -0.007786794099956751, "step": 900 }, { "epoch": 3.260869565217391, "eval_debug/policy_chosen_logits": 1.6527211666107178, "eval_debug/policy_chosen_logps": -122.30872344970703, "eval_debug/policy_rejected_logits": 1.7148381471633911, "eval_debug/policy_rejected_logps": -63.61892318725586, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6527211666107178, "eval_logits/rejected": 1.7148381471633911, "eval_logps/chosen": -122.30872344970703, "eval_logps/rejected": -63.61892318725586, "eval_loss": 0.7157950401306152, "eval_rewards/accuracies": 0.2763157784938812, "eval_rewards/chosen": 0.419659286737442, "eval_rewards/margins": 0.2855943441390991, "eval_rewards/rejected": 0.13406497240066528, "eval_runtime": 28.7434, "eval_samples_per_second": 20.874, "eval_steps_per_second": 0.661, "step": 900 }, { "debug/policy_chosen_logits": 1.506245493888855, "debug/policy_chosen_logps": -92.83143615722656, "debug/policy_rejected_logits": 1.8087244033813477, "debug/policy_rejected_logps": -6.149545669555664, "debug/reference_chosen_logps": -93.24861145019531, "debug/reference_rejected_logps": -6.105729579925537, "epoch": 3.278985507246377, "grad_norm": 29.706176087146048, "learning_rate": 6.181214421252372e-08, "logits/chosen": 1.506245493888855, "logits/rejected": 1.8087244033813477, "logps/chosen": -92.83143615722656, "logps/rejected": -6.149545669555664, "loss": 0.6613, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.2085857391357422, "rewards/margins": 0.23049357533454895, "rewards/rejected": -0.021907825022935867, "step": 905 }, { "debug/policy_chosen_logits": 1.368865728378296, "debug/policy_chosen_logps": -10.314421653747559, "debug/policy_rejected_logits": 1.4819577932357788, "debug/policy_rejected_logps": -57.02370071411133, "debug/reference_chosen_logps": -10.355507850646973, "debug/reference_rejected_logps": -56.988853454589844, "epoch": 3.2971014492753623, "grad_norm": 97.75770294945315, "learning_rate": 6.157495256166983e-08, "logits/chosen": 1.368865728378296, "logits/rejected": 1.4819577932357788, "logps/chosen": -10.314421653747559, "logps/rejected": -57.02370071411133, "loss": 0.6813, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.02054334245622158, "rewards/margins": 0.03796694427728653, "rewards/rejected": -0.0174235999584198, "step": 910 }, { "debug/policy_chosen_logits": 1.7693036794662476, "debug/policy_chosen_logps": -8.325143814086914, "debug/policy_rejected_logits": 1.9025001525878906, "debug/policy_rejected_logps": -6.738936424255371, "debug/reference_chosen_logps": -8.424813270568848, "debug/reference_rejected_logps": -6.69986629486084, "epoch": 3.3152173913043477, "grad_norm": 31.05892642492881, "learning_rate": 6.133776091081594e-08, "logits/chosen": 1.7693036794662476, "logits/rejected": 1.9025001525878906, "logps/chosen": -8.325143814086914, "logps/rejected": -6.738936424255371, "loss": 0.6744, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0498344786465168, "rewards/margins": 0.06936945021152496, "rewards/rejected": -0.019534965977072716, "step": 915 }, { "debug/policy_chosen_logits": 1.393141508102417, "debug/policy_chosen_logps": -60.4977912902832, "debug/policy_rejected_logits": 1.3986886739730835, "debug/policy_rejected_logps": -8.130289077758789, "debug/reference_chosen_logps": -60.804039001464844, "debug/reference_rejected_logps": -8.141077995300293, "epoch": 3.3333333333333335, "grad_norm": 396.1448102601587, "learning_rate": 6.110056925996205e-08, "logits/chosen": 1.393141508102417, "logits/rejected": 1.3986886739730835, "logps/chosen": -60.4977912902832, "logps/rejected": -8.130289077758789, "loss": 0.6884, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.1531251072883606, "rewards/margins": 0.1477302759885788, "rewards/rejected": 0.005394837353378534, "step": 920 }, { "debug/policy_chosen_logits": 1.3195728063583374, "debug/policy_chosen_logps": -11.974189758300781, "debug/policy_rejected_logits": 1.433007001876831, "debug/policy_rejected_logps": -5.216119289398193, "debug/reference_chosen_logps": -12.01789665222168, "debug/reference_rejected_logps": -5.1840620040893555, "epoch": 3.351449275362319, "grad_norm": 28.456626922659662, "learning_rate": 6.086337760910815e-08, "logits/chosen": 1.3195728063583374, "logits/rejected": 1.433007001876831, "logps/chosen": -11.974189758300781, "logps/rejected": -5.216119289398193, "loss": 0.669, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.021853357553482056, "rewards/margins": 0.03788194805383682, "rewards/rejected": -0.016028590500354767, "step": 925 }, { "debug/policy_chosen_logits": 1.210734486579895, "debug/policy_chosen_logps": -7.432974815368652, "debug/policy_rejected_logits": 1.521740198135376, "debug/policy_rejected_logps": -5.7173967361450195, "debug/reference_chosen_logps": -7.4084978103637695, "debug/reference_rejected_logps": -5.688941478729248, "epoch": 3.369565217391304, "grad_norm": 45.1685596107324, "learning_rate": 6.062618595825426e-08, "logits/chosen": 1.210734486579895, "logits/rejected": 1.521740198135376, "logps/chosen": -7.432974815368652, "logps/rejected": -5.7173967361450195, "loss": 0.6866, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.01223852951079607, "rewards/margins": 0.0019893646240234375, "rewards/rejected": -0.014227894134819508, "step": 930 }, { "debug/policy_chosen_logits": 1.1847336292266846, "debug/policy_chosen_logps": -87.63777160644531, "debug/policy_rejected_logits": 1.6733314990997314, "debug/policy_rejected_logps": -21.106548309326172, "debug/reference_chosen_logps": -87.83734893798828, "debug/reference_rejected_logps": -21.036624908447266, "epoch": 3.38768115942029, "grad_norm": 41.31399498873852, "learning_rate": 6.038899430740037e-08, "logits/chosen": 1.1847336292266846, "logits/rejected": 1.6733314990997314, "logps/chosen": -87.63777160644531, "logps/rejected": -21.106548309326172, "loss": 0.6597, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0997919961810112, "rewards/margins": 0.13475386798381805, "rewards/rejected": -0.034961871802806854, "step": 935 }, { "debug/policy_chosen_logits": 1.2867705821990967, "debug/policy_chosen_logps": -123.9426498413086, "debug/policy_rejected_logits": 1.4360134601593018, "debug/policy_rejected_logps": -11.085226058959961, "debug/reference_chosen_logps": -123.60845947265625, "debug/reference_rejected_logps": -11.164692878723145, "epoch": 3.4057971014492754, "grad_norm": 336.52599734996716, "learning_rate": 6.015180265654649e-08, "logits/chosen": 1.2867705821990967, "logits/rejected": 1.4360134601593018, "logps/chosen": -123.9426498413086, "logps/rejected": -11.085226058959961, "loss": 0.7623, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.16710162162780762, "rewards/margins": -0.20683518052101135, "rewards/rejected": 0.03973354771733284, "step": 940 }, { "debug/policy_chosen_logits": 1.283972978591919, "debug/policy_chosen_logps": -9.375557899475098, "debug/policy_rejected_logits": 1.6528282165527344, "debug/policy_rejected_logps": -3.876771926879883, "debug/reference_chosen_logps": -9.403645515441895, "debug/reference_rejected_logps": -3.788437604904175, "epoch": 3.4239130434782608, "grad_norm": 135.35186954778095, "learning_rate": 5.99146110056926e-08, "logits/chosen": 1.283972978591919, "logits/rejected": 1.6528282165527344, "logps/chosen": -9.375557899475098, "logps/rejected": -3.876771926879883, "loss": 0.7806, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.014043962582945824, "rewards/margins": 0.0582113154232502, "rewards/rejected": -0.044167350977659225, "step": 945 }, { "debug/policy_chosen_logits": 1.0533493757247925, "debug/policy_chosen_logps": -3.2173144817352295, "debug/policy_rejected_logits": 1.362418532371521, "debug/policy_rejected_logps": -74.07160186767578, "debug/reference_chosen_logps": -3.216484785079956, "debug/reference_rejected_logps": -73.69139099121094, "epoch": 3.4420289855072466, "grad_norm": 38.11708955230098, "learning_rate": 5.967741935483871e-08, "logits/chosen": 1.0533493757247925, "logits/rejected": 1.362418532371521, "logps/chosen": -3.2173144817352295, "logps/rejected": -74.07160186767578, "loss": 0.6794, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.00041468889685347676, "rewards/margins": 0.18968746066093445, "rewards/rejected": -0.19010214507579803, "step": 950 }, { "debug/policy_chosen_logits": 1.368204116821289, "debug/policy_chosen_logps": -3.931748628616333, "debug/policy_rejected_logits": 1.6319797039031982, "debug/policy_rejected_logps": -177.16738891601562, "debug/reference_chosen_logps": -3.9458231925964355, "debug/reference_rejected_logps": -176.2043914794922, "epoch": 3.460144927536232, "grad_norm": 24.718669038042705, "learning_rate": 5.944022770398481e-08, "logits/chosen": 1.368204116821289, "logits/rejected": 1.6319797039031982, "logps/chosen": -3.931748628616333, "logps/rejected": -177.16738891601562, "loss": 0.6774, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.007037323899567127, "rewards/margins": 0.48852816224098206, "rewards/rejected": -0.4814907908439636, "step": 955 }, { "debug/policy_chosen_logits": 1.3896573781967163, "debug/policy_chosen_logps": -103.21891784667969, "debug/policy_rejected_logits": 1.6598775386810303, "debug/policy_rejected_logps": -100.97898864746094, "debug/reference_chosen_logps": -103.34651947021484, "debug/reference_rejected_logps": -101.48641967773438, "epoch": 3.4782608695652173, "grad_norm": 424.7175203430712, "learning_rate": 5.9203036053130925e-08, "logits/chosen": 1.3896573781967163, "logits/rejected": 1.6598775386810303, "logps/chosen": -103.21891784667969, "logps/rejected": -100.97898864746094, "loss": 0.6986, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0638008564710617, "rewards/margins": -0.189919114112854, "rewards/rejected": 0.2537199556827545, "step": 960 }, { "debug/policy_chosen_logits": 1.5393651723861694, "debug/policy_chosen_logps": -45.15822219848633, "debug/policy_rejected_logits": 1.8976068496704102, "debug/policy_rejected_logps": -48.85960006713867, "debug/reference_chosen_logps": -44.924034118652344, "debug/reference_rejected_logps": -48.36165237426758, "epoch": 3.496376811594203, "grad_norm": 290.9924632792359, "learning_rate": 5.896584440227703e-08, "logits/chosen": 1.5393651723861694, "logits/rejected": 1.8976068496704102, "logps/chosen": -45.15822219848633, "logps/rejected": -48.85960006713867, "loss": 0.6822, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.1170935407280922, "rewards/margins": 0.13188210129737854, "rewards/rejected": -0.24897566437721252, "step": 965 }, { "debug/policy_chosen_logits": 1.3908889293670654, "debug/policy_chosen_logps": -76.97760009765625, "debug/policy_rejected_logits": 1.7035739421844482, "debug/policy_rejected_logps": -9.426881790161133, "debug/reference_chosen_logps": -77.12386322021484, "debug/reference_rejected_logps": -9.334480285644531, "epoch": 3.5144927536231885, "grad_norm": 253.96035079847826, "learning_rate": 5.872865275142315e-08, "logits/chosen": 1.3908889293670654, "logits/rejected": 1.7035739421844482, "logps/chosen": -76.97760009765625, "logps/rejected": -9.426881790161133, "loss": 0.6734, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.07313155382871628, "rewards/margins": 0.11933251470327377, "rewards/rejected": -0.046200960874557495, "step": 970 }, { "debug/policy_chosen_logits": 1.400263786315918, "debug/policy_chosen_logps": -83.63748931884766, "debug/policy_rejected_logits": 1.6685125827789307, "debug/policy_rejected_logps": -7.530575752258301, "debug/reference_chosen_logps": -84.18830871582031, "debug/reference_rejected_logps": -7.56585168838501, "epoch": 3.532608695652174, "grad_norm": 33.06476433752453, "learning_rate": 5.849146110056926e-08, "logits/chosen": 1.400263786315918, "logits/rejected": 1.6685125827789307, "logps/chosen": -83.63748931884766, "logps/rejected": -7.530575752258301, "loss": 0.6623, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.275409996509552, "rewards/margins": 0.2577720284461975, "rewards/rejected": 0.017638003453612328, "step": 975 }, { "debug/policy_chosen_logits": 1.360818862915039, "debug/policy_chosen_logps": -7.1180925369262695, "debug/policy_rejected_logits": 1.9248683452606201, "debug/policy_rejected_logps": -4.067748069763184, "debug/reference_chosen_logps": -7.126869201660156, "debug/reference_rejected_logps": -4.064708709716797, "epoch": 3.550724637681159, "grad_norm": 27.29103858210508, "learning_rate": 5.8254269449715365e-08, "logits/chosen": 1.360818862915039, "logits/rejected": 1.9248683452606201, "logps/chosen": -7.1180925369262695, "logps/rejected": -4.067748069763184, "loss": 0.6823, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.004388138651847839, "rewards/margins": 0.005907702259719372, "rewards/rejected": -0.001519563840702176, "step": 980 }, { "debug/policy_chosen_logits": 1.4152826070785522, "debug/policy_chosen_logps": -7.399645805358887, "debug/policy_rejected_logits": 1.6108105182647705, "debug/policy_rejected_logps": -2.091935634613037, "debug/reference_chosen_logps": -7.488625526428223, "debug/reference_rejected_logps": -2.104032516479492, "epoch": 3.568840579710145, "grad_norm": 39.09288221095929, "learning_rate": 5.801707779886148e-08, "logits/chosen": 1.4152826070785522, "logits/rejected": 1.6108105182647705, "logps/chosen": -7.399645805358887, "logps/rejected": -2.091935634613037, "loss": 0.6844, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.04448971524834633, "rewards/margins": 0.038441337645053864, "rewards/rejected": 0.00604837853461504, "step": 985 }, { "debug/policy_chosen_logits": 1.5785908699035645, "debug/policy_chosen_logps": -112.74638366699219, "debug/policy_rejected_logits": 1.5525871515274048, "debug/policy_rejected_logps": -6.454891204833984, "debug/reference_chosen_logps": -113.3122787475586, "debug/reference_rejected_logps": -6.36132287979126, "epoch": 3.5869565217391304, "grad_norm": 37.242591626914724, "learning_rate": 5.777988614800758e-08, "logits/chosen": 1.5785908699035645, "logits/rejected": 1.5525871515274048, "logps/chosen": -112.74638366699219, "logps/rejected": -6.454891204833984, "loss": 0.6605, "rewards/accuracies": 0.375, "rewards/chosen": 0.28295522928237915, "rewards/margins": 0.3297395706176758, "rewards/rejected": -0.04678431898355484, "step": 990 }, { "debug/policy_chosen_logits": 1.284183144569397, "debug/policy_chosen_logps": -3.157134771347046, "debug/policy_rejected_logits": 1.7949405908584595, "debug/policy_rejected_logps": -7.089175224304199, "debug/reference_chosen_logps": -3.14458966255188, "debug/reference_rejected_logps": -7.091891288757324, "epoch": 3.605072463768116, "grad_norm": 33.53482029704163, "learning_rate": 5.7542694497153696e-08, "logits/chosen": 1.284183144569397, "logits/rejected": 1.7949405908584595, "logps/chosen": -3.157134771347046, "logps/rejected": -7.089175224304199, "loss": 0.701, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.006272452883422375, "rewards/margins": -0.007630518171936274, "rewards/rejected": 0.0013580650556832552, "step": 995 }, { "debug/policy_chosen_logits": 1.6103435754776, "debug/policy_chosen_logps": -37.78554153442383, "debug/policy_rejected_logits": 1.8071285486221313, "debug/policy_rejected_logps": -5.487974166870117, "debug/reference_chosen_logps": -38.062889099121094, "debug/reference_rejected_logps": -5.441411018371582, "epoch": 3.6231884057971016, "grad_norm": 48.48861746668537, "learning_rate": 5.7305502846299804e-08, "logits/chosen": 1.6103435754776, "logits/rejected": 1.8071285486221313, "logps/chosen": -37.78554153442383, "logps/rejected": -5.487974166870117, "loss": 0.6654, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.13867545127868652, "rewards/margins": 0.16195693612098694, "rewards/rejected": -0.02328147552907467, "step": 1000 }, { "epoch": 3.6231884057971016, "eval_debug/policy_chosen_logits": 1.645974040031433, "eval_debug/policy_chosen_logps": -122.32249450683594, "eval_debug/policy_rejected_logits": 1.7063970565795898, "eval_debug/policy_rejected_logps": -63.88512420654297, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.645974040031433, "eval_logits/rejected": 1.7063970565795898, "eval_logps/chosen": -122.32249450683594, "eval_logps/rejected": -63.88512420654297, "eval_loss": 0.6837486028671265, "eval_rewards/accuracies": 0.3947368562221527, "eval_rewards/chosen": 0.4127763509750366, "eval_rewards/margins": 0.4118105173110962, "eval_rewards/rejected": 0.0009658289491198957, "eval_runtime": 28.546, "eval_samples_per_second": 21.019, "eval_steps_per_second": 0.666, "step": 1000 }, { "debug/policy_chosen_logits": 1.2776224613189697, "debug/policy_chosen_logps": -18.016647338867188, "debug/policy_rejected_logits": 1.898296594619751, "debug/policy_rejected_logps": -21.252689361572266, "debug/reference_chosen_logps": -18.150279998779297, "debug/reference_rejected_logps": -21.3471736907959, "epoch": 3.641304347826087, "grad_norm": 82.94473601422003, "learning_rate": 5.706831119544592e-08, "logits/chosen": 1.2776224613189697, "logits/rejected": 1.898296594619751, "logps/chosen": -18.016647338867188, "logps/rejected": -21.252689361572266, "loss": 0.6709, "rewards/accuracies": 0.25, "rewards/chosen": 0.06681646406650543, "rewards/margins": 0.01957358419895172, "rewards/rejected": 0.04724288359284401, "step": 1005 }, { "debug/policy_chosen_logits": 1.313188076019287, "debug/policy_chosen_logps": -145.06497192382812, "debug/policy_rejected_logits": 1.890424370765686, "debug/policy_rejected_logps": -91.81851196289062, "debug/reference_chosen_logps": -145.67306518554688, "debug/reference_rejected_logps": -91.565673828125, "epoch": 3.6594202898550723, "grad_norm": 247.88794560402675, "learning_rate": 5.6831119544592034e-08, "logits/chosen": 1.313188076019287, "logits/rejected": 1.890424370765686, "logps/chosen": -145.06497192382812, "logps/rejected": -91.81851196289062, "loss": 0.6852, "rewards/accuracies": 0.375, "rewards/chosen": 0.30404406785964966, "rewards/margins": 0.43046265840530396, "rewards/rejected": -0.1264186054468155, "step": 1010 }, { "debug/policy_chosen_logits": 1.620037317276001, "debug/policy_chosen_logps": -38.632469177246094, "debug/policy_rejected_logits": 1.4626463651657104, "debug/policy_rejected_logps": -43.076271057128906, "debug/reference_chosen_logps": -38.78121566772461, "debug/reference_rejected_logps": -42.984474182128906, "epoch": 3.677536231884058, "grad_norm": 24.26556632667471, "learning_rate": 5.6593927893738136e-08, "logits/chosen": 1.620037317276001, "logits/rejected": 1.4626463651657104, "logps/chosen": -38.632469177246094, "logps/rejected": -43.076271057128906, "loss": 0.674, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.07437404990196228, "rewards/margins": 0.12027297168970108, "rewards/rejected": -0.0458989143371582, "step": 1015 }, { "debug/policy_chosen_logits": 1.0805531740188599, "debug/policy_chosen_logps": -130.40225219726562, "debug/policy_rejected_logits": 1.529587745666504, "debug/policy_rejected_logps": -7.021085262298584, "debug/reference_chosen_logps": -130.89085388183594, "debug/reference_rejected_logps": -7.041855812072754, "epoch": 3.6956521739130435, "grad_norm": 44.64804354015198, "learning_rate": 5.635673624288425e-08, "logits/chosen": 1.0805531740188599, "logits/rejected": 1.529587745666504, "logps/chosen": -130.40225219726562, "logps/rejected": -7.021085262298584, "loss": 0.65, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.24429699778556824, "rewards/margins": 0.23391170799732208, "rewards/rejected": 0.010385322384536266, "step": 1020 }, { "debug/policy_chosen_logits": 1.5907633304595947, "debug/policy_chosen_logps": -6.0800862312316895, "debug/policy_rejected_logits": 2.0582656860351562, "debug/policy_rejected_logps": -64.01649475097656, "debug/reference_chosen_logps": -6.047281265258789, "debug/reference_rejected_logps": -63.94500732421875, "epoch": 3.713768115942029, "grad_norm": 73.65459187636674, "learning_rate": 5.611954459203035e-08, "logits/chosen": 1.5907633304595947, "logits/rejected": 2.0582656860351562, "logps/chosen": -6.0800862312316895, "logps/rejected": -64.01649475097656, "loss": 0.6801, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.01640259101986885, "rewards/margins": 0.019338121637701988, "rewards/rejected": -0.03574071079492569, "step": 1025 }, { "debug/policy_chosen_logits": 1.3926115036010742, "debug/policy_chosen_logps": -4.225093364715576, "debug/policy_rejected_logits": 1.6491317749023438, "debug/policy_rejected_logps": -11.89829158782959, "debug/reference_chosen_logps": -4.215153217315674, "debug/reference_rejected_logps": -11.835853576660156, "epoch": 3.7318840579710146, "grad_norm": 93.88462207611535, "learning_rate": 5.588235294117647e-08, "logits/chosen": 1.3926115036010742, "logits/rejected": 1.6491317749023438, "logps/chosen": -4.225093364715576, "logps/rejected": -11.89829158782959, "loss": 0.668, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.004970122128725052, "rewards/margins": 0.026248831301927567, "rewards/rejected": -0.03121895156800747, "step": 1030 }, { "debug/policy_chosen_logits": 1.1844418048858643, "debug/policy_chosen_logps": -119.50179290771484, "debug/policy_rejected_logits": 1.3030064105987549, "debug/policy_rejected_logps": -3.5948143005371094, "debug/reference_chosen_logps": -119.53863525390625, "debug/reference_rejected_logps": -3.519488573074341, "epoch": 3.75, "grad_norm": 28.751939921176653, "learning_rate": 5.5645161290322576e-08, "logits/chosen": 1.1844418048858643, "logits/rejected": 1.3030064105987549, "logps/chosen": -119.50179290771484, "logps/rejected": -3.5948143005371094, "loss": 0.6732, "rewards/accuracies": 0.375, "rewards/chosen": 0.018416905775666237, "rewards/margins": 0.05607951804995537, "rewards/rejected": -0.03766261413693428, "step": 1035 }, { "debug/policy_chosen_logits": 1.936018943786621, "debug/policy_chosen_logps": -5.443009376525879, "debug/policy_rejected_logits": 2.2490768432617188, "debug/policy_rejected_logps": -97.88577270507812, "debug/reference_chosen_logps": -5.451835632324219, "debug/reference_rejected_logps": -97.595947265625, "epoch": 3.7681159420289854, "grad_norm": 37.5310388892098, "learning_rate": 5.540796963946869e-08, "logits/chosen": 1.936018943786621, "logits/rejected": 2.2490768432617188, "logps/chosen": -5.443009376525879, "logps/rejected": -97.88577270507812, "loss": 0.6684, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00441314559429884, "rewards/margins": 0.149323970079422, "rewards/rejected": -0.1449107974767685, "step": 1040 }, { "debug/policy_chosen_logits": 1.719305396080017, "debug/policy_chosen_logps": -10.679169654846191, "debug/policy_rejected_logits": 1.9739372730255127, "debug/policy_rejected_logps": -23.52250099182129, "debug/reference_chosen_logps": -10.742517471313477, "debug/reference_rejected_logps": -23.387409210205078, "epoch": 3.786231884057971, "grad_norm": 52.265453618978505, "learning_rate": 5.5170777988614805e-08, "logits/chosen": 1.719305396080017, "logits/rejected": 1.9739372730255127, "logps/chosen": -10.679169654846191, "logps/rejected": -23.52250099182129, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.03167480602860451, "rewards/margins": 0.09922030568122864, "rewards/rejected": -0.06754550337791443, "step": 1045 }, { "debug/policy_chosen_logits": 1.4039462804794312, "debug/policy_chosen_logps": -6.643633842468262, "debug/policy_rejected_logits": 1.8524723052978516, "debug/policy_rejected_logps": -4.875354766845703, "debug/reference_chosen_logps": -6.616199493408203, "debug/reference_rejected_logps": -4.851325988769531, "epoch": 3.8043478260869565, "grad_norm": 34.673889387372014, "learning_rate": 5.493358633776091e-08, "logits/chosen": 1.4039462804794312, "logits/rejected": 1.8524723052978516, "logps/chosen": -6.643633842468262, "logps/rejected": -4.875354766845703, "loss": 0.6663, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.013717399910092354, "rewards/margins": -0.0017030939925462008, "rewards/rejected": -0.012014305219054222, "step": 1050 }, { "debug/policy_chosen_logits": 1.5869477987289429, "debug/policy_chosen_logps": -66.59515380859375, "debug/policy_rejected_logits": 1.9052488803863525, "debug/policy_rejected_logps": -5.9034318923950195, "debug/reference_chosen_logps": -66.7884292602539, "debug/reference_rejected_logps": -5.737187385559082, "epoch": 3.822463768115942, "grad_norm": 128.39014325147528, "learning_rate": 5.469639468690702e-08, "logits/chosen": 1.5869477987289429, "logits/rejected": 1.9052488803863525, "logps/chosen": -66.59515380859375, "logps/rejected": -5.9034318923950195, "loss": 0.6698, "rewards/accuracies": 0.375, "rewards/chosen": 0.09663376957178116, "rewards/margins": 0.17975611984729767, "rewards/rejected": -0.08312235027551651, "step": 1055 }, { "debug/policy_chosen_logits": 1.5089061260223389, "debug/policy_chosen_logps": -78.43992614746094, "debug/policy_rejected_logits": 1.70491623878479, "debug/policy_rejected_logps": -55.27770233154297, "debug/reference_chosen_logps": -78.50907897949219, "debug/reference_rejected_logps": -55.16875457763672, "epoch": 3.8405797101449277, "grad_norm": 66.80061039445472, "learning_rate": 5.4459203036053124e-08, "logits/chosen": 1.5089061260223389, "logits/rejected": 1.70491623878479, "logps/chosen": -78.43992614746094, "logps/rejected": -55.27770233154297, "loss": 0.6586, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.034575335681438446, "rewards/margins": 0.08905027061700821, "rewards/rejected": -0.05447493866086006, "step": 1060 }, { "debug/policy_chosen_logits": 1.6448147296905518, "debug/policy_chosen_logps": -71.85960388183594, "debug/policy_rejected_logits": 1.7115284204483032, "debug/policy_rejected_logps": -13.00658893585205, "debug/reference_chosen_logps": -72.53044128417969, "debug/reference_rejected_logps": -12.9281587600708, "epoch": 3.858695652173913, "grad_norm": 75.64182668372877, "learning_rate": 5.422201138519924e-08, "logits/chosen": 1.6448147296905518, "logits/rejected": 1.7115284204483032, "logps/chosen": -71.85960388183594, "logps/rejected": -13.00658893585205, "loss": 0.6784, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.33541780710220337, "rewards/margins": 0.3746335506439209, "rewards/rejected": -0.039215732365846634, "step": 1065 }, { "debug/policy_chosen_logits": 1.4140228033065796, "debug/policy_chosen_logps": -128.88458251953125, "debug/policy_rejected_logits": 1.6629931926727295, "debug/policy_rejected_logps": -3.1154098510742188, "debug/reference_chosen_logps": -129.23959350585938, "debug/reference_rejected_logps": -3.075037717819214, "epoch": 3.8768115942028984, "grad_norm": 60.964338245576684, "learning_rate": 5.398481973434535e-08, "logits/chosen": 1.4140228033065796, "logits/rejected": 1.6629931926727295, "logps/chosen": -128.88458251953125, "logps/rejected": -3.1154098510742188, "loss": 0.6603, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.17750807106494904, "rewards/margins": 0.19769418239593506, "rewards/rejected": -0.020186107605695724, "step": 1070 }, { "debug/policy_chosen_logits": 1.244246244430542, "debug/policy_chosen_logps": -6.111571788787842, "debug/policy_rejected_logits": 1.5681354999542236, "debug/policy_rejected_logps": -2.8084378242492676, "debug/reference_chosen_logps": -6.159215450286865, "debug/reference_rejected_logps": -2.7770676612854004, "epoch": 3.894927536231884, "grad_norm": 29.815211429709244, "learning_rate": 5.374762808349146e-08, "logits/chosen": 1.244246244430542, "logits/rejected": 1.5681354999542236, "logps/chosen": -6.111571788787842, "logps/rejected": -2.8084378242492676, "loss": 0.6679, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.023822065442800522, "rewards/margins": 0.039506882429122925, "rewards/rejected": -0.0156848207116127, "step": 1075 }, { "debug/policy_chosen_logits": 1.356063961982727, "debug/policy_chosen_logps": -8.535842895507812, "debug/policy_rejected_logits": 1.7975889444351196, "debug/policy_rejected_logps": -46.58405303955078, "debug/reference_chosen_logps": -8.5914306640625, "debug/reference_rejected_logps": -46.42311477661133, "epoch": 3.9130434782608696, "grad_norm": 79.76055224945848, "learning_rate": 5.3510436432637577e-08, "logits/chosen": 1.356063961982727, "logits/rejected": 1.7975889444351196, "logps/chosen": -8.535842895507812, "logps/rejected": -46.58405303955078, "loss": 0.6677, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.027794048190116882, "rewards/margins": 0.10826502740383148, "rewards/rejected": -0.080470971763134, "step": 1080 }, { "debug/policy_chosen_logits": 1.3838964700698853, "debug/policy_chosen_logps": -23.5528507232666, "debug/policy_rejected_logits": 1.8698123693466187, "debug/policy_rejected_logps": -69.71846771240234, "debug/reference_chosen_logps": -23.595714569091797, "debug/reference_rejected_logps": -69.52255249023438, "epoch": 3.931159420289855, "grad_norm": 122.04688496099257, "learning_rate": 5.327324478178368e-08, "logits/chosen": 1.3838964700698853, "logits/rejected": 1.8698123693466187, "logps/chosen": -23.5528507232666, "logps/rejected": -69.71846771240234, "loss": 0.6586, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02143268845975399, "rewards/margins": 0.11939145624637604, "rewards/rejected": -0.0979587659239769, "step": 1085 }, { "debug/policy_chosen_logits": 1.2569609880447388, "debug/policy_chosen_logps": -3.7661705017089844, "debug/policy_rejected_logits": 1.753953218460083, "debug/policy_rejected_logps": -96.56436157226562, "debug/reference_chosen_logps": -3.756516695022583, "debug/reference_rejected_logps": -96.35320281982422, "epoch": 3.949275362318841, "grad_norm": 35.041344726194716, "learning_rate": 5.303605313092979e-08, "logits/chosen": 1.2569609880447388, "logits/rejected": 1.753953218460083, "logps/chosen": -3.7661705017089844, "logps/rejected": -96.56436157226562, "loss": 0.6618, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.004826811142265797, "rewards/margins": 0.10075213760137558, "rewards/rejected": -0.1055789440870285, "step": 1090 }, { "debug/policy_chosen_logits": 1.1286402940750122, "debug/policy_chosen_logps": -94.75798034667969, "debug/policy_rejected_logits": 1.4070926904678345, "debug/policy_rejected_logps": -120.85664367675781, "debug/reference_chosen_logps": -94.8940658569336, "debug/reference_rejected_logps": -120.67181396484375, "epoch": 3.967391304347826, "grad_norm": 35.83768532386312, "learning_rate": 5.2798861480075895e-08, "logits/chosen": 1.1286402940750122, "logits/rejected": 1.4070926904678345, "logps/chosen": -94.75798034667969, "logps/rejected": -120.85664367675781, "loss": 0.6677, "rewards/accuracies": 0.375, "rewards/chosen": 0.06804393231868744, "rewards/margins": 0.1604512482881546, "rewards/rejected": -0.09240730851888657, "step": 1095 }, { "debug/policy_chosen_logits": 1.5355989933013916, "debug/policy_chosen_logps": -147.57003784179688, "debug/policy_rejected_logits": 1.769222617149353, "debug/policy_rejected_logps": -44.36610794067383, "debug/reference_chosen_logps": -147.768310546875, "debug/reference_rejected_logps": -44.236473083496094, "epoch": 3.9855072463768115, "grad_norm": 53.260024345556445, "learning_rate": 5.256166982922201e-08, "logits/chosen": 1.5355989933013916, "logits/rejected": 1.769222617149353, "logps/chosen": -147.57003784179688, "logps/rejected": -44.36610794067383, "loss": 0.669, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.09914834797382355, "rewards/margins": 0.16396939754486084, "rewards/rejected": -0.0648210421204567, "step": 1100 }, { "epoch": 3.9855072463768115, "eval_debug/policy_chosen_logits": 1.641280174255371, "eval_debug/policy_chosen_logps": -122.61564636230469, "eval_debug/policy_rejected_logits": 1.700822114944458, "eval_debug/policy_rejected_logps": -63.91725540161133, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.641280174255371, "eval_logits/rejected": 1.700822114944458, "eval_logps/chosen": -122.61564636230469, "eval_logps/rejected": -63.91725540161133, "eval_loss": 0.6801173686981201, "eval_rewards/accuracies": 0.3815789520740509, "eval_rewards/chosen": 0.26619815826416016, "eval_rewards/margins": 0.2813015282154083, "eval_rewards/rejected": -0.015103358775377274, "eval_runtime": 28.7437, "eval_samples_per_second": 20.874, "eval_steps_per_second": 0.661, "step": 1100 }, { "debug/policy_chosen_logits": 1.2795231342315674, "debug/policy_chosen_logps": -31.767568588256836, "debug/policy_rejected_logits": 1.2254111766815186, "debug/policy_rejected_logps": -41.531005859375, "debug/reference_chosen_logps": -31.847381591796875, "debug/reference_rejected_logps": -41.46240234375, "epoch": 4.003623188405797, "grad_norm": 28.700431056322046, "learning_rate": 5.232447817836811e-08, "logits/chosen": 1.2795231342315674, "logits/rejected": 1.2254111766815186, "logps/chosen": -31.767568588256836, "logps/rejected": -41.531005859375, "loss": 0.6727, "rewards/accuracies": 0.375, "rewards/chosen": 0.039905983954668045, "rewards/margins": 0.07420708239078522, "rewards/rejected": -0.03430110216140747, "step": 1105 }, { "debug/policy_chosen_logits": 1.331516981124878, "debug/policy_chosen_logps": -16.836349487304688, "debug/policy_rejected_logits": 1.8580379486083984, "debug/policy_rejected_logps": -73.23863220214844, "debug/reference_chosen_logps": -16.892589569091797, "debug/reference_rejected_logps": -73.21855926513672, "epoch": 4.021739130434782, "grad_norm": 31.724751181379688, "learning_rate": 5.2087286527514226e-08, "logits/chosen": 1.331516981124878, "logits/rejected": 1.8580379486083984, "logps/chosen": -16.836349487304688, "logps/rejected": -73.23863220214844, "loss": 0.657, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.028120670467615128, "rewards/margins": 0.03815806657075882, "rewards/rejected": -0.010037397965788841, "step": 1110 }, { "debug/policy_chosen_logits": 1.4044967889785767, "debug/policy_chosen_logps": -4.733355522155762, "debug/policy_rejected_logits": 1.539889931678772, "debug/policy_rejected_logps": -9.622430801391602, "debug/reference_chosen_logps": -4.702683448791504, "debug/reference_rejected_logps": -9.626226425170898, "epoch": 4.0398550724637685, "grad_norm": 30.3654953708822, "learning_rate": 5.185009487666034e-08, "logits/chosen": 1.4044967889785767, "logits/rejected": 1.539889931678772, "logps/chosen": -4.733355522155762, "logps/rejected": -9.622430801391602, "loss": 0.6663, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.015336352400481701, "rewards/margins": -0.01723439060151577, "rewards/rejected": 0.00189803761895746, "step": 1115 }, { "debug/policy_chosen_logits": 1.3917351961135864, "debug/policy_chosen_logps": -94.19818878173828, "debug/policy_rejected_logits": 1.4854586124420166, "debug/policy_rejected_logps": -50.3610725402832, "debug/reference_chosen_logps": -93.95970153808594, "debug/reference_rejected_logps": -50.229618072509766, "epoch": 4.057971014492754, "grad_norm": 40.08886436389265, "learning_rate": 5.161290322580645e-08, "logits/chosen": 1.3917351961135864, "logits/rejected": 1.4854586124420166, "logps/chosen": -94.19818878173828, "logps/rejected": -50.3610725402832, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": -0.11923716217279434, "rewards/margins": -0.053507816046476364, "rewards/rejected": -0.06572934985160828, "step": 1120 }, { "debug/policy_chosen_logits": 1.3438831567764282, "debug/policy_chosen_logps": -7.876392364501953, "debug/policy_rejected_logits": 1.6717513799667358, "debug/policy_rejected_logps": -11.951692581176758, "debug/reference_chosen_logps": -7.895465850830078, "debug/reference_rejected_logps": -11.934921264648438, "epoch": 4.076086956521739, "grad_norm": 58.14431953226711, "learning_rate": 5.1375711574952564e-08, "logits/chosen": 1.3438831567764282, "logits/rejected": 1.6717513799667358, "logps/chosen": -7.876392364501953, "logps/rejected": -11.951692581176758, "loss": 0.6634, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00953676737844944, "rewards/margins": 0.017922287806868553, "rewards/rejected": -0.008385521359741688, "step": 1125 }, { "debug/policy_chosen_logits": 1.5080369710922241, "debug/policy_chosen_logps": -8.126718521118164, "debug/policy_rejected_logits": 1.7252181768417358, "debug/policy_rejected_logps": -7.337622165679932, "debug/reference_chosen_logps": -8.121663093566895, "debug/reference_rejected_logps": -7.3379998207092285, "epoch": 4.094202898550725, "grad_norm": 60.19796258080323, "learning_rate": 5.1138519924098666e-08, "logits/chosen": 1.5080369710922241, "logits/rejected": 1.7252181768417358, "logps/chosen": -8.126718521118164, "logps/rejected": -7.337622165679932, "loss": 0.67, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.0025281489361077547, "rewards/margins": -0.002717047929763794, "rewards/rejected": 0.00018889903731178492, "step": 1130 }, { "debug/policy_chosen_logits": 1.1528054475784302, "debug/policy_chosen_logps": -235.63473510742188, "debug/policy_rejected_logits": 1.4311878681182861, "debug/policy_rejected_logps": -6.018296241760254, "debug/reference_chosen_logps": -236.1083984375, "debug/reference_rejected_logps": -5.9557414054870605, "epoch": 4.11231884057971, "grad_norm": 190.37360555410805, "learning_rate": 5.090132827324478e-08, "logits/chosen": 1.1528054475784302, "logits/rejected": 1.4311878681182861, "logps/chosen": -235.63473510742188, "logps/rejected": -6.018296241760254, "loss": 0.6707, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.23681211471557617, "rewards/margins": 0.2680895924568176, "rewards/rejected": -0.03127748891711235, "step": 1135 }, { "debug/policy_chosen_logits": 1.1075921058654785, "debug/policy_chosen_logps": -3.195159912109375, "debug/policy_rejected_logits": 1.636969804763794, "debug/policy_rejected_logps": -3.538886547088623, "debug/reference_chosen_logps": -3.2428536415100098, "debug/reference_rejected_logps": -3.4997127056121826, "epoch": 4.130434782608695, "grad_norm": 46.44049527345961, "learning_rate": 5.066413662239088e-08, "logits/chosen": 1.1075921058654785, "logits/rejected": 1.636969804763794, "logps/chosen": -3.195159912109375, "logps/rejected": -3.538886547088623, "loss": 0.6743, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.023846620693802834, "rewards/margins": 0.04343365505337715, "rewards/rejected": -0.01958703249692917, "step": 1140 }, { "debug/policy_chosen_logits": 1.2994215488433838, "debug/policy_chosen_logps": -8.438199996948242, "debug/policy_rejected_logits": 1.5658494234085083, "debug/policy_rejected_logps": -100.09513854980469, "debug/reference_chosen_logps": -8.481250762939453, "debug/reference_rejected_logps": -99.89872741699219, "epoch": 4.148550724637682, "grad_norm": 34.85948721596176, "learning_rate": 5.0426944971537e-08, "logits/chosen": 1.2994215488433838, "logits/rejected": 1.5658494234085083, "logps/chosen": -8.438199996948242, "logps/rejected": -100.09513854980469, "loss": 0.6515, "rewards/accuracies": 0.375, "rewards/chosen": 0.0215253084897995, "rewards/margins": 0.11973126977682114, "rewards/rejected": -0.09820596128702164, "step": 1145 }, { "debug/policy_chosen_logits": 1.3623902797698975, "debug/policy_chosen_logps": -8.232358932495117, "debug/policy_rejected_logits": 1.7384799718856812, "debug/policy_rejected_logps": -15.885915756225586, "debug/reference_chosen_logps": -8.285252571105957, "debug/reference_rejected_logps": -15.81109619140625, "epoch": 4.166666666666667, "grad_norm": 66.96190806715386, "learning_rate": 5.018975332068311e-08, "logits/chosen": 1.3623902797698975, "logits/rejected": 1.7384799718856812, "logps/chosen": -8.232358932495117, "logps/rejected": -15.885915756225586, "loss": 0.6513, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.026447156444191933, "rewards/margins": 0.06385638564825058, "rewards/rejected": -0.037409231066703796, "step": 1150 }, { "debug/policy_chosen_logits": 1.4879833459854126, "debug/policy_chosen_logps": -65.3663101196289, "debug/policy_rejected_logits": 1.9051059484481812, "debug/policy_rejected_logps": -58.30918502807617, "debug/reference_chosen_logps": -65.75659942626953, "debug/reference_rejected_logps": -58.31553268432617, "epoch": 4.184782608695652, "grad_norm": 424.02063610582934, "learning_rate": 4.995256166982922e-08, "logits/chosen": 1.4879833459854126, "logits/rejected": 1.9051059484481812, "logps/chosen": -65.3663101196289, "logps/rejected": -58.30918502807617, "loss": 0.6706, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.19514571130275726, "rewards/margins": 0.19197407364845276, "rewards/rejected": 0.003171634627506137, "step": 1155 }, { "debug/policy_chosen_logits": 1.346236228942871, "debug/policy_chosen_logps": -6.153543472290039, "debug/policy_rejected_logits": 1.91078782081604, "debug/policy_rejected_logps": -9.87269401550293, "debug/reference_chosen_logps": -6.167381286621094, "debug/reference_rejected_logps": -9.863636016845703, "epoch": 4.202898550724638, "grad_norm": 32.94724596843832, "learning_rate": 4.971537001897533e-08, "logits/chosen": 1.346236228942871, "logits/rejected": 1.91078782081604, "logps/chosen": -6.153543472290039, "logps/rejected": -9.87269401550293, "loss": 0.6699, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.006918889470398426, "rewards/margins": 0.011447494849562645, "rewards/rejected": -0.004528605844825506, "step": 1160 }, { "debug/policy_chosen_logits": 1.2568588256835938, "debug/policy_chosen_logps": -8.91445255279541, "debug/policy_rejected_logits": 1.684737205505371, "debug/policy_rejected_logps": -43.46324920654297, "debug/reference_chosen_logps": -9.041211128234863, "debug/reference_rejected_logps": -43.405174255371094, "epoch": 4.221014492753623, "grad_norm": 66.87440886018562, "learning_rate": 4.9478178368121444e-08, "logits/chosen": 1.2568588256835938, "logits/rejected": 1.684737205505371, "logps/chosen": -8.91445255279541, "logps/rejected": -43.46324920654297, "loss": 0.6705, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.06337939947843552, "rewards/margins": 0.09241970628499985, "rewards/rejected": -0.02904030680656433, "step": 1165 }, { "debug/policy_chosen_logits": 1.4423856735229492, "debug/policy_chosen_logps": -219.54678344726562, "debug/policy_rejected_logits": 1.4422798156738281, "debug/policy_rejected_logps": -7.929970741271973, "debug/reference_chosen_logps": -220.4656524658203, "debug/reference_rejected_logps": -7.890463829040527, "epoch": 4.239130434782608, "grad_norm": 202.2835673655268, "learning_rate": 4.924098671726755e-08, "logits/chosen": 1.4423856735229492, "logits/rejected": 1.4422798156738281, "logps/chosen": -219.54678344726562, "logps/rejected": -7.929970741271973, "loss": 0.659, "rewards/accuracies": 0.5, "rewards/chosen": 0.4594428539276123, "rewards/margins": 0.47919636964797974, "rewards/rejected": -0.01975351944565773, "step": 1170 }, { "debug/policy_chosen_logits": 1.3197758197784424, "debug/policy_chosen_logps": -36.545494079589844, "debug/policy_rejected_logits": 1.3392329216003418, "debug/policy_rejected_logps": -5.226871490478516, "debug/reference_chosen_logps": -36.75517654418945, "debug/reference_rejected_logps": -5.078423976898193, "epoch": 4.257246376811594, "grad_norm": 34.11317502134507, "learning_rate": 4.900379506641366e-08, "logits/chosen": 1.3197758197784424, "logits/rejected": 1.3392329216003418, "logps/chosen": -36.545494079589844, "logps/rejected": -5.226871490478516, "loss": 0.6832, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.10483930259943008, "rewards/margins": 0.17906302213668823, "rewards/rejected": -0.07422370463609695, "step": 1175 }, { "debug/policy_chosen_logits": 1.4831788539886475, "debug/policy_chosen_logps": -5.93057918548584, "debug/policy_rejected_logits": 1.6806341409683228, "debug/policy_rejected_logps": -50.401573181152344, "debug/reference_chosen_logps": -5.976899147033691, "debug/reference_rejected_logps": -50.49010467529297, "epoch": 4.27536231884058, "grad_norm": 39.75133074458651, "learning_rate": 4.876660341555977e-08, "logits/chosen": 1.4831788539886475, "logits/rejected": 1.6806341409683228, "logps/chosen": -5.93057918548584, "logps/rejected": -50.401573181152344, "loss": 0.6734, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.023159796372056007, "rewards/margins": -0.021107176318764687, "rewards/rejected": 0.04426697641611099, "step": 1180 }, { "debug/policy_chosen_logits": 1.0257833003997803, "debug/policy_chosen_logps": -6.121421813964844, "debug/policy_rejected_logits": 1.430450439453125, "debug/policy_rejected_logps": -6.698253631591797, "debug/reference_chosen_logps": -6.171689033508301, "debug/reference_rejected_logps": -6.672621726989746, "epoch": 4.293478260869565, "grad_norm": 343.04457109368485, "learning_rate": 4.852941176470588e-08, "logits/chosen": 1.0257833003997803, "logits/rejected": 1.430450439453125, "logps/chosen": -6.121421813964844, "logps/rejected": -6.698253631591797, "loss": 0.6668, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.02513359859585762, "rewards/margins": 0.03794936090707779, "rewards/rejected": -0.012815764173865318, "step": 1185 }, { "debug/policy_chosen_logits": 1.4630863666534424, "debug/policy_chosen_logps": -12.607038497924805, "debug/policy_rejected_logits": 1.6148658990859985, "debug/policy_rejected_logps": -8.732985496520996, "debug/reference_chosen_logps": -12.709831237792969, "debug/reference_rejected_logps": -8.655733108520508, "epoch": 4.311594202898551, "grad_norm": 38.05024074291088, "learning_rate": 4.829222011385199e-08, "logits/chosen": 1.4630863666534424, "logits/rejected": 1.6148658990859985, "logps/chosen": -12.607038497924805, "logps/rejected": -8.732985496520996, "loss": 0.6728, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.05139613896608353, "rewards/margins": 0.09002222120761871, "rewards/rejected": -0.038626085966825485, "step": 1190 }, { "debug/policy_chosen_logits": 1.2072805166244507, "debug/policy_chosen_logps": -3.519702911376953, "debug/policy_rejected_logits": 1.527407169342041, "debug/policy_rejected_logps": -111.81815338134766, "debug/reference_chosen_logps": -3.5323424339294434, "debug/reference_rejected_logps": -111.6783676147461, "epoch": 4.329710144927536, "grad_norm": 59.54592772621333, "learning_rate": 4.80550284629981e-08, "logits/chosen": 1.2072805166244507, "logits/rejected": 1.527407169342041, "logps/chosen": -3.519702911376953, "logps/rejected": -111.81815338134766, "loss": 0.6711, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.006319747772067785, "rewards/margins": 0.07621678709983826, "rewards/rejected": -0.06989704072475433, "step": 1195 }, { "debug/policy_chosen_logits": 1.8271970748901367, "debug/policy_chosen_logps": -9.577889442443848, "debug/policy_rejected_logits": 2.2353129386901855, "debug/policy_rejected_logps": -93.31974029541016, "debug/reference_chosen_logps": -9.619321823120117, "debug/reference_rejected_logps": -92.8711166381836, "epoch": 4.3478260869565215, "grad_norm": 37.55325101050477, "learning_rate": 4.781783681214421e-08, "logits/chosen": 1.8271970748901367, "logits/rejected": 2.2353129386901855, "logps/chosen": -9.577889442443848, "logps/rejected": -93.31974029541016, "loss": 0.6658, "rewards/accuracies": 0.375, "rewards/chosen": 0.020716574043035507, "rewards/margins": 0.24502527713775635, "rewards/rejected": -0.22430872917175293, "step": 1200 }, { "epoch": 4.3478260869565215, "eval_debug/policy_chosen_logits": 1.6381882429122925, "eval_debug/policy_chosen_logps": -122.7149887084961, "eval_debug/policy_rejected_logits": 1.6985112428665161, "eval_debug/policy_rejected_logps": -63.96799087524414, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6381882429122925, "eval_logits/rejected": 1.6985112428665161, "eval_logps/chosen": -122.7149887084961, "eval_logps/rejected": -63.96799087524414, "eval_loss": 0.6950154900550842, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.21652783453464508, "eval_rewards/margins": 0.2569939196109772, "eval_rewards/rejected": -0.0404660701751709, "eval_runtime": 28.6074, "eval_samples_per_second": 20.974, "eval_steps_per_second": 0.664, "step": 1200 }, { "debug/policy_chosen_logits": 1.2374722957611084, "debug/policy_chosen_logps": -104.5480728149414, "debug/policy_rejected_logits": 1.490818738937378, "debug/policy_rejected_logps": -119.3241958618164, "debug/reference_chosen_logps": -104.79878234863281, "debug/reference_rejected_logps": -119.53370666503906, "epoch": 4.365942028985507, "grad_norm": 248.791988897393, "learning_rate": 4.7580645161290323e-08, "logits/chosen": 1.2374722957611084, "logits/rejected": 1.490818738937378, "logps/chosen": -104.5480728149414, "logps/rejected": -119.3241958618164, "loss": 0.673, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.1253504455089569, "rewards/margins": 0.020594747737050056, "rewards/rejected": 0.104755699634552, "step": 1205 }, { "debug/policy_chosen_logits": 1.539039969444275, "debug/policy_chosen_logps": -138.03907775878906, "debug/policy_rejected_logits": 1.9208202362060547, "debug/policy_rejected_logps": -105.87274169921875, "debug/reference_chosen_logps": -138.21644592285156, "debug/reference_rejected_logps": -105.77984619140625, "epoch": 4.384057971014493, "grad_norm": 154.8737028118711, "learning_rate": 4.734345351043643e-08, "logits/chosen": 1.539039969444275, "logits/rejected": 1.9208202362060547, "logps/chosen": -138.03907775878906, "logps/rejected": -105.87274169921875, "loss": 0.6672, "rewards/accuracies": 0.375, "rewards/chosen": 0.08868501335382462, "rewards/margins": 0.13513876497745514, "rewards/rejected": -0.04645375534892082, "step": 1210 }, { "debug/policy_chosen_logits": 1.2730329036712646, "debug/policy_chosen_logps": -6.225523948669434, "debug/policy_rejected_logits": 1.3485596179962158, "debug/policy_rejected_logps": -2.8757498264312744, "debug/reference_chosen_logps": -6.278141021728516, "debug/reference_rejected_logps": -2.8032212257385254, "epoch": 4.4021739130434785, "grad_norm": 31.474765216150416, "learning_rate": 4.710626185958254e-08, "logits/chosen": 1.2730329036712646, "logits/rejected": 1.3485596179962158, "logps/chosen": -6.225523948669434, "logps/rejected": -2.8757498264312744, "loss": 0.6608, "rewards/accuracies": 0.375, "rewards/chosen": 0.026308486238121986, "rewards/margins": 0.06257269531488419, "rewards/rejected": -0.03626420348882675, "step": 1215 }, { "debug/policy_chosen_logits": 0.9204782247543335, "debug/policy_chosen_logps": -3.2213845252990723, "debug/policy_rejected_logits": 1.246864676475525, "debug/policy_rejected_logps": -6.587350368499756, "debug/reference_chosen_logps": -3.2298636436462402, "debug/reference_rejected_logps": -6.5863037109375, "epoch": 4.420289855072464, "grad_norm": 325.25604824734904, "learning_rate": 4.686907020872865e-08, "logits/chosen": 0.9204782247543335, "logits/rejected": 1.246864676475525, "logps/chosen": -3.2213845252990723, "logps/rejected": -6.587350368499756, "loss": 0.827, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.004239474423229694, "rewards/margins": 0.004762648139148951, "rewards/rejected": -0.000523173832334578, "step": 1220 }, { "debug/policy_chosen_logits": 1.2885674238204956, "debug/policy_chosen_logps": -102.65574645996094, "debug/policy_rejected_logits": 1.410846471786499, "debug/policy_rejected_logps": -66.18922424316406, "debug/reference_chosen_logps": -98.9661865234375, "debug/reference_rejected_logps": -63.7095832824707, "epoch": 4.438405797101449, "grad_norm": 466.7565878060821, "learning_rate": 4.6631878557874757e-08, "logits/chosen": 1.2885674238204956, "logits/rejected": 1.410846471786499, "logps/chosen": -102.65574645996094, "logps/rejected": -66.18922424316406, "loss": 1.599, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -1.8447787761688232, "rewards/margins": -0.6049600839614868, "rewards/rejected": -1.2398183345794678, "step": 1225 }, { "debug/policy_chosen_logits": 1.2755794525146484, "debug/policy_chosen_logps": -10.91869068145752, "debug/policy_rejected_logits": 1.6430355310440063, "debug/policy_rejected_logps": -42.71623992919922, "debug/reference_chosen_logps": -10.939241409301758, "debug/reference_rejected_logps": -44.20545196533203, "epoch": 4.456521739130435, "grad_norm": 625.7387752557272, "learning_rate": 4.639468690702087e-08, "logits/chosen": 1.2755794525146484, "logits/rejected": 1.6430355310440063, "logps/chosen": -10.91869068145752, "logps/rejected": -42.71623992919922, "loss": 1.3544, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.010275465436279774, "rewards/margins": -0.7343301177024841, "rewards/rejected": 0.7446056604385376, "step": 1230 }, { "debug/policy_chosen_logits": 1.3107882738113403, "debug/policy_chosen_logps": -44.55481719970703, "debug/policy_rejected_logits": 1.6620744466781616, "debug/policy_rejected_logps": -10.315018653869629, "debug/reference_chosen_logps": -43.48090744018555, "debug/reference_rejected_logps": -10.295641899108887, "epoch": 4.47463768115942, "grad_norm": 320.72407639820364, "learning_rate": 4.615749525616698e-08, "logits/chosen": 1.3107882738113403, "logits/rejected": 1.6620744466781616, "logps/chosen": -44.55481719970703, "logps/rejected": -10.315018653869629, "loss": 0.8607, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.5369553565979004, "rewards/margins": -0.5272666215896606, "rewards/rejected": -0.009688710793852806, "step": 1235 }, { "debug/policy_chosen_logits": 1.0817210674285889, "debug/policy_chosen_logps": -2.185075283050537, "debug/policy_rejected_logits": 1.7000091075897217, "debug/policy_rejected_logps": -7.065483093261719, "debug/reference_chosen_logps": -2.2591500282287598, "debug/reference_rejected_logps": -7.0815110206604, "epoch": 4.492753623188406, "grad_norm": 36.21387486534125, "learning_rate": 4.5920303605313095e-08, "logits/chosen": 1.0817210674285889, "logits/rejected": 1.7000091075897217, "logps/chosen": -2.185075283050537, "logps/rejected": -7.065483093261719, "loss": 0.6645, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.037037331610918045, "rewards/margins": 0.029023170471191406, "rewards/rejected": 0.008014163002371788, "step": 1240 }, { "debug/policy_chosen_logits": 1.407456636428833, "debug/policy_chosen_logps": -8.593276023864746, "debug/policy_rejected_logits": 1.58327054977417, "debug/policy_rejected_logps": -6.20754337310791, "debug/reference_chosen_logps": -8.682868957519531, "debug/reference_rejected_logps": -6.076182842254639, "epoch": 4.510869565217392, "grad_norm": 200.53393488140983, "learning_rate": 4.56831119544592e-08, "logits/chosen": 1.407456636428833, "logits/rejected": 1.58327054977417, "logps/chosen": -8.593276023864746, "logps/rejected": -6.20754337310791, "loss": 0.8141, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.04479644075036049, "rewards/margins": 0.11047659814357758, "rewards/rejected": -0.06568016111850739, "step": 1245 }, { "debug/policy_chosen_logits": 0.9900293350219727, "debug/policy_chosen_logps": -222.2896270751953, "debug/policy_rejected_logits": 1.4614320993423462, "debug/policy_rejected_logps": -3.5678131580352783, "debug/reference_chosen_logps": -222.7086181640625, "debug/reference_rejected_logps": -3.507493257522583, "epoch": 4.528985507246377, "grad_norm": 77.45340335491315, "learning_rate": 4.544592030360531e-08, "logits/chosen": 0.9900293350219727, "logits/rejected": 1.4614320993423462, "logps/chosen": -222.2896270751953, "logps/rejected": -3.5678131580352783, "loss": 0.6596, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.2095002830028534, "rewards/margins": 0.23966006934642792, "rewards/rejected": -0.03015979193150997, "step": 1250 }, { "debug/policy_chosen_logits": 1.2872966527938843, "debug/policy_chosen_logps": -60.451316833496094, "debug/policy_rejected_logits": 1.385683536529541, "debug/policy_rejected_logps": -75.03933715820312, "debug/reference_chosen_logps": -60.365997314453125, "debug/reference_rejected_logps": -74.52059173583984, "epoch": 4.547101449275362, "grad_norm": 114.82292193495736, "learning_rate": 4.520872865275142e-08, "logits/chosen": 1.2872966527938843, "logits/rejected": 1.385683536529541, "logps/chosen": -60.451316833496094, "logps/rejected": -75.03933715820312, "loss": 0.6819, "rewards/accuracies": 0.375, "rewards/chosen": -0.042658619582653046, "rewards/margins": 0.21671228110790253, "rewards/rejected": -0.259370893239975, "step": 1255 }, { "debug/policy_chosen_logits": 1.3871732950210571, "debug/policy_chosen_logps": -82.07464599609375, "debug/policy_rejected_logits": 1.4439096450805664, "debug/policy_rejected_logps": -46.68411636352539, "debug/reference_chosen_logps": -82.198974609375, "debug/reference_rejected_logps": -46.564697265625, "epoch": 4.565217391304348, "grad_norm": 520.7962064156875, "learning_rate": 4.497153700189753e-08, "logits/chosen": 1.3871732950210571, "logits/rejected": 1.4439096450805664, "logps/chosen": -82.07464599609375, "logps/rejected": -46.68411636352539, "loss": 0.7219, "rewards/accuracies": 0.375, "rewards/chosen": 0.062163155525922775, "rewards/margins": 0.12187274545431137, "rewards/rejected": -0.05970959737896919, "step": 1260 }, { "debug/policy_chosen_logits": 1.553961992263794, "debug/policy_chosen_logps": -60.15752029418945, "debug/policy_rejected_logits": 1.7329633235931396, "debug/policy_rejected_logps": -5.356936454772949, "debug/reference_chosen_logps": -60.350914001464844, "debug/reference_rejected_logps": -5.316500186920166, "epoch": 4.583333333333333, "grad_norm": 271.58111081234784, "learning_rate": 4.473434535104364e-08, "logits/chosen": 1.553961992263794, "logits/rejected": 1.7329633235931396, "logps/chosen": -60.15752029418945, "logps/rejected": -5.356936454772949, "loss": 0.6706, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.09669642150402069, "rewards/margins": 0.11691449582576752, "rewards/rejected": -0.02021809294819832, "step": 1265 }, { "debug/policy_chosen_logits": 1.5647668838500977, "debug/policy_chosen_logps": -7.702832221984863, "debug/policy_rejected_logits": 1.6691343784332275, "debug/policy_rejected_logps": -9.375310897827148, "debug/reference_chosen_logps": -7.772337436676025, "debug/reference_rejected_logps": -9.234907150268555, "epoch": 4.601449275362318, "grad_norm": 35.39778197739084, "learning_rate": 4.449715370018975e-08, "logits/chosen": 1.5647668838500977, "logits/rejected": 1.6691343784332275, "logps/chosen": -7.702832221984863, "logps/rejected": -9.375310897827148, "loss": 0.6685, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03475252538919449, "rewards/margins": 0.10495438426733017, "rewards/rejected": -0.07020185887813568, "step": 1270 }, { "debug/policy_chosen_logits": 1.3239275217056274, "debug/policy_chosen_logps": -41.2127799987793, "debug/policy_rejected_logits": 1.6437804698944092, "debug/policy_rejected_logps": -171.32662963867188, "debug/reference_chosen_logps": -41.317928314208984, "debug/reference_rejected_logps": -171.03085327148438, "epoch": 4.619565217391305, "grad_norm": 31.499818331750344, "learning_rate": 4.4259962049335866e-08, "logits/chosen": 1.3239275217056274, "logits/rejected": 1.6437804698944092, "logps/chosen": -41.2127799987793, "logps/rejected": -171.32662963867188, "loss": 0.6591, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.05257601663470268, "rewards/margins": 0.20046229660511017, "rewards/rejected": -0.1478862762451172, "step": 1275 }, { "debug/policy_chosen_logits": 1.6109020709991455, "debug/policy_chosen_logps": -4.701529026031494, "debug/policy_rejected_logits": 1.505475640296936, "debug/policy_rejected_logps": -86.47359466552734, "debug/reference_chosen_logps": -4.780890464782715, "debug/reference_rejected_logps": -86.65788269042969, "epoch": 4.63768115942029, "grad_norm": 42.0055648248997, "learning_rate": 4.4022770398481974e-08, "logits/chosen": 1.6109020709991455, "logits/rejected": 1.505475640296936, "logps/chosen": -4.701529026031494, "logps/rejected": -86.47359466552734, "loss": 0.6811, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.03968062624335289, "rewards/margins": -0.05246232822537422, "rewards/rejected": 0.09214296191930771, "step": 1280 }, { "debug/policy_chosen_logits": 1.360306739807129, "debug/policy_chosen_logps": -175.38632202148438, "debug/policy_rejected_logits": 1.79587721824646, "debug/policy_rejected_logps": -4.262625217437744, "debug/reference_chosen_logps": -176.03570556640625, "debug/reference_rejected_logps": -4.204698085784912, "epoch": 4.655797101449275, "grad_norm": 276.0881144754807, "learning_rate": 4.378557874762808e-08, "logits/chosen": 1.360306739807129, "logits/rejected": 1.79587721824646, "logps/chosen": -175.38632202148438, "logps/rejected": -4.262625217437744, "loss": 0.6706, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.3246857225894928, "rewards/margins": 0.3536491394042969, "rewards/rejected": -0.02896338701248169, "step": 1285 }, { "debug/policy_chosen_logits": 1.234985113143921, "debug/policy_chosen_logps": -52.16318893432617, "debug/policy_rejected_logits": 1.650697946548462, "debug/policy_rejected_logps": -110.412109375, "debug/reference_chosen_logps": -52.22807693481445, "debug/reference_rejected_logps": -110.21329498291016, "epoch": 4.673913043478261, "grad_norm": 118.26822998265335, "learning_rate": 4.354838709677419e-08, "logits/chosen": 1.234985113143921, "logits/rejected": 1.650697946548462, "logps/chosen": -52.16318893432617, "logps/rejected": -110.412109375, "loss": 0.654, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.03244481980800629, "rewards/margins": 0.1318475902080536, "rewards/rejected": -0.0994027629494667, "step": 1290 }, { "debug/policy_chosen_logits": 1.392626166343689, "debug/policy_chosen_logps": -55.760215759277344, "debug/policy_rejected_logits": 1.910886526107788, "debug/policy_rejected_logps": -5.741093635559082, "debug/reference_chosen_logps": -55.77851486206055, "debug/reference_rejected_logps": -5.727991580963135, "epoch": 4.692028985507246, "grad_norm": 186.35174717784878, "learning_rate": 4.33111954459203e-08, "logits/chosen": 1.392626166343689, "logits/rejected": 1.910886526107788, "logps/chosen": -55.760215759277344, "logps/rejected": -5.741093635559082, "loss": 0.6758, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0091487942263484, "rewards/margins": 0.0156997200101614, "rewards/rejected": -0.006550925783813, "step": 1295 }, { "debug/policy_chosen_logits": 1.4467806816101074, "debug/policy_chosen_logps": -7.973025321960449, "debug/policy_rejected_logits": 1.5456690788269043, "debug/policy_rejected_logps": -103.21405029296875, "debug/reference_chosen_logps": -8.032278060913086, "debug/reference_rejected_logps": -102.90438079833984, "epoch": 4.710144927536232, "grad_norm": 40.87029015439756, "learning_rate": 4.307400379506641e-08, "logits/chosen": 1.4467806816101074, "logits/rejected": 1.5456690788269043, "logps/chosen": -7.973025321960449, "logps/rejected": -103.21405029296875, "loss": 0.6774, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.029626810923218727, "rewards/margins": 0.18445703387260437, "rewards/rejected": -0.1548302322626114, "step": 1300 }, { "epoch": 4.710144927536232, "eval_debug/policy_chosen_logits": 1.6370888948440552, "eval_debug/policy_chosen_logps": -122.5047836303711, "eval_debug/policy_rejected_logits": 1.6955863237380981, "eval_debug/policy_rejected_logps": -63.81239700317383, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6370888948440552, "eval_logits/rejected": 1.6955863237380981, "eval_logps/chosen": -122.5047836303711, "eval_logps/rejected": -63.81239700317383, "eval_loss": 0.6833045482635498, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.3216411769390106, "eval_rewards/margins": 0.28431209921836853, "eval_rewards/rejected": 0.03732903301715851, "eval_runtime": 28.5502, "eval_samples_per_second": 21.016, "eval_steps_per_second": 0.665, "step": 1300 }, { "debug/policy_chosen_logits": 1.2079617977142334, "debug/policy_chosen_logps": -40.498085021972656, "debug/policy_rejected_logits": 1.6058070659637451, "debug/policy_rejected_logps": -63.61951446533203, "debug/reference_chosen_logps": -40.53647994995117, "debug/reference_rejected_logps": -63.32228469848633, "epoch": 4.728260869565218, "grad_norm": 65.80241307393712, "learning_rate": 4.283681214421252e-08, "logits/chosen": 1.2079617977142334, "logits/rejected": 1.6058070659637451, "logps/chosen": -40.498085021972656, "logps/rejected": -63.61951446533203, "loss": 0.6443, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.019195493310689926, "rewards/margins": 0.16780886054039001, "rewards/rejected": -0.14861339330673218, "step": 1305 }, { "debug/policy_chosen_logits": 1.428055763244629, "debug/policy_chosen_logps": -3.3297653198242188, "debug/policy_rejected_logits": 1.6681495904922485, "debug/policy_rejected_logps": -5.380585670471191, "debug/reference_chosen_logps": -3.3413052558898926, "debug/reference_rejected_logps": -5.367804527282715, "epoch": 4.746376811594203, "grad_norm": 404.3967406317029, "learning_rate": 4.259962049335864e-08, "logits/chosen": 1.428055763244629, "logits/rejected": 1.6681495904922485, "logps/chosen": -3.3297653198242188, "logps/rejected": -5.380585670471191, "loss": 0.6791, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.005769933573901653, "rewards/margins": 0.01216028444468975, "rewards/rejected": -0.006390350870788097, "step": 1310 }, { "debug/policy_chosen_logits": 1.6879383325576782, "debug/policy_chosen_logps": -117.51094818115234, "debug/policy_rejected_logits": 1.998164176940918, "debug/policy_rejected_logps": -140.36463928222656, "debug/reference_chosen_logps": -118.21076965332031, "debug/reference_rejected_logps": -140.56314086914062, "epoch": 4.7644927536231885, "grad_norm": 52.44052231637535, "learning_rate": 4.2362428842504745e-08, "logits/chosen": 1.6879383325576782, "logits/rejected": 1.998164176940918, "logps/chosen": -117.51094818115234, "logps/rejected": -140.36463928222656, "loss": 0.6815, "rewards/accuracies": 0.375, "rewards/chosen": 0.349907785654068, "rewards/margins": 0.25066617131233215, "rewards/rejected": 0.09924156218767166, "step": 1315 }, { "debug/policy_chosen_logits": 1.2812038660049438, "debug/policy_chosen_logps": -106.45469665527344, "debug/policy_rejected_logits": 1.632421851158142, "debug/policy_rejected_logps": -87.32609558105469, "debug/reference_chosen_logps": -106.98687744140625, "debug/reference_rejected_logps": -87.4347152709961, "epoch": 4.782608695652174, "grad_norm": 58.73337330266924, "learning_rate": 4.2125237191650854e-08, "logits/chosen": 1.2812038660049438, "logits/rejected": 1.632421851158142, "logps/chosen": -106.45469665527344, "logps/rejected": -87.32609558105469, "loss": 0.7082, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.2660979926586151, "rewards/margins": 0.21178345382213593, "rewards/rejected": 0.054314516484737396, "step": 1320 }, { "debug/policy_chosen_logits": 1.34615957736969, "debug/policy_chosen_logps": -87.20893096923828, "debug/policy_rejected_logits": 1.5617353916168213, "debug/policy_rejected_logps": -4.647953987121582, "debug/reference_chosen_logps": -87.5677719116211, "debug/reference_rejected_logps": -4.6185102462768555, "epoch": 4.800724637681159, "grad_norm": 41.687748070055676, "learning_rate": 4.188804554079696e-08, "logits/chosen": 1.34615957736969, "logits/rejected": 1.5617353916168213, "logps/chosen": -87.20893096923828, "logps/rejected": -4.647953987121582, "loss": 0.6525, "rewards/accuracies": 0.375, "rewards/chosen": 0.17941974103450775, "rewards/margins": 0.19414183497428894, "rewards/rejected": -0.014722108840942383, "step": 1325 }, { "debug/policy_chosen_logits": 1.199203372001648, "debug/policy_chosen_logps": -9.825940132141113, "debug/policy_rejected_logits": 1.5372501611709595, "debug/policy_rejected_logps": -4.859073162078857, "debug/reference_chosen_logps": -9.850369453430176, "debug/reference_rejected_logps": -4.753626823425293, "epoch": 4.818840579710145, "grad_norm": 27.14671598468104, "learning_rate": 4.165085388994307e-08, "logits/chosen": 1.199203372001648, "logits/rejected": 1.5372501611709595, "logps/chosen": -9.825940132141113, "logps/rejected": -4.859073162078857, "loss": 0.6548, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.012214923277497292, "rewards/margins": 0.06493789702653885, "rewards/rejected": -0.05272297188639641, "step": 1330 }, { "debug/policy_chosen_logits": 1.4724206924438477, "debug/policy_chosen_logps": -64.66626739501953, "debug/policy_rejected_logits": 1.8178075551986694, "debug/policy_rejected_logps": -123.71893310546875, "debug/reference_chosen_logps": -64.95429992675781, "debug/reference_rejected_logps": -123.97188568115234, "epoch": 4.836956521739131, "grad_norm": 34.54144779807464, "learning_rate": 4.141366223908918e-08, "logits/chosen": 1.4724206924438477, "logits/rejected": 1.8178075551986694, "logps/chosen": -64.66626739501953, "logps/rejected": -123.71893310546875, "loss": 0.6754, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.14402173459529877, "rewards/margins": 0.01753334142267704, "rewards/rejected": 0.12648838758468628, "step": 1335 }, { "debug/policy_chosen_logits": 1.2929631471633911, "debug/policy_chosen_logps": -6.545241355895996, "debug/policy_rejected_logits": 1.7286803722381592, "debug/policy_rejected_logps": -41.2450065612793, "debug/reference_chosen_logps": -6.516819953918457, "debug/reference_rejected_logps": -41.23200225830078, "epoch": 4.855072463768116, "grad_norm": 60.522885921497654, "learning_rate": 4.1176470588235293e-08, "logits/chosen": 1.2929631471633911, "logits/rejected": 1.7286803722381592, "logps/chosen": -6.545241355895996, "logps/rejected": -41.2450065612793, "loss": 0.6718, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.01421084813773632, "rewards/margins": -0.007708053104579449, "rewards/rejected": -0.006502795033156872, "step": 1340 }, { "debug/policy_chosen_logits": 1.5119786262512207, "debug/policy_chosen_logps": -45.999019622802734, "debug/policy_rejected_logits": 1.855688452720642, "debug/policy_rejected_logps": -163.92227172851562, "debug/reference_chosen_logps": -46.10194778442383, "debug/reference_rejected_logps": -163.77232360839844, "epoch": 4.8731884057971016, "grad_norm": 165.89402673133958, "learning_rate": 4.09392789373814e-08, "logits/chosen": 1.5119786262512207, "logits/rejected": 1.855688452720642, "logps/chosen": -45.999019622802734, "logps/rejected": -163.92227172851562, "loss": 0.67, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.05146535113453865, "rewards/margins": 0.1264403611421585, "rewards/rejected": -0.07497499883174896, "step": 1345 }, { "debug/policy_chosen_logits": 1.3068609237670898, "debug/policy_chosen_logps": -87.71485137939453, "debug/policy_rejected_logits": 1.6485017538070679, "debug/policy_rejected_logps": -87.09721374511719, "debug/reference_chosen_logps": -87.97621154785156, "debug/reference_rejected_logps": -86.86080932617188, "epoch": 4.891304347826087, "grad_norm": 29.12471615653062, "learning_rate": 4.0702087286527517e-08, "logits/chosen": 1.3068609237670898, "logits/rejected": 1.6485017538070679, "logps/chosen": -87.71485137939453, "logps/rejected": -87.09721374511719, "loss": 0.6671, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.1306859850883484, "rewards/margins": 0.24888797104358673, "rewards/rejected": -0.11820198595523834, "step": 1350 }, { "debug/policy_chosen_logits": 1.4119784832000732, "debug/policy_chosen_logps": -90.49473571777344, "debug/policy_rejected_logits": 1.714742660522461, "debug/policy_rejected_logps": -48.9465446472168, "debug/reference_chosen_logps": -91.06925964355469, "debug/reference_rejected_logps": -48.99866485595703, "epoch": 4.909420289855072, "grad_norm": 34.97749829423541, "learning_rate": 4.0464895635673625e-08, "logits/chosen": 1.4119784832000732, "logits/rejected": 1.714742660522461, "logps/chosen": -90.49473571777344, "logps/rejected": -48.9465446472168, "loss": 0.6683, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.2872655987739563, "rewards/margins": 0.2612074017524719, "rewards/rejected": 0.026058191433548927, "step": 1355 }, { "debug/policy_chosen_logits": 1.213944435119629, "debug/policy_chosen_logps": -6.545996189117432, "debug/policy_rejected_logits": 1.7774651050567627, "debug/policy_rejected_logps": -223.02078247070312, "debug/reference_chosen_logps": -6.552236080169678, "debug/reference_rejected_logps": -222.52499389648438, "epoch": 4.927536231884058, "grad_norm": 37.1673091812925, "learning_rate": 4.022770398481973e-08, "logits/chosen": 1.213944435119629, "logits/rejected": 1.7774651050567627, "logps/chosen": -6.545996189117432, "logps/rejected": -223.02078247070312, "loss": 0.6522, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0031195133924484253, "rewards/margins": 0.25099384784698486, "rewards/rejected": -0.24787434935569763, "step": 1360 }, { "debug/policy_chosen_logits": 1.4161075353622437, "debug/policy_chosen_logps": -52.0767822265625, "debug/policy_rejected_logits": 1.5926083326339722, "debug/policy_rejected_logps": -10.879626274108887, "debug/reference_chosen_logps": -52.23685836791992, "debug/reference_rejected_logps": -10.776144027709961, "epoch": 4.945652173913043, "grad_norm": 28.589183013865505, "learning_rate": 3.999051233396584e-08, "logits/chosen": 1.4161075353622437, "logits/rejected": 1.5926083326339722, "logps/chosen": -52.0767822265625, "logps/rejected": -10.879626274108887, "loss": 0.6731, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.08003643155097961, "rewards/margins": 0.13177812099456787, "rewards/rejected": -0.05174170061945915, "step": 1365 }, { "debug/policy_chosen_logits": 1.2914502620697021, "debug/policy_chosen_logps": -3.4077377319335938, "debug/policy_rejected_logits": 1.732967734336853, "debug/policy_rejected_logps": -10.682971000671387, "debug/reference_chosen_logps": -3.437901258468628, "debug/reference_rejected_logps": -10.604125022888184, "epoch": 4.963768115942029, "grad_norm": 48.6679041368137, "learning_rate": 3.975332068311195e-08, "logits/chosen": 1.2914502620697021, "logits/rejected": 1.732967734336853, "logps/chosen": -3.4077377319335938, "logps/rejected": -10.682971000671387, "loss": 0.6516, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.015081921592354774, "rewards/margins": 0.05450447276234627, "rewards/rejected": -0.039422549307346344, "step": 1370 }, { "debug/policy_chosen_logits": 1.6448627710342407, "debug/policy_chosen_logps": -93.75178527832031, "debug/policy_rejected_logits": 1.877089262008667, "debug/policy_rejected_logps": -7.381670951843262, "debug/reference_chosen_logps": -93.96662139892578, "debug/reference_rejected_logps": -7.362084865570068, "epoch": 4.981884057971015, "grad_norm": 708.6077735385884, "learning_rate": 3.951612903225806e-08, "logits/chosen": 1.6448627710342407, "logits/rejected": 1.877089262008667, "logps/chosen": -93.75178527832031, "logps/rejected": -7.381670951843262, "loss": 0.668, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.10741843283176422, "rewards/margins": 0.11721190065145493, "rewards/rejected": -0.009793472476303577, "step": 1375 }, { "debug/policy_chosen_logits": 1.5920040607452393, "debug/policy_chosen_logps": -42.423702239990234, "debug/policy_rejected_logits": 1.7869011163711548, "debug/policy_rejected_logps": -42.71479034423828, "debug/reference_chosen_logps": -42.54169464111328, "debug/reference_rejected_logps": -42.27654266357422, "epoch": 5.0, "grad_norm": 86.19366466774764, "learning_rate": 3.927893738140417e-08, "logits/chosen": 1.5920040607452393, "logits/rejected": 1.7869011163711548, "logps/chosen": -42.423702239990234, "logps/rejected": -42.71479034423828, "loss": 0.6719, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.05899656563997269, "rewards/margins": 0.27812159061431885, "rewards/rejected": -0.21912503242492676, "step": 1380 }, { "debug/policy_chosen_logits": 1.2387285232543945, "debug/policy_chosen_logps": -42.7496337890625, "debug/policy_rejected_logits": 1.5183712244033813, "debug/policy_rejected_logps": -80.9970474243164, "debug/reference_chosen_logps": -42.970298767089844, "debug/reference_rejected_logps": -80.71112060546875, "epoch": 5.018115942028985, "grad_norm": 78.79995467296504, "learning_rate": 3.904174573055029e-08, "logits/chosen": 1.2387285232543945, "logits/rejected": 1.5183712244033813, "logps/chosen": -42.7496337890625, "logps/rejected": -80.9970474243164, "loss": 0.6557, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.11033084243535995, "rewards/margins": 0.2532910704612732, "rewards/rejected": -0.14296022057533264, "step": 1385 }, { "debug/policy_chosen_logits": 1.4641830921173096, "debug/policy_chosen_logps": -118.42779541015625, "debug/policy_rejected_logits": 1.8877878189086914, "debug/policy_rejected_logps": -59.512481689453125, "debug/reference_chosen_logps": -119.01092529296875, "debug/reference_rejected_logps": -59.480979919433594, "epoch": 5.036231884057971, "grad_norm": 48.06557089086541, "learning_rate": 3.8804554079696396e-08, "logits/chosen": 1.4641830921173096, "logits/rejected": 1.8877878189086914, "logps/chosen": -118.42779541015625, "logps/rejected": -59.512481689453125, "loss": 0.6545, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.2915671169757843, "rewards/margins": 0.30731692910194397, "rewards/rejected": -0.015749800950288773, "step": 1390 }, { "debug/policy_chosen_logits": 1.5759700536727905, "debug/policy_chosen_logps": -81.65776062011719, "debug/policy_rejected_logits": 2.0366058349609375, "debug/policy_rejected_logps": -69.65290069580078, "debug/reference_chosen_logps": -81.94474792480469, "debug/reference_rejected_logps": -69.47213745117188, "epoch": 5.054347826086956, "grad_norm": 24.834071702049574, "learning_rate": 3.8567362428842504e-08, "logits/chosen": 1.5759700536727905, "logits/rejected": 2.0366058349609375, "logps/chosen": -81.65776062011719, "logps/rejected": -69.65290069580078, "loss": 0.6564, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.14349479973316193, "rewards/margins": 0.2338804006576538, "rewards/rejected": -0.09038563072681427, "step": 1395 }, { "debug/policy_chosen_logits": 0.9414035677909851, "debug/policy_chosen_logps": -6.669172763824463, "debug/policy_rejected_logits": 1.4601836204528809, "debug/policy_rejected_logps": -8.895085334777832, "debug/reference_chosen_logps": -6.740177154541016, "debug/reference_rejected_logps": -8.824418067932129, "epoch": 5.072463768115942, "grad_norm": 64.90134619120597, "learning_rate": 3.833017077798861e-08, "logits/chosen": 0.9414035677909851, "logits/rejected": 1.4601836204528809, "logps/chosen": -6.669172763824463, "logps/rejected": -8.895085334777832, "loss": 0.6553, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.03550233691930771, "rewards/margins": 0.0708359107375145, "rewards/rejected": -0.035333577543497086, "step": 1400 }, { "epoch": 5.072463768115942, "eval_debug/policy_chosen_logits": 1.6324462890625, "eval_debug/policy_chosen_logps": -122.25032043457031, "eval_debug/policy_rejected_logits": 1.6925644874572754, "eval_debug/policy_rejected_logps": -63.86791229248047, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6324462890625, "eval_logits/rejected": 1.6925644874572754, "eval_logps/chosen": -122.25032043457031, "eval_logps/rejected": -63.86791229248047, "eval_loss": 0.6870798468589783, "eval_rewards/accuracies": 0.34210526943206787, "eval_rewards/chosen": 0.44886454939842224, "eval_rewards/margins": 0.4392945170402527, "eval_rewards/rejected": 0.009570048190653324, "eval_runtime": 28.5004, "eval_samples_per_second": 21.052, "eval_steps_per_second": 0.667, "step": 1400 }, { "debug/policy_chosen_logits": 1.2388232946395874, "debug/policy_chosen_logps": -7.173973083496094, "debug/policy_rejected_logits": 1.3482537269592285, "debug/policy_rejected_logps": -12.929254531860352, "debug/reference_chosen_logps": -7.235052585601807, "debug/reference_rejected_logps": -12.837717056274414, "epoch": 5.090579710144928, "grad_norm": 60.799049209895536, "learning_rate": 3.809297912713472e-08, "logits/chosen": 1.2388232946395874, "logits/rejected": 1.3482537269592285, "logps/chosen": -7.173973083496094, "logps/rejected": -12.929254531860352, "loss": 0.6569, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.030539561063051224, "rewards/margins": 0.07630815356969833, "rewards/rejected": -0.04576859250664711, "step": 1405 }, { "debug/policy_chosen_logits": 1.4136666059494019, "debug/policy_chosen_logps": -7.416630744934082, "debug/policy_rejected_logits": 1.695948839187622, "debug/policy_rejected_logps": -49.10722351074219, "debug/reference_chosen_logps": -7.446101188659668, "debug/reference_rejected_logps": -48.94523239135742, "epoch": 5.108695652173913, "grad_norm": 237.50980330912327, "learning_rate": 3.785578747628083e-08, "logits/chosen": 1.4136666059494019, "logits/rejected": 1.695948839187622, "logps/chosen": -7.416630744934082, "logps/rejected": -49.10722351074219, "loss": 0.6604, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.014735421165823936, "rewards/margins": 0.09573069959878922, "rewards/rejected": -0.08099526911973953, "step": 1410 }, { "debug/policy_chosen_logits": 1.4948642253875732, "debug/policy_chosen_logps": -37.65853500366211, "debug/policy_rejected_logits": 1.5326335430145264, "debug/policy_rejected_logps": -7.317802429199219, "debug/reference_chosen_logps": -37.77952194213867, "debug/reference_rejected_logps": -7.246865749359131, "epoch": 5.1268115942028984, "grad_norm": 36.29439316661414, "learning_rate": 3.7618595825426944e-08, "logits/chosen": 1.4948642253875732, "logits/rejected": 1.5326335430145264, "logps/chosen": -37.65853500366211, "logps/rejected": -7.317802429199219, "loss": 0.6627, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.06049184873700142, "rewards/margins": 0.09596039354801178, "rewards/rejected": -0.03546854108572006, "step": 1415 }, { "debug/policy_chosen_logits": 1.4747118949890137, "debug/policy_chosen_logps": -44.89406204223633, "debug/policy_rejected_logits": 1.751103401184082, "debug/policy_rejected_logps": -3.6775214672088623, "debug/reference_chosen_logps": -45.08138656616211, "debug/reference_rejected_logps": -3.631639003753662, "epoch": 5.144927536231884, "grad_norm": 61.5142584389856, "learning_rate": 3.738140417457305e-08, "logits/chosen": 1.4747118949890137, "logits/rejected": 1.751103401184082, "logps/chosen": -44.89406204223633, "logps/rejected": -3.6775214672088623, "loss": 0.6686, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.09366239607334137, "rewards/margins": 0.11660374701023102, "rewards/rejected": -0.02294134721159935, "step": 1420 }, { "debug/policy_chosen_logits": 1.1936651468276978, "debug/policy_chosen_logps": -9.350784301757812, "debug/policy_rejected_logits": 1.3846832513809204, "debug/policy_rejected_logps": -57.17119216918945, "debug/reference_chosen_logps": -9.44443416595459, "debug/reference_rejected_logps": -57.086700439453125, "epoch": 5.163043478260869, "grad_norm": 47.589017114917674, "learning_rate": 3.714421252371917e-08, "logits/chosen": 1.1936651468276978, "logits/rejected": 1.3846832513809204, "logps/chosen": -9.350784301757812, "logps/rejected": -57.17119216918945, "loss": 0.6515, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0468253456056118, "rewards/margins": 0.08907244354486465, "rewards/rejected": -0.04224709793925285, "step": 1425 }, { "debug/policy_chosen_logits": 1.2141838073730469, "debug/policy_chosen_logps": -136.44961547851562, "debug/policy_rejected_logits": 1.6318451166152954, "debug/policy_rejected_logps": -3.9245381355285645, "debug/reference_chosen_logps": -137.0669403076172, "debug/reference_rejected_logps": -3.9132537841796875, "epoch": 5.181159420289855, "grad_norm": 48.564234029835816, "learning_rate": 3.6907020872865276e-08, "logits/chosen": 1.2141838073730469, "logits/rejected": 1.6318451166152954, "logps/chosen": -136.44961547851562, "logps/rejected": -3.9245381355285645, "loss": 0.6544, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.308660089969635, "rewards/margins": 0.31430208683013916, "rewards/rejected": -0.0056419880129396915, "step": 1430 }, { "debug/policy_chosen_logits": 0.8902009725570679, "debug/policy_chosen_logps": -88.27565002441406, "debug/policy_rejected_logits": 1.3336741924285889, "debug/policy_rejected_logps": -114.299560546875, "debug/reference_chosen_logps": -88.36367797851562, "debug/reference_rejected_logps": -114.4028549194336, "epoch": 5.199275362318841, "grad_norm": 70.17721593231215, "learning_rate": 3.6669829222011384e-08, "logits/chosen": 0.8902009725570679, "logits/rejected": 1.3336741924285889, "logps/chosen": -88.27565002441406, "logps/rejected": -114.299560546875, "loss": 0.6616, "rewards/accuracies": 0.25, "rewards/chosen": 0.044019222259521484, "rewards/margins": -0.007624474354088306, "rewards/rejected": 0.05164368823170662, "step": 1435 }, { "debug/policy_chosen_logits": 1.372544527053833, "debug/policy_chosen_logps": -38.13840866088867, "debug/policy_rejected_logits": 1.4774370193481445, "debug/policy_rejected_logps": -98.13192749023438, "debug/reference_chosen_logps": -38.32720947265625, "debug/reference_rejected_logps": -97.77687072753906, "epoch": 5.217391304347826, "grad_norm": 36.558145359827364, "learning_rate": 3.643263757115749e-08, "logits/chosen": 1.372544527053833, "logits/rejected": 1.4774370193481445, "logps/chosen": -38.13840866088867, "logps/rejected": -98.13192749023438, "loss": 0.6557, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.09439970552921295, "rewards/margins": 0.2719293236732483, "rewards/rejected": -0.17752960324287415, "step": 1440 }, { "debug/policy_chosen_logits": 1.1531798839569092, "debug/policy_chosen_logps": -305.1312561035156, "debug/policy_rejected_logits": 1.8629705905914307, "debug/policy_rejected_logps": -75.52597045898438, "debug/reference_chosen_logps": -306.47906494140625, "debug/reference_rejected_logps": -73.67118835449219, "epoch": 5.2355072463768115, "grad_norm": 424.1473975862814, "learning_rate": 3.61954459203036e-08, "logits/chosen": 1.1531798839569092, "logits/rejected": 1.8629705905914307, "logps/chosen": -305.1312561035156, "logps/rejected": -75.52597045898438, "loss": 0.6592, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.6739075779914856, "rewards/margins": 1.6012996435165405, "rewards/rejected": -0.9273921251296997, "step": 1445 }, { "debug/policy_chosen_logits": 1.4649162292480469, "debug/policy_chosen_logps": -6.299330711364746, "debug/policy_rejected_logits": 1.802556037902832, "debug/policy_rejected_logps": -32.06026077270508, "debug/reference_chosen_logps": -6.336493492126465, "debug/reference_rejected_logps": -31.913803100585938, "epoch": 5.253623188405797, "grad_norm": 48.92537620517908, "learning_rate": 3.595825426944971e-08, "logits/chosen": 1.4649162292480469, "logits/rejected": 1.802556037902832, "logps/chosen": -6.299330711364746, "logps/rejected": -32.06026077270508, "loss": 0.6591, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.018581295385956764, "rewards/margins": 0.09181151539087296, "rewards/rejected": -0.07323022186756134, "step": 1450 }, { "debug/policy_chosen_logits": 1.0758881568908691, "debug/policy_chosen_logps": -5.92312479019165, "debug/policy_rejected_logits": 1.4213136434555054, "debug/policy_rejected_logps": -3.8541884422302246, "debug/reference_chosen_logps": -5.939286231994629, "debug/reference_rejected_logps": -3.8238983154296875, "epoch": 5.271739130434782, "grad_norm": 69.29316154490202, "learning_rate": 3.5721062618595824e-08, "logits/chosen": 1.0758881568908691, "logits/rejected": 1.4213136434555054, "logps/chosen": -5.92312479019165, "logps/rejected": -3.8541884422302246, "loss": 0.676, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.008080867119133472, "rewards/margins": 0.023225897923111916, "rewards/rejected": -0.015145030803978443, "step": 1455 }, { "debug/policy_chosen_logits": 1.149516224861145, "debug/policy_chosen_logps": -155.43740844726562, "debug/policy_rejected_logits": 1.4444801807403564, "debug/policy_rejected_logps": -46.121192932128906, "debug/reference_chosen_logps": -155.37582397460938, "debug/reference_rejected_logps": -45.943355560302734, "epoch": 5.2898550724637685, "grad_norm": 55.72586009874707, "learning_rate": 3.548387096774194e-08, "logits/chosen": 1.149516224861145, "logits/rejected": 1.4444801807403564, "logps/chosen": -155.43740844726562, "logps/rejected": -46.121192932128906, "loss": 0.6775, "rewards/accuracies": 0.375, "rewards/chosen": -0.030793100595474243, "rewards/margins": 0.05812593549489975, "rewards/rejected": -0.08891903609037399, "step": 1460 }, { "debug/policy_chosen_logits": 1.471667766571045, "debug/policy_chosen_logps": -5.1590576171875, "debug/policy_rejected_logits": 1.7072460651397705, "debug/policy_rejected_logps": -7.834895133972168, "debug/reference_chosen_logps": -5.220461368560791, "debug/reference_rejected_logps": -7.778662204742432, "epoch": 5.307971014492754, "grad_norm": 38.475221932596455, "learning_rate": 3.524667931688805e-08, "logits/chosen": 1.471667766571045, "logits/rejected": 1.7072460651397705, "logps/chosen": -5.1590576171875, "logps/rejected": -7.834895133972168, "loss": 0.661, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0307016484439373, "rewards/margins": 0.058818183839321136, "rewards/rejected": -0.028116535395383835, "step": 1465 }, { "debug/policy_chosen_logits": 1.2105424404144287, "debug/policy_chosen_logps": -4.774018287658691, "debug/policy_rejected_logits": 2.0813844203948975, "debug/policy_rejected_logps": -39.55461502075195, "debug/reference_chosen_logps": -4.839789390563965, "debug/reference_rejected_logps": -39.348365783691406, "epoch": 5.326086956521739, "grad_norm": 39.18317359085667, "learning_rate": 3.5009487666034155e-08, "logits/chosen": 1.2105424404144287, "logits/rejected": 2.0813844203948975, "logps/chosen": -4.774018287658691, "logps/rejected": -39.55461502075195, "loss": 0.648, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.03288550674915314, "rewards/margins": 0.1360102742910385, "rewards/rejected": -0.10312476009130478, "step": 1470 }, { "debug/policy_chosen_logits": 1.3450971841812134, "debug/policy_chosen_logps": -6.853333950042725, "debug/policy_rejected_logits": 1.6441389322280884, "debug/policy_rejected_logps": -11.630078315734863, "debug/reference_chosen_logps": -6.857815742492676, "debug/reference_rejected_logps": -11.570013999938965, "epoch": 5.344202898550725, "grad_norm": 37.47750078788712, "learning_rate": 3.4772296015180263e-08, "logits/chosen": 1.3450971841812134, "logits/rejected": 1.6441389322280884, "logps/chosen": -6.853333950042725, "logps/rejected": -11.630078315734863, "loss": 0.6608, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0022404552437365055, "rewards/margins": 0.03227285295724869, "rewards/rejected": -0.03003239631652832, "step": 1475 }, { "debug/policy_chosen_logits": 1.505319356918335, "debug/policy_chosen_logps": -91.32258605957031, "debug/policy_rejected_logits": 1.8216221332550049, "debug/policy_rejected_logps": -7.684873104095459, "debug/reference_chosen_logps": -91.68228912353516, "debug/reference_rejected_logps": -7.644742488861084, "epoch": 5.36231884057971, "grad_norm": 49.5349643329943, "learning_rate": 3.453510436432637e-08, "logits/chosen": 1.505319356918335, "logits/rejected": 1.8216221332550049, "logps/chosen": -91.32258605957031, "logps/rejected": -7.684873104095459, "loss": 0.6673, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.17985357344150543, "rewards/margins": 0.1999189853668213, "rewards/rejected": -0.02006540820002556, "step": 1480 }, { "debug/policy_chosen_logits": 1.2321274280548096, "debug/policy_chosen_logps": -1.5475294589996338, "debug/policy_rejected_logits": 1.423863172531128, "debug/policy_rejected_logps": -6.345267295837402, "debug/reference_chosen_logps": -1.591352939605713, "debug/reference_rejected_logps": -6.144689083099365, "epoch": 5.380434782608695, "grad_norm": 34.20840885445535, "learning_rate": 3.429791271347248e-08, "logits/chosen": 1.2321274280548096, "logits/rejected": 1.423863172531128, "logps/chosen": -1.5475294589996338, "logps/rejected": -6.345267295837402, "loss": 0.6563, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.02191174402832985, "rewards/margins": 0.12220092117786407, "rewards/rejected": -0.10028918087482452, "step": 1485 }, { "debug/policy_chosen_logits": 1.4869940280914307, "debug/policy_chosen_logps": -3.731186628341675, "debug/policy_rejected_logits": 1.7644668817520142, "debug/policy_rejected_logps": -5.1883063316345215, "debug/reference_chosen_logps": -3.7803280353546143, "debug/reference_rejected_logps": -5.147512435913086, "epoch": 5.398550724637682, "grad_norm": 183.2811967111881, "learning_rate": 3.4060721062618595e-08, "logits/chosen": 1.4869940280914307, "logits/rejected": 1.7644668817520142, "logps/chosen": -3.731186628341675, "logps/rejected": -5.1883063316345215, "loss": 0.6627, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.024570565670728683, "rewards/margins": 0.044967398047447205, "rewards/rejected": -0.02039683237671852, "step": 1490 }, { "debug/policy_chosen_logits": 1.4407232999801636, "debug/policy_chosen_logps": -98.67827606201172, "debug/policy_rejected_logits": 1.5844844579696655, "debug/policy_rejected_logps": -103.06256103515625, "debug/reference_chosen_logps": -98.66349792480469, "debug/reference_rejected_logps": -102.67134094238281, "epoch": 5.416666666666667, "grad_norm": 30.9356711395197, "learning_rate": 3.38235294117647e-08, "logits/chosen": 1.4407232999801636, "logits/rejected": 1.5844844579696655, "logps/chosen": -98.67827606201172, "logps/rejected": -103.06256103515625, "loss": 0.6804, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.007392148487269878, "rewards/margins": 0.18821898102760315, "rewards/rejected": -0.1956111490726471, "step": 1495 }, { "debug/policy_chosen_logits": 1.2020397186279297, "debug/policy_chosen_logps": -67.10565185546875, "debug/policy_rejected_logits": 1.4833542108535767, "debug/policy_rejected_logps": -3.8966896533966064, "debug/reference_chosen_logps": -67.42205810546875, "debug/reference_rejected_logps": -3.8296828269958496, "epoch": 5.434782608695652, "grad_norm": 260.687445882403, "learning_rate": 3.358633776091082e-08, "logits/chosen": 1.2020397186279297, "logits/rejected": 1.4833542108535767, "logps/chosen": -67.10565185546875, "logps/rejected": -3.8966896533966064, "loss": 0.655, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.1582057625055313, "rewards/margins": 0.19170907139778137, "rewards/rejected": -0.03350331634283066, "step": 1500 }, { "epoch": 5.434782608695652, "eval_debug/policy_chosen_logits": 1.6446142196655273, "eval_debug/policy_chosen_logps": -122.37462615966797, "eval_debug/policy_rejected_logits": 1.703676700592041, "eval_debug/policy_rejected_logps": -63.886287689208984, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6446142196655273, "eval_logits/rejected": 1.703676700592041, "eval_logps/chosen": -122.37462615966797, "eval_logps/rejected": -63.886287689208984, "eval_loss": 0.6900221705436707, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.38670632243156433, "eval_rewards/margins": 0.3863199055194855, "eval_rewards/rejected": 0.00038638082332909107, "eval_runtime": 28.5865, "eval_samples_per_second": 20.989, "eval_steps_per_second": 0.665, "step": 1500 }, { "debug/policy_chosen_logits": 1.3616105318069458, "debug/policy_chosen_logps": -92.06291198730469, "debug/policy_rejected_logits": 1.628366231918335, "debug/policy_rejected_logps": -9.634321212768555, "debug/reference_chosen_logps": -92.66380310058594, "debug/reference_rejected_logps": -9.599748611450195, "epoch": 5.452898550724638, "grad_norm": 91.62496513943545, "learning_rate": 3.3349146110056926e-08, "logits/chosen": 1.3616105318069458, "logits/rejected": 1.628366231918335, "logps/chosen": -92.06291198730469, "logps/rejected": -9.634321212768555, "loss": 0.6589, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.3004438281059265, "rewards/margins": 0.31772980093955994, "rewards/rejected": -0.017285970970988274, "step": 1505 }, { "debug/policy_chosen_logits": 1.332423448562622, "debug/policy_chosen_logps": -118.2754135131836, "debug/policy_rejected_logits": 1.7691797018051147, "debug/policy_rejected_logps": -3.7979729175567627, "debug/reference_chosen_logps": -119.2478256225586, "debug/reference_rejected_logps": -3.7696566581726074, "epoch": 5.471014492753623, "grad_norm": 36.92206401851464, "learning_rate": 3.3111954459203035e-08, "logits/chosen": 1.332423448562622, "logits/rejected": 1.7691797018051147, "logps/chosen": -118.2754135131836, "logps/rejected": -3.7979729175567627, "loss": 0.6613, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.48620694875717163, "rewards/margins": 0.5003648996353149, "rewards/rejected": -0.014157896861433983, "step": 1510 }, { "debug/policy_chosen_logits": 1.2442333698272705, "debug/policy_chosen_logps": -136.5766143798828, "debug/policy_rejected_logits": 1.6991926431655884, "debug/policy_rejected_logps": -3.1117019653320312, "debug/reference_chosen_logps": -137.1483917236328, "debug/reference_rejected_logps": -3.143246650695801, "epoch": 5.489130434782608, "grad_norm": 37.63687205429341, "learning_rate": 3.287476280834914e-08, "logits/chosen": 1.2442333698272705, "logits/rejected": 1.6991926431655884, "logps/chosen": -136.5766143798828, "logps/rejected": -3.1117019653320312, "loss": 0.661, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.2858816385269165, "rewards/margins": 0.2701093256473541, "rewards/rejected": 0.015772301703691483, "step": 1515 }, { "debug/policy_chosen_logits": 1.7487331628799438, "debug/policy_chosen_logps": -40.07701873779297, "debug/policy_rejected_logits": 2.039294958114624, "debug/policy_rejected_logps": -38.04768371582031, "debug/reference_chosen_logps": -40.2617301940918, "debug/reference_rejected_logps": -38.05076599121094, "epoch": 5.507246376811594, "grad_norm": 24.985282114418585, "learning_rate": 3.263757115749525e-08, "logits/chosen": 1.7487331628799438, "logits/rejected": 2.039294958114624, "logps/chosen": -40.07701873779297, "logps/rejected": -38.04768371582031, "loss": 0.6752, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0923568606376648, "rewards/margins": 0.09081549942493439, "rewards/rejected": 0.0015413642395287752, "step": 1520 }, { "debug/policy_chosen_logits": 1.422887921333313, "debug/policy_chosen_logps": -131.58580017089844, "debug/policy_rejected_logits": 1.6982561349868774, "debug/policy_rejected_logps": -5.473758697509766, "debug/reference_chosen_logps": -132.44015502929688, "debug/reference_rejected_logps": -5.4024763107299805, "epoch": 5.52536231884058, "grad_norm": 86.50074266664166, "learning_rate": 3.240037950664136e-08, "logits/chosen": 1.422887921333313, "logits/rejected": 1.6982561349868774, "logps/chosen": -131.58580017089844, "logps/rejected": -5.473758697509766, "loss": 0.6443, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.42716890573501587, "rewards/margins": 0.46280989050865173, "rewards/rejected": -0.03564102575182915, "step": 1525 }, { "debug/policy_chosen_logits": 0.9620069265365601, "debug/policy_chosen_logps": -3.563978672027588, "debug/policy_rejected_logits": 1.0636909008026123, "debug/policy_rejected_logps": -3.118100166320801, "debug/reference_chosen_logps": -3.6078414916992188, "debug/reference_rejected_logps": -3.0336270332336426, "epoch": 5.543478260869565, "grad_norm": 352.9411109661715, "learning_rate": 3.2163187855787474e-08, "logits/chosen": 0.9620069265365601, "logits/rejected": 1.0636909008026123, "logps/chosen": -3.563978672027588, "logps/rejected": -3.118100166320801, "loss": 0.6609, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.02193145826458931, "rewards/margins": 0.06416809558868408, "rewards/rejected": -0.04223664104938507, "step": 1530 }, { "debug/policy_chosen_logits": 1.2095394134521484, "debug/policy_chosen_logps": -5.086410999298096, "debug/policy_rejected_logits": 1.5882301330566406, "debug/policy_rejected_logps": -5.110785961151123, "debug/reference_chosen_logps": -5.104105472564697, "debug/reference_rejected_logps": -5.033654689788818, "epoch": 5.561594202898551, "grad_norm": 94.45741349755835, "learning_rate": 3.192599620493359e-08, "logits/chosen": 1.2095394134521484, "logits/rejected": 1.5882301330566406, "logps/chosen": -5.086410999298096, "logps/rejected": -5.110785961151123, "loss": 0.6603, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00884726457297802, "rewards/margins": 0.0474126935005188, "rewards/rejected": -0.03856542706489563, "step": 1535 }, { "debug/policy_chosen_logits": 1.2204310894012451, "debug/policy_chosen_logps": -3.1711525917053223, "debug/policy_rejected_logits": 1.787411093711853, "debug/policy_rejected_logps": -39.81066131591797, "debug/reference_chosen_logps": -3.233112335205078, "debug/reference_rejected_logps": -39.82624816894531, "epoch": 5.579710144927536, "grad_norm": 83.44934938085363, "learning_rate": 3.16888045540797e-08, "logits/chosen": 1.2204310894012451, "logits/rejected": 1.787411093711853, "logps/chosen": -3.1711525917053223, "logps/rejected": -39.81066131591797, "loss": 0.6689, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.030979936942458153, "rewards/margins": 0.023183239623904228, "rewards/rejected": 0.0077966987155377865, "step": 1540 }, { "debug/policy_chosen_logits": 1.2455536127090454, "debug/policy_chosen_logps": -5.565741062164307, "debug/policy_rejected_logits": 1.5199828147888184, "debug/policy_rejected_logps": -82.37709045410156, "debug/reference_chosen_logps": -5.5925612449646, "debug/reference_rejected_logps": -82.02436828613281, "epoch": 5.5978260869565215, "grad_norm": 49.84846033971187, "learning_rate": 3.1451612903225806e-08, "logits/chosen": 1.2455536127090454, "logits/rejected": 1.5199828147888184, "logps/chosen": -5.565741062164307, "logps/rejected": -82.37709045410156, "loss": 0.6488, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.013410058803856373, "rewards/margins": 0.1897648572921753, "rewards/rejected": -0.1763547956943512, "step": 1545 }, { "debug/policy_chosen_logits": 1.3855087757110596, "debug/policy_chosen_logps": -44.71533966064453, "debug/policy_rejected_logits": 1.9909900426864624, "debug/policy_rejected_logps": -57.854209899902344, "debug/reference_chosen_logps": -44.77842712402344, "debug/reference_rejected_logps": -57.672447204589844, "epoch": 5.615942028985507, "grad_norm": 85.69575098502474, "learning_rate": 3.1214421252371914e-08, "logits/chosen": 1.3855087757110596, "logits/rejected": 1.9909900426864624, "logps/chosen": -44.71533966064453, "logps/rejected": -57.854209899902344, "loss": 0.6569, "rewards/accuracies": 0.25, "rewards/chosen": 0.03154348209500313, "rewards/margins": 0.12242207676172256, "rewards/rejected": -0.09087859839200974, "step": 1550 }, { "debug/policy_chosen_logits": 1.3234268426895142, "debug/policy_chosen_logps": -104.77967834472656, "debug/policy_rejected_logits": 1.7090914249420166, "debug/policy_rejected_logps": -5.021101951599121, "debug/reference_chosen_logps": -104.84139251708984, "debug/reference_rejected_logps": -4.836784362792969, "epoch": 5.634057971014493, "grad_norm": 43.41272359821495, "learning_rate": 3.097722960151802e-08, "logits/chosen": 1.3234268426895142, "logits/rejected": 1.7090914249420166, "logps/chosen": -104.77967834472656, "logps/rejected": -5.021101951599121, "loss": 0.666, "rewards/accuracies": 0.375, "rewards/chosen": 0.03086104616522789, "rewards/margins": 0.12301985919475555, "rewards/rejected": -0.09215881675481796, "step": 1555 }, { "debug/policy_chosen_logits": 1.4496822357177734, "debug/policy_chosen_logps": -144.7053680419922, "debug/policy_rejected_logits": 1.8354533910751343, "debug/policy_rejected_logps": -6.74813175201416, "debug/reference_chosen_logps": -144.6988067626953, "debug/reference_rejected_logps": -6.722146034240723, "epoch": 5.6521739130434785, "grad_norm": 356.96236099987294, "learning_rate": 3.074003795066413e-08, "logits/chosen": 1.4496822357177734, "logits/rejected": 1.8354533910751343, "logps/chosen": -144.7053680419922, "logps/rejected": -6.74813175201416, "loss": 0.6641, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.0032808303367346525, "rewards/margins": 0.009712534956634045, "rewards/rejected": -0.01299336552619934, "step": 1560 }, { "debug/policy_chosen_logits": 0.9579411745071411, "debug/policy_chosen_logps": -143.39065551757812, "debug/policy_rejected_logits": 1.2373861074447632, "debug/policy_rejected_logps": -3.99231219291687, "debug/reference_chosen_logps": -143.37928771972656, "debug/reference_rejected_logps": -3.9735684394836426, "epoch": 5.670289855072464, "grad_norm": 49.90162372299298, "learning_rate": 3.0502846299810246e-08, "logits/chosen": 0.9579411745071411, "logits/rejected": 1.2373861074447632, "logps/chosen": -143.39065551757812, "logps/rejected": -3.99231219291687, "loss": 0.6703, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.005685743875801563, "rewards/margins": 0.003686320735141635, "rewards/rejected": -0.009372064843773842, "step": 1565 }, { "debug/policy_chosen_logits": 1.5043195486068726, "debug/policy_chosen_logps": -36.65690231323242, "debug/policy_rejected_logits": 1.8350836038589478, "debug/policy_rejected_logps": -10.769933700561523, "debug/reference_chosen_logps": -36.733680725097656, "debug/reference_rejected_logps": -10.768438339233398, "epoch": 5.688405797101449, "grad_norm": 48.19204110573151, "learning_rate": 3.0265654648956354e-08, "logits/chosen": 1.5043195486068726, "logits/rejected": 1.8350836038589478, "logps/chosen": -36.65690231323242, "logps/rejected": -10.769933700561523, "loss": 0.6569, "rewards/accuracies": 0.375, "rewards/chosen": 0.03838967904448509, "rewards/margins": 0.0391378290951252, "rewards/rejected": -0.0007481470820493996, "step": 1570 }, { "debug/policy_chosen_logits": 1.6632654666900635, "debug/policy_chosen_logps": -127.93687438964844, "debug/policy_rejected_logits": 1.8194118738174438, "debug/policy_rejected_logps": -12.835029602050781, "debug/reference_chosen_logps": -128.343994140625, "debug/reference_rejected_logps": -12.645792007446289, "epoch": 5.706521739130435, "grad_norm": 31.49401959810334, "learning_rate": 3.002846299810247e-08, "logits/chosen": 1.6632654666900635, "logits/rejected": 1.8194118738174438, "logps/chosen": -127.93687438964844, "logps/rejected": -12.835029602050781, "loss": 0.6607, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.20356428623199463, "rewards/margins": 0.2981840670108795, "rewards/rejected": -0.09461978822946548, "step": 1575 }, { "debug/policy_chosen_logits": 1.511162281036377, "debug/policy_chosen_logps": -29.363107681274414, "debug/policy_rejected_logits": 1.8363853693008423, "debug/policy_rejected_logps": -112.23433685302734, "debug/reference_chosen_logps": -29.465951919555664, "debug/reference_rejected_logps": -111.66182708740234, "epoch": 5.72463768115942, "grad_norm": 33.83219527357827, "learning_rate": 2.9791271347248577e-08, "logits/chosen": 1.511162281036377, "logits/rejected": 1.8363853693008423, "logps/chosen": -29.363107681274414, "logps/rejected": -112.23433685302734, "loss": 0.6522, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.05142257362604141, "rewards/margins": 0.3376769423484802, "rewards/rejected": -0.286254346370697, "step": 1580 }, { "debug/policy_chosen_logits": 1.1692798137664795, "debug/policy_chosen_logps": -8.56626033782959, "debug/policy_rejected_logits": 1.626138687133789, "debug/policy_rejected_logps": -5.798595905303955, "debug/reference_chosen_logps": -8.66431999206543, "debug/reference_rejected_logps": -5.612049102783203, "epoch": 5.742753623188406, "grad_norm": 70.51924790694046, "learning_rate": 2.9554079696394685e-08, "logits/chosen": 1.1692798137664795, "logits/rejected": 1.626138687133789, "logps/chosen": -8.56626033782959, "logps/rejected": -5.798595905303955, "loss": 0.6522, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0490298718214035, "rewards/margins": 0.14230301976203918, "rewards/rejected": -0.09327313303947449, "step": 1585 }, { "debug/policy_chosen_logits": 1.1633007526397705, "debug/policy_chosen_logps": -4.2419962882995605, "debug/policy_rejected_logits": 1.8023815155029297, "debug/policy_rejected_logps": -4.186429023742676, "debug/reference_chosen_logps": -4.240781784057617, "debug/reference_rejected_logps": -4.2187275886535645, "epoch": 5.760869565217392, "grad_norm": 127.8087042746356, "learning_rate": 2.9316888045540794e-08, "logits/chosen": 1.1633007526397705, "logits/rejected": 1.8023815155029297, "logps/chosen": -4.2419962882995605, "logps/rejected": -4.186429023742676, "loss": 0.6667, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0006070107338018715, "rewards/margins": -0.016756299883127213, "rewards/rejected": 0.016149288043379784, "step": 1590 }, { "debug/policy_chosen_logits": 1.1937873363494873, "debug/policy_chosen_logps": -5.156860828399658, "debug/policy_rejected_logits": 1.3993984460830688, "debug/policy_rejected_logps": -39.990150451660156, "debug/reference_chosen_logps": -5.168766021728516, "debug/reference_rejected_logps": -39.74993896484375, "epoch": 5.778985507246377, "grad_norm": 247.66666417372528, "learning_rate": 2.9079696394686902e-08, "logits/chosen": 1.1937873363494873, "logits/rejected": 1.3993984460830688, "logps/chosen": -5.156860828399658, "logps/rejected": -39.990150451660156, "loss": 0.665, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.005952669773250818, "rewards/margins": 0.12605887651443481, "rewards/rejected": -0.12010620534420013, "step": 1595 }, { "debug/policy_chosen_logits": 1.1740655899047852, "debug/policy_chosen_logps": -115.88630676269531, "debug/policy_rejected_logits": 1.5919690132141113, "debug/policy_rejected_logps": -38.514408111572266, "debug/reference_chosen_logps": -116.2649154663086, "debug/reference_rejected_logps": -38.35938262939453, "epoch": 5.797101449275362, "grad_norm": 35.38213998761934, "learning_rate": 2.8842504743833017e-08, "logits/chosen": 1.1740655899047852, "logits/rejected": 1.5919690132141113, "logps/chosen": -115.88630676269531, "logps/rejected": -38.514408111572266, "loss": 0.6552, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.18931356072425842, "rewards/margins": 0.266826331615448, "rewards/rejected": -0.07751278579235077, "step": 1600 }, { "epoch": 5.797101449275362, "eval_debug/policy_chosen_logits": 1.6342072486877441, "eval_debug/policy_chosen_logps": -122.5849380493164, "eval_debug/policy_rejected_logits": 1.6935113668441772, "eval_debug/policy_rejected_logps": -64.02359771728516, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6342072486877441, "eval_logits/rejected": 1.6935113668441772, "eval_logps/chosen": -122.5849380493164, "eval_logps/rejected": -64.02359771728516, "eval_loss": 0.698072075843811, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.2815607786178589, "eval_rewards/margins": 0.3498329222202301, "eval_rewards/rejected": -0.06827213615179062, "eval_runtime": 28.5734, "eval_samples_per_second": 20.999, "eval_steps_per_second": 0.665, "step": 1600 }, { "debug/policy_chosen_logits": 1.6218411922454834, "debug/policy_chosen_logps": -7.398769378662109, "debug/policy_rejected_logits": 1.6293132305145264, "debug/policy_rejected_logps": -3.3103396892547607, "debug/reference_chosen_logps": -7.392036437988281, "debug/reference_rejected_logps": -3.246826648712158, "epoch": 5.815217391304348, "grad_norm": 43.77144553745299, "learning_rate": 2.860531309297913e-08, "logits/chosen": 1.6218411922454834, "logits/rejected": 1.6293132305145264, "logps/chosen": -7.398769378662109, "logps/rejected": -3.3103396892547607, "loss": 0.6551, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.0033666223753243685, "rewards/margins": 0.028390079736709595, "rewards/rejected": -0.031756702810525894, "step": 1605 }, { "debug/policy_chosen_logits": 1.4865918159484863, "debug/policy_chosen_logps": -6.791438102722168, "debug/policy_rejected_logits": 1.654322862625122, "debug/policy_rejected_logps": -12.433537483215332, "debug/reference_chosen_logps": -6.868844509124756, "debug/reference_rejected_logps": -12.332254409790039, "epoch": 5.833333333333333, "grad_norm": 52.53108285967263, "learning_rate": 2.8368121442125237e-08, "logits/chosen": 1.4865918159484863, "logits/rejected": 1.654322862625122, "logps/chosen": -6.791438102722168, "logps/rejected": -12.433537483215332, "loss": 0.6588, "rewards/accuracies": 0.375, "rewards/chosen": 0.0387030653655529, "rewards/margins": 0.08934410661458969, "rewards/rejected": -0.050641048699617386, "step": 1610 }, { "debug/policy_chosen_logits": 1.2854838371276855, "debug/policy_chosen_logps": -181.76376342773438, "debug/policy_rejected_logits": 1.4061797857284546, "debug/policy_rejected_logps": -6.738494873046875, "debug/reference_chosen_logps": -182.20623779296875, "debug/reference_rejected_logps": -6.71105432510376, "epoch": 5.851449275362318, "grad_norm": 43.75071440435272, "learning_rate": 2.8130929791271345e-08, "logits/chosen": 1.2854838371276855, "logits/rejected": 1.4061797857284546, "logps/chosen": -181.76376342773438, "logps/rejected": -6.738494873046875, "loss": 0.6584, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.22123070061206818, "rewards/margins": 0.23495125770568848, "rewards/rejected": -0.013720574788749218, "step": 1615 }, { "debug/policy_chosen_logits": 1.1859219074249268, "debug/policy_chosen_logps": -81.61519622802734, "debug/policy_rejected_logits": 1.4593770503997803, "debug/policy_rejected_logps": -7.561013698577881, "debug/reference_chosen_logps": -82.38662719726562, "debug/reference_rejected_logps": -7.408883571624756, "epoch": 5.869565217391305, "grad_norm": 160.76692452433764, "learning_rate": 2.7893738140417457e-08, "logits/chosen": 1.1859219074249268, "logits/rejected": 1.4593770503997803, "logps/chosen": -81.61519622802734, "logps/rejected": -7.561013698577881, "loss": 0.6551, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.3857176899909973, "rewards/margins": 0.4617827832698822, "rewards/rejected": -0.0760650783777237, "step": 1620 }, { "debug/policy_chosen_logits": 1.4994169473648071, "debug/policy_chosen_logps": -3.7514541149139404, "debug/policy_rejected_logits": 1.7846410274505615, "debug/policy_rejected_logps": -3.720341444015503, "debug/reference_chosen_logps": -3.66951060295105, "debug/reference_rejected_logps": -3.7241806983947754, "epoch": 5.88768115942029, "grad_norm": 35.366220205795024, "learning_rate": 2.7656546489563565e-08, "logits/chosen": 1.4994169473648071, "logits/rejected": 1.7846410274505615, "logps/chosen": -3.7514541149139404, "logps/rejected": -3.720341444015503, "loss": 0.6679, "rewards/accuracies": 0.125, "rewards/chosen": -0.04097171127796173, "rewards/margins": -0.042891182005405426, "rewards/rejected": 0.0019194722408428788, "step": 1625 }, { "debug/policy_chosen_logits": 1.1910721063613892, "debug/policy_chosen_logps": -5.033883571624756, "debug/policy_rejected_logits": 1.8654645681381226, "debug/policy_rejected_logps": -7.776394844055176, "debug/reference_chosen_logps": -5.09005069732666, "debug/reference_rejected_logps": -7.698686122894287, "epoch": 5.905797101449275, "grad_norm": 58.68056363836774, "learning_rate": 2.7419354838709673e-08, "logits/chosen": 1.1910721063613892, "logits/rejected": 1.8654645681381226, "logps/chosen": -5.033883571624756, "logps/rejected": -7.776394844055176, "loss": 0.6822, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.028083667159080505, "rewards/margins": 0.06693761050701141, "rewards/rejected": -0.03885394334793091, "step": 1630 }, { "debug/policy_chosen_logits": 1.203753113746643, "debug/policy_chosen_logps": -104.66374206542969, "debug/policy_rejected_logits": 1.6841144561767578, "debug/policy_rejected_logps": -3.7886364459991455, "debug/reference_chosen_logps": -105.0621337890625, "debug/reference_rejected_logps": -3.808579921722412, "epoch": 5.923913043478261, "grad_norm": 42.054086060517086, "learning_rate": 2.7182163187855788e-08, "logits/chosen": 1.203753113746643, "logits/rejected": 1.6841144561767578, "logps/chosen": -104.66374206542969, "logps/rejected": -3.7886364459991455, "loss": 0.6647, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.19919805228710175, "rewards/margins": 0.1892261952161789, "rewards/rejected": 0.009971851482987404, "step": 1635 }, { "debug/policy_chosen_logits": 1.0750402212142944, "debug/policy_chosen_logps": -10.779031753540039, "debug/policy_rejected_logits": 1.147640585899353, "debug/policy_rejected_logps": -6.88559103012085, "debug/reference_chosen_logps": -10.813862800598145, "debug/reference_rejected_logps": -6.787085056304932, "epoch": 5.942028985507246, "grad_norm": 33.10862492424325, "learning_rate": 2.69449715370019e-08, "logits/chosen": 1.0750402212142944, "logits/rejected": 1.147640585899353, "logps/chosen": -10.779031753540039, "logps/rejected": -6.88559103012085, "loss": 0.6549, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.01741555891931057, "rewards/margins": 0.06666828691959381, "rewards/rejected": -0.04925272613763809, "step": 1640 }, { "debug/policy_chosen_logits": 1.478532075881958, "debug/policy_chosen_logps": -6.058830738067627, "debug/policy_rejected_logits": 1.9522411823272705, "debug/policy_rejected_logps": -7.184481620788574, "debug/reference_chosen_logps": -6.074235439300537, "debug/reference_rejected_logps": -7.147464752197266, "epoch": 5.960144927536232, "grad_norm": 84.67337647709797, "learning_rate": 2.6707779886148008e-08, "logits/chosen": 1.478532075881958, "logits/rejected": 1.9522411823272705, "logps/chosen": -6.058830738067627, "logps/rejected": -7.184481620788574, "loss": 0.6799, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.007702639792114496, "rewards/margins": 0.02621082402765751, "rewards/rejected": -0.0185081847012043, "step": 1645 }, { "debug/policy_chosen_logits": 1.3560972213745117, "debug/policy_chosen_logps": -187.38829040527344, "debug/policy_rejected_logits": 1.7562698125839233, "debug/policy_rejected_logps": -9.193543434143066, "debug/reference_chosen_logps": -187.60826110839844, "debug/reference_rejected_logps": -9.161067008972168, "epoch": 5.978260869565218, "grad_norm": 144.61657175961227, "learning_rate": 2.6470588235294116e-08, "logits/chosen": 1.3560972213745117, "logits/rejected": 1.7562698125839233, "logps/chosen": -187.38829040527344, "logps/rejected": -9.193543434143066, "loss": 0.6795, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.10998935997486115, "rewards/margins": 0.12622733414173126, "rewards/rejected": -0.016237974166870117, "step": 1650 }, { "debug/policy_chosen_logits": 1.181376338005066, "debug/policy_chosen_logps": -7.381121635437012, "debug/policy_rejected_logits": 1.3099758625030518, "debug/policy_rejected_logps": -85.78623962402344, "debug/reference_chosen_logps": -7.433223724365234, "debug/reference_rejected_logps": -85.72587585449219, "epoch": 5.996376811594203, "grad_norm": 40.41270296889673, "learning_rate": 2.6233396584440225e-08, "logits/chosen": 1.181376338005066, "logits/rejected": 1.3099758625030518, "logps/chosen": -7.381121635437012, "logps/rejected": -85.78623962402344, "loss": 0.6738, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.026050686836242676, "rewards/margins": 0.056230735033750534, "rewards/rejected": -0.03018004633486271, "step": 1655 }, { "debug/policy_chosen_logits": 1.5152583122253418, "debug/policy_chosen_logps": -66.83251190185547, "debug/policy_rejected_logits": 1.6339941024780273, "debug/policy_rejected_logps": -7.1771650314331055, "debug/reference_chosen_logps": -66.9842300415039, "debug/reference_rejected_logps": -7.152446746826172, "epoch": 6.0144927536231885, "grad_norm": 35.07248539252083, "learning_rate": 2.5996204933586336e-08, "logits/chosen": 1.5152583122253418, "logits/rejected": 1.6339941024780273, "logps/chosen": -66.83251190185547, "logps/rejected": -7.1771650314331055, "loss": 0.662, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.07585398852825165, "rewards/margins": 0.08821350336074829, "rewards/rejected": -0.012359514832496643, "step": 1660 }, { "debug/policy_chosen_logits": 1.3242194652557373, "debug/policy_chosen_logps": -84.23370361328125, "debug/policy_rejected_logits": 1.4188930988311768, "debug/policy_rejected_logps": -5.60372257232666, "debug/reference_chosen_logps": -84.40177154541016, "debug/reference_rejected_logps": -5.586115837097168, "epoch": 6.032608695652174, "grad_norm": 57.70589436197753, "learning_rate": 2.5759013282732444e-08, "logits/chosen": 1.3242194652557373, "logits/rejected": 1.4188930988311768, "logps/chosen": -84.23370361328125, "logps/rejected": -5.60372257232666, "loss": 0.6776, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.08403295278549194, "rewards/margins": 0.09283626079559326, "rewards/rejected": -0.008803312666714191, "step": 1665 }, { "debug/policy_chosen_logits": 1.185652732849121, "debug/policy_chosen_logps": -8.77514934539795, "debug/policy_rejected_logits": 1.5694773197174072, "debug/policy_rejected_logps": -9.982881546020508, "debug/reference_chosen_logps": -8.831872940063477, "debug/reference_rejected_logps": -9.832635879516602, "epoch": 6.050724637681159, "grad_norm": 36.34414879900724, "learning_rate": 2.5521821631878553e-08, "logits/chosen": 1.185652732849121, "logits/rejected": 1.5694773197174072, "logps/chosen": -8.77514934539795, "logps/rejected": -9.982881546020508, "loss": 0.6721, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.028361553326249123, "rewards/margins": 0.10348429530858994, "rewards/rejected": -0.07512273639440536, "step": 1670 }, { "debug/policy_chosen_logits": 1.2835001945495605, "debug/policy_chosen_logps": -8.858372688293457, "debug/policy_rejected_logits": 1.5510776042938232, "debug/policy_rejected_logps": -105.2295913696289, "debug/reference_chosen_logps": -8.899450302124023, "debug/reference_rejected_logps": -105.12371826171875, "epoch": 6.068840579710145, "grad_norm": 28.282549974961164, "learning_rate": 2.5284629981024668e-08, "logits/chosen": 1.2835001945495605, "logits/rejected": 1.5510776042938232, "logps/chosen": -8.858372688293457, "logps/rejected": -105.2295913696289, "loss": 0.6539, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02053852006793022, "rewards/margins": 0.07347148656845093, "rewards/rejected": -0.052932966500520706, "step": 1675 }, { "debug/policy_chosen_logits": 1.1395282745361328, "debug/policy_chosen_logps": -34.65678787231445, "debug/policy_rejected_logits": 1.3821098804473877, "debug/policy_rejected_logps": -6.025293827056885, "debug/reference_chosen_logps": -34.69602584838867, "debug/reference_rejected_logps": -5.9452972412109375, "epoch": 6.086956521739131, "grad_norm": 359.3602317936761, "learning_rate": 2.504743833017078e-08, "logits/chosen": 1.1395282745361328, "logits/rejected": 1.3821098804473877, "logps/chosen": -34.65678787231445, "logps/rejected": -6.025293827056885, "loss": 0.6978, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01961865834891796, "rewards/margins": 0.05961690470576286, "rewards/rejected": -0.03999824821949005, "step": 1680 }, { "debug/policy_chosen_logits": 1.256869912147522, "debug/policy_chosen_logps": -7.5263471603393555, "debug/policy_rejected_logits": 1.7314449548721313, "debug/policy_rejected_logps": -7.651271820068359, "debug/reference_chosen_logps": -7.592371463775635, "debug/reference_rejected_logps": -7.573750972747803, "epoch": 6.105072463768116, "grad_norm": 39.70251223938036, "learning_rate": 2.4810246679316887e-08, "logits/chosen": 1.256869912147522, "logits/rejected": 1.7314449548721313, "logps/chosen": -7.5263471603393555, "logps/rejected": -7.651271820068359, "loss": 0.6539, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.03301204368472099, "rewards/margins": 0.07177215814590454, "rewards/rejected": -0.03876011073589325, "step": 1685 }, { "debug/policy_chosen_logits": 1.3765307664871216, "debug/policy_chosen_logps": -122.90785217285156, "debug/policy_rejected_logits": 1.5451319217681885, "debug/policy_rejected_logps": -5.471409797668457, "debug/reference_chosen_logps": -123.18696594238281, "debug/reference_rejected_logps": -5.408583641052246, "epoch": 6.1231884057971016, "grad_norm": 385.9590751586315, "learning_rate": 2.4573055028462996e-08, "logits/chosen": 1.3765307664871216, "logits/rejected": 1.5451319217681885, "logps/chosen": -122.90785217285156, "logps/rejected": -5.471409797668457, "loss": 0.6875, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13954779505729675, "rewards/margins": 0.1709609031677246, "rewards/rejected": -0.03141311928629875, "step": 1690 }, { "debug/policy_chosen_logits": 1.327089548110962, "debug/policy_chosen_logps": -5.8684401512146, "debug/policy_rejected_logits": 1.753175973892212, "debug/policy_rejected_logps": -4.861193656921387, "debug/reference_chosen_logps": -5.891690254211426, "debug/reference_rejected_logps": -4.724798679351807, "epoch": 6.141304347826087, "grad_norm": 33.73435101622891, "learning_rate": 2.4335863377609107e-08, "logits/chosen": 1.327089548110962, "logits/rejected": 1.753175973892212, "logps/chosen": -5.8684401512146, "logps/rejected": -4.861193656921387, "loss": 0.6445, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.011624842882156372, "rewards/margins": 0.07982231676578522, "rewards/rejected": -0.06819747388362885, "step": 1695 }, { "debug/policy_chosen_logits": 1.22604501247406, "debug/policy_chosen_logps": -5.180497646331787, "debug/policy_rejected_logits": 1.398335576057434, "debug/policy_rejected_logps": -5.645544528961182, "debug/reference_chosen_logps": -5.239500999450684, "debug/reference_rejected_logps": -5.618521213531494, "epoch": 6.159420289855072, "grad_norm": 31.831117523045197, "learning_rate": 2.409867172675522e-08, "logits/chosen": 1.22604501247406, "logits/rejected": 1.398335576057434, "logps/chosen": -5.180497646331787, "logps/rejected": -5.645544528961182, "loss": 0.6471, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.029501402750611305, "rewards/margins": 0.04301312938332558, "rewards/rejected": -0.013511726632714272, "step": 1700 }, { "epoch": 6.159420289855072, "eval_debug/policy_chosen_logits": 1.6384843587875366, "eval_debug/policy_chosen_logps": -122.4115219116211, "eval_debug/policy_rejected_logits": 1.6991726160049438, "eval_debug/policy_rejected_logps": -63.84629440307617, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6384843587875366, "eval_logits/rejected": 1.6991726160049438, "eval_logps/chosen": -122.4115219116211, "eval_logps/rejected": -63.84629440307617, "eval_loss": 0.7016617059707642, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.368269681930542, "eval_rewards/margins": 0.3478873670101166, "eval_rewards/rejected": 0.020382316783070564, "eval_runtime": 28.6888, "eval_samples_per_second": 20.914, "eval_steps_per_second": 0.662, "step": 1700 }, { "debug/policy_chosen_logits": 1.3456484079360962, "debug/policy_chosen_logps": -82.4546890258789, "debug/policy_rejected_logits": 1.7750904560089111, "debug/policy_rejected_logps": -16.662784576416016, "debug/reference_chosen_logps": -82.63568878173828, "debug/reference_rejected_logps": -16.59540557861328, "epoch": 6.177536231884058, "grad_norm": 35.71287906547146, "learning_rate": 2.3861480075901327e-08, "logits/chosen": 1.3456484079360962, "logits/rejected": 1.7750904560089111, "logps/chosen": -82.4546890258789, "logps/rejected": -16.662784576416016, "loss": 0.6539, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.09049810469150543, "rewards/margins": 0.12418758869171143, "rewards/rejected": -0.033689484000205994, "step": 1705 }, { "debug/policy_chosen_logits": 1.2774298191070557, "debug/policy_chosen_logps": -12.122432708740234, "debug/policy_rejected_logits": 1.7372058629989624, "debug/policy_rejected_logps": -27.708694458007812, "debug/reference_chosen_logps": -12.21338176727295, "debug/reference_rejected_logps": -27.601428985595703, "epoch": 6.195652173913044, "grad_norm": 93.82403978865119, "learning_rate": 2.3624288425047436e-08, "logits/chosen": 1.2774298191070557, "logits/rejected": 1.7372058629989624, "logps/chosen": -12.122432708740234, "logps/rejected": -27.708694458007812, "loss": 0.671, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.04547480493783951, "rewards/margins": 0.09910771250724792, "rewards/rejected": -0.05363290756940842, "step": 1710 }, { "debug/policy_chosen_logits": 1.0689574480056763, "debug/policy_chosen_logps": -93.77153778076172, "debug/policy_rejected_logits": 1.5067087411880493, "debug/policy_rejected_logps": -2.8954646587371826, "debug/reference_chosen_logps": -94.5040283203125, "debug/reference_rejected_logps": -2.808331251144409, "epoch": 6.213768115942029, "grad_norm": 382.40289768751535, "learning_rate": 2.3387096774193547e-08, "logits/chosen": 1.0689574480056763, "logits/rejected": 1.5067087411880493, "logps/chosen": -93.77153778076172, "logps/rejected": -2.8954646587371826, "loss": 0.6612, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.366253137588501, "rewards/margins": 0.4098196029663086, "rewards/rejected": -0.043566472828388214, "step": 1715 }, { "debug/policy_chosen_logits": 1.4415522813796997, "debug/policy_chosen_logps": -47.072845458984375, "debug/policy_rejected_logits": 1.559846043586731, "debug/policy_rejected_logps": -6.159904956817627, "debug/reference_chosen_logps": -47.269874572753906, "debug/reference_rejected_logps": -6.084237098693848, "epoch": 6.231884057971015, "grad_norm": 98.4758063577198, "learning_rate": 2.314990512333966e-08, "logits/chosen": 1.4415522813796997, "logits/rejected": 1.559846043586731, "logps/chosen": -47.072845458984375, "logps/rejected": -6.159904956817627, "loss": 0.6568, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.09851451218128204, "rewards/margins": 0.13634836673736572, "rewards/rejected": -0.03783385455608368, "step": 1720 }, { "debug/policy_chosen_logits": 1.4300625324249268, "debug/policy_chosen_logps": -5.330075263977051, "debug/policy_rejected_logits": 1.7048680782318115, "debug/policy_rejected_logps": -11.85203742980957, "debug/reference_chosen_logps": -5.359372138977051, "debug/reference_rejected_logps": -11.723997116088867, "epoch": 6.25, "grad_norm": 34.14986941334215, "learning_rate": 2.2912713472485767e-08, "logits/chosen": 1.4300625324249268, "logits/rejected": 1.7048680782318115, "logps/chosen": -5.330075263977051, "logps/rejected": -11.85203742980957, "loss": 0.6604, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.014648301526904106, "rewards/margins": 0.07866859436035156, "rewards/rejected": -0.06402029097080231, "step": 1725 }, { "debug/policy_chosen_logits": 1.4155908823013306, "debug/policy_chosen_logps": -6.200887203216553, "debug/policy_rejected_logits": 1.5919172763824463, "debug/policy_rejected_logps": -6.469831943511963, "debug/reference_chosen_logps": -6.293431758880615, "debug/reference_rejected_logps": -6.414093971252441, "epoch": 6.268115942028985, "grad_norm": 62.403481890316144, "learning_rate": 2.2675521821631875e-08, "logits/chosen": 1.4155908823013306, "logits/rejected": 1.5919172763824463, "logps/chosen": -6.200887203216553, "logps/rejected": -6.469831943511963, "loss": 0.6578, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.04627244547009468, "rewards/margins": 0.07414165884256363, "rewards/rejected": -0.02786921337246895, "step": 1730 }, { "debug/policy_chosen_logits": 1.2085551023483276, "debug/policy_chosen_logps": -9.333691596984863, "debug/policy_rejected_logits": 1.5498554706573486, "debug/policy_rejected_logps": -171.72164916992188, "debug/reference_chosen_logps": -9.423843383789062, "debug/reference_rejected_logps": -171.1486053466797, "epoch": 6.286231884057971, "grad_norm": 30.11318690226037, "learning_rate": 2.243833017077799e-08, "logits/chosen": 1.2085551023483276, "logits/rejected": 1.5498554706573486, "logps/chosen": -9.333691596984863, "logps/rejected": -171.72164916992188, "loss": 0.6492, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.04507599025964737, "rewards/margins": 0.3315882682800293, "rewards/rejected": -0.2865122854709625, "step": 1735 }, { "debug/policy_chosen_logits": 1.4618535041809082, "debug/policy_chosen_logps": -92.94352722167969, "debug/policy_rejected_logits": 1.8357000350952148, "debug/policy_rejected_logps": -47.09352493286133, "debug/reference_chosen_logps": -93.40971374511719, "debug/reference_rejected_logps": -47.21750259399414, "epoch": 6.304347826086957, "grad_norm": 31.748740995893296, "learning_rate": 2.22011385199241e-08, "logits/chosen": 1.4618535041809082, "logits/rejected": 1.8357000350952148, "logps/chosen": -92.94352722167969, "logps/rejected": -47.09352493286133, "loss": 0.6553, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.23309071362018585, "rewards/margins": 0.1711045503616333, "rewards/rejected": 0.06198614835739136, "step": 1740 }, { "debug/policy_chosen_logits": 1.0506945848464966, "debug/policy_chosen_logps": -28.91946792602539, "debug/policy_rejected_logits": 1.25243079662323, "debug/policy_rejected_logps": -7.529149532318115, "debug/reference_chosen_logps": -29.042627334594727, "debug/reference_rejected_logps": -7.397076606750488, "epoch": 6.322463768115942, "grad_norm": 37.47797922668427, "learning_rate": 2.1963946869070207e-08, "logits/chosen": 1.0506945848464966, "logits/rejected": 1.25243079662323, "logps/chosen": -28.91946792602539, "logps/rejected": -7.529149532318115, "loss": 0.6543, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.06157945841550827, "rewards/margins": 0.12761655449867249, "rewards/rejected": -0.06603709608316422, "step": 1745 }, { "debug/policy_chosen_logits": 1.4275844097137451, "debug/policy_chosen_logps": -67.59367370605469, "debug/policy_rejected_logits": 1.6424779891967773, "debug/policy_rejected_logps": -63.01472091674805, "debug/reference_chosen_logps": -67.7783203125, "debug/reference_rejected_logps": -62.94593048095703, "epoch": 6.340579710144928, "grad_norm": 127.64657723572913, "learning_rate": 2.1726755218216315e-08, "logits/chosen": 1.4275844097137451, "logits/rejected": 1.6424779891967773, "logps/chosen": -67.59367370605469, "logps/rejected": -63.01472091674805, "loss": 0.645, "rewards/accuracies": 0.375, "rewards/chosen": 0.09232205897569656, "rewards/margins": 0.12671837210655212, "rewards/rejected": -0.03439630940556526, "step": 1750 }, { "debug/policy_chosen_logits": 1.1399704217910767, "debug/policy_chosen_logps": -93.69328308105469, "debug/policy_rejected_logits": 1.3987802267074585, "debug/policy_rejected_logps": -7.957698822021484, "debug/reference_chosen_logps": -94.22650909423828, "debug/reference_rejected_logps": -7.949918270111084, "epoch": 6.358695652173913, "grad_norm": 38.64524095367147, "learning_rate": 2.148956356736243e-08, "logits/chosen": 1.1399704217910767, "logits/rejected": 1.3987802267074585, "logps/chosen": -93.69328308105469, "logps/rejected": -7.957698822021484, "loss": 0.664, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.26661059260368347, "rewards/margins": 0.27050095796585083, "rewards/rejected": -0.0038903176318854094, "step": 1755 }, { "debug/policy_chosen_logits": 1.3165624141693115, "debug/policy_chosen_logps": -77.65309143066406, "debug/policy_rejected_logits": 1.5861561298370361, "debug/policy_rejected_logps": -3.9581093788146973, "debug/reference_chosen_logps": -78.01481628417969, "debug/reference_rejected_logps": -3.9095966815948486, "epoch": 6.3768115942028984, "grad_norm": 53.145094977493976, "learning_rate": 2.1252371916508538e-08, "logits/chosen": 1.3165624141693115, "logits/rejected": 1.5861561298370361, "logps/chosen": -77.65309143066406, "logps/rejected": -3.9581093788146973, "loss": 0.6636, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.18086755275726318, "rewards/margins": 0.20512373745441437, "rewards/rejected": -0.024256208911538124, "step": 1760 }, { "debug/policy_chosen_logits": 1.5248353481292725, "debug/policy_chosen_logps": -81.12098693847656, "debug/policy_rejected_logits": 1.5897893905639648, "debug/policy_rejected_logps": -4.35104513168335, "debug/reference_chosen_logps": -81.39645385742188, "debug/reference_rejected_logps": -4.305384635925293, "epoch": 6.394927536231884, "grad_norm": 39.05867706167761, "learning_rate": 2.1015180265654647e-08, "logits/chosen": 1.5248353481292725, "logits/rejected": 1.5897893905639648, "logps/chosen": -81.12098693847656, "logps/rejected": -4.35104513168335, "loss": 0.6758, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.13773062825202942, "rewards/margins": 0.16056084632873535, "rewards/rejected": -0.022830242291092873, "step": 1765 }, { "debug/policy_chosen_logits": 1.283276081085205, "debug/policy_chosen_logps": -71.31474304199219, "debug/policy_rejected_logits": 1.7061437368392944, "debug/policy_rejected_logps": -136.7980194091797, "debug/reference_chosen_logps": -71.41020202636719, "debug/reference_rejected_logps": -136.6486053466797, "epoch": 6.413043478260869, "grad_norm": 103.31428318944523, "learning_rate": 2.0777988614800758e-08, "logits/chosen": 1.283276081085205, "logits/rejected": 1.7061437368392944, "logps/chosen": -71.31474304199219, "logps/rejected": -136.7980194091797, "loss": 0.6589, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.04772891849279404, "rewards/margins": 0.12242750078439713, "rewards/rejected": -0.07469858229160309, "step": 1770 }, { "debug/policy_chosen_logits": 1.4341970682144165, "debug/policy_chosen_logps": -112.47139739990234, "debug/policy_rejected_logits": 1.4577664136886597, "debug/policy_rejected_logps": -28.52488136291504, "debug/reference_chosen_logps": -112.68605041503906, "debug/reference_rejected_logps": -28.50644874572754, "epoch": 6.431159420289855, "grad_norm": 36.13179718197164, "learning_rate": 2.054079696394687e-08, "logits/chosen": 1.4341970682144165, "logits/rejected": 1.4577664136886597, "logps/chosen": -112.47139739990234, "logps/rejected": -28.52488136291504, "loss": 0.6574, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.10731756687164307, "rewards/margins": 0.11653383821249008, "rewards/rejected": -0.00921627413481474, "step": 1775 }, { "debug/policy_chosen_logits": 1.0640902519226074, "debug/policy_chosen_logps": -3.746899127960205, "debug/policy_rejected_logits": 1.4140526056289673, "debug/policy_rejected_logps": -10.00014591217041, "debug/reference_chosen_logps": -3.734755039215088, "debug/reference_rejected_logps": -9.857994079589844, "epoch": 6.449275362318841, "grad_norm": 91.7104352047138, "learning_rate": 2.0303605313092978e-08, "logits/chosen": 1.0640902519226074, "logits/rejected": 1.4140526056289673, "logps/chosen": -3.746899127960205, "logps/rejected": -10.00014591217041, "loss": 0.6657, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0060721696354448795, "rewards/margins": 0.06500352174043655, "rewards/rejected": -0.0710756927728653, "step": 1780 }, { "debug/policy_chosen_logits": 1.6819137334823608, "debug/policy_chosen_logps": -57.0432014465332, "debug/policy_rejected_logits": 1.747323751449585, "debug/policy_rejected_logps": -109.24458312988281, "debug/reference_chosen_logps": -57.20923614501953, "debug/reference_rejected_logps": -108.96925354003906, "epoch": 6.467391304347826, "grad_norm": 80.44062538825038, "learning_rate": 2.0066413662239086e-08, "logits/chosen": 1.6819137334823608, "logits/rejected": 1.747323751449585, "logps/chosen": -57.0432014465332, "logps/rejected": -109.24458312988281, "loss": 0.6667, "rewards/accuracies": 0.375, "rewards/chosen": 0.08301866054534912, "rewards/margins": 0.22067275643348694, "rewards/rejected": -0.13765409588813782, "step": 1785 }, { "debug/policy_chosen_logits": 1.0586025714874268, "debug/policy_chosen_logps": -114.2347183227539, "debug/policy_rejected_logits": 1.410879135131836, "debug/policy_rejected_logps": -16.744754791259766, "debug/reference_chosen_logps": -114.78114318847656, "debug/reference_rejected_logps": -16.70585060119629, "epoch": 6.4855072463768115, "grad_norm": 43.13984071611652, "learning_rate": 1.9829222011385198e-08, "logits/chosen": 1.0586025714874268, "logits/rejected": 1.410879135131836, "logps/chosen": -114.2347183227539, "logps/rejected": -16.744754791259766, "loss": 0.6643, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.2732129395008087, "rewards/margins": 0.2926652431488037, "rewards/rejected": -0.019452307373285294, "step": 1790 }, { "debug/policy_chosen_logits": 1.3839597702026367, "debug/policy_chosen_logps": -176.83563232421875, "debug/policy_rejected_logits": 1.555235505104065, "debug/policy_rejected_logps": -196.59803771972656, "debug/reference_chosen_logps": -178.44227600097656, "debug/reference_rejected_logps": -196.77088928222656, "epoch": 6.503623188405797, "grad_norm": 75.41154936052064, "learning_rate": 1.959203036053131e-08, "logits/chosen": 1.3839597702026367, "logits/rejected": 1.555235505104065, "logps/chosen": -176.83563232421875, "logps/rejected": -196.59803771972656, "loss": 0.657, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.8033260107040405, "rewards/margins": 0.7168967723846436, "rewards/rejected": 0.08642923086881638, "step": 1795 }, { "debug/policy_chosen_logits": 1.3572170734405518, "debug/policy_chosen_logps": -134.22207641601562, "debug/policy_rejected_logits": 1.6565492153167725, "debug/policy_rejected_logps": -5.04932975769043, "debug/reference_chosen_logps": -134.6995391845703, "debug/reference_rejected_logps": -4.946457862854004, "epoch": 6.521739130434782, "grad_norm": 28.52698104844771, "learning_rate": 1.9354838709677418e-08, "logits/chosen": 1.3572170734405518, "logits/rejected": 1.6565492153167725, "logps/chosen": -134.22207641601562, "logps/rejected": -5.04932975769043, "loss": 0.6557, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.23872795701026917, "rewards/margins": 0.29016393423080444, "rewards/rejected": -0.05143599957227707, "step": 1800 }, { "epoch": 6.521739130434782, "eval_debug/policy_chosen_logits": 1.6336705684661865, "eval_debug/policy_chosen_logps": -122.61048126220703, "eval_debug/policy_rejected_logits": 1.6946860551834106, "eval_debug/policy_rejected_logps": -64.0820083618164, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6336705684661865, "eval_logits/rejected": 1.6946860551834106, "eval_logps/chosen": -122.61048126220703, "eval_logps/rejected": -64.0820083618164, "eval_loss": 0.6956868767738342, "eval_rewards/accuracies": 0.30263158679008484, "eval_rewards/chosen": 0.26878225803375244, "eval_rewards/margins": 0.3662572503089905, "eval_rewards/rejected": -0.09747497737407684, "eval_runtime": 28.6264, "eval_samples_per_second": 20.96, "eval_steps_per_second": 0.664, "step": 1800 }, { "debug/policy_chosen_logits": 1.2778838872909546, "debug/policy_chosen_logps": -103.6480941772461, "debug/policy_rejected_logits": 1.3229701519012451, "debug/policy_rejected_logps": -4.542574882507324, "debug/reference_chosen_logps": -104.10665130615234, "debug/reference_rejected_logps": -4.467912197113037, "epoch": 6.539855072463768, "grad_norm": 387.01878320961083, "learning_rate": 1.9117647058823526e-08, "logits/chosen": 1.2778838872909546, "logits/rejected": 1.3229701519012451, "logps/chosen": -103.6480941772461, "logps/rejected": -4.542574882507324, "loss": 0.6587, "rewards/accuracies": 0.375, "rewards/chosen": 0.22927561402320862, "rewards/margins": 0.26660701632499695, "rewards/rejected": -0.037331365048885345, "step": 1805 }, { "debug/policy_chosen_logits": 1.318593978881836, "debug/policy_chosen_logps": -94.01082611083984, "debug/policy_rejected_logits": 1.8537782430648804, "debug/policy_rejected_logps": -97.25322723388672, "debug/reference_chosen_logps": -94.25801849365234, "debug/reference_rejected_logps": -96.91804504394531, "epoch": 6.557971014492754, "grad_norm": 36.10907282037337, "learning_rate": 1.888045540796964e-08, "logits/chosen": 1.318593978881836, "logits/rejected": 1.8537782430648804, "logps/chosen": -94.01082611083984, "logps/rejected": -97.25322723388672, "loss": 0.6472, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.12359496206045151, "rewards/margins": 0.29118940234184265, "rewards/rejected": -0.16759443283081055, "step": 1810 }, { "debug/policy_chosen_logits": 1.7237064838409424, "debug/policy_chosen_logps": -6.793422698974609, "debug/policy_rejected_logits": 1.9384024143218994, "debug/policy_rejected_logps": -160.19265747070312, "debug/reference_chosen_logps": -6.899423122406006, "debug/reference_rejected_logps": -159.81626892089844, "epoch": 6.576086956521739, "grad_norm": 42.23756545844281, "learning_rate": 1.864326375711575e-08, "logits/chosen": 1.7237064838409424, "logits/rejected": 1.9384024143218994, "logps/chosen": -6.793422698974609, "logps/rejected": -160.19265747070312, "loss": 0.654, "rewards/accuracies": 0.5, "rewards/chosen": 0.053000371903181076, "rewards/margins": 0.2411905825138092, "rewards/rejected": -0.18819019198417664, "step": 1815 }, { "debug/policy_chosen_logits": 1.189234972000122, "debug/policy_chosen_logps": -6.713139533996582, "debug/policy_rejected_logits": 1.6182925701141357, "debug/policy_rejected_logps": -1.441576361656189, "debug/reference_chosen_logps": -6.754980564117432, "debug/reference_rejected_logps": -1.4245069026947021, "epoch": 6.594202898550725, "grad_norm": 199.27918547200593, "learning_rate": 1.8406072106261857e-08, "logits/chosen": 1.189234972000122, "logits/rejected": 1.6182925701141357, "logps/chosen": -6.713139533996582, "logps/rejected": -1.441576361656189, "loss": 0.6536, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.020920375362038612, "rewards/margins": 0.02945505641400814, "rewards/rejected": -0.008534681983292103, "step": 1820 }, { "debug/policy_chosen_logits": 1.125486135482788, "debug/policy_chosen_logps": -117.1451416015625, "debug/policy_rejected_logits": 1.5688053369522095, "debug/policy_rejected_logps": -29.204456329345703, "debug/reference_chosen_logps": -117.39445495605469, "debug/reference_rejected_logps": -28.985509872436523, "epoch": 6.61231884057971, "grad_norm": 37.05309043940236, "learning_rate": 1.816888045540797e-08, "logits/chosen": 1.125486135482788, "logits/rejected": 1.5688053369522095, "logps/chosen": -117.1451416015625, "logps/rejected": -29.204456329345703, "loss": 0.6447, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.12464497238397598, "rewards/margins": 0.23411846160888672, "rewards/rejected": -0.10947350412607193, "step": 1825 }, { "debug/policy_chosen_logits": 1.4344699382781982, "debug/policy_chosen_logps": -57.752655029296875, "debug/policy_rejected_logits": 1.7739328145980835, "debug/policy_rejected_logps": -11.272838592529297, "debug/reference_chosen_logps": -57.94880294799805, "debug/reference_rejected_logps": -11.025108337402344, "epoch": 6.630434782608695, "grad_norm": 37.78542105000992, "learning_rate": 1.793168880455408e-08, "logits/chosen": 1.4344699382781982, "logits/rejected": 1.7739328145980835, "logps/chosen": -57.752655029296875, "logps/rejected": -11.272838592529297, "loss": 0.6517, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.09807337075471878, "rewards/margins": 0.22193865478038788, "rewards/rejected": -0.1238652914762497, "step": 1830 }, { "debug/policy_chosen_logits": 1.6551039218902588, "debug/policy_chosen_logps": -186.6226043701172, "debug/policy_rejected_logits": 2.07128643989563, "debug/policy_rejected_logps": -39.71226501464844, "debug/reference_chosen_logps": -187.2967987060547, "debug/reference_rejected_logps": -39.620033264160156, "epoch": 6.648550724637682, "grad_norm": 30.169955240541245, "learning_rate": 1.769449715370019e-08, "logits/chosen": 1.6551039218902588, "logits/rejected": 2.07128643989563, "logps/chosen": -186.6226043701172, "logps/rejected": -39.71226501464844, "loss": 0.6585, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.3370964527130127, "rewards/margins": 0.3832097053527832, "rewards/rejected": -0.04611321911215782, "step": 1835 }, { "debug/policy_chosen_logits": 1.5274724960327148, "debug/policy_chosen_logps": -9.325301170349121, "debug/policy_rejected_logits": 1.7058852910995483, "debug/policy_rejected_logps": -60.226837158203125, "debug/reference_chosen_logps": -9.433355331420898, "debug/reference_rejected_logps": -60.15769577026367, "epoch": 6.666666666666667, "grad_norm": 34.759565309616235, "learning_rate": 1.7457305502846297e-08, "logits/chosen": 1.5274724960327148, "logits/rejected": 1.7058852910995483, "logps/chosen": -9.325301170349121, "logps/rejected": -60.226837158203125, "loss": 0.6457, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.054027192294597626, "rewards/margins": 0.0885983407497406, "rewards/rejected": -0.034571148455142975, "step": 1840 }, { "debug/policy_chosen_logits": 1.3474111557006836, "debug/policy_chosen_logps": -8.886063575744629, "debug/policy_rejected_logits": 1.3526195287704468, "debug/policy_rejected_logps": -49.64582443237305, "debug/reference_chosen_logps": -8.973352432250977, "debug/reference_rejected_logps": -49.62647247314453, "epoch": 6.684782608695652, "grad_norm": 79.47010292754901, "learning_rate": 1.722011385199241e-08, "logits/chosen": 1.3474111557006836, "logits/rejected": 1.3526195287704468, "logps/chosen": -8.886063575744629, "logps/rejected": -49.64582443237305, "loss": 0.644, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.04364483803510666, "rewards/margins": 0.053318120539188385, "rewards/rejected": -0.009673279710114002, "step": 1845 }, { "debug/policy_chosen_logits": 1.5600453615188599, "debug/policy_chosen_logps": -8.149175643920898, "debug/policy_rejected_logits": 1.695662260055542, "debug/policy_rejected_logps": -48.94252395629883, "debug/reference_chosen_logps": -8.133310317993164, "debug/reference_rejected_logps": -48.77833938598633, "epoch": 6.702898550724638, "grad_norm": 59.28313411237688, "learning_rate": 1.698292220113852e-08, "logits/chosen": 1.5600453615188599, "logits/rejected": 1.695662260055542, "logps/chosen": -8.149175643920898, "logps/rejected": -48.94252395629883, "loss": 0.6523, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.007933027110993862, "rewards/margins": 0.07415995001792908, "rewards/rejected": -0.08209298551082611, "step": 1850 }, { "debug/policy_chosen_logits": 1.237030267715454, "debug/policy_chosen_logps": -207.984375, "debug/policy_rejected_logits": 1.2847999334335327, "debug/policy_rejected_logps": -50.04624557495117, "debug/reference_chosen_logps": -207.99520874023438, "debug/reference_rejected_logps": -49.724403381347656, "epoch": 6.721014492753623, "grad_norm": 81.47516797791837, "learning_rate": 1.674573055028463e-08, "logits/chosen": 1.237030267715454, "logits/rejected": 1.2847999334335327, "logps/chosen": -207.984375, "logps/rejected": -50.04624557495117, "loss": 0.6586, "rewards/accuracies": 0.375, "rewards/chosen": 0.0054070292972028255, "rewards/margins": 0.16632738709449768, "rewards/rejected": -0.16092035174369812, "step": 1855 }, { "debug/policy_chosen_logits": 1.2854220867156982, "debug/policy_chosen_logps": -6.273829460144043, "debug/policy_rejected_logits": 1.474238395690918, "debug/policy_rejected_logps": -84.76202392578125, "debug/reference_chosen_logps": -6.3128814697265625, "debug/reference_rejected_logps": -84.64005279541016, "epoch": 6.739130434782608, "grad_norm": 214.65318966807024, "learning_rate": 1.650853889943074e-08, "logits/chosen": 1.2854220867156982, "logits/rejected": 1.474238395690918, "logps/chosen": -6.273829460144043, "logps/rejected": -84.76202392578125, "loss": 0.6642, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.019526397809386253, "rewards/margins": 0.08051058650016785, "rewards/rejected": -0.060984186828136444, "step": 1860 }, { "debug/policy_chosen_logits": 1.4894376993179321, "debug/policy_chosen_logps": -12.38526725769043, "debug/policy_rejected_logits": 1.6733038425445557, "debug/policy_rejected_logps": -31.791147232055664, "debug/reference_chosen_logps": -12.45335865020752, "debug/reference_rejected_logps": -31.73923683166504, "epoch": 6.757246376811594, "grad_norm": 46.41498347596156, "learning_rate": 1.627134724857685e-08, "logits/chosen": 1.4894376993179321, "logits/rejected": 1.6733038425445557, "logps/chosen": -12.38526725769043, "logps/rejected": -31.791147232055664, "loss": 0.6634, "rewards/accuracies": 0.375, "rewards/chosen": 0.03404510021209717, "rewards/margins": 0.06000005081295967, "rewards/rejected": -0.025954946875572205, "step": 1865 }, { "debug/policy_chosen_logits": 1.490206241607666, "debug/policy_chosen_logps": -6.490980625152588, "debug/policy_rejected_logits": 1.9461854696273804, "debug/policy_rejected_logps": -53.45317840576172, "debug/reference_chosen_logps": -6.544236183166504, "debug/reference_rejected_logps": -53.29387283325195, "epoch": 6.77536231884058, "grad_norm": 42.68539660626054, "learning_rate": 1.603415559772296e-08, "logits/chosen": 1.490206241607666, "logits/rejected": 1.9461854696273804, "logps/chosen": -6.490980625152588, "logps/rejected": -53.45317840576172, "loss": 0.652, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.02662719413638115, "rewards/margins": 0.10628092288970947, "rewards/rejected": -0.07965372502803802, "step": 1870 }, { "debug/policy_chosen_logits": 1.3330042362213135, "debug/policy_chosen_logps": -100.5584487915039, "debug/policy_rejected_logits": 1.5773370265960693, "debug/policy_rejected_logps": -68.95774841308594, "debug/reference_chosen_logps": -101.13746643066406, "debug/reference_rejected_logps": -68.70438385009766, "epoch": 6.793478260869565, "grad_norm": 52.27852614993406, "learning_rate": 1.579696394686907e-08, "logits/chosen": 1.3330042362213135, "logits/rejected": 1.5773370265960693, "logps/chosen": -100.5584487915039, "logps/rejected": -68.95774841308594, "loss": 0.6428, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.289511114358902, "rewards/margins": 0.41619640588760376, "rewards/rejected": -0.12668530642986298, "step": 1875 }, { "debug/policy_chosen_logits": 1.4957469701766968, "debug/policy_chosen_logps": -11.446759223937988, "debug/policy_rejected_logits": 1.7021844387054443, "debug/policy_rejected_logps": -5.8381757736206055, "debug/reference_chosen_logps": -11.4380521774292, "debug/reference_rejected_logps": -5.718893527984619, "epoch": 6.811594202898551, "grad_norm": 35.369623500615475, "learning_rate": 1.555977229601518e-08, "logits/chosen": 1.4957469701766968, "logits/rejected": 1.7021844387054443, "logps/chosen": -11.446759223937988, "logps/rejected": -5.8381757736206055, "loss": 0.6555, "rewards/accuracies": 0.375, "rewards/chosen": -0.00435307901352644, "rewards/margins": 0.05528787523508072, "rewards/rejected": -0.059640951454639435, "step": 1880 }, { "debug/policy_chosen_logits": 1.4213391542434692, "debug/policy_chosen_logps": -4.652745246887207, "debug/policy_rejected_logits": 1.5516124963760376, "debug/policy_rejected_logps": -5.292793273925781, "debug/reference_chosen_logps": -4.665145397186279, "debug/reference_rejected_logps": -5.2398681640625, "epoch": 6.829710144927536, "grad_norm": 45.74579267359909, "learning_rate": 1.532258064516129e-08, "logits/chosen": 1.4213391542434692, "logits/rejected": 1.5516124963760376, "logps/chosen": -4.652745246887207, "logps/rejected": -5.292793273925781, "loss": 0.6493, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.006199912633746862, "rewards/margins": 0.03266260772943497, "rewards/rejected": -0.026462692767381668, "step": 1885 }, { "debug/policy_chosen_logits": 1.4781081676483154, "debug/policy_chosen_logps": -106.99214172363281, "debug/policy_rejected_logits": 1.7082345485687256, "debug/policy_rejected_logps": -3.2737228870391846, "debug/reference_chosen_logps": -107.45467376708984, "debug/reference_rejected_logps": -3.288224458694458, "epoch": 6.8478260869565215, "grad_norm": 34.85206480735712, "learning_rate": 1.50853889943074e-08, "logits/chosen": 1.4781081676483154, "logits/rejected": 1.7082345485687256, "logps/chosen": -106.99214172363281, "logps/rejected": -3.2737228870391846, "loss": 0.6537, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.23126187920570374, "rewards/margins": 0.22401121258735657, "rewards/rejected": 0.007250678725540638, "step": 1890 }, { "debug/policy_chosen_logits": 1.52455735206604, "debug/policy_chosen_logps": -58.93330001831055, "debug/policy_rejected_logits": 1.7860231399536133, "debug/policy_rejected_logps": -65.43986511230469, "debug/reference_chosen_logps": -59.187416076660156, "debug/reference_rejected_logps": -65.53862762451172, "epoch": 6.865942028985507, "grad_norm": 38.43122998656183, "learning_rate": 1.4848197343453508e-08, "logits/chosen": 1.52455735206604, "logits/rejected": 1.7860231399536133, "logps/chosen": -58.93330001831055, "logps/rejected": -65.43986511230469, "loss": 0.6563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.12705127894878387, "rewards/margins": 0.07767374813556671, "rewards/rejected": 0.04937754198908806, "step": 1895 }, { "debug/policy_chosen_logits": 1.5747568607330322, "debug/policy_chosen_logps": -7.633500576019287, "debug/policy_rejected_logits": 1.8503799438476562, "debug/policy_rejected_logps": -9.252070426940918, "debug/reference_chosen_logps": -7.718649387359619, "debug/reference_rejected_logps": -9.06689739227295, "epoch": 6.884057971014493, "grad_norm": 72.93004664720283, "learning_rate": 1.461100569259962e-08, "logits/chosen": 1.5747568607330322, "logits/rejected": 1.8503799438476562, "logps/chosen": -7.633500576019287, "logps/rejected": -9.252070426940918, "loss": 0.6516, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.04257441312074661, "rewards/margins": 0.13516095280647278, "rewards/rejected": -0.09258653223514557, "step": 1900 }, { "epoch": 6.884057971014493, "eval_debug/policy_chosen_logits": 1.6399821043014526, "eval_debug/policy_chosen_logps": -122.36705780029297, "eval_debug/policy_rejected_logits": 1.7001841068267822, "eval_debug/policy_rejected_logps": -63.87035369873047, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6399821043014526, "eval_logits/rejected": 1.7001841068267822, "eval_logps/chosen": -122.36705780029297, "eval_logps/rejected": -63.87035369873047, "eval_loss": 0.6872497797012329, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.3905019760131836, "eval_rewards/margins": 0.38214704394340515, "eval_rewards/rejected": 0.008354968391358852, "eval_runtime": 28.5555, "eval_samples_per_second": 21.012, "eval_steps_per_second": 0.665, "step": 1900 }, { "debug/policy_chosen_logits": 1.2882137298583984, "debug/policy_chosen_logps": -197.25997924804688, "debug/policy_rejected_logits": 1.6784976720809937, "debug/policy_rejected_logps": -8.449228286743164, "debug/reference_chosen_logps": -197.6197967529297, "debug/reference_rejected_logps": -8.366586685180664, "epoch": 6.9021739130434785, "grad_norm": 488.03790116979616, "learning_rate": 1.437381404174573e-08, "logits/chosen": 1.2882137298583984, "logits/rejected": 1.6784976720809937, "logps/chosen": -197.25997924804688, "logps/rejected": -8.449228286743164, "loss": 0.6677, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.17990939319133759, "rewards/margins": 0.22123003005981445, "rewards/rejected": -0.041320621967315674, "step": 1905 }, { "debug/policy_chosen_logits": 1.4524803161621094, "debug/policy_chosen_logps": -18.3839054107666, "debug/policy_rejected_logits": 1.8204071521759033, "debug/policy_rejected_logps": -9.69588851928711, "debug/reference_chosen_logps": -18.452165603637695, "debug/reference_rejected_logps": -9.541703224182129, "epoch": 6.920289855072464, "grad_norm": 46.21606320607923, "learning_rate": 1.413662239089184e-08, "logits/chosen": 1.4524803161621094, "logits/rejected": 1.8204071521759033, "logps/chosen": -18.3839054107666, "logps/rejected": -9.69588851928711, "loss": 0.6736, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.03413000702857971, "rewards/margins": 0.11122287809848785, "rewards/rejected": -0.07709287106990814, "step": 1910 }, { "debug/policy_chosen_logits": 1.1719087362289429, "debug/policy_chosen_logps": -115.2168960571289, "debug/policy_rejected_logits": 1.4648483991622925, "debug/policy_rejected_logps": -3.8881168365478516, "debug/reference_chosen_logps": -115.864013671875, "debug/reference_rejected_logps": -3.795975923538208, "epoch": 6.938405797101449, "grad_norm": 37.46778747856099, "learning_rate": 1.3899430740037951e-08, "logits/chosen": 1.1719087362289429, "logits/rejected": 1.4648483991622925, "logps/chosen": -115.2168960571289, "logps/rejected": -3.8881168365478516, "loss": 0.6474, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.32355502247810364, "rewards/margins": 0.369625449180603, "rewards/rejected": -0.04607042297720909, "step": 1915 }, { "debug/policy_chosen_logits": 1.3939597606658936, "debug/policy_chosen_logps": -7.001612663269043, "debug/policy_rejected_logits": 1.378764033317566, "debug/policy_rejected_logps": -103.198974609375, "debug/reference_chosen_logps": -6.958459377288818, "debug/reference_rejected_logps": -103.35543060302734, "epoch": 6.956521739130435, "grad_norm": 36.49138119652407, "learning_rate": 1.3662239089184061e-08, "logits/chosen": 1.3939597606658936, "logits/rejected": 1.378764033317566, "logps/chosen": -7.001612663269043, "logps/rejected": -103.198974609375, "loss": 0.6783, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.021576717495918274, "rewards/margins": -0.09981317818164825, "rewards/rejected": 0.07823643833398819, "step": 1920 }, { "debug/policy_chosen_logits": 1.6214466094970703, "debug/policy_chosen_logps": -114.3349380493164, "debug/policy_rejected_logits": 1.9548161029815674, "debug/policy_rejected_logps": -11.46960163116455, "debug/reference_chosen_logps": -114.66383361816406, "debug/reference_rejected_logps": -11.365381240844727, "epoch": 6.97463768115942, "grad_norm": 31.9262823573379, "learning_rate": 1.342504743833017e-08, "logits/chosen": 1.6214466094970703, "logits/rejected": 1.9548161029815674, "logps/chosen": -114.3349380493164, "logps/rejected": -11.46960163116455, "loss": 0.6471, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.16445250809192657, "rewards/margins": 0.21656255424022675, "rewards/rejected": -0.05211006477475166, "step": 1925 }, { "debug/policy_chosen_logits": 1.5135091543197632, "debug/policy_chosen_logps": -7.79807186126709, "debug/policy_rejected_logits": 1.948464035987854, "debug/policy_rejected_logps": -9.86213207244873, "debug/reference_chosen_logps": -7.876543998718262, "debug/reference_rejected_logps": -9.774301528930664, "epoch": 6.992753623188406, "grad_norm": 62.027667859664746, "learning_rate": 1.318785578747628e-08, "logits/chosen": 1.5135091543197632, "logits/rejected": 1.948464035987854, "logps/chosen": -7.79807186126709, "logps/rejected": -9.86213207244873, "loss": 0.6535, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03923585265874863, "rewards/margins": 0.08315081894397736, "rewards/rejected": -0.04391496628522873, "step": 1930 }, { "debug/policy_chosen_logits": 1.1846020221710205, "debug/policy_chosen_logps": -75.94058990478516, "debug/policy_rejected_logits": 1.396977424621582, "debug/policy_rejected_logps": -95.08988952636719, "debug/reference_chosen_logps": -76.09022521972656, "debug/reference_rejected_logps": -94.5392074584961, "epoch": 7.010869565217392, "grad_norm": 104.15026287538193, "learning_rate": 1.2950664136622391e-08, "logits/chosen": 1.1846020221710205, "logits/rejected": 1.396977424621582, "logps/chosen": -75.94058990478516, "logps/rejected": -95.08988952636719, "loss": 0.643, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.07481960952281952, "rewards/margins": 0.3501637876033783, "rewards/rejected": -0.27534419298171997, "step": 1935 }, { "debug/policy_chosen_logits": 1.5505082607269287, "debug/policy_chosen_logps": -8.709111213684082, "debug/policy_rejected_logits": 1.6560828685760498, "debug/policy_rejected_logps": -11.377399444580078, "debug/reference_chosen_logps": -8.80463981628418, "debug/reference_rejected_logps": -11.33636474609375, "epoch": 7.028985507246377, "grad_norm": 34.642870917001055, "learning_rate": 1.2713472485768501e-08, "logits/chosen": 1.5505082607269287, "logits/rejected": 1.6560828685760498, "logps/chosen": -8.709111213684082, "logps/rejected": -11.377399444580078, "loss": 0.6551, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.04776417464017868, "rewards/margins": 0.0682816207408905, "rewards/rejected": -0.020517444238066673, "step": 1940 }, { "debug/policy_chosen_logits": 1.3253029584884644, "debug/policy_chosen_logps": -71.88716125488281, "debug/policy_rejected_logits": 1.733841896057129, "debug/policy_rejected_logps": -24.40646743774414, "debug/reference_chosen_logps": -72.51323699951172, "debug/reference_rejected_logps": -24.24868392944336, "epoch": 7.047101449275362, "grad_norm": 40.04789211738999, "learning_rate": 1.247628083491461e-08, "logits/chosen": 1.3253029584884644, "logits/rejected": 1.733841896057129, "logps/chosen": -71.88716125488281, "logps/rejected": -24.40646743774414, "loss": 0.6472, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.3130418360233307, "rewards/margins": 0.39193350076675415, "rewards/rejected": -0.07889164239168167, "step": 1945 }, { "debug/policy_chosen_logits": 1.3019014596939087, "debug/policy_chosen_logps": -3.2647480964660645, "debug/policy_rejected_logits": 1.6650381088256836, "debug/policy_rejected_logps": -24.002193450927734, "debug/reference_chosen_logps": -3.290344715118408, "debug/reference_rejected_logps": -23.97683334350586, "epoch": 7.065217391304348, "grad_norm": 38.839525995590684, "learning_rate": 1.2239089184060721e-08, "logits/chosen": 1.3019014596939087, "logits/rejected": 1.6650381088256836, "logps/chosen": -3.2647480964660645, "logps/rejected": -24.002193450927734, "loss": 0.6438, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.01279846765100956, "rewards/margins": 0.025479713454842567, "rewards/rejected": -0.012681245803833008, "step": 1950 }, { "debug/policy_chosen_logits": 1.564024806022644, "debug/policy_chosen_logps": -6.324842929840088, "debug/policy_rejected_logits": 1.5677435398101807, "debug/policy_rejected_logps": -71.68635559082031, "debug/reference_chosen_logps": -6.436964988708496, "debug/reference_rejected_logps": -71.70610046386719, "epoch": 7.083333333333333, "grad_norm": 28.299198368622815, "learning_rate": 1.200189753320683e-08, "logits/chosen": 1.564024806022644, "logits/rejected": 1.5677435398101807, "logps/chosen": -6.324842929840088, "logps/rejected": -71.68635559082031, "loss": 0.6715, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.056061286479234695, "rewards/margins": 0.04618590325117111, "rewards/rejected": 0.009875382296741009, "step": 1955 }, { "debug/policy_chosen_logits": 1.5397911071777344, "debug/policy_chosen_logps": -93.92724609375, "debug/policy_rejected_logits": 1.6229177713394165, "debug/policy_rejected_logps": -5.184260368347168, "debug/reference_chosen_logps": -94.31362915039062, "debug/reference_rejected_logps": -5.15993070602417, "epoch": 7.101449275362318, "grad_norm": 36.637876837055245, "learning_rate": 1.176470588235294e-08, "logits/chosen": 1.5397911071777344, "logits/rejected": 1.6229177713394165, "logps/chosen": -93.92724609375, "logps/rejected": -5.184260368347168, "loss": 0.6503, "rewards/accuracies": 0.375, "rewards/chosen": 0.19319573044776917, "rewards/margins": 0.20536045730113983, "rewards/rejected": -0.012164726853370667, "step": 1960 }, { "debug/policy_chosen_logits": 1.4585140943527222, "debug/policy_chosen_logps": -7.235345363616943, "debug/policy_rejected_logits": 1.91153085231781, "debug/policy_rejected_logps": -8.29022216796875, "debug/reference_chosen_logps": -7.292272090911865, "debug/reference_rejected_logps": -8.146242141723633, "epoch": 7.119565217391305, "grad_norm": 35.2731965980762, "learning_rate": 1.152751423149905e-08, "logits/chosen": 1.4585140943527222, "logits/rejected": 1.91153085231781, "logps/chosen": -7.235345363616943, "logps/rejected": -8.29022216796875, "loss": 0.6591, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.028463352471590042, "rewards/margins": 0.10045270621776581, "rewards/rejected": -0.07198936492204666, "step": 1965 }, { "debug/policy_chosen_logits": 1.0448970794677734, "debug/policy_chosen_logps": -69.42283630371094, "debug/policy_rejected_logits": 1.5620346069335938, "debug/policy_rejected_logps": -6.179040431976318, "debug/reference_chosen_logps": -69.42707824707031, "debug/reference_rejected_logps": -6.121747016906738, "epoch": 7.13768115942029, "grad_norm": 35.95554247445267, "learning_rate": 1.129032258064516e-08, "logits/chosen": 1.0448970794677734, "logits/rejected": 1.5620346069335938, "logps/chosen": -69.42283630371094, "logps/rejected": -6.179040431976318, "loss": 0.6749, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.0021192640997469425, "rewards/margins": 0.030765961855649948, "rewards/rejected": -0.028646698221564293, "step": 1970 }, { "debug/policy_chosen_logits": 1.6691560745239258, "debug/policy_chosen_logps": -28.891870498657227, "debug/policy_rejected_logits": 1.7167425155639648, "debug/policy_rejected_logps": -7.132044792175293, "debug/reference_chosen_logps": -28.961029052734375, "debug/reference_rejected_logps": -6.966675758361816, "epoch": 7.155797101449275, "grad_norm": 36.188248691083224, "learning_rate": 1.105313092979127e-08, "logits/chosen": 1.6691560745239258, "logits/rejected": 1.7167425155639648, "logps/chosen": -28.891870498657227, "logps/rejected": -7.132044792175293, "loss": 0.6617, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.03457950800657272, "rewards/margins": 0.11726430803537369, "rewards/rejected": -0.08268480002880096, "step": 1975 }, { "debug/policy_chosen_logits": 1.3721542358398438, "debug/policy_chosen_logps": -5.316123008728027, "debug/policy_rejected_logits": 1.7160942554473877, "debug/policy_rejected_logps": -7.058150291442871, "debug/reference_chosen_logps": -5.378409385681152, "debug/reference_rejected_logps": -6.849114894866943, "epoch": 7.173913043478261, "grad_norm": 32.10733499327613, "learning_rate": 1.081593927893738e-08, "logits/chosen": 1.3721542358398438, "logits/rejected": 1.7160942554473877, "logps/chosen": -5.316123008728027, "logps/rejected": -7.058150291442871, "loss": 0.6624, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.031143110245466232, "rewards/margins": 0.13566073775291443, "rewards/rejected": -0.1045176237821579, "step": 1980 }, { "debug/policy_chosen_logits": 1.5946528911590576, "debug/policy_chosen_logps": -10.34161376953125, "debug/policy_rejected_logits": 1.633715033531189, "debug/policy_rejected_logps": -80.63815307617188, "debug/reference_chosen_logps": -10.41511058807373, "debug/reference_rejected_logps": -80.3069839477539, "epoch": 7.192028985507246, "grad_norm": 35.402104600767366, "learning_rate": 1.057874762808349e-08, "logits/chosen": 1.5946528911590576, "logits/rejected": 1.633715033531189, "logps/chosen": -10.34161376953125, "logps/rejected": -80.63815307617188, "loss": 0.6487, "rewards/accuracies": 0.5, "rewards/chosen": 0.03674789518117905, "rewards/margins": 0.20233604311943054, "rewards/rejected": -0.1655881404876709, "step": 1985 }, { "debug/policy_chosen_logits": 1.1120822429656982, "debug/policy_chosen_logps": -31.399667739868164, "debug/policy_rejected_logits": 1.7031781673431396, "debug/policy_rejected_logps": -2.312849521636963, "debug/reference_chosen_logps": -31.72493553161621, "debug/reference_rejected_logps": -2.3256211280822754, "epoch": 7.2101449275362315, "grad_norm": 40.91816792412739, "learning_rate": 1.03415559772296e-08, "logits/chosen": 1.1120822429656982, "logits/rejected": 1.7031781673431396, "logps/chosen": -31.399667739868164, "logps/rejected": -2.312849521636963, "loss": 0.6462, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.16263499855995178, "rewards/margins": 0.15624922513961792, "rewards/rejected": 0.006385767366737127, "step": 1990 }, { "debug/policy_chosen_logits": 1.5316581726074219, "debug/policy_chosen_logps": -7.786660194396973, "debug/policy_rejected_logits": 1.7022438049316406, "debug/policy_rejected_logps": -41.573402404785156, "debug/reference_chosen_logps": -7.837969779968262, "debug/reference_rejected_logps": -41.54066848754883, "epoch": 7.228260869565218, "grad_norm": 41.80574788145537, "learning_rate": 1.010436432637571e-08, "logits/chosen": 1.5316581726074219, "logits/rejected": 1.7022438049316406, "logps/chosen": -7.786660194396973, "logps/rejected": -41.573402404785156, "loss": 0.6591, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.02565532922744751, "rewards/margins": 0.042021624743938446, "rewards/rejected": -0.016366295516490936, "step": 1995 }, { "debug/policy_chosen_logits": 1.3563371896743774, "debug/policy_chosen_logps": -68.27490234375, "debug/policy_rejected_logits": 1.6342350244522095, "debug/policy_rejected_logps": -2.9619014263153076, "debug/reference_chosen_logps": -68.45893096923828, "debug/reference_rejected_logps": -2.8858752250671387, "epoch": 7.246376811594203, "grad_norm": 35.26206209511059, "learning_rate": 9.867172675521822e-09, "logits/chosen": 1.3563371896743774, "logits/rejected": 1.6342350244522095, "logps/chosen": -68.27490234375, "logps/rejected": -2.9619014263153076, "loss": 0.6542, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.09201527386903763, "rewards/margins": 0.13002833724021912, "rewards/rejected": -0.03801308199763298, "step": 2000 }, { "epoch": 7.246376811594203, "eval_debug/policy_chosen_logits": 1.632000207901001, "eval_debug/policy_chosen_logps": -122.46611785888672, "eval_debug/policy_rejected_logits": 1.691506266593933, "eval_debug/policy_rejected_logps": -63.88637161254883, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.632000207901001, "eval_logits/rejected": 1.691506266593933, "eval_logps/chosen": -122.46611785888672, "eval_logps/rejected": -63.88637161254883, "eval_loss": 0.6910152435302734, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.3409661054611206, "eval_rewards/margins": 0.3406268358230591, "eval_rewards/rejected": 0.0003392900398466736, "eval_runtime": 28.5803, "eval_samples_per_second": 20.994, "eval_steps_per_second": 0.665, "step": 2000 }, { "debug/policy_chosen_logits": 1.3101251125335693, "debug/policy_chosen_logps": -47.13117218017578, "debug/policy_rejected_logits": 1.657314658164978, "debug/policy_rejected_logps": -84.49040985107422, "debug/reference_chosen_logps": -47.29503631591797, "debug/reference_rejected_logps": -84.16597747802734, "epoch": 7.2644927536231885, "grad_norm": 43.196677498862265, "learning_rate": 9.629981024667932e-09, "logits/chosen": 1.3101251125335693, "logits/rejected": 1.657314658164978, "logps/chosen": -47.13117218017578, "logps/rejected": -84.49040985107422, "loss": 0.6602, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0819304957985878, "rewards/margins": 0.24414435029029846, "rewards/rejected": -0.16221384704113007, "step": 2005 }, { "debug/policy_chosen_logits": 1.2839982509613037, "debug/policy_chosen_logps": -5.64115571975708, "debug/policy_rejected_logits": 1.6193948984146118, "debug/policy_rejected_logps": -16.42999267578125, "debug/reference_chosen_logps": -5.665097236633301, "debug/reference_rejected_logps": -16.296802520751953, "epoch": 7.282608695652174, "grad_norm": 212.69381126304614, "learning_rate": 9.392789373814042e-09, "logits/chosen": 1.2839982509613037, "logits/rejected": 1.6193948984146118, "logps/chosen": -5.64115571975708, "logps/rejected": -16.42999267578125, "loss": 0.6511, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.011970730498433113, "rewards/margins": 0.07856658846139908, "rewards/rejected": -0.06659585237503052, "step": 2010 }, { "debug/policy_chosen_logits": 1.2851697206497192, "debug/policy_chosen_logps": -9.873387336730957, "debug/policy_rejected_logits": 1.5647236108779907, "debug/policy_rejected_logps": -7.3395843505859375, "debug/reference_chosen_logps": -10.036648750305176, "debug/reference_rejected_logps": -7.27008056640625, "epoch": 7.300724637681159, "grad_norm": 39.04651821308786, "learning_rate": 9.155597722960152e-09, "logits/chosen": 1.2851697206497192, "logits/rejected": 1.5647236108779907, "logps/chosen": -9.873387336730957, "logps/rejected": -7.3395843505859375, "loss": 0.6552, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.08163096755743027, "rewards/margins": 0.11638306081295013, "rewards/rejected": -0.03475208953022957, "step": 2015 }, { "debug/policy_chosen_logits": 1.631627082824707, "debug/policy_chosen_logps": -3.869621753692627, "debug/policy_rejected_logits": 1.8790578842163086, "debug/policy_rejected_logps": -54.247642517089844, "debug/reference_chosen_logps": -3.9204821586608887, "debug/reference_rejected_logps": -54.2545280456543, "epoch": 7.318840579710145, "grad_norm": 45.97880009743909, "learning_rate": 8.918406072106262e-09, "logits/chosen": 1.631627082824707, "logits/rejected": 1.8790578842163086, "logps/chosen": -3.869621753692627, "logps/rejected": -54.247642517089844, "loss": 0.6621, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.025430312380194664, "rewards/margins": 0.021985823288559914, "rewards/rejected": 0.0034444897901266813, "step": 2020 }, { "debug/policy_chosen_logits": 1.2944848537445068, "debug/policy_chosen_logps": -107.83905029296875, "debug/policy_rejected_logits": 1.507871389389038, "debug/policy_rejected_logps": -67.22627258300781, "debug/reference_chosen_logps": -108.31185150146484, "debug/reference_rejected_logps": -67.40676879882812, "epoch": 7.336956521739131, "grad_norm": 29.154362056849124, "learning_rate": 8.681214421252372e-09, "logits/chosen": 1.2944848537445068, "logits/rejected": 1.507871389389038, "logps/chosen": -107.83905029296875, "logps/rejected": -67.22627258300781, "loss": 0.646, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.23640838265419006, "rewards/margins": 0.14616259932518005, "rewards/rejected": 0.09024576842784882, "step": 2025 }, { "debug/policy_chosen_logits": 1.5653653144836426, "debug/policy_chosen_logps": -58.257667541503906, "debug/policy_rejected_logits": 1.5012786388397217, "debug/policy_rejected_logps": -166.61129760742188, "debug/reference_chosen_logps": -58.72990036010742, "debug/reference_rejected_logps": -166.75540161132812, "epoch": 7.355072463768116, "grad_norm": 27.061187936486384, "learning_rate": 8.444022770398482e-09, "logits/chosen": 1.5653653144836426, "logits/rejected": 1.5012786388397217, "logps/chosen": -58.257667541503906, "logps/rejected": -166.61129760742188, "loss": 0.6599, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.23611393570899963, "rewards/margins": 0.1640617996454239, "rewards/rejected": 0.07205210626125336, "step": 2030 }, { "debug/policy_chosen_logits": 1.2250094413757324, "debug/policy_chosen_logps": -4.4164018630981445, "debug/policy_rejected_logits": 1.8114646673202515, "debug/policy_rejected_logps": -3.9573020935058594, "debug/reference_chosen_logps": -4.469573020935059, "debug/reference_rejected_logps": -3.8946938514709473, "epoch": 7.3731884057971016, "grad_norm": 36.64645276863105, "learning_rate": 8.206831119544591e-09, "logits/chosen": 1.2250094413757324, "logits/rejected": 1.8114646673202515, "logps/chosen": -4.4164018630981445, "logps/rejected": -3.9573020935058594, "loss": 0.6512, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.02658577635884285, "rewards/margins": 0.05788997560739517, "rewards/rejected": -0.03130420297384262, "step": 2035 }, { "debug/policy_chosen_logits": 1.1348648071289062, "debug/policy_chosen_logps": -7.984068393707275, "debug/policy_rejected_logits": 1.6135408878326416, "debug/policy_rejected_logps": -7.341626167297363, "debug/reference_chosen_logps": -8.049311637878418, "debug/reference_rejected_logps": -7.240313529968262, "epoch": 7.391304347826087, "grad_norm": 32.29299248324973, "learning_rate": 7.969639468690701e-09, "logits/chosen": 1.1348648071289062, "logits/rejected": 1.6135408878326416, "logps/chosen": -7.984068393707275, "logps/rejected": -7.341626167297363, "loss": 0.6637, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.032621435821056366, "rewards/margins": 0.08327746391296387, "rewards/rejected": -0.0506560280919075, "step": 2040 }, { "debug/policy_chosen_logits": 1.6667225360870361, "debug/policy_chosen_logps": -194.6750030517578, "debug/policy_rejected_logits": 1.7397089004516602, "debug/policy_rejected_logps": -72.59608459472656, "debug/reference_chosen_logps": -195.5371551513672, "debug/reference_rejected_logps": -71.1019058227539, "epoch": 7.409420289855072, "grad_norm": 27.85164912466528, "learning_rate": 7.732447817836813e-09, "logits/chosen": 1.6667225360870361, "logits/rejected": 1.7397089004516602, "logps/chosen": -194.6750030517578, "logps/rejected": -72.59608459472656, "loss": 0.65, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.4310840666294098, "rewards/margins": 1.1781728267669678, "rewards/rejected": -0.7470887303352356, "step": 2045 }, { "debug/policy_chosen_logits": 1.5480074882507324, "debug/policy_chosen_logps": -43.2056884765625, "debug/policy_rejected_logits": 1.6676868200302124, "debug/policy_rejected_logps": -43.060569763183594, "debug/reference_chosen_logps": -43.37819290161133, "debug/reference_rejected_logps": -42.858863830566406, "epoch": 7.427536231884058, "grad_norm": 35.39393280724955, "learning_rate": 7.495256166982921e-09, "logits/chosen": 1.5480074882507324, "logits/rejected": 1.6676868200302124, "logps/chosen": -43.2056884765625, "logps/rejected": -43.060569763183594, "loss": 0.6603, "rewards/accuracies": 0.5, "rewards/chosen": 0.08625027537345886, "rewards/margins": 0.18710538744926453, "rewards/rejected": -0.10085508972406387, "step": 2050 }, { "debug/policy_chosen_logits": 1.3798162937164307, "debug/policy_chosen_logps": -85.08789825439453, "debug/policy_rejected_logits": 1.3964811563491821, "debug/policy_rejected_logps": -4.442984580993652, "debug/reference_chosen_logps": -85.24625396728516, "debug/reference_rejected_logps": -4.37467622756958, "epoch": 7.445652173913043, "grad_norm": 50.367891618503464, "learning_rate": 7.258064516129032e-09, "logits/chosen": 1.3798162937164307, "logits/rejected": 1.3964811563491821, "logps/chosen": -85.08789825439453, "logps/rejected": -4.442984580993652, "loss": 0.6502, "rewards/accuracies": 0.375, "rewards/chosen": 0.07917936146259308, "rewards/margins": 0.11333368718624115, "rewards/rejected": -0.03415432944893837, "step": 2055 }, { "debug/policy_chosen_logits": 1.644296646118164, "debug/policy_chosen_logps": -87.42615509033203, "debug/policy_rejected_logits": 1.9322481155395508, "debug/policy_rejected_logps": -67.15447235107422, "debug/reference_chosen_logps": -87.58280944824219, "debug/reference_rejected_logps": -67.03447723388672, "epoch": 7.463768115942029, "grad_norm": 103.68228857435442, "learning_rate": 7.020872865275142e-09, "logits/chosen": 1.644296646118164, "logits/rejected": 1.9322481155395508, "logps/chosen": -87.42615509033203, "logps/rejected": -67.15447235107422, "loss": 0.6473, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.07832814753055573, "rewards/margins": 0.13832548260688782, "rewards/rejected": -0.059997331351041794, "step": 2060 }, { "debug/policy_chosen_logits": 1.5305650234222412, "debug/policy_chosen_logps": -102.69795989990234, "debug/policy_rejected_logits": 1.8510377407073975, "debug/policy_rejected_logps": -7.974301338195801, "debug/reference_chosen_logps": -102.98133850097656, "debug/reference_rejected_logps": -7.96884822845459, "epoch": 7.481884057971015, "grad_norm": 97.25071804957805, "learning_rate": 6.783681214421253e-09, "logits/chosen": 1.5305650234222412, "logits/rejected": 1.8510377407073975, "logps/chosen": -102.69795989990234, "logps/rejected": -7.974301338195801, "loss": 0.6487, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.14168180525302887, "rewards/margins": 0.14440837502479553, "rewards/rejected": -0.0027265697717666626, "step": 2065 }, { "debug/policy_chosen_logits": 1.4482511281967163, "debug/policy_chosen_logps": -4.886277198791504, "debug/policy_rejected_logits": 1.870830774307251, "debug/policy_rejected_logps": -8.20402717590332, "debug/reference_chosen_logps": -4.927979469299316, "debug/reference_rejected_logps": -8.165047645568848, "epoch": 7.5, "grad_norm": 27.49284390578646, "learning_rate": 6.546489563567362e-09, "logits/chosen": 1.4482511281967163, "logits/rejected": 1.870830774307251, "logps/chosen": -4.886277198791504, "logps/rejected": -8.20402717590332, "loss": 0.6631, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.02085092104971409, "rewards/margins": 0.04034087061882019, "rewards/rejected": -0.019489949569106102, "step": 2070 }, { "debug/policy_chosen_logits": 1.5376430749893188, "debug/policy_chosen_logps": -17.58883285522461, "debug/policy_rejected_logits": 1.8304096460342407, "debug/policy_rejected_logps": -3.94201922416687, "debug/reference_chosen_logps": -17.670480728149414, "debug/reference_rejected_logps": -3.907533645629883, "epoch": 7.518115942028985, "grad_norm": 34.15174936214955, "learning_rate": 6.309297912713473e-09, "logits/chosen": 1.5376430749893188, "logits/rejected": 1.8304096460342407, "logps/chosen": -17.58883285522461, "logps/rejected": -3.94201922416687, "loss": 0.6406, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.04082300886511803, "rewards/margins": 0.05806582048535347, "rewards/rejected": -0.017242809757590294, "step": 2075 }, { "debug/policy_chosen_logits": 1.3218053579330444, "debug/policy_chosen_logps": -5.300800323486328, "debug/policy_rejected_logits": 1.2787723541259766, "debug/policy_rejected_logps": -52.17490768432617, "debug/reference_chosen_logps": -5.416479587554932, "debug/reference_rejected_logps": -52.12139129638672, "epoch": 7.536231884057971, "grad_norm": 90.57213851526198, "learning_rate": 6.0721062618595826e-09, "logits/chosen": 1.3218053579330444, "logits/rejected": 1.2787723541259766, "logps/chosen": -5.300800323486328, "logps/rejected": -52.17490768432617, "loss": 0.6473, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0578392930328846, "rewards/margins": 0.08459631353616714, "rewards/rejected": -0.026757020503282547, "step": 2080 }, { "debug/policy_chosen_logits": 1.1222273111343384, "debug/policy_chosen_logps": -160.9003448486328, "debug/policy_rejected_logits": 1.6895637512207031, "debug/policy_rejected_logps": -55.912818908691406, "debug/reference_chosen_logps": -161.3338165283203, "debug/reference_rejected_logps": -55.968605041503906, "epoch": 7.554347826086957, "grad_norm": 33.49327924799915, "learning_rate": 5.8349146110056925e-09, "logits/chosen": 1.1222273111343384, "logits/rejected": 1.6895637512207031, "logps/chosen": -160.9003448486328, "logps/rejected": -55.912818908691406, "loss": 0.6582, "rewards/accuracies": 0.375, "rewards/chosen": 0.216732457280159, "rewards/margins": 0.18883684277534485, "rewards/rejected": 0.027895618230104446, "step": 2085 }, { "debug/policy_chosen_logits": 1.1496461629867554, "debug/policy_chosen_logps": -8.303511619567871, "debug/policy_rejected_logits": 1.7243238687515259, "debug/policy_rejected_logps": -4.816485404968262, "debug/reference_chosen_logps": -8.452306747436523, "debug/reference_rejected_logps": -4.780450820922852, "epoch": 7.572463768115942, "grad_norm": 45.69908585540078, "learning_rate": 5.5977229601518025e-09, "logits/chosen": 1.1496461629867554, "logits/rejected": 1.7243238687515259, "logps/chosen": -8.303511619567871, "logps/rejected": -4.816485404968262, "loss": 0.6658, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.07439769804477692, "rewards/margins": 0.09241487085819244, "rewards/rejected": -0.018017178401350975, "step": 2090 }, { "debug/policy_chosen_logits": 1.0077733993530273, "debug/policy_chosen_logps": -223.5562286376953, "debug/policy_rejected_logits": 1.460754632949829, "debug/policy_rejected_logps": -85.72068786621094, "debug/reference_chosen_logps": -224.5519256591797, "debug/reference_rejected_logps": -85.32658386230469, "epoch": 7.590579710144928, "grad_norm": 28.061269953673687, "learning_rate": 5.360531309297912e-09, "logits/chosen": 1.0077733993530273, "logits/rejected": 1.460754632949829, "logps/chosen": -223.5562286376953, "logps/rejected": -85.72068786621094, "loss": 0.6615, "rewards/accuracies": 0.375, "rewards/chosen": 0.4978490471839905, "rewards/margins": 0.6948975324630737, "rewards/rejected": -0.19704854488372803, "step": 2095 }, { "debug/policy_chosen_logits": 1.5016510486602783, "debug/policy_chosen_logps": -4.516307830810547, "debug/policy_rejected_logits": 1.6163917779922485, "debug/policy_rejected_logps": -10.154683113098145, "debug/reference_chosen_logps": -4.5533342361450195, "debug/reference_rejected_logps": -10.117715835571289, "epoch": 7.608695652173913, "grad_norm": 37.69622330456993, "learning_rate": 5.123339658444022e-09, "logits/chosen": 1.5016510486602783, "logits/rejected": 1.6163917779922485, "logps/chosen": -4.516307830810547, "logps/rejected": -10.154683113098145, "loss": 0.6629, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.018513206392526627, "rewards/margins": 0.03699735924601555, "rewards/rejected": -0.018484150990843773, "step": 2100 }, { "epoch": 7.608695652173913, "eval_debug/policy_chosen_logits": 1.6375880241394043, "eval_debug/policy_chosen_logps": -122.29913330078125, "eval_debug/policy_rejected_logits": 1.696777105331421, "eval_debug/policy_rejected_logps": -63.825870513916016, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6375880241394043, "eval_logits/rejected": 1.696777105331421, "eval_logps/chosen": -122.29913330078125, "eval_logps/rejected": -63.825870513916016, "eval_loss": 0.6930306553840637, "eval_rewards/accuracies": 0.30263158679008484, "eval_rewards/chosen": 0.4244559407234192, "eval_rewards/margins": 0.39386186003685, "eval_rewards/rejected": 0.03059404157102108, "eval_runtime": 28.5422, "eval_samples_per_second": 21.022, "eval_steps_per_second": 0.666, "step": 2100 }, { "debug/policy_chosen_logits": 0.866378903388977, "debug/policy_chosen_logps": -151.85739135742188, "debug/policy_rejected_logits": 1.1311208009719849, "debug/policy_rejected_logps": -153.0951385498047, "debug/reference_chosen_logps": -152.16748046875, "debug/reference_rejected_logps": -152.31759643554688, "epoch": 7.6268115942028984, "grad_norm": 35.14630211332276, "learning_rate": 4.886148007590132e-09, "logits/chosen": 0.866378903388977, "logits/rejected": 1.1311208009719849, "logps/chosen": -151.85739135742188, "logps/rejected": -153.0951385498047, "loss": 0.651, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.15503665804862976, "rewards/margins": 0.5438076853752136, "rewards/rejected": -0.38877105712890625, "step": 2105 }, { "debug/policy_chosen_logits": 1.4329841136932373, "debug/policy_chosen_logps": -92.68058776855469, "debug/policy_rejected_logits": 1.606961965560913, "debug/policy_rejected_logps": -28.344751358032227, "debug/reference_chosen_logps": -93.01536560058594, "debug/reference_rejected_logps": -28.119426727294922, "epoch": 7.644927536231884, "grad_norm": 37.77593570746261, "learning_rate": 4.648956356736242e-09, "logits/chosen": 1.4329841136932373, "logits/rejected": 1.606961965560913, "logps/chosen": -92.68058776855469, "logps/rejected": -28.344751358032227, "loss": 0.6611, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.1673946976661682, "rewards/margins": 0.28005701303482056, "rewards/rejected": -0.11266227811574936, "step": 2110 }, { "debug/policy_chosen_logits": 1.2792384624481201, "debug/policy_chosen_logps": -5.244724750518799, "debug/policy_rejected_logits": 1.406163215637207, "debug/policy_rejected_logps": -50.28623580932617, "debug/reference_chosen_logps": -5.272663593292236, "debug/reference_rejected_logps": -50.06426239013672, "epoch": 7.663043478260869, "grad_norm": 68.69239929417682, "learning_rate": 4.411764705882353e-09, "logits/chosen": 1.2792384624481201, "logits/rejected": 1.406163215637207, "logps/chosen": -5.244724750518799, "logps/rejected": -50.28623580932617, "loss": 0.6438, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.013969654217362404, "rewards/margins": 0.12495739758014679, "rewards/rejected": -0.11098773777484894, "step": 2115 }, { "debug/policy_chosen_logits": 1.1968486309051514, "debug/policy_chosen_logps": -120.29439544677734, "debug/policy_rejected_logits": 1.5515402555465698, "debug/policy_rejected_logps": -7.528843879699707, "debug/reference_chosen_logps": -120.79359436035156, "debug/reference_rejected_logps": -7.519468784332275, "epoch": 7.681159420289855, "grad_norm": 32.94288411352006, "learning_rate": 4.174573055028463e-09, "logits/chosen": 1.1968486309051514, "logits/rejected": 1.5515402555465698, "logps/chosen": -120.29439544677734, "logps/rejected": -7.528843879699707, "loss": 0.6557, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.24959564208984375, "rewards/margins": 0.2542833089828491, "rewards/rejected": -0.004687666893005371, "step": 2120 }, { "debug/policy_chosen_logits": 1.1708781719207764, "debug/policy_chosen_logps": -323.01861572265625, "debug/policy_rejected_logits": 1.601980447769165, "debug/policy_rejected_logps": -123.80934143066406, "debug/reference_chosen_logps": -324.15594482421875, "debug/reference_rejected_logps": -123.63587951660156, "epoch": 7.699275362318841, "grad_norm": 45.61566837404442, "learning_rate": 3.937381404174573e-09, "logits/chosen": 1.1708781719207764, "logits/rejected": 1.601980447769165, "logps/chosen": -323.01861572265625, "logps/rejected": -123.80934143066406, "loss": 0.6455, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.5686324238777161, "rewards/margins": 0.6553633809089661, "rewards/rejected": -0.08673091977834702, "step": 2125 }, { "debug/policy_chosen_logits": 1.527826189994812, "debug/policy_chosen_logps": -59.58167266845703, "debug/policy_rejected_logits": 1.8827228546142578, "debug/policy_rejected_logps": -11.152032852172852, "debug/reference_chosen_logps": -59.7724494934082, "debug/reference_rejected_logps": -11.075045585632324, "epoch": 7.717391304347826, "grad_norm": 46.1775427493181, "learning_rate": 3.700189753320683e-09, "logits/chosen": 1.527826189994812, "logits/rejected": 1.8827228546142578, "logps/chosen": -59.58167266845703, "logps/rejected": -11.152032852172852, "loss": 0.6627, "rewards/accuracies": 0.375, "rewards/chosen": 0.0953909382224083, "rewards/margins": 0.13388440012931824, "rewards/rejected": -0.038493454456329346, "step": 2130 }, { "debug/policy_chosen_logits": 1.3597345352172852, "debug/policy_chosen_logps": -6.96924352645874, "debug/policy_rejected_logits": 1.784441590309143, "debug/policy_rejected_logps": -44.47954177856445, "debug/reference_chosen_logps": -6.974306583404541, "debug/reference_rejected_logps": -44.45081329345703, "epoch": 7.7355072463768115, "grad_norm": 88.70789123543379, "learning_rate": 3.462998102466793e-09, "logits/chosen": 1.3597345352172852, "logits/rejected": 1.784441590309143, "logps/chosen": -6.96924352645874, "logps/rejected": -44.47954177856445, "loss": 0.6816, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0025312243960797787, "rewards/margins": 0.016896400600671768, "rewards/rejected": -0.014365175738930702, "step": 2135 }, { "debug/policy_chosen_logits": 1.4395967721939087, "debug/policy_chosen_logps": -46.809654235839844, "debug/policy_rejected_logits": 1.8417460918426514, "debug/policy_rejected_logps": -110.1465835571289, "debug/reference_chosen_logps": -46.85969161987305, "debug/reference_rejected_logps": -109.70494079589844, "epoch": 7.753623188405797, "grad_norm": 205.98557573215828, "learning_rate": 3.225806451612903e-09, "logits/chosen": 1.4395967721939087, "logits/rejected": 1.8417460918426514, "logps/chosen": -46.809654235839844, "logps/rejected": -110.1465835571289, "loss": 0.6559, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.025019049644470215, "rewards/margins": 0.24583573639392853, "rewards/rejected": -0.2208166867494583, "step": 2140 }, { "debug/policy_chosen_logits": 1.5244606733322144, "debug/policy_chosen_logps": -24.133201599121094, "debug/policy_rejected_logits": 1.5809597969055176, "debug/policy_rejected_logps": -5.4411211013793945, "debug/reference_chosen_logps": -24.1997013092041, "debug/reference_rejected_logps": -5.325526237487793, "epoch": 7.771739130434782, "grad_norm": 48.73476298141665, "learning_rate": 2.988614800759013e-09, "logits/chosen": 1.5244606733322144, "logits/rejected": 1.5809597969055176, "logps/chosen": -24.133201599121094, "logps/rejected": -5.4411211013793945, "loss": 0.6478, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.033248938620090485, "rewards/margins": 0.09104625135660172, "rewards/rejected": -0.057797305285930634, "step": 2145 }, { "debug/policy_chosen_logits": 1.2399160861968994, "debug/policy_chosen_logps": -117.43072509765625, "debug/policy_rejected_logits": 1.360144853591919, "debug/policy_rejected_logps": -5.555869102478027, "debug/reference_chosen_logps": -117.74879455566406, "debug/reference_rejected_logps": -5.538172721862793, "epoch": 7.789855072463768, "grad_norm": 32.4328554727727, "learning_rate": 2.7514231499051234e-09, "logits/chosen": 1.2399160861968994, "logits/rejected": 1.360144853591919, "logps/chosen": -117.43072509765625, "logps/rejected": -5.555869102478027, "loss": 0.6439, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.15903374552726746, "rewards/margins": 0.16788175702095032, "rewards/rejected": -0.008848002180457115, "step": 2150 }, { "debug/policy_chosen_logits": 1.2963535785675049, "debug/policy_chosen_logps": -54.22887420654297, "debug/policy_rejected_logits": 1.428857684135437, "debug/policy_rejected_logps": -115.9268569946289, "debug/reference_chosen_logps": -54.441184997558594, "debug/reference_rejected_logps": -115.64637756347656, "epoch": 7.807971014492754, "grad_norm": 140.9728422392306, "learning_rate": 2.5142314990512333e-09, "logits/chosen": 1.2963535785675049, "logits/rejected": 1.428857684135437, "logps/chosen": -54.22887420654297, "logps/rejected": -115.9268569946289, "loss": 0.6466, "rewards/accuracies": 0.375, "rewards/chosen": 0.10615917295217514, "rewards/margins": 0.24639901518821716, "rewards/rejected": -0.1402398645877838, "step": 2155 }, { "debug/policy_chosen_logits": 1.3899633884429932, "debug/policy_chosen_logps": -99.61604309082031, "debug/policy_rejected_logits": 2.1701138019561768, "debug/policy_rejected_logps": -12.003734588623047, "debug/reference_chosen_logps": -99.80180358886719, "debug/reference_rejected_logps": -11.882006645202637, "epoch": 7.826086956521739, "grad_norm": 37.080349441208895, "learning_rate": 2.2770398481973433e-09, "logits/chosen": 1.3899633884429932, "logits/rejected": 2.1701138019561768, "logps/chosen": -99.61604309082031, "logps/rejected": -12.003734588623047, "loss": 0.65, "rewards/accuracies": 0.375, "rewards/chosen": 0.09288475662469864, "rewards/margins": 0.15374809503555298, "rewards/rejected": -0.06086335331201553, "step": 2160 }, { "debug/policy_chosen_logits": 1.4681205749511719, "debug/policy_chosen_logps": -71.65458679199219, "debug/policy_rejected_logits": 1.4492331743240356, "debug/policy_rejected_logps": -3.07924222946167, "debug/reference_chosen_logps": -71.87583923339844, "debug/reference_rejected_logps": -3.0672378540039062, "epoch": 7.844202898550725, "grad_norm": 26.81304162425296, "learning_rate": 2.039848197343453e-09, "logits/chosen": 1.4681205749511719, "logits/rejected": 1.4492331743240356, "logps/chosen": -71.65458679199219, "logps/rejected": -3.07924222946167, "loss": 0.6616, "rewards/accuracies": 0.375, "rewards/chosen": 0.11062222719192505, "rewards/margins": 0.11662455648183823, "rewards/rejected": -0.006002339534461498, "step": 2165 }, { "debug/policy_chosen_logits": 1.2745212316513062, "debug/policy_chosen_logps": -79.49455261230469, "debug/policy_rejected_logits": 1.465950846672058, "debug/policy_rejected_logps": -91.65544128417969, "debug/reference_chosen_logps": -79.83101654052734, "debug/reference_rejected_logps": -91.58612060546875, "epoch": 7.86231884057971, "grad_norm": 31.469073621881474, "learning_rate": 1.8026565464895636e-09, "logits/chosen": 1.2745212316513062, "logits/rejected": 1.465950846672058, "logps/chosen": -79.49455261230469, "logps/rejected": -91.65544128417969, "loss": 0.6573, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.1682339608669281, "rewards/margins": 0.20289640128612518, "rewards/rejected": -0.034662432968616486, "step": 2170 }, { "debug/policy_chosen_logits": 1.5723754167556763, "debug/policy_chosen_logps": -67.89265441894531, "debug/policy_rejected_logits": 1.7329862117767334, "debug/policy_rejected_logps": -4.517541408538818, "debug/reference_chosen_logps": -68.33360290527344, "debug/reference_rejected_logps": -4.408754348754883, "epoch": 7.880434782608695, "grad_norm": 48.04650327384561, "learning_rate": 1.5654648956356737e-09, "logits/chosen": 1.5723754167556763, "logits/rejected": 1.7329862117767334, "logps/chosen": -67.89265441894531, "logps/rejected": -4.517541408538818, "loss": 0.6611, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.22047337889671326, "rewards/margins": 0.2748670279979706, "rewards/rejected": -0.05439364165067673, "step": 2175 }, { "debug/policy_chosen_logits": 1.205492377281189, "debug/policy_chosen_logps": -7.135344505310059, "debug/policy_rejected_logits": 1.4717100858688354, "debug/policy_rejected_logps": -5.174375534057617, "debug/reference_chosen_logps": -7.189650535583496, "debug/reference_rejected_logps": -5.031564235687256, "epoch": 7.898550724637682, "grad_norm": 54.462097770988215, "learning_rate": 1.3282732447817836e-09, "logits/chosen": 1.205492377281189, "logits/rejected": 1.4717100858688354, "logps/chosen": -7.135344505310059, "logps/rejected": -5.174375534057617, "loss": 0.6633, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.02715294435620308, "rewards/margins": 0.09855826944112778, "rewards/rejected": -0.071405328810215, "step": 2180 }, { "debug/policy_chosen_logits": 1.5712878704071045, "debug/policy_chosen_logps": -7.730282783508301, "debug/policy_rejected_logits": 1.7779029607772827, "debug/policy_rejected_logps": -8.776229858398438, "debug/reference_chosen_logps": -7.779355525970459, "debug/reference_rejected_logps": -8.762332916259766, "epoch": 7.916666666666667, "grad_norm": 57.985497629055196, "learning_rate": 1.0910815939278936e-09, "logits/chosen": 1.5712878704071045, "logits/rejected": 1.7779029607772827, "logps/chosen": -7.730282783508301, "logps/rejected": -8.776229858398438, "loss": 0.6598, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.02453657053411007, "rewards/margins": 0.03148510307073593, "rewards/rejected": -0.006948533467948437, "step": 2185 }, { "debug/policy_chosen_logits": 0.9796692132949829, "debug/policy_chosen_logps": -171.66966247558594, "debug/policy_rejected_logits": 1.4385249614715576, "debug/policy_rejected_logps": -45.73806381225586, "debug/reference_chosen_logps": -172.30654907226562, "debug/reference_rejected_logps": -45.706825256347656, "epoch": 7.934782608695652, "grad_norm": 85.23213570668042, "learning_rate": 8.538899430740038e-10, "logits/chosen": 0.9796692132949829, "logits/rejected": 1.4385249614715576, "logps/chosen": -171.66966247558594, "logps/rejected": -45.73806381225586, "loss": 0.659, "rewards/accuracies": 0.25, "rewards/chosen": 0.31843605637550354, "rewards/margins": 0.334055632352829, "rewards/rejected": -0.015619602985680103, "step": 2190 }, { "debug/policy_chosen_logits": 1.5003211498260498, "debug/policy_chosen_logps": -186.5579376220703, "debug/policy_rejected_logits": 1.6400268077850342, "debug/policy_rejected_logps": -12.129293441772461, "debug/reference_chosen_logps": -187.01004028320312, "debug/reference_rejected_logps": -12.076559066772461, "epoch": 7.952898550724638, "grad_norm": 35.957413344748325, "learning_rate": 6.166982922201139e-10, "logits/chosen": 1.5003211498260498, "logits/rejected": 1.6400268077850342, "logps/chosen": -186.5579376220703, "logps/rejected": -12.129293441772461, "loss": 0.6513, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.22606050968170166, "rewards/margins": 0.2524270713329315, "rewards/rejected": -0.0263665858656168, "step": 2195 }, { "debug/policy_chosen_logits": 1.3417085409164429, "debug/policy_chosen_logps": -7.31008768081665, "debug/policy_rejected_logits": 1.7232770919799805, "debug/policy_rejected_logps": -5.647335052490234, "debug/reference_chosen_logps": -7.411065578460693, "debug/reference_rejected_logps": -5.5189714431762695, "epoch": 7.971014492753623, "grad_norm": 38.49049740473527, "learning_rate": 3.795066413662239e-10, "logits/chosen": 1.3417085409164429, "logits/rejected": 1.7232770919799805, "logps/chosen": -7.31008768081665, "logps/rejected": -5.647335052490234, "loss": 0.6427, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.050489313900470734, "rewards/margins": 0.11467085033655167, "rewards/rejected": -0.06418152898550034, "step": 2200 }, { "epoch": 7.971014492753623, "eval_debug/policy_chosen_logits": 1.6350387334823608, "eval_debug/policy_chosen_logps": -122.4480209350586, "eval_debug/policy_rejected_logits": 1.6952074766159058, "eval_debug/policy_rejected_logps": -63.8274040222168, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_logits/chosen": 1.6350387334823608, "eval_logits/rejected": 1.6952074766159058, "eval_logps/chosen": -122.4480209350586, "eval_logps/rejected": -63.8274040222168, "eval_loss": 0.7008652687072754, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.35002413392066956, "eval_rewards/margins": 0.3201960623264313, "eval_rewards/rejected": 0.029828067868947983, "eval_runtime": 28.5695, "eval_samples_per_second": 21.001, "eval_steps_per_second": 0.665, "step": 2200 }, { "debug/policy_chosen_logits": 1.5036356449127197, "debug/policy_chosen_logps": -41.09565734863281, "debug/policy_rejected_logits": 1.8106467723846436, "debug/policy_rejected_logps": -6.002691268920898, "debug/reference_chosen_logps": -41.25218200683594, "debug/reference_rejected_logps": -5.933625221252441, "epoch": 7.989130434782608, "grad_norm": 55.62179740087847, "learning_rate": 1.4231499051233395e-10, "logits/chosen": 1.5036356449127197, "logits/rejected": 1.8106467723846436, "logps/chosen": -41.09565734863281, "logps/rejected": -6.002691268920898, "loss": 0.6569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.07826316356658936, "rewards/margins": 0.11279622465372086, "rewards/rejected": -0.0345330573618412, "step": 2205 }, { "epoch": 8.0, "step": 2208, "total_flos": 0.0, "train_loss": 0.6995941666157349, "train_runtime": 15519.0477, "train_samples_per_second": 9.086, "train_steps_per_second": 0.142 } ], "logging_steps": 5, "max_steps": 2208, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }