{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 100, "global_step": 2208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 0.8079685568809509, "debug/policy_chosen_logps": -298.0812683105469, "debug/policy_rejected_logits": 0.6268295645713806, "debug/policy_rejected_logps": -240.20742797851562, "debug/reference_chosen_logps": -298.0812683105469, "debug/reference_rejected_logps": -240.20742797851562, "debug/sppo_chosen_loss": 2500.0, "debug/sppo_chosen_reward_in_loss": 0.0, "debug/sppo_rej_reward_in_loss": 0.0, "debug/sppo_reject_loss": 2500.0, "epoch": 0.0036231884057971015, "grad_norm": 63517.32525931037, "learning_rate": 1e-09, "logits/chosen": 0.8079685568809509, "logits/rejected": 0.6268295645713806, "logps/chosen": -298.0812683105469, "logps/rejected": -240.20742797851562, "loss": 5000.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 1.1300876140594482, "debug/policy_chosen_logps": -262.69927978515625, "debug/policy_rejected_logits": 1.369264841079712, "debug/policy_rejected_logps": -296.08197021484375, "debug/reference_chosen_logps": -262.58026123046875, "debug/reference_rejected_logps": -296.0579528808594, "debug/sppo_chosen_loss": 2512.595703125, "debug/sppo_chosen_reward_in_loss": -0.11899030208587646, "debug/sppo_rej_reward_in_loss": -0.024042129516601562, "debug/sppo_reject_loss": 2498.0830078125, "epoch": 0.018115942028985508, "grad_norm": 59714.293458203625, "learning_rate": 5e-09, "logits/chosen": 1.1300876140594482, "logits/rejected": 1.369264841079712, "logps/chosen": -262.69927978515625, "logps/rejected": -296.08197021484375, "loss": 4993.04, "rewards/accuracies": 0.40625, "rewards/chosen": -0.001189903006888926, "rewards/margins": -0.0009494817350059748, "rewards/rejected": -0.00024042127188295126, "step": 5 }, { "debug/policy_chosen_logits": 1.3408454656600952, "debug/policy_chosen_logps": -282.72808837890625, "debug/policy_rejected_logits": 1.6083786487579346, "debug/policy_rejected_logps": -287.5859375, "debug/reference_chosen_logps": -282.7684631347656, "debug/reference_rejected_logps": -287.81396484375, "debug/sppo_chosen_loss": 2496.753173828125, "debug/sppo_chosen_reward_in_loss": 0.040346525609493256, "debug/sppo_rej_reward_in_loss": 0.22803974151611328, "debug/sppo_reject_loss": 2523.411376953125, "epoch": 0.036231884057971016, "grad_norm": 65270.98699704099, "learning_rate": 1e-08, "logits/chosen": 1.3408454656600952, "logits/rejected": 1.6083786487579346, "logps/chosen": -282.72808837890625, "logps/rejected": -287.5859375, "loss": 5005.3031, "rewards/accuracies": 0.375, "rewards/chosen": 0.00040346532477997243, "rewards/margins": -0.001876931986771524, "rewards/rejected": 0.0022803975734859705, "step": 10 }, { "debug/policy_chosen_logits": 1.2799644470214844, "debug/policy_chosen_logps": -247.2631072998047, "debug/policy_rejected_logits": 1.6324832439422607, "debug/policy_rejected_logps": -285.818115234375, "debug/reference_chosen_logps": -247.40646362304688, "debug/reference_rejected_logps": -286.155029296875, "debug/sppo_chosen_loss": 2486.138916015625, "debug/sppo_chosen_reward_in_loss": 0.143341064453125, "debug/sppo_rej_reward_in_loss": 0.3368995785713196, "debug/sppo_reject_loss": 2534.625732421875, "epoch": 0.05434782608695652, "grad_norm": 58361.99119140576, "learning_rate": 1.5e-08, "logits/chosen": 1.2799644470214844, "logits/rejected": 1.6324832439422607, "logps/chosen": -247.2631072998047, "logps/rejected": -285.818115234375, "loss": 5003.5852, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0014334104489535093, "rewards/margins": -0.0019355848198756576, "rewards/rejected": 0.0033689953852444887, "step": 15 }, { "debug/policy_chosen_logits": 1.254523515701294, "debug/policy_chosen_logps": -275.05377197265625, "debug/policy_rejected_logits": 1.6007152795791626, "debug/policy_rejected_logps": -277.05108642578125, "debug/reference_chosen_logps": -275.2658386230469, "debug/reference_rejected_logps": -277.7287292480469, "debug/sppo_chosen_loss": 2479.42138671875, "debug/sppo_chosen_reward_in_loss": 0.21210822463035583, "debug/sppo_rej_reward_in_loss": 0.677642822265625, "debug/sppo_reject_loss": 2569.156982421875, "epoch": 0.07246376811594203, "grad_norm": 57681.84213169881, "learning_rate": 2e-08, "logits/chosen": 1.254523515701294, "logits/rejected": 1.6007152795791626, "logps/chosen": -275.05377197265625, "logps/rejected": -277.05108642578125, "loss": 5022.8664, "rewards/accuracies": 0.375, "rewards/chosen": 0.0021210822742432356, "rewards/margins": -0.004655345343053341, "rewards/rejected": 0.006776427384465933, "step": 20 }, { "debug/policy_chosen_logits": 1.3303930759429932, "debug/policy_chosen_logps": -256.53955078125, "debug/policy_rejected_logits": 1.7013771533966064, "debug/policy_rejected_logps": -268.72650146484375, "debug/reference_chosen_logps": -257.0242614746094, "debug/reference_rejected_logps": -269.0206604003906, "debug/sppo_chosen_loss": 2452.55615234375, "debug/sppo_chosen_reward_in_loss": 0.48469600081443787, "debug/sppo_rej_reward_in_loss": 0.29415759444236755, "debug/sppo_reject_loss": 2530.29052734375, "epoch": 0.09057971014492754, "grad_norm": 60693.91690484042, "learning_rate": 2.5e-08, "logits/chosen": 1.3303930759429932, "logits/rejected": 1.7013771533966064, "logps/chosen": -256.53955078125, "logps/rejected": -268.72650146484375, "loss": 4996.8453, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004846959840506315, "rewards/margins": 0.0019053838914260268, "rewards/rejected": 0.0029415760654956102, "step": 25 }, { "debug/policy_chosen_logits": 1.4970160722732544, "debug/policy_chosen_logps": -225.7395477294922, "debug/policy_rejected_logits": 1.8431037664413452, "debug/policy_rejected_logps": -272.37225341796875, "debug/reference_chosen_logps": -226.3221893310547, "debug/reference_rejected_logps": -272.57330322265625, "debug/sppo_chosen_loss": 2442.823486328125, "debug/sppo_chosen_reward_in_loss": 0.5826284289360046, "debug/sppo_rej_reward_in_loss": 0.201019287109375, "debug/sppo_reject_loss": 2520.621337890625, "epoch": 0.10869565217391304, "grad_norm": 60646.590277693744, "learning_rate": 3e-08, "logits/chosen": 1.4970160722732544, "logits/rejected": 1.8431037664413452, "logps/chosen": -225.7395477294922, "logps/rejected": -272.37225341796875, "loss": 4999.957, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005826284177601337, "rewards/margins": 0.003816091688349843, "rewards/rejected": 0.0020101929549127817, "step": 30 }, { "debug/policy_chosen_logits": 1.5928256511688232, "debug/policy_chosen_logps": -250.4981689453125, "debug/policy_rejected_logits": 1.9174392223358154, "debug/policy_rejected_logps": -278.39886474609375, "debug/reference_chosen_logps": -250.8056640625, "debug/reference_rejected_logps": -278.73834228515625, "debug/sppo_chosen_loss": 2470.23095703125, "debug/sppo_chosen_reward_in_loss": 0.30749550461769104, "debug/sppo_rej_reward_in_loss": 0.3394853472709656, "debug/sppo_reject_loss": 2535.08154296875, "epoch": 0.12681159420289856, "grad_norm": 60096.07003172631, "learning_rate": 3.4999999999999996e-08, "logits/chosen": 1.5928256511688232, "logits/rejected": 1.9174392223358154, "logps/chosen": -250.4981689453125, "logps/rejected": -278.39886474609375, "loss": 5004.3445, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0030749549623578787, "rewards/margins": -0.00031989876879379153, "rewards/rejected": 0.003394853789359331, "step": 35 }, { "debug/policy_chosen_logits": 1.490484595298767, "debug/policy_chosen_logps": -258.9418029785156, "debug/policy_rejected_logits": 1.817221999168396, "debug/policy_rejected_logps": -306.5070495605469, "debug/reference_chosen_logps": -259.2444152832031, "debug/reference_rejected_logps": -306.8253479003906, "debug/sppo_chosen_loss": 2470.703369140625, "debug/sppo_chosen_reward_in_loss": 0.3025951385498047, "debug/sppo_rej_reward_in_loss": 0.31828880310058594, "debug/sppo_reject_loss": 2532.748046875, "epoch": 0.14492753623188406, "grad_norm": 60616.997755284705, "learning_rate": 4e-08, "logits/chosen": 1.490484595298767, "logits/rejected": 1.817221999168396, "logps/chosen": -258.9418029785156, "logps/rejected": -306.5070495605469, "loss": 5000.4187, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0030259513296186924, "rewards/margins": -0.00015693642490077764, "rewards/rejected": 0.0031828880310058594, "step": 40 }, { "debug/policy_chosen_logits": 1.4415340423583984, "debug/policy_chosen_logps": -251.431884765625, "debug/policy_rejected_logits": 1.6230499744415283, "debug/policy_rejected_logps": -283.16357421875, "debug/reference_chosen_logps": -251.584228515625, "debug/reference_rejected_logps": -283.26763916015625, "debug/sppo_chosen_loss": 2485.560302734375, "debug/sppo_chosen_reward_in_loss": 0.1523365080356598, "debug/sppo_rej_reward_in_loss": 0.10411567986011505, "debug/sppo_reject_loss": 2511.12841796875, "epoch": 0.16304347826086957, "grad_norm": 73372.47642524718, "learning_rate": 4.5e-08, "logits/chosen": 1.4415340423583984, "logits/rejected": 1.6230499744415283, "logps/chosen": -251.431884765625, "logps/rejected": -283.16357421875, "loss": 5000.591, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0015233650337904692, "rewards/margins": 0.0004822082701139152, "rewards/rejected": 0.0010411568218842149, "step": 45 }, { "debug/policy_chosen_logits": 1.3510771989822388, "debug/policy_chosen_logps": -244.49349975585938, "debug/policy_rejected_logits": 1.787461519241333, "debug/policy_rejected_logps": -291.9498596191406, "debug/reference_chosen_logps": -244.6476287841797, "debug/reference_rejected_logps": -292.14788818359375, "debug/sppo_chosen_loss": 2485.240234375, "debug/sppo_chosen_reward_in_loss": 0.154139906167984, "debug/sppo_rej_reward_in_loss": 0.1980735808610916, "debug/sppo_reject_loss": 2520.5009765625, "epoch": 0.18115942028985507, "grad_norm": 59287.09347582178, "learning_rate": 5e-08, "logits/chosen": 1.3510771989822388, "logits/rejected": 1.787461519241333, "logps/chosen": -244.49349975585938, "logps/rejected": -291.9498596191406, "loss": 5008.6457, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0015413989312946796, "rewards/margins": -0.00043933698907494545, "rewards/rejected": 0.0019807356875389814, "step": 50 }, { "debug/policy_chosen_logits": 1.3307476043701172, "debug/policy_chosen_logps": -256.0205078125, "debug/policy_rejected_logits": 1.8728010654449463, "debug/policy_rejected_logps": -305.50537109375, "debug/reference_chosen_logps": -255.957275390625, "debug/reference_rejected_logps": -305.47186279296875, "debug/sppo_chosen_loss": 2507.0, "debug/sppo_chosen_reward_in_loss": -0.06325797736644745, "debug/sppo_rej_reward_in_loss": -0.03349952772259712, "debug/sppo_reject_loss": 2497.160888671875, "epoch": 0.19927536231884058, "grad_norm": 67813.2875580145, "learning_rate": 5.5e-08, "logits/chosen": 1.3307476043701172, "logits/rejected": 1.8728010654449463, "logps/chosen": -256.0205078125, "logps/rejected": -305.50537109375, "loss": 4999.1008, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006325797294266522, "rewards/margins": -0.00029758457094430923, "rewards/rejected": -0.0003349951875861734, "step": 55 }, { "debug/policy_chosen_logits": 1.5255582332611084, "debug/policy_chosen_logps": -259.21923828125, "debug/policy_rejected_logits": 1.826495885848999, "debug/policy_rejected_logps": -309.2088928222656, "debug/reference_chosen_logps": -259.2587585449219, "debug/reference_rejected_logps": -309.2173767089844, "debug/sppo_chosen_loss": 2496.88720703125, "debug/sppo_chosen_reward_in_loss": 0.039513397961854935, "debug/sppo_rej_reward_in_loss": 0.00846939068287611, "debug/sppo_reject_loss": 2501.68115234375, "epoch": 0.21739130434782608, "grad_norm": 64102.46291248613, "learning_rate": 6e-08, "logits/chosen": 1.5255582332611084, "logits/rejected": 1.826495885848999, "logps/chosen": -259.21923828125, "logps/rejected": -309.2088928222656, "loss": 4999.0242, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00039513400406576693, "rewards/margins": 0.0003104399947915226, "rewards/rejected": 8.469391468679532e-05, "step": 60 }, { "debug/policy_chosen_logits": 1.478100299835205, "debug/policy_chosen_logps": -272.63275146484375, "debug/policy_rejected_logits": 1.856560468673706, "debug/policy_rejected_logps": -295.95751953125, "debug/reference_chosen_logps": -272.57183837890625, "debug/reference_rejected_logps": -295.93487548828125, "debug/sppo_chosen_loss": 2506.627685546875, "debug/sppo_chosen_reward_in_loss": -0.06086692959070206, "debug/sppo_rej_reward_in_loss": -0.022687530145049095, "debug/sppo_reject_loss": 2498.60009765625, "epoch": 0.23550724637681159, "grad_norm": 68724.79615678519, "learning_rate": 6.5e-08, "logits/chosen": 1.478100299835205, "logits/rejected": 1.856560468673706, "logps/chosen": -272.63275146484375, "logps/rejected": -295.95751953125, "loss": 4994.9328, "rewards/accuracies": 0.375, "rewards/chosen": -0.0006086693028919399, "rewards/margins": -0.0003817938850261271, "rewards/rejected": -0.00022687530145049095, "step": 65 }, { "debug/policy_chosen_logits": 1.642473816871643, "debug/policy_chosen_logps": -266.3983459472656, "debug/policy_rejected_logits": 1.629185676574707, "debug/policy_rejected_logps": -279.23944091796875, "debug/reference_chosen_logps": -266.346435546875, "debug/reference_rejected_logps": -279.0445861816406, "debug/sppo_chosen_loss": 2505.703857421875, "debug/sppo_chosen_reward_in_loss": -0.05189533159136772, "debug/sppo_rej_reward_in_loss": -0.1948680877685547, "debug/sppo_reject_loss": 2481.307861328125, "epoch": 0.2536231884057971, "grad_norm": 57009.90149130299, "learning_rate": 6.999999999999999e-08, "logits/chosen": 1.642473816871643, "logits/rejected": 1.629185676574707, "logps/chosen": -266.3983459472656, "logps/rejected": -279.23944091796875, "loss": 4986.9828, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0005189533694647253, "rewards/margins": 0.0014297275338321924, "rewards/rejected": -0.0019486809615045786, "step": 70 }, { "debug/policy_chosen_logits": 1.3951247930526733, "debug/policy_chosen_logps": -241.9928436279297, "debug/policy_rejected_logits": 1.7822593450546265, "debug/policy_rejected_logps": -272.57305908203125, "debug/reference_chosen_logps": -241.78036499023438, "debug/reference_rejected_logps": -272.10552978515625, "debug/sppo_chosen_loss": 2522.02294921875, "debug/sppo_chosen_reward_in_loss": -0.2124733030796051, "debug/sppo_rej_reward_in_loss": -0.46752509474754333, "debug/sppo_reject_loss": 2454.223388671875, "epoch": 0.2717391304347826, "grad_norm": 56506.439537798804, "learning_rate": 7.5e-08, "logits/chosen": 1.3951247930526733, "logits/rejected": 1.7822593450546265, "logps/chosen": -241.9928436279297, "logps/rejected": -272.57305908203125, "loss": 4982.825, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0021247328259050846, "rewards/margins": 0.0025505179073661566, "rewards/rejected": -0.0046752505004405975, "step": 75 }, { "debug/policy_chosen_logits": 1.358420491218567, "debug/policy_chosen_logps": -263.78192138671875, "debug/policy_rejected_logits": 1.7411441802978516, "debug/policy_rejected_logps": -292.9356689453125, "debug/reference_chosen_logps": -263.3627014160156, "debug/reference_rejected_logps": -292.67999267578125, "debug/sppo_chosen_loss": 2542.97607421875, "debug/sppo_chosen_reward_in_loss": -0.4192062318325043, "debug/sppo_rej_reward_in_loss": -0.2556942105293274, "debug/sppo_reject_loss": 2475.31298828125, "epoch": 0.2898550724637681, "grad_norm": 58766.825046566, "learning_rate": 8e-08, "logits/chosen": 1.358420491218567, "logits/rejected": 1.7411441802978516, "logps/chosen": -263.78192138671875, "logps/rejected": -292.9356689453125, "loss": 4985.0809, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00419206265360117, "rewards/margins": -0.0016351204831153154, "rewards/rejected": -0.0025569419376552105, "step": 80 }, { "debug/policy_chosen_logits": 1.4952151775360107, "debug/policy_chosen_logps": -239.63442993164062, "debug/policy_rejected_logits": 2.120744228363037, "debug/policy_rejected_logps": -303.7784729003906, "debug/reference_chosen_logps": -239.67935180664062, "debug/reference_rejected_logps": -303.4689636230469, "debug/sppo_chosen_loss": 2496.201171875, "debug/sppo_chosen_reward_in_loss": 0.04491062089800835, "debug/sppo_rej_reward_in_loss": -0.30949363112449646, "debug/sppo_reject_loss": 2469.991455078125, "epoch": 0.3079710144927536, "grad_norm": 70435.71301625106, "learning_rate": 8.5e-08, "logits/chosen": 1.4952151775360107, "logits/rejected": 2.120744228363037, "logps/chosen": -239.63442993164062, "logps/rejected": -303.7784729003906, "loss": 4982.057, "rewards/accuracies": 0.625, "rewards/chosen": 0.0004491062427405268, "rewards/margins": 0.0035440423525869846, "rewards/rejected": -0.00309493625536561, "step": 85 }, { "debug/policy_chosen_logits": 1.2319533824920654, "debug/policy_chosen_logps": -252.02392578125, "debug/policy_rejected_logits": 1.4156310558319092, "debug/policy_rejected_logps": -281.8111267089844, "debug/reference_chosen_logps": -251.71041870117188, "debug/reference_rejected_logps": -281.41546630859375, "debug/sppo_chosen_loss": 2532.412109375, "debug/sppo_chosen_reward_in_loss": -0.31351929903030396, "debug/sppo_rej_reward_in_loss": -0.3956874907016754, "debug/sppo_reject_loss": 2461.616455078125, "epoch": 0.32608695652173914, "grad_norm": 62105.83167382246, "learning_rate": 9e-08, "logits/chosen": 1.2319533824920654, "logits/rejected": 1.4156310558319092, "logps/chosen": -252.02392578125, "logps/rejected": -281.8111267089844, "loss": 4979.9766, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.003135192673653364, "rewards/margins": 0.0008216820424422622, "rewards/rejected": -0.003956874832510948, "step": 90 }, { "debug/policy_chosen_logits": 1.4233776330947876, "debug/policy_chosen_logps": -250.4988250732422, "debug/policy_rejected_logits": 1.7451362609863281, "debug/policy_rejected_logps": -265.4994201660156, "debug/reference_chosen_logps": -250.3922119140625, "debug/reference_rejected_logps": -265.2495422363281, "debug/sppo_chosen_loss": 2511.2119140625, "debug/sppo_chosen_reward_in_loss": -0.10661010444164276, "debug/sppo_rej_reward_in_loss": -0.24988651275634766, "debug/sppo_reject_loss": 2475.833251953125, "epoch": 0.3442028985507246, "grad_norm": 73268.9034996652, "learning_rate": 9.499999999999999e-08, "logits/chosen": 1.4233776330947876, "logits/rejected": 1.7451362609863281, "logps/chosen": -250.4988250732422, "logps/rejected": -265.4994201660156, "loss": 4966.9859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0010661011328920722, "rewards/margins": 0.0014327641110867262, "rewards/rejected": -0.00249886536039412, "step": 95 }, { "debug/policy_chosen_logits": 1.1827932596206665, "debug/policy_chosen_logps": -229.2421875, "debug/policy_rejected_logits": 1.8669897317886353, "debug/policy_rejected_logps": -291.1219787597656, "debug/reference_chosen_logps": -229.11221313476562, "debug/reference_rejected_logps": -290.591552734375, "debug/sppo_chosen_loss": 2513.80908203125, "debug/sppo_chosen_reward_in_loss": -0.1299573928117752, "debug/sppo_rej_reward_in_loss": -0.5303934216499329, "debug/sppo_reject_loss": 2448.19775390625, "epoch": 0.36231884057971014, "grad_norm": 56415.03063351695, "learning_rate": 1e-07, "logits/chosen": 1.1827932596206665, "logits/rejected": 1.8669897317886353, "logps/chosen": -229.2421875, "logps/rejected": -291.1219787597656, "loss": 4975.3273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0012995738070458174, "rewards/margins": 0.004004360176622868, "rewards/rejected": -0.005303933285176754, "step": 100 }, { "epoch": 0.36231884057971014, "eval_debug/policy_chosen_logits": 1.637236475944519, "eval_debug/policy_chosen_logps": -253.25315856933594, "eval_debug/policy_rejected_logits": 1.7010170221328735, "eval_debug/policy_rejected_logps": -260.0372619628906, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2534.367919921875, "eval_debug/sppo_chosen_reward_in_loss": -0.33469125628471375, "eval_debug/sppo_rej_reward_in_loss": -0.37861743569374084, "eval_debug/sppo_reject_loss": 2463.385986328125, "eval_logits/chosen": 1.637236475944519, "eval_logits/rejected": 1.7010170221328735, "eval_logps/chosen": -253.25315856933594, "eval_logps/rejected": -260.0372619628906, "eval_loss": 4981.64892578125, "eval_rewards/accuracies": 0.46052631735801697, "eval_rewards/chosen": -0.0033469130285084248, "eval_rewards/margins": 0.0004392618138808757, "eval_rewards/rejected": -0.00378617481328547, "eval_runtime": 28.5662, "eval_samples_per_second": 21.004, "eval_steps_per_second": 0.665, "step": 100 }, { "debug/policy_chosen_logits": 1.1154356002807617, "debug/policy_chosen_logps": -245.5152587890625, "debug/policy_rejected_logits": 1.2336546182632446, "debug/policy_rejected_logps": -293.0606994628906, "debug/reference_chosen_logps": -246.10745239257812, "debug/reference_rejected_logps": -293.17578125, "debug/sppo_chosen_loss": 2442.190673828125, "debug/sppo_chosen_reward_in_loss": 0.5921922922134399, "debug/sppo_rej_reward_in_loss": 0.11505775153636932, "debug/sppo_reject_loss": 2512.83349609375, "epoch": 0.3804347826086957, "grad_norm": 63705.659163213895, "learning_rate": 9.97628083491461e-08, "logits/chosen": 1.1154356002807617, "logits/rejected": 1.2336546182632446, "logps/chosen": -245.5152587890625, "logps/rejected": -293.0606994628906, "loss": 4948.6871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0059219226241111755, "rewards/margins": 0.004771345295011997, "rewards/rejected": 0.001150577561929822, "step": 105 }, { "debug/policy_chosen_logits": 1.2632733583450317, "debug/policy_chosen_logps": -251.89804077148438, "debug/policy_rejected_logits": 1.5536165237426758, "debug/policy_rejected_logps": -277.15899658203125, "debug/reference_chosen_logps": -251.8131561279297, "debug/reference_rejected_logps": -276.50152587890625, "debug/sppo_chosen_loss": 2509.01318359375, "debug/sppo_chosen_reward_in_loss": -0.08488330990076065, "debug/sppo_rej_reward_in_loss": -0.6574586629867554, "debug/sppo_reject_loss": 2435.74853515625, "epoch": 0.39855072463768115, "grad_norm": 62131.983983858336, "learning_rate": 9.952561669829221e-08, "logits/chosen": 1.2632733583450317, "logits/rejected": 1.5536165237426758, "logps/chosen": -251.89804077148438, "logps/rejected": -277.15899658203125, "loss": 4955.5793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0008488328894600272, "rewards/margins": 0.005725753493607044, "rewards/rejected": -0.006574586965143681, "step": 110 }, { "debug/policy_chosen_logits": 1.5175397396087646, "debug/policy_chosen_logps": -240.87155151367188, "debug/policy_rejected_logits": 1.90229070186615, "debug/policy_rejected_logps": -302.7718811035156, "debug/reference_chosen_logps": -240.13211059570312, "debug/reference_rejected_logps": -301.23028564453125, "debug/sppo_chosen_loss": 2575.89111328125, "debug/sppo_chosen_reward_in_loss": -0.7394704818725586, "debug/sppo_rej_reward_in_loss": -1.5416189432144165, "debug/sppo_reject_loss": 2350.080810546875, "epoch": 0.4166666666666667, "grad_norm": 59914.79467332017, "learning_rate": 9.928842504743833e-08, "logits/chosen": 1.5175397396087646, "logits/rejected": 1.90229070186615, "logps/chosen": -240.87155151367188, "logps/rejected": -302.7718811035156, "loss": 4954.3, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007394704967737198, "rewards/margins": 0.00802148412913084, "rewards/rejected": -0.015416188165545464, "step": 115 }, { "debug/policy_chosen_logits": 1.3039615154266357, "debug/policy_chosen_logps": -239.6048583984375, "debug/policy_rejected_logits": 1.6626259088516235, "debug/policy_rejected_logps": -276.4693908691406, "debug/reference_chosen_logps": -238.712158203125, "debug/reference_rejected_logps": -275.2547302246094, "debug/sppo_chosen_loss": 2591.43701171875, "debug/sppo_chosen_reward_in_loss": -0.8927196264266968, "debug/sppo_rej_reward_in_loss": -1.2145980596542358, "debug/sppo_reject_loss": 2381.885009765625, "epoch": 0.43478260869565216, "grad_norm": 72052.94180783679, "learning_rate": 9.905123339658443e-08, "logits/chosen": 1.3039615154266357, "logits/rejected": 1.6626259088516235, "logps/chosen": -239.6048583984375, "logps/rejected": -276.4693908691406, "loss": 4962.4031, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008927195332944393, "rewards/margins": 0.003218784462660551, "rewards/rejected": -0.012145979329943657, "step": 120 }, { "debug/policy_chosen_logits": 1.3840808868408203, "debug/policy_chosen_logps": -240.65328979492188, "debug/policy_rejected_logits": 1.6251329183578491, "debug/policy_rejected_logps": -272.1733703613281, "debug/reference_chosen_logps": -240.1095428466797, "debug/reference_rejected_logps": -271.2296447753906, "debug/sppo_chosen_loss": 2555.42919921875, "debug/sppo_chosen_reward_in_loss": -0.5437628030776978, "debug/sppo_rej_reward_in_loss": -0.9437026977539062, "debug/sppo_reject_loss": 2408.338623046875, "epoch": 0.4528985507246377, "grad_norm": 142634.81996945245, "learning_rate": 9.881404174573055e-08, "logits/chosen": 1.3840808868408203, "logits/rejected": 1.6251329183578491, "logps/chosen": -240.65328979492188, "logps/rejected": -272.1733703613281, "loss": 4961.8859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005437628366053104, "rewards/margins": 0.003999399021267891, "rewards/rejected": -0.00943702645599842, "step": 125 }, { "debug/policy_chosen_logits": 1.1763654947280884, "debug/policy_chosen_logps": -251.47268676757812, "debug/policy_rejected_logits": 1.6029510498046875, "debug/policy_rejected_logps": -297.7487487792969, "debug/reference_chosen_logps": -250.86929321289062, "debug/reference_rejected_logps": -296.35479736328125, "debug/sppo_chosen_loss": 2562.08056640625, "debug/sppo_chosen_reward_in_loss": -0.6033927798271179, "debug/sppo_rej_reward_in_loss": -1.3939769268035889, "debug/sppo_reject_loss": 2363.89013671875, "epoch": 0.47101449275362317, "grad_norm": 62548.25971245636, "learning_rate": 9.857685009487665e-08, "logits/chosen": 1.1763654947280884, "logits/rejected": 1.6029510498046875, "logps/chosen": -251.47268676757812, "logps/rejected": -297.7487487792969, "loss": 4954.1355, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006033928133547306, "rewards/margins": 0.007905842736363411, "rewards/rejected": -0.013939769938588142, "step": 130 }, { "debug/policy_chosen_logits": 1.3112818002700806, "debug/policy_chosen_logps": -248.6862335205078, "debug/policy_rejected_logits": 1.7784364223480225, "debug/policy_rejected_logps": -299.6939392089844, "debug/reference_chosen_logps": -247.79171752929688, "debug/reference_rejected_logps": -298.0932312011719, "debug/sppo_chosen_loss": 2592.22607421875, "debug/sppo_chosen_reward_in_loss": -0.8945201635360718, "debug/sppo_rej_reward_in_loss": -1.6007232666015625, "debug/sppo_reject_loss": 2344.56787109375, "epoch": 0.4891304347826087, "grad_norm": 55460.956888231085, "learning_rate": 9.833965844402277e-08, "logits/chosen": 1.3112818002700806, "logits/rejected": 1.7784364223480225, "logps/chosen": -248.6862335205078, "logps/rejected": -299.6939392089844, "loss": 4941.0723, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.008945201523602009, "rewards/margins": 0.0070620314218103886, "rewards/rejected": -0.016007233411073685, "step": 135 }, { "debug/policy_chosen_logits": 1.6897640228271484, "debug/policy_chosen_logps": -267.25457763671875, "debug/policy_rejected_logits": 1.9849485158920288, "debug/policy_rejected_logps": -309.496826171875, "debug/reference_chosen_logps": -266.6741638183594, "debug/reference_rejected_logps": -308.5638427734375, "debug/sppo_chosen_loss": 2559.904541015625, "debug/sppo_chosen_reward_in_loss": -0.5804330706596375, "debug/sppo_rej_reward_in_loss": -0.9329582452774048, "debug/sppo_reject_loss": 2409.60546875, "epoch": 0.5072463768115942, "grad_norm": 92713.31204857262, "learning_rate": 9.810246679316887e-08, "logits/chosen": 1.6897640228271484, "logits/rejected": 1.9849485158920288, "logps/chosen": -267.25457763671875, "logps/rejected": -309.496826171875, "loss": 4952.791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005804331041872501, "rewards/margins": 0.003525251056998968, "rewards/rejected": -0.009329581633210182, "step": 140 }, { "debug/policy_chosen_logits": 1.2088998556137085, "debug/policy_chosen_logps": -241.20230102539062, "debug/policy_rejected_logits": 1.5338783264160156, "debug/policy_rejected_logps": -282.41192626953125, "debug/reference_chosen_logps": -240.69192504882812, "debug/reference_rejected_logps": -281.6559753417969, "debug/sppo_chosen_loss": 2552.27294921875, "debug/sppo_chosen_reward_in_loss": -0.5103633999824524, "debug/sppo_rej_reward_in_loss": -0.7559680938720703, "debug/sppo_reject_loss": 2426.20654296875, "epoch": 0.5253623188405797, "grad_norm": 62190.797245739384, "learning_rate": 9.786527514231498e-08, "logits/chosen": 1.2088998556137085, "logits/rejected": 1.5338783264160156, "logps/chosen": -241.20230102539062, "logps/rejected": -282.41192626953125, "loss": 4959.4836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005103633739054203, "rewards/margins": 0.0024560471065342426, "rewards/rejected": -0.007559680845588446, "step": 145 }, { "debug/policy_chosen_logits": 1.5628173351287842, "debug/policy_chosen_logps": -267.08331298828125, "debug/policy_rejected_logits": 2.0287530422210693, "debug/policy_rejected_logps": -296.8517150878906, "debug/reference_chosen_logps": -266.7818298339844, "debug/reference_rejected_logps": -295.389892578125, "debug/sppo_chosen_loss": 2531.56689453125, "debug/sppo_chosen_reward_in_loss": -0.30144691467285156, "debug/sppo_rej_reward_in_loss": -1.4618279933929443, "debug/sppo_reject_loss": 2357.379638671875, "epoch": 0.5434782608695652, "grad_norm": 115434.88549840715, "learning_rate": 9.76280834914611e-08, "logits/chosen": 1.5628173351287842, "logits/rejected": 2.0287530422210693, "logps/chosen": -267.08331298828125, "logps/rejected": -296.8517150878906, "loss": 4940.8695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.003014469053596258, "rewards/margins": 0.011603811755776405, "rewards/rejected": -0.014618280343711376, "step": 150 }, { "debug/policy_chosen_logits": 1.3465659618377686, "debug/policy_chosen_logps": -241.53561401367188, "debug/policy_rejected_logits": 1.6948206424713135, "debug/policy_rejected_logps": -277.7395324707031, "debug/reference_chosen_logps": -240.8543701171875, "debug/reference_rejected_logps": -276.788818359375, "debug/sppo_chosen_loss": 2569.73583984375, "debug/sppo_chosen_reward_in_loss": -0.6812616586685181, "debug/sppo_rej_reward_in_loss": -0.9507232904434204, "debug/sppo_reject_loss": 2407.25927734375, "epoch": 0.5615942028985508, "grad_norm": 73209.56469136247, "learning_rate": 9.73908918406072e-08, "logits/chosen": 1.3465659618377686, "logits/rejected": 1.6948206424713135, "logps/chosen": -241.53561401367188, "logps/rejected": -277.7395324707031, "loss": 4947.9406, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.006812616251409054, "rewards/margins": 0.002694616327062249, "rewards/rejected": -0.009507233276963234, "step": 155 }, { "debug/policy_chosen_logits": 1.0786606073379517, "debug/policy_chosen_logps": -254.95944213867188, "debug/policy_rejected_logits": 1.536789894104004, "debug/policy_rejected_logps": -297.4348449707031, "debug/reference_chosen_logps": -254.5387420654297, "debug/reference_rejected_logps": -295.9001770019531, "debug/sppo_chosen_loss": 2543.41015625, "debug/sppo_chosen_reward_in_loss": -0.4207225739955902, "debug/sppo_rej_reward_in_loss": -1.5346710681915283, "debug/sppo_reject_loss": 2351.199951171875, "epoch": 0.5797101449275363, "grad_norm": 122373.82444624687, "learning_rate": 9.715370018975332e-08, "logits/chosen": 1.0786606073379517, "logits/rejected": 1.536789894104004, "logps/chosen": -254.95944213867188, "logps/rejected": -297.4348449707031, "loss": 4959.1637, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.004207225516438484, "rewards/margins": 0.011139484122395515, "rewards/rejected": -0.015346708707511425, "step": 160 }, { "debug/policy_chosen_logits": 1.5776184797286987, "debug/policy_chosen_logps": -269.88970947265625, "debug/policy_rejected_logits": 1.4022572040557861, "debug/policy_rejected_logps": -263.2962951660156, "debug/reference_chosen_logps": -269.4091491699219, "debug/reference_rejected_logps": -262.3013610839844, "debug/sppo_chosen_loss": 2550.29736328125, "debug/sppo_chosen_reward_in_loss": -0.4805648922920227, "debug/sppo_rej_reward_in_loss": -0.9949352145195007, "debug/sppo_reject_loss": 2403.582275390625, "epoch": 0.5978260869565217, "grad_norm": 85426.22464427097, "learning_rate": 9.691650853889942e-08, "logits/chosen": 1.5776184797286987, "logits/rejected": 1.4022572040557861, "logps/chosen": -269.88970947265625, "logps/rejected": -263.2962951660156, "loss": 4934.6766, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0048056491650640965, "rewards/margins": 0.005143703427165747, "rewards/rejected": -0.009949352592229843, "step": 165 }, { "debug/policy_chosen_logits": 1.2946946620941162, "debug/policy_chosen_logps": -237.68826293945312, "debug/policy_rejected_logits": 1.6014455556869507, "debug/policy_rejected_logps": -291.9438781738281, "debug/reference_chosen_logps": -238.73080444335938, "debug/reference_rejected_logps": -293.11700439453125, "debug/sppo_chosen_loss": 2399.82275390625, "debug/sppo_chosen_reward_in_loss": 1.042567253112793, "debug/sppo_rej_reward_in_loss": 1.1731163263320923, "debug/sppo_reject_loss": 2622.353759765625, "epoch": 0.6159420289855072, "grad_norm": 63590.22812250166, "learning_rate": 9.667931688804554e-08, "logits/chosen": 1.2946946620941162, "logits/rejected": 1.6014455556869507, "logps/chosen": -237.68826293945312, "logps/rejected": -291.9438781738281, "loss": 4977.4523, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.010425671935081482, "rewards/margins": -0.0013054904993623495, "rewards/rejected": 0.011731162667274475, "step": 170 }, { "debug/policy_chosen_logits": 1.42464280128479, "debug/policy_chosen_logps": -262.2657165527344, "debug/policy_rejected_logits": 1.8595415353775024, "debug/policy_rejected_logps": -308.47412109375, "debug/reference_chosen_logps": -263.7447814941406, "debug/reference_rejected_logps": -309.0677795410156, "debug/sppo_chosen_loss": 2357.06689453125, "debug/sppo_chosen_reward_in_loss": 1.4790408611297607, "debug/sppo_rej_reward_in_loss": 0.5936603546142578, "debug/sppo_reject_loss": 2565.474609375, "epoch": 0.6340579710144928, "grad_norm": 64425.147966590055, "learning_rate": 9.644212523719165e-08, "logits/chosen": 1.42464280128479, "logits/rejected": 1.8595415353775024, "logps/chosen": -262.2657165527344, "logps/rejected": -308.47412109375, "loss": 4937.791, "rewards/accuracies": 0.625, "rewards/chosen": 0.014790408313274384, "rewards/margins": 0.008853806182742119, "rewards/rejected": 0.005936603061854839, "step": 175 }, { "debug/policy_chosen_logits": 1.2175884246826172, "debug/policy_chosen_logps": -246.80810546875, "debug/policy_rejected_logits": 1.5828646421432495, "debug/policy_rejected_logps": -289.2925720214844, "debug/reference_chosen_logps": -246.3313446044922, "debug/reference_rejected_logps": -288.1348876953125, "debug/sppo_chosen_loss": 2550.687255859375, "debug/sppo_chosen_reward_in_loss": -0.4767661988735199, "debug/sppo_rej_reward_in_loss": -1.15768301486969, "debug/sppo_reject_loss": 2392.44384765625, "epoch": 0.6521739130434783, "grad_norm": 74341.59346639444, "learning_rate": 9.620493358633775e-08, "logits/chosen": 1.2175884246826172, "logits/rejected": 1.5828646421432495, "logps/chosen": -246.80810546875, "logps/rejected": -289.2925720214844, "loss": 4913.027, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00476766237989068, "rewards/margins": 0.006809167563915253, "rewards/rejected": -0.011576829478144646, "step": 180 }, { "debug/policy_chosen_logits": 1.1927402019500732, "debug/policy_chosen_logps": -248.07931518554688, "debug/policy_rejected_logits": 1.6495721340179443, "debug/policy_rejected_logps": -322.205810546875, "debug/reference_chosen_logps": -246.93405151367188, "debug/reference_rejected_logps": -319.759765625, "debug/sppo_chosen_loss": 2617.726318359375, "debug/sppo_chosen_reward_in_loss": -1.1452611684799194, "debug/sppo_rej_reward_in_loss": -2.4460387229919434, "debug/sppo_reject_loss": 2267.54248046875, "epoch": 0.6702898550724637, "grad_norm": 61789.91322959805, "learning_rate": 9.596774193548388e-08, "logits/chosen": 1.1927402019500732, "logits/rejected": 1.6495721340179443, "logps/chosen": -248.07931518554688, "logps/rejected": -322.205810546875, "loss": 4906.6523, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011452612467110157, "rewards/margins": 0.013007773086428642, "rewards/rejected": -0.024460386484861374, "step": 185 }, { "debug/policy_chosen_logits": 1.205572485923767, "debug/policy_chosen_logps": -251.21682739257812, "debug/policy_rejected_logits": 1.27708101272583, "debug/policy_rejected_logps": -275.42572021484375, "debug/reference_chosen_logps": -250.4452362060547, "debug/reference_rejected_logps": -273.4372863769531, "debug/sppo_chosen_loss": 2580.40380859375, "debug/sppo_chosen_reward_in_loss": -0.7715753316879272, "debug/sppo_rej_reward_in_loss": -1.9884198904037476, "debug/sppo_reject_loss": 2313.635498046875, "epoch": 0.6884057971014492, "grad_norm": 58225.69051231545, "learning_rate": 9.573055028462997e-08, "logits/chosen": 1.205572485923767, "logits/rejected": 1.27708101272583, "logps/chosen": -251.21682739257812, "logps/rejected": -275.42572021484375, "loss": 4925.2672, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.007715752813965082, "rewards/margins": 0.012168444693088531, "rewards/rejected": -0.019884198904037476, "step": 190 }, { "debug/policy_chosen_logits": 1.7537024021148682, "debug/policy_chosen_logps": -272.2340087890625, "debug/policy_rejected_logits": 2.1274123191833496, "debug/policy_rejected_logps": -313.24383544921875, "debug/reference_chosen_logps": -270.6104736328125, "debug/reference_rejected_logps": -310.8916320800781, "debug/sppo_chosen_loss": 2673.464111328125, "debug/sppo_chosen_reward_in_loss": -1.6235431432724, "debug/sppo_rej_reward_in_loss": -2.352172374725342, "debug/sppo_reject_loss": 2276.525390625, "epoch": 0.7065217391304348, "grad_norm": 59780.27359617854, "learning_rate": 9.549335863377609e-08, "logits/chosen": 1.7537024021148682, "logits/rejected": 2.1274123191833496, "logps/chosen": -272.2340087890625, "logps/rejected": -313.24383544921875, "loss": 4934.2148, "rewards/accuracies": 0.625, "rewards/chosen": -0.016235431656241417, "rewards/margins": 0.007286292966455221, "rewards/rejected": -0.023521723225712776, "step": 195 }, { "debug/policy_chosen_logits": 1.2133315801620483, "debug/policy_chosen_logps": -283.06036376953125, "debug/policy_rejected_logits": 1.5332649946212769, "debug/policy_rejected_logps": -289.94671630859375, "debug/reference_chosen_logps": -282.74090576171875, "debug/reference_rejected_logps": -288.225341796875, "debug/sppo_chosen_loss": 2535.31103515625, "debug/sppo_chosen_reward_in_loss": -0.31944578886032104, "debug/sppo_rej_reward_in_loss": -1.7213935852050781, "debug/sppo_reject_loss": 2334.736328125, "epoch": 0.7246376811594203, "grad_norm": 59559.644946658744, "learning_rate": 9.525616698292219e-08, "logits/chosen": 1.2133315801620483, "logits/rejected": 1.5332649946212769, "logps/chosen": -283.06036376953125, "logps/rejected": -289.94671630859375, "loss": 4930.2141, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0031944583170115948, "rewards/margins": 0.014019479043781757, "rewards/rejected": -0.017213935032486916, "step": 200 }, { "epoch": 0.7246376811594203, "eval_debug/policy_chosen_logits": 1.6069599390029907, "eval_debug/policy_chosen_logps": -253.04759216308594, "eval_debug/policy_rejected_logits": 1.668046236038208, "eval_debug/policy_rejected_logps": -260.25958251953125, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2514.630859375, "eval_debug/sppo_chosen_reward_in_loss": -0.12910039722919464, "eval_debug/sppo_rej_reward_in_loss": -0.6009387969970703, "eval_debug/sppo_reject_loss": 2444.321044921875, "eval_logits/chosen": 1.6069599390029907, "eval_logits/rejected": 1.668046236038208, "eval_logps/chosen": -253.04759216308594, "eval_logps/rejected": -260.25958251953125, "eval_loss": 4924.06494140625, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": -0.0012910040095448494, "eval_rewards/margins": 0.004718384239822626, "eval_rewards/rejected": -0.006009387783706188, "eval_runtime": 28.4556, "eval_samples_per_second": 21.085, "eval_steps_per_second": 0.668, "step": 200 }, { "debug/policy_chosen_logits": 1.3175169229507446, "debug/policy_chosen_logps": -251.24765014648438, "debug/policy_rejected_logits": 1.6718851327896118, "debug/policy_rejected_logps": -307.59417724609375, "debug/reference_chosen_logps": -250.2000274658203, "debug/reference_rejected_logps": -305.8931579589844, "debug/sppo_chosen_loss": 2608.48291015625, "debug/sppo_chosen_reward_in_loss": -1.0476433038711548, "debug/sppo_rej_reward_in_loss": -1.7010328769683838, "debug/sppo_reject_loss": 2337.03125, "epoch": 0.7427536231884058, "grad_norm": 68762.81932461073, "learning_rate": 9.501897533206831e-08, "logits/chosen": 1.3175169229507446, "logits/rejected": 1.6718851327896118, "logps/chosen": -251.24765014648438, "logps/rejected": -307.59417724609375, "loss": 4924.4742, "rewards/accuracies": 0.625, "rewards/chosen": -0.010476434603333473, "rewards/margins": 0.0065338946878910065, "rewards/rejected": -0.01701032742857933, "step": 205 }, { "debug/policy_chosen_logits": 1.1597055196762085, "debug/policy_chosen_logps": -259.6463317871094, "debug/policy_rejected_logits": 1.4596357345581055, "debug/policy_rejected_logps": -255.8791046142578, "debug/reference_chosen_logps": -258.54693603515625, "debug/reference_rejected_logps": -254.0518035888672, "debug/sppo_chosen_loss": 2614.744873046875, "debug/sppo_chosen_reward_in_loss": -1.099359154701233, "debug/sppo_rej_reward_in_loss": -1.8272931575775146, "debug/sppo_reject_loss": 2325.46875, "epoch": 0.7608695652173914, "grad_norm": 56683.52495823318, "learning_rate": 9.478178368121442e-08, "logits/chosen": 1.1597055196762085, "logits/rejected": 1.4596357345581055, "logps/chosen": -259.6463317871094, "logps/rejected": -255.8791046142578, "loss": 4912.3742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010993590578436852, "rewards/margins": 0.007279340177774429, "rewards/rejected": -0.01827293261885643, "step": 210 }, { "debug/policy_chosen_logits": 1.431215524673462, "debug/policy_chosen_logps": -268.44384765625, "debug/policy_rejected_logits": 1.8261162042617798, "debug/policy_rejected_logps": -298.30633544921875, "debug/reference_chosen_logps": -267.0002136230469, "debug/reference_rejected_logps": -295.2477111816406, "debug/sppo_chosen_loss": 2651.647216796875, "debug/sppo_chosen_reward_in_loss": -1.4436086416244507, "debug/sppo_rej_reward_in_loss": -3.0586190223693848, "debug/sppo_reject_loss": 2214.093505859375, "epoch": 0.7789855072463768, "grad_norm": 57024.00722800529, "learning_rate": 9.454459203036053e-08, "logits/chosen": 1.431215524673462, "logits/rejected": 1.8261162042617798, "logps/chosen": -268.44384765625, "logps/rejected": -298.30633544921875, "loss": 4875.9406, "rewards/accuracies": 0.625, "rewards/chosen": -0.01443608570843935, "rewards/margins": 0.016150105744600296, "rewards/rejected": -0.030586188659071922, "step": 215 }, { "debug/policy_chosen_logits": 0.9344885945320129, "debug/policy_chosen_logps": -250.80398559570312, "debug/policy_rejected_logits": 1.2634727954864502, "debug/policy_rejected_logps": -276.9325866699219, "debug/reference_chosen_logps": -249.66796875, "debug/reference_rejected_logps": -274.86138916015625, "debug/sppo_chosen_loss": 2621.491943359375, "debug/sppo_chosen_reward_in_loss": -1.1359920501708984, "debug/sppo_rej_reward_in_loss": -2.0712063312530518, "debug/sppo_reject_loss": 2306.584716796875, "epoch": 0.7971014492753623, "grad_norm": 84242.43895561656, "learning_rate": 9.430740037950665e-08, "logits/chosen": 0.9344885945320129, "logits/rejected": 1.2634727954864502, "logps/chosen": -250.80398559570312, "logps/rejected": -276.9325866699219, "loss": 4890.2953, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011359919793903828, "rewards/margins": 0.009352141991257668, "rewards/rejected": -0.02071206271648407, "step": 220 }, { "debug/policy_chosen_logits": 1.310813307762146, "debug/policy_chosen_logps": -283.4052429199219, "debug/policy_rejected_logits": 1.6033828258514404, "debug/policy_rejected_logps": -291.384033203125, "debug/reference_chosen_logps": -282.43328857421875, "debug/reference_rejected_logps": -289.2051696777344, "debug/sppo_chosen_loss": 2601.934326171875, "debug/sppo_chosen_reward_in_loss": -0.9719875454902649, "debug/sppo_rej_reward_in_loss": -2.17887544631958, "debug/sppo_reject_loss": 2294.928466796875, "epoch": 0.8152173913043478, "grad_norm": 67401.35089765802, "learning_rate": 9.407020872865274e-08, "logits/chosen": 1.310813307762146, "logits/rejected": 1.6033828258514404, "logps/chosen": -283.4052429199219, "logps/rejected": -291.384033203125, "loss": 4889.4094, "rewards/accuracies": 0.625, "rewards/chosen": -0.009719875641167164, "rewards/margins": 0.012068879790604115, "rewards/rejected": -0.02178875356912613, "step": 225 }, { "debug/policy_chosen_logits": 1.164006233215332, "debug/policy_chosen_logps": -233.8202362060547, "debug/policy_rejected_logits": 1.3819098472595215, "debug/policy_rejected_logps": -276.9102783203125, "debug/reference_chosen_logps": -233.37783813476562, "debug/reference_rejected_logps": -275.42218017578125, "debug/sppo_chosen_loss": 2548.687744140625, "debug/sppo_chosen_reward_in_loss": -0.44242897629737854, "debug/sppo_rej_reward_in_loss": -1.4880669116973877, "debug/sppo_reject_loss": 2358.2958984375, "epoch": 0.8333333333333334, "grad_norm": 63025.956451539554, "learning_rate": 9.383301707779886e-08, "logits/chosen": 1.164006233215332, "logits/rejected": 1.3819098472595215, "logps/chosen": -233.8202362060547, "logps/rejected": -276.9102783203125, "loss": 4892.3969, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004424289800226688, "rewards/margins": 0.010456378571689129, "rewards/rejected": -0.014880669303238392, "step": 230 }, { "debug/policy_chosen_logits": 1.3427813053131104, "debug/policy_chosen_logps": -252.94656372070312, "debug/policy_rejected_logits": 1.5185219049453735, "debug/policy_rejected_logps": -298.92926025390625, "debug/reference_chosen_logps": -251.4791259765625, "debug/reference_rejected_logps": -296.46527099609375, "debug/sppo_chosen_loss": 2654.405029296875, "debug/sppo_chosen_reward_in_loss": -1.4674131870269775, "debug/sppo_rej_reward_in_loss": -2.4640157222747803, "debug/sppo_reject_loss": 2269.97998046875, "epoch": 0.8514492753623188, "grad_norm": 100170.56994988552, "learning_rate": 9.359582542694496e-08, "logits/chosen": 1.3427813053131104, "logits/rejected": 1.5185219049453735, "logps/chosen": -252.94656372070312, "logps/rejected": -298.92926025390625, "loss": 4865.6945, "rewards/accuracies": 0.625, "rewards/chosen": -0.014674129895865917, "rewards/margins": 0.009966026991605759, "rewards/rejected": -0.024640154093503952, "step": 235 }, { "debug/policy_chosen_logits": 1.3361910581588745, "debug/policy_chosen_logps": -244.4169464111328, "debug/policy_rejected_logits": 1.9270985126495361, "debug/policy_rejected_logps": -300.205078125, "debug/reference_chosen_logps": -242.9955291748047, "debug/reference_rejected_logps": -297.2693786621094, "debug/sppo_chosen_loss": 2649.09912109375, "debug/sppo_chosen_reward_in_loss": -1.4214222431182861, "debug/sppo_rej_reward_in_loss": -2.935706377029419, "debug/sppo_reject_loss": 2222.06396484375, "epoch": 0.8695652173913043, "grad_norm": 58927.00519280313, "learning_rate": 9.335863377609108e-08, "logits/chosen": 1.3361910581588745, "logits/rejected": 1.9270985126495361, "logps/chosen": -244.4169464111328, "logps/rejected": -300.205078125, "loss": 4894.9539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.014214222319424152, "rewards/margins": 0.015142843127250671, "rewards/rejected": -0.02935706451535225, "step": 240 }, { "debug/policy_chosen_logits": 1.0650449991226196, "debug/policy_chosen_logps": -233.67684936523438, "debug/policy_rejected_logits": 1.4125347137451172, "debug/policy_rejected_logps": -283.5912170410156, "debug/reference_chosen_logps": -233.43692016601562, "debug/reference_rejected_logps": -282.2871398925781, "debug/sppo_chosen_loss": 2530.022705078125, "debug/sppo_chosen_reward_in_loss": -0.2399454116821289, "debug/sppo_rej_reward_in_loss": -1.3040904998779297, "debug/sppo_reject_loss": 2381.484130859375, "epoch": 0.8876811594202898, "grad_norm": 64978.28173618321, "learning_rate": 9.312144212523719e-08, "logits/chosen": 1.0650449991226196, "logits/rejected": 1.4125347137451172, "logps/chosen": -233.67684936523438, "logps/rejected": -283.5912170410156, "loss": 4873.1453, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002399453893303871, "rewards/margins": 0.010641450062394142, "rewards/rejected": -0.013040904887020588, "step": 245 }, { "debug/policy_chosen_logits": 1.1842572689056396, "debug/policy_chosen_logps": -276.47589111328125, "debug/policy_rejected_logits": 1.5513187646865845, "debug/policy_rejected_logps": -282.89385986328125, "debug/reference_chosen_logps": -276.27154541015625, "debug/reference_rejected_logps": -282.2914123535156, "debug/sppo_chosen_loss": 2526.110107421875, "debug/sppo_chosen_reward_in_loss": -0.20432281494140625, "debug/sppo_rej_reward_in_loss": -0.602450966835022, "debug/sppo_reject_loss": 2445.01513671875, "epoch": 0.9057971014492754, "grad_norm": 56707.006055481135, "learning_rate": 9.28842504743833e-08, "logits/chosen": 1.1842572689056396, "logits/rejected": 1.5513187646865845, "logps/chosen": -276.47589111328125, "logps/rejected": -282.89385986328125, "loss": 4922.093, "rewards/accuracies": 0.5, "rewards/chosen": -0.002043228130787611, "rewards/margins": 0.003981282003223896, "rewards/rejected": -0.006024509202688932, "step": 250 }, { "debug/policy_chosen_logits": 1.3690376281738281, "debug/policy_chosen_logps": -235.6128387451172, "debug/policy_rejected_logits": 1.5393130779266357, "debug/policy_rejected_logps": -271.02557373046875, "debug/reference_chosen_logps": -235.9077606201172, "debug/reference_rejected_logps": -269.55755615234375, "debug/sppo_chosen_loss": 2474.778076171875, "debug/sppo_chosen_reward_in_loss": 0.29494571685791016, "debug/sppo_rej_reward_in_loss": -1.468017339706421, "debug/sppo_reject_loss": 2364.58349609375, "epoch": 0.9239130434782609, "grad_norm": 67012.64835888152, "learning_rate": 9.26470588235294e-08, "logits/chosen": 1.3690376281738281, "logits/rejected": 1.5393130779266357, "logps/chosen": -235.6128387451172, "logps/rejected": -271.02557373046875, "loss": 4864.6945, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00294945714995265, "rewards/margins": 0.017629629001021385, "rewards/rejected": -0.014680172316730022, "step": 255 }, { "debug/policy_chosen_logits": 1.2896627187728882, "debug/policy_chosen_logps": -252.8654327392578, "debug/policy_rejected_logits": 1.8506171703338623, "debug/policy_rejected_logps": -307.50738525390625, "debug/reference_chosen_logps": -253.3511199951172, "debug/reference_rejected_logps": -305.5581359863281, "debug/sppo_chosen_loss": 2454.99755859375, "debug/sppo_chosen_reward_in_loss": 0.4856864809989929, "debug/sppo_rej_reward_in_loss": -1.9492313861846924, "debug/sppo_reject_loss": 2318.29833984375, "epoch": 0.9420289855072463, "grad_norm": 87514.05648018401, "learning_rate": 9.240986717267551e-08, "logits/chosen": 1.2896627187728882, "logits/rejected": 1.8506171703338623, "logps/chosen": -252.8654327392578, "logps/rejected": -307.50738525390625, "loss": 4825.3953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004856864456087351, "rewards/margins": 0.02434917725622654, "rewards/rejected": -0.019492313265800476, "step": 260 }, { "debug/policy_chosen_logits": 1.499426007270813, "debug/policy_chosen_logps": -304.4361267089844, "debug/policy_rejected_logits": 1.6462829113006592, "debug/policy_rejected_logps": -293.2527770996094, "debug/reference_chosen_logps": -304.60321044921875, "debug/reference_rejected_logps": -291.7633056640625, "debug/sppo_chosen_loss": 2487.141845703125, "debug/sppo_chosen_reward_in_loss": 0.16708603501319885, "debug/sppo_rej_reward_in_loss": -1.4894893169403076, "debug/sppo_reject_loss": 2362.33544921875, "epoch": 0.9601449275362319, "grad_norm": 57539.219252727984, "learning_rate": 9.217267552182164e-08, "logits/chosen": 1.499426007270813, "logits/rejected": 1.6462829113006592, "logps/chosen": -304.4361267089844, "logps/rejected": -293.2527770996094, "loss": 4845.3477, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0016708601033315063, "rewards/margins": 0.016565751284360886, "rewards/rejected": -0.01489489246159792, "step": 265 }, { "debug/policy_chosen_logits": 1.454056978225708, "debug/policy_chosen_logps": -268.58087158203125, "debug/policy_rejected_logits": 1.9709457159042358, "debug/policy_rejected_logps": -277.7257385253906, "debug/reference_chosen_logps": -267.8411560058594, "debug/reference_rejected_logps": -275.6684875488281, "debug/sppo_chosen_loss": 2581.456298828125, "debug/sppo_chosen_reward_in_loss": -0.7396841049194336, "debug/sppo_rej_reward_in_loss": -2.057248830795288, "debug/sppo_reject_loss": 2311.698486328125, "epoch": 0.9782608695652174, "grad_norm": 62921.546115813035, "learning_rate": 9.193548387096773e-08, "logits/chosen": 1.454056978225708, "logits/rejected": 1.9709457159042358, "logps/chosen": -268.58087158203125, "logps/rejected": -277.7257385253906, "loss": 4868.1102, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0073968409560620785, "rewards/margins": 0.013175646774470806, "rewards/rejected": -0.020572489127516747, "step": 270 }, { "debug/policy_chosen_logits": 1.0943603515625, "debug/policy_chosen_logps": -257.5702819824219, "debug/policy_rejected_logits": 1.3593322038650513, "debug/policy_rejected_logps": -256.6277160644531, "debug/reference_chosen_logps": -257.44097900390625, "debug/reference_rejected_logps": -254.19161987304688, "debug/sppo_chosen_loss": 2516.85693359375, "debug/sppo_chosen_reward_in_loss": -0.129298597574234, "debug/sppo_rej_reward_in_loss": -2.4360697269439697, "debug/sppo_reject_loss": 2271.108642578125, "epoch": 0.9963768115942029, "grad_norm": 65985.17028025891, "learning_rate": 9.169829222011385e-08, "logits/chosen": 1.0943603515625, "logits/rejected": 1.3593322038650513, "logps/chosen": -257.5702819824219, "logps/rejected": -256.6277160644531, "loss": 4832.943, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0012929860968142748, "rewards/margins": 0.023067709058523178, "rewards/rejected": -0.024360693991184235, "step": 275 }, { "debug/policy_chosen_logits": 1.3448100090026855, "debug/policy_chosen_logps": -261.7139892578125, "debug/policy_rejected_logits": 1.799532175064087, "debug/policy_rejected_logps": -297.2242431640625, "debug/reference_chosen_logps": -260.2370300292969, "debug/reference_rejected_logps": -293.6090393066406, "debug/sppo_chosen_loss": 2657.1240234375, "debug/sppo_chosen_reward_in_loss": -1.4769624471664429, "debug/sppo_rej_reward_in_loss": -3.6152305603027344, "debug/sppo_reject_loss": 2166.08203125, "epoch": 1.0144927536231885, "grad_norm": 67112.90954910344, "learning_rate": 9.146110056925995e-08, "logits/chosen": 1.3448100090026855, "logits/rejected": 1.799532175064087, "logps/chosen": -261.7139892578125, "logps/rejected": -297.2242431640625, "loss": 4841.2109, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01476962398737669, "rewards/margins": 0.021382678300142288, "rewards/rejected": -0.03615230321884155, "step": 280 }, { "debug/policy_chosen_logits": 1.2284828424453735, "debug/policy_chosen_logps": -259.97393798828125, "debug/policy_rejected_logits": 1.4659395217895508, "debug/policy_rejected_logps": -285.306396484375, "debug/reference_chosen_logps": -259.2257385253906, "debug/reference_rejected_logps": -282.3316955566406, "debug/sppo_chosen_loss": 2581.87353515625, "debug/sppo_chosen_reward_in_loss": -0.7482067346572876, "debug/sppo_rej_reward_in_loss": -2.974637508392334, "debug/sppo_reject_loss": 2224.20751953125, "epoch": 1.0326086956521738, "grad_norm": 68280.43853432951, "learning_rate": 9.122390891840607e-08, "logits/chosen": 1.2284828424453735, "logits/rejected": 1.4659395217895508, "logps/chosen": -259.97393798828125, "logps/rejected": -285.306396484375, "loss": 4821.0039, "rewards/accuracies": 0.75, "rewards/chosen": -0.007482066750526428, "rewards/margins": 0.022264307364821434, "rewards/rejected": -0.02974637784063816, "step": 285 }, { "debug/policy_chosen_logits": 1.5147180557250977, "debug/policy_chosen_logps": -274.5523986816406, "debug/policy_rejected_logits": 1.640385627746582, "debug/policy_rejected_logps": -274.20904541015625, "debug/reference_chosen_logps": -274.32757568359375, "debug/reference_rejected_logps": -271.07623291015625, "debug/sppo_chosen_loss": 2526.9208984375, "debug/sppo_chosen_reward_in_loss": -0.22480659186840057, "debug/sppo_rej_reward_in_loss": -3.132810115814209, "debug/sppo_reject_loss": 2209.643310546875, "epoch": 1.0507246376811594, "grad_norm": 79336.72299299274, "learning_rate": 9.098671726755218e-08, "logits/chosen": 1.5147180557250977, "logits/rejected": 1.640385627746582, "logps/chosen": -274.5523986816406, "logps/rejected": -274.20904541015625, "loss": 4840.0047, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0022480657789856195, "rewards/margins": 0.029080037027597427, "rewards/rejected": -0.031328100711107254, "step": 290 }, { "debug/policy_chosen_logits": 1.2543970346450806, "debug/policy_chosen_logps": -246.2348175048828, "debug/policy_rejected_logits": 1.6862668991088867, "debug/policy_rejected_logps": -314.8777770996094, "debug/reference_chosen_logps": -245.95834350585938, "debug/reference_rejected_logps": -312.770751953125, "debug/sppo_chosen_loss": 2532.136474609375, "debug/sppo_chosen_reward_in_loss": -0.2764921188354492, "debug/sppo_rej_reward_in_loss": -2.107025146484375, "debug/sppo_reject_loss": 2303.393310546875, "epoch": 1.068840579710145, "grad_norm": 58487.104903453896, "learning_rate": 9.074952561669828e-08, "logits/chosen": 1.2543970346450806, "logits/rejected": 1.6862668991088867, "logps/chosen": -246.2348175048828, "logps/rejected": -314.8777770996094, "loss": 4845.9082, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0027649211697280407, "rewards/margins": 0.018305329605937004, "rewards/rejected": -0.021070251241326332, "step": 295 }, { "debug/policy_chosen_logits": 1.0411510467529297, "debug/policy_chosen_logps": -253.15823364257812, "debug/policy_rejected_logits": 1.3232452869415283, "debug/policy_rejected_logps": -298.0408630371094, "debug/reference_chosen_logps": -251.8573455810547, "debug/reference_rejected_logps": -295.08416748046875, "debug/sppo_chosen_loss": 2639.18310546875, "debug/sppo_chosen_reward_in_loss": -1.300882339477539, "debug/sppo_rej_reward_in_loss": -2.9567086696624756, "debug/sppo_reject_loss": 2226.27587890625, "epoch": 1.0869565217391304, "grad_norm": 59150.33220000485, "learning_rate": 9.05123339658444e-08, "logits/chosen": 1.0411510467529297, "logits/rejected": 1.3232452869415283, "logps/chosen": -253.15823364257812, "logps/rejected": -298.0408630371094, "loss": 4841.2859, "rewards/accuracies": 0.625, "rewards/chosen": -0.013008822686970234, "rewards/margins": 0.016558263450860977, "rewards/rejected": -0.029567087069153786, "step": 300 }, { "epoch": 1.0869565217391304, "eval_debug/policy_chosen_logits": 1.5925791263580322, "eval_debug/policy_chosen_logps": -253.87155151367188, "eval_debug/policy_rejected_logits": 1.6499637365341187, "eval_debug/policy_rejected_logps": -261.5046691894531, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2603.546142578125, "eval_debug/sppo_chosen_reward_in_loss": -0.9530732035636902, "eval_debug/sppo_rej_reward_in_loss": -1.8460270166397095, "eval_debug/sppo_reject_loss": 2331.751953125, "eval_logits/chosen": 1.5925791263580322, "eval_logits/rejected": 1.6499637365341187, "eval_logps/chosen": -253.87155151367188, "eval_logps/rejected": -261.5046691894531, "eval_loss": 4866.08642578125, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.009530730545520782, "eval_rewards/margins": 0.0089295394718647, "eval_rewards/rejected": -0.018460270017385483, "eval_runtime": 28.4234, "eval_samples_per_second": 21.109, "eval_steps_per_second": 0.668, "step": 300 }, { "debug/policy_chosen_logits": 1.676957130432129, "debug/policy_chosen_logps": -274.81304931640625, "debug/policy_rejected_logits": 1.871455430984497, "debug/policy_rejected_logps": -293.260009765625, "debug/reference_chosen_logps": -273.9171142578125, "debug/reference_rejected_logps": -290.7630920410156, "debug/sppo_chosen_loss": 2594.29736328125, "debug/sppo_chosen_reward_in_loss": -0.8959203958511353, "debug/sppo_rej_reward_in_loss": -2.4969019889831543, "debug/sppo_reject_loss": 2271.0869140625, "epoch": 1.105072463768116, "grad_norm": 67936.39119331828, "learning_rate": 9.02751423149905e-08, "logits/chosen": 1.676957130432129, "logits/rejected": 1.871455430984497, "logps/chosen": -274.81304931640625, "logps/rejected": -293.260009765625, "loss": 4872.5703, "rewards/accuracies": 0.625, "rewards/chosen": -0.008959203958511353, "rewards/margins": 0.016009816899895668, "rewards/rejected": -0.02496902085840702, "step": 305 }, { "debug/policy_chosen_logits": 1.385887622833252, "debug/policy_chosen_logps": -289.3854064941406, "debug/policy_rejected_logits": 1.636156678199768, "debug/policy_rejected_logps": -310.3551025390625, "debug/reference_chosen_logps": -288.9293212890625, "debug/reference_rejected_logps": -308.27606201171875, "debug/sppo_chosen_loss": 2548.999267578125, "debug/sppo_chosen_reward_in_loss": -0.45608481764793396, "debug/sppo_rej_reward_in_loss": -2.0790188312530518, "debug/sppo_reject_loss": 2303.059814453125, "epoch": 1.1231884057971016, "grad_norm": 60225.99966281642, "learning_rate": 9.003795066413662e-08, "logits/chosen": 1.385887622833252, "logits/rejected": 1.636156678199768, "logps/chosen": -289.3854064941406, "logps/rejected": -310.3551025390625, "loss": 4828.8195, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004560848232358694, "rewards/margins": 0.016229338943958282, "rewards/rejected": -0.020790185779333115, "step": 310 }, { "debug/policy_chosen_logits": 1.0067355632781982, "debug/policy_chosen_logps": -228.10665893554688, "debug/policy_rejected_logits": 1.3971359729766846, "debug/policy_rejected_logps": -286.5841064453125, "debug/reference_chosen_logps": -227.9397430419922, "debug/reference_rejected_logps": -283.32073974609375, "debug/sppo_chosen_loss": 2522.895263671875, "debug/sppo_chosen_reward_in_loss": -0.1669052094221115, "debug/sppo_rej_reward_in_loss": -3.263331651687622, "debug/sppo_reject_loss": 2199.813232421875, "epoch": 1.141304347826087, "grad_norm": 66050.06938057746, "learning_rate": 8.980075901328272e-08, "logits/chosen": 1.0067355632781982, "logits/rejected": 1.3971359729766846, "logps/chosen": -228.10665893554688, "logps/rejected": -286.5841064453125, "loss": 4823.1551, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.0016690522897988558, "rewards/margins": 0.030964260920882225, "rewards/rejected": -0.03263331577181816, "step": 315 }, { "debug/policy_chosen_logits": 1.4893410205841064, "debug/policy_chosen_logps": -250.9482879638672, "debug/policy_rejected_logits": 1.7037372589111328, "debug/policy_rejected_logps": -303.536376953125, "debug/reference_chosen_logps": -250.67617797851562, "debug/reference_rejected_logps": -301.82122802734375, "debug/sppo_chosen_loss": 2533.998779296875, "debug/sppo_chosen_reward_in_loss": -0.2721099853515625, "debug/sppo_rej_reward_in_loss": -1.715135931968689, "debug/sppo_reject_loss": 2340.74755859375, "epoch": 1.1594202898550725, "grad_norm": 60007.491746256324, "learning_rate": 8.956356736242884e-08, "logits/chosen": 1.4893410205841064, "logits/rejected": 1.7037372589111328, "logps/chosen": -250.9482879638672, "logps/rejected": -303.536376953125, "loss": 4834.3234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0027210998814553022, "rewards/margins": 0.014430259354412556, "rewards/rejected": -0.0171513594686985, "step": 320 }, { "debug/policy_chosen_logits": 0.9663621187210083, "debug/policy_chosen_logps": -235.44802856445312, "debug/policy_rejected_logits": 1.2461779117584229, "debug/policy_rejected_logps": -293.0070495605469, "debug/reference_chosen_logps": -239.85531616210938, "debug/reference_rejected_logps": -296.14971923828125, "debug/sppo_chosen_loss": 2119.17578125, "debug/sppo_chosen_reward_in_loss": 4.407319068908691, "debug/sppo_rej_reward_in_loss": 3.142679214477539, "debug/sppo_reject_loss": 2876.73046875, "epoch": 1.177536231884058, "grad_norm": 83467.76517961797, "learning_rate": 8.932637571157495e-08, "logits/chosen": 0.9663621187210083, "logits/rejected": 1.2461779117584229, "logps/chosen": -235.44802856445312, "logps/rejected": -293.0070495605469, "loss": 4921.284, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.044073186814785004, "rewards/margins": 0.012646397575736046, "rewards/rejected": 0.031426794826984406, "step": 325 }, { "debug/policy_chosen_logits": 0.8228054046630859, "debug/policy_chosen_logps": -235.9485626220703, "debug/policy_rejected_logits": 1.2900307178497314, "debug/policy_rejected_logps": -289.92755126953125, "debug/reference_chosen_logps": -240.9998321533203, "debug/reference_rejected_logps": -293.64276123046875, "debug/sppo_chosen_loss": 2093.779296875, "debug/sppo_chosen_reward_in_loss": 5.051271915435791, "debug/sppo_rej_reward_in_loss": 3.7152011394500732, "debug/sppo_reject_loss": 2924.75927734375, "epoch": 1.1956521739130435, "grad_norm": 116522.53161008158, "learning_rate": 8.908918406072106e-08, "logits/chosen": 0.8228054046630859, "logits/rejected": 1.2900307178497314, "logps/chosen": -235.9485626220703, "logps/rejected": -289.92755126953125, "loss": 5036.2301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05051271244883537, "rewards/margins": 0.01336070615798235, "rewards/rejected": 0.037152014672756195, "step": 330 }, { "debug/policy_chosen_logits": 1.2111567258834839, "debug/policy_chosen_logps": -248.19058227539062, "debug/policy_rejected_logits": 1.3974757194519043, "debug/policy_rejected_logps": -283.38409423828125, "debug/reference_chosen_logps": -247.591064453125, "debug/reference_rejected_logps": -282.10699462890625, "debug/sppo_chosen_loss": 2599.230224609375, "debug/sppo_chosen_reward_in_loss": -0.5995389819145203, "debug/sppo_rej_reward_in_loss": -1.2771251201629639, "debug/sppo_reject_loss": 2396.506591796875, "epoch": 1.213768115942029, "grad_norm": 66068.79268195969, "learning_rate": 8.885199240986718e-08, "logits/chosen": 1.2111567258834839, "logits/rejected": 1.3974757194519043, "logps/chosen": -248.19058227539062, "logps/rejected": -283.38409423828125, "loss": 4755.3961, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0059953900054097176, "rewards/margins": 0.006775862071663141, "rewards/rejected": -0.012771251611411572, "step": 335 }, { "debug/policy_chosen_logits": 1.5844733715057373, "debug/policy_chosen_logps": -252.624755859375, "debug/policy_rejected_logits": 1.750051498413086, "debug/policy_rejected_logps": -273.21343994140625, "debug/reference_chosen_logps": -252.81381225585938, "debug/reference_rejected_logps": -270.9890441894531, "debug/sppo_chosen_loss": 2487.39501953125, "debug/sppo_chosen_reward_in_loss": 0.18907126784324646, "debug/sppo_rej_reward_in_loss": -2.224409580230713, "debug/sppo_reject_loss": 2306.11279296875, "epoch": 1.2318840579710144, "grad_norm": 70240.39147815418, "learning_rate": 8.861480075901327e-08, "logits/chosen": 1.5844733715057373, "logits/rejected": 1.750051498413086, "logps/chosen": -252.624755859375, "logps/rejected": -273.21343994140625, "loss": 4810.2625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0018907127669081092, "rewards/margins": 0.024134809151291847, "rewards/rejected": -0.02224409580230713, "step": 340 }, { "debug/policy_chosen_logits": 1.237799882888794, "debug/policy_chosen_logps": -269.93609619140625, "debug/policy_rejected_logits": 1.455768346786499, "debug/policy_rejected_logps": -274.3055725097656, "debug/reference_chosen_logps": -269.8506774902344, "debug/reference_rejected_logps": -272.15484619140625, "debug/sppo_chosen_loss": 2517.87939453125, "debug/sppo_chosen_reward_in_loss": -0.08542633056640625, "debug/sppo_rej_reward_in_loss": -2.1507396697998047, "debug/sppo_reject_loss": 2308.344482421875, "epoch": 1.25, "grad_norm": 59892.172559735984, "learning_rate": 8.83776091081594e-08, "logits/chosen": 1.237799882888794, "logits/rejected": 1.455768346786499, "logps/chosen": -269.93609619140625, "logps/rejected": -274.3055725097656, "loss": 4810.1699, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0008542636642232537, "rewards/margins": 0.0206531323492527, "rewards/rejected": -0.021507395431399345, "step": 345 }, { "debug/policy_chosen_logits": 1.025648832321167, "debug/policy_chosen_logps": -243.3861846923828, "debug/policy_rejected_logits": 1.387263298034668, "debug/policy_rejected_logps": -290.004150390625, "debug/reference_chosen_logps": -243.61593627929688, "debug/reference_rejected_logps": -288.1432189941406, "debug/sppo_chosen_loss": 2482.4091796875, "debug/sppo_chosen_reward_in_loss": 0.22976569831371307, "debug/sppo_rej_reward_in_loss": -1.8609302043914795, "debug/sppo_reject_loss": 2337.25732421875, "epoch": 1.2681159420289856, "grad_norm": 94678.24831742006, "learning_rate": 8.814041745730549e-08, "logits/chosen": 1.025648832321167, "logits/rejected": 1.387263298034668, "logps/chosen": -243.3861846923828, "logps/rejected": -290.004150390625, "loss": 4830.8406, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.002297657076269388, "rewards/margins": 0.020906958729028702, "rewards/rejected": -0.01860930025577545, "step": 350 }, { "debug/policy_chosen_logits": 1.498515009880066, "debug/policy_chosen_logps": -260.25372314453125, "debug/policy_rejected_logits": 1.6761505603790283, "debug/policy_rejected_logps": -296.8304138183594, "debug/reference_chosen_logps": -260.3787841796875, "debug/reference_rejected_logps": -293.5425109863281, "debug/sppo_chosen_loss": 2492.62060546875, "debug/sppo_chosen_reward_in_loss": 0.12508754432201385, "debug/sppo_rej_reward_in_loss": -3.2878963947296143, "debug/sppo_reject_loss": 2213.01025390625, "epoch": 1.286231884057971, "grad_norm": 59432.02810994338, "learning_rate": 8.790322580645161e-08, "logits/chosen": 1.498515009880066, "logits/rejected": 1.6761505603790283, "logps/chosen": -260.25372314453125, "logps/rejected": -296.8304138183594, "loss": 4806.2641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0012508754152804613, "rewards/margins": 0.03412983566522598, "rewards/rejected": -0.03287896513938904, "step": 355 }, { "debug/policy_chosen_logits": 0.8536568880081177, "debug/policy_chosen_logps": -229.35403442382812, "debug/policy_rejected_logits": 1.1173298358917236, "debug/policy_rejected_logps": -260.71868896484375, "debug/reference_chosen_logps": -229.55331420898438, "debug/reference_rejected_logps": -256.6519470214844, "debug/sppo_chosen_loss": 2490.58154296875, "debug/sppo_chosen_reward_in_loss": 0.1992940902709961, "debug/sppo_rej_reward_in_loss": -4.0667619705200195, "debug/sppo_reject_loss": 2131.237548828125, "epoch": 1.3043478260869565, "grad_norm": 59957.90901353401, "learning_rate": 8.766603415559772e-08, "logits/chosen": 0.8536568880081177, "logits/rejected": 1.1173298358917236, "logps/chosen": -229.35403442382812, "logps/rejected": -260.71868896484375, "loss": 4774.7418, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.001992940902709961, "rewards/margins": 0.042660560458898544, "rewards/rejected": -0.04066761955618858, "step": 360 }, { "debug/policy_chosen_logits": 1.294481635093689, "debug/policy_chosen_logps": -230.01528930664062, "debug/policy_rejected_logits": 1.8574059009552002, "debug/policy_rejected_logps": -298.5569763183594, "debug/reference_chosen_logps": -229.5476531982422, "debug/reference_rejected_logps": -296.23944091796875, "debug/sppo_chosen_loss": 2556.991455078125, "debug/sppo_chosen_reward_in_loss": -0.4675941467285156, "debug/sppo_rej_reward_in_loss": -2.3175415992736816, "debug/sppo_reject_loss": 2288.79833984375, "epoch": 1.322463768115942, "grad_norm": 65423.33628598978, "learning_rate": 8.742884250474383e-08, "logits/chosen": 1.294481635093689, "logits/rejected": 1.8574059009552002, "logps/chosen": -230.01528930664062, "logps/rejected": -298.5569763183594, "loss": 4846.5555, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004675941076129675, "rewards/margins": 0.018499473109841347, "rewards/rejected": -0.02317541465163231, "step": 365 }, { "debug/policy_chosen_logits": 1.3842321634292603, "debug/policy_chosen_logps": -261.2808532714844, "debug/policy_rejected_logits": 1.626103162765503, "debug/policy_rejected_logps": -302.74468994140625, "debug/reference_chosen_logps": -260.66339111328125, "debug/reference_rejected_logps": -298.43231201171875, "debug/sppo_chosen_loss": 2570.486083984375, "debug/sppo_chosen_reward_in_loss": -0.6174880862236023, "debug/sppo_rej_reward_in_loss": -4.312387466430664, "debug/sppo_reject_loss": 2119.960205078125, "epoch": 1.3405797101449275, "grad_norm": 60727.91021750688, "learning_rate": 8.719165085388994e-08, "logits/chosen": 1.3842321634292603, "logits/rejected": 1.626103162765503, "logps/chosen": -261.2808532714844, "logps/rejected": -302.74468994140625, "loss": 4776.4918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006174880545586348, "rewards/margins": 0.03694899380207062, "rewards/rejected": -0.04312387481331825, "step": 370 }, { "debug/policy_chosen_logits": 1.1844645738601685, "debug/policy_chosen_logps": -255.6688995361328, "debug/policy_rejected_logits": 1.4649739265441895, "debug/policy_rejected_logps": -263.22125244140625, "debug/reference_chosen_logps": -254.73141479492188, "debug/reference_rejected_logps": -260.73736572265625, "debug/sppo_chosen_loss": 2600.70068359375, "debug/sppo_chosen_reward_in_loss": -0.937482476234436, "debug/sppo_rej_reward_in_loss": -2.4838500022888184, "debug/sppo_reject_loss": 2272.923583984375, "epoch": 1.358695652173913, "grad_norm": 67056.46670846178, "learning_rate": 8.695445920303604e-08, "logits/chosen": 1.1844645738601685, "logits/rejected": 1.4649739265441895, "logps/chosen": -255.6688995361328, "logps/rejected": -263.22125244140625, "loss": 4855.8852, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009374824352562428, "rewards/margins": 0.015463674440979958, "rewards/rejected": -0.02483849972486496, "step": 375 }, { "debug/policy_chosen_logits": 1.1452186107635498, "debug/policy_chosen_logps": -235.8422088623047, "debug/policy_rejected_logits": 1.4934955835342407, "debug/policy_rejected_logps": -260.0475158691406, "debug/reference_chosen_logps": -235.4017333984375, "debug/reference_rejected_logps": -256.94818115234375, "debug/sppo_chosen_loss": 2548.05517578125, "debug/sppo_chosen_reward_in_loss": -0.44045963883399963, "debug/sppo_rej_reward_in_loss": -3.099336624145508, "debug/sppo_reject_loss": 2221.05517578125, "epoch": 1.3768115942028984, "grad_norm": 74208.24811451357, "learning_rate": 8.671726755218217e-08, "logits/chosen": 1.1452186107635498, "logits/rejected": 1.4934955835342407, "logps/chosen": -235.8422088623047, "logps/rejected": -260.0475158691406, "loss": 4766.4734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0044045960530638695, "rewards/margins": 0.026588771492242813, "rewards/rejected": -0.030993366613984108, "step": 380 }, { "debug/policy_chosen_logits": 1.5168814659118652, "debug/policy_chosen_logps": -264.10906982421875, "debug/policy_rejected_logits": 1.6211068630218506, "debug/policy_rejected_logps": -294.58306884765625, "debug/reference_chosen_logps": -262.984375, "debug/reference_rejected_logps": -292.22747802734375, "debug/sppo_chosen_loss": 2623.75537109375, "debug/sppo_chosen_reward_in_loss": -1.12469482421875, "debug/sppo_rej_reward_in_loss": -2.3556056022644043, "debug/sppo_reject_loss": 2286.559326171875, "epoch": 1.394927536231884, "grad_norm": 62559.73726564714, "learning_rate": 8.648007590132826e-08, "logits/chosen": 1.5168814659118652, "logits/rejected": 1.6211068630218506, "logps/chosen": -264.10906982421875, "logps/rejected": -294.58306884765625, "loss": 4816.043, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.011246947571635246, "rewards/margins": 0.012309107929468155, "rewards/rejected": -0.0235560555011034, "step": 385 }, { "debug/policy_chosen_logits": 1.5912892818450928, "debug/policy_chosen_logps": -296.7356262207031, "debug/policy_rejected_logits": 1.3841075897216797, "debug/policy_rejected_logps": -283.91851806640625, "debug/reference_chosen_logps": -296.08233642578125, "debug/reference_rejected_logps": -280.37335205078125, "debug/sppo_chosen_loss": 2574.815185546875, "debug/sppo_chosen_reward_in_loss": -0.6533054113388062, "debug/sppo_rej_reward_in_loss": -3.545187473297119, "debug/sppo_reject_loss": 2180.56982421875, "epoch": 1.4130434782608696, "grad_norm": 69130.9325794301, "learning_rate": 8.624288425047438e-08, "logits/chosen": 1.5912892818450928, "logits/rejected": 1.3841075897216797, "logps/chosen": -296.7356262207031, "logps/rejected": -283.91851806640625, "loss": 4745.0805, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006533053703606129, "rewards/margins": 0.02891881763935089, "rewards/rejected": -0.035451870411634445, "step": 390 }, { "debug/policy_chosen_logits": 0.9788148999214172, "debug/policy_chosen_logps": -274.5755310058594, "debug/policy_rejected_logits": 1.2344077825546265, "debug/policy_rejected_logps": -305.6072692871094, "debug/reference_chosen_logps": -273.4296875, "debug/reference_rejected_logps": -303.46014404296875, "debug/sppo_chosen_loss": 2628.5703125, "debug/sppo_chosen_reward_in_loss": -1.1458460092544556, "debug/sppo_rej_reward_in_loss": -2.147127389907837, "debug/sppo_reject_loss": 2307.91162109375, "epoch": 1.431159420289855, "grad_norm": 79195.9127279263, "learning_rate": 8.600569259962049e-08, "logits/chosen": 0.9788148999214172, "logits/rejected": 1.2344077825546265, "logps/chosen": -274.5755310058594, "logps/rejected": -305.6072692871094, "loss": 4804.4148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011458461172878742, "rewards/margins": 0.010012813843786716, "rewards/rejected": -0.02147127129137516, "step": 395 }, { "debug/policy_chosen_logits": 1.2316147089004517, "debug/policy_chosen_logps": -283.9236755371094, "debug/policy_rejected_logits": 1.3582853078842163, "debug/policy_rejected_logps": -315.0761413574219, "debug/reference_chosen_logps": -283.3064270019531, "debug/reference_rejected_logps": -312.33880615234375, "debug/sppo_chosen_loss": 2571.54736328125, "debug/sppo_chosen_reward_in_loss": -0.6172817349433899, "debug/sppo_rej_reward_in_loss": -2.7373268604278564, "debug/sppo_reject_loss": 2251.958251953125, "epoch": 1.4492753623188406, "grad_norm": 57382.841652196665, "learning_rate": 8.57685009487666e-08, "logits/chosen": 1.2316147089004517, "logits/rejected": 1.3582853078842163, "logps/chosen": -283.9236755371094, "logps/rejected": -315.0761413574219, "loss": 4822.266, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.006172817200422287, "rewards/margins": 0.021200451999902725, "rewards/rejected": -0.027373269200325012, "step": 400 }, { "epoch": 1.4492753623188406, "eval_debug/policy_chosen_logits": 1.5602922439575195, "eval_debug/policy_chosen_logps": -254.64974975585938, "eval_debug/policy_rejected_logits": 1.616196632385254, "eval_debug/policy_rejected_logps": -262.6080322265625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2692.540771484375, "eval_debug/sppo_chosen_reward_in_loss": -1.7312862873077393, "eval_debug/sppo_rej_reward_in_loss": -2.949385643005371, "eval_debug/sppo_reject_loss": 2243.4091796875, "eval_logits/chosen": 1.5602922439575195, "eval_logits/rejected": 1.616196632385254, "eval_logps/chosen": -254.64974975585938, "eval_logps/rejected": -262.6080322265625, "eval_loss": 4827.97607421875, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.01731286197900772, "eval_rewards/margins": 0.012180991470813751, "eval_rewards/rejected": -0.029493853449821472, "eval_runtime": 29.1239, "eval_samples_per_second": 20.602, "eval_steps_per_second": 0.652, "step": 400 }, { "debug/policy_chosen_logits": 1.3524808883666992, "debug/policy_chosen_logps": -249.79495239257812, "debug/policy_rejected_logits": 1.8515160083770752, "debug/policy_rejected_logps": -278.3388366699219, "debug/reference_chosen_logps": -250.1258544921875, "debug/reference_rejected_logps": -276.1705627441406, "debug/sppo_chosen_loss": 2470.27587890625, "debug/sppo_chosen_reward_in_loss": 0.3309038281440735, "debug/sppo_rej_reward_in_loss": -2.1682965755462646, "debug/sppo_reject_loss": 2303.71728515625, "epoch": 1.4673913043478262, "grad_norm": 68740.79188760545, "learning_rate": 8.553130929791271e-08, "logits/chosen": 1.3524808883666992, "logits/rejected": 1.8515160083770752, "logps/chosen": -249.79495239257812, "logps/rejected": -278.3388366699219, "loss": 4746.2109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003309038234874606, "rewards/margins": 0.02499200403690338, "rewards/rejected": -0.021682966500520706, "step": 405 }, { "debug/policy_chosen_logits": 1.38738214969635, "debug/policy_chosen_logps": -244.7665557861328, "debug/policy_rejected_logits": 1.6158170700073242, "debug/policy_rejected_logps": -292.2993469238281, "debug/reference_chosen_logps": -244.8103485107422, "debug/reference_rejected_logps": -288.8402404785156, "debug/sppo_chosen_loss": 2498.940185546875, "debug/sppo_chosen_reward_in_loss": 0.04381828382611275, "debug/sppo_rej_reward_in_loss": -3.459094524383545, "debug/sppo_reject_loss": 2181.8310546875, "epoch": 1.4855072463768115, "grad_norm": 65059.93380067381, "learning_rate": 8.529411764705881e-08, "logits/chosen": 1.38738214969635, "logits/rejected": 1.6158170700073242, "logps/chosen": -244.7665557861328, "logps/rejected": -292.2993469238281, "loss": 4786.5965, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0004381827893666923, "rewards/margins": 0.03502912446856499, "rewards/rejected": -0.034590944647789, "step": 410 }, { "debug/policy_chosen_logits": 0.9360378980636597, "debug/policy_chosen_logps": -243.22348022460938, "debug/policy_rejected_logits": 1.3336433172225952, "debug/policy_rejected_logps": -292.2352600097656, "debug/reference_chosen_logps": -243.03857421875, "debug/reference_rejected_logps": -289.7604064941406, "debug/sppo_chosen_loss": 2535.86376953125, "debug/sppo_chosen_reward_in_loss": -0.18492908775806427, "debug/sppo_rej_reward_in_loss": -2.474832057952881, "debug/sppo_reject_loss": 2279.22998046875, "epoch": 1.5036231884057971, "grad_norm": 86383.3999472384, "learning_rate": 8.505692599620494e-08, "logits/chosen": 0.9360378980636597, "logits/rejected": 1.3336433172225952, "logps/chosen": -243.22348022460938, "logps/rejected": -292.2352600097656, "loss": 4772.184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0018492921954020858, "rewards/margins": 0.022899029776453972, "rewards/rejected": -0.024748321622610092, "step": 415 }, { "debug/policy_chosen_logits": 1.5796959400177002, "debug/policy_chosen_logps": -256.4622497558594, "debug/policy_rejected_logits": 2.1471669673919678, "debug/policy_rejected_logps": -306.14581298828125, "debug/reference_chosen_logps": -255.08334350585938, "debug/reference_rejected_logps": -302.2669372558594, "debug/sppo_chosen_loss": 2663.369873046875, "debug/sppo_chosen_reward_in_loss": -1.3789360523223877, "debug/sppo_rej_reward_in_loss": -3.8788833618164062, "debug/sppo_reject_loss": 2161.091064453125, "epoch": 1.5217391304347827, "grad_norm": 103555.38066318117, "learning_rate": 8.481973434535103e-08, "logits/chosen": 1.5796959400177002, "logits/rejected": 2.1471669673919678, "logps/chosen": -256.4622497558594, "logps/rejected": -306.14581298828125, "loss": 4849.4711, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.013789360411465168, "rewards/margins": 0.024999473243951797, "rewards/rejected": -0.03878883272409439, "step": 420 }, { "debug/policy_chosen_logits": 1.395512580871582, "debug/policy_chosen_logps": -255.1446075439453, "debug/policy_rejected_logits": 1.9408433437347412, "debug/policy_rejected_logps": -310.40679931640625, "debug/reference_chosen_logps": -254.27566528320312, "debug/reference_rejected_logps": -306.13055419921875, "debug/sppo_chosen_loss": 2605.943115234375, "debug/sppo_chosen_reward_in_loss": -0.8689578771591187, "debug/sppo_rej_reward_in_loss": -4.276240348815918, "debug/sppo_reject_loss": 2132.437744140625, "epoch": 1.539855072463768, "grad_norm": 94976.66848562153, "learning_rate": 8.458254269449715e-08, "logits/chosen": 1.395512580871582, "logits/rejected": 1.9408433437347412, "logps/chosen": -255.1446075439453, "logps/rejected": -310.40679931640625, "loss": 4717.8863, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.008689578622579575, "rewards/margins": 0.03407282009720802, "rewards/rejected": -0.0427623987197876, "step": 425 }, { "debug/policy_chosen_logits": 0.9035994410514832, "debug/policy_chosen_logps": -243.0633544921875, "debug/policy_rejected_logits": 1.2840200662612915, "debug/policy_rejected_logps": -295.56982421875, "debug/reference_chosen_logps": -243.094970703125, "debug/reference_rejected_logps": -292.0074462890625, "debug/sppo_chosen_loss": 2514.469482421875, "debug/sppo_chosen_reward_in_loss": 0.03161926195025444, "debug/sppo_rej_reward_in_loss": -3.5623831748962402, "debug/sppo_reject_loss": 2200.54833984375, "epoch": 1.5579710144927537, "grad_norm": 73425.9725077612, "learning_rate": 8.434535104364326e-08, "logits/chosen": 0.9035994410514832, "logits/rejected": 1.2840200662612915, "logps/chosen": -243.0633544921875, "logps/rejected": -295.56982421875, "loss": 4730.8953, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00031619257060810924, "rewards/margins": 0.0359400250017643, "rewards/rejected": -0.03562382981181145, "step": 430 }, { "debug/policy_chosen_logits": 1.3114192485809326, "debug/policy_chosen_logps": -236.48046875, "debug/policy_rejected_logits": 1.5032637119293213, "debug/policy_rejected_logps": -269.6251525878906, "debug/reference_chosen_logps": -235.29800415039062, "debug/reference_rejected_logps": -266.57958984375, "debug/sppo_chosen_loss": 2641.09228515625, "debug/sppo_chosen_reward_in_loss": -1.1824610233306885, "debug/sppo_rej_reward_in_loss": -3.045569896697998, "debug/sppo_reject_loss": 2232.681884765625, "epoch": 1.5760869565217392, "grad_norm": 80173.06237989027, "learning_rate": 8.410815939278937e-08, "logits/chosen": 1.3114192485809326, "logits/rejected": 1.5032637119293213, "logps/chosen": -236.48046875, "logps/rejected": -269.6251525878906, "loss": 4775.4113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011824609711766243, "rewards/margins": 0.018631089478731155, "rewards/rejected": -0.0304556991904974, "step": 435 }, { "debug/policy_chosen_logits": 1.307786226272583, "debug/policy_chosen_logps": -248.123291015625, "debug/policy_rejected_logits": 1.7721586227416992, "debug/policy_rejected_logps": -306.8035583496094, "debug/reference_chosen_logps": -246.77597045898438, "debug/reference_rejected_logps": -302.6429138183594, "debug/sppo_chosen_loss": 2649.71875, "debug/sppo_chosen_reward_in_loss": -1.3473155498504639, "debug/sppo_rej_reward_in_loss": -4.160645484924316, "debug/sppo_reject_loss": 2137.823486328125, "epoch": 1.5942028985507246, "grad_norm": 81922.45013113407, "learning_rate": 8.387096774193548e-08, "logits/chosen": 1.307786226272583, "logits/rejected": 1.7721586227416992, "logps/chosen": -248.123291015625, "logps/rejected": -306.8035583496094, "loss": 4801.7691, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.013473155908286572, "rewards/margins": 0.028133297339081764, "rewards/rejected": -0.04160645231604576, "step": 440 }, { "debug/policy_chosen_logits": 1.322139024734497, "debug/policy_chosen_logps": -262.4474792480469, "debug/policy_rejected_logits": 1.4987752437591553, "debug/policy_rejected_logps": -287.6851501464844, "debug/reference_chosen_logps": -260.79449462890625, "debug/reference_rejected_logps": -283.6168518066406, "debug/sppo_chosen_loss": 2697.68798828125, "debug/sppo_chosen_reward_in_loss": -1.653011679649353, "debug/sppo_rej_reward_in_loss": -4.068353176116943, "debug/sppo_reject_loss": 2149.60986328125, "epoch": 1.6123188405797102, "grad_norm": 67662.40188182965, "learning_rate": 8.363377609108159e-08, "logits/chosen": 1.322139024734497, "logits/rejected": 1.4987752437591553, "logps/chosen": -262.4474792480469, "logps/rejected": -287.6851501464844, "loss": 4734.0082, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.016530117020010948, "rewards/margins": 0.024153415113687515, "rewards/rejected": -0.040683530271053314, "step": 445 }, { "debug/policy_chosen_logits": 1.206435203552246, "debug/policy_chosen_logps": -262.6929626464844, "debug/policy_rejected_logits": 1.3074095249176025, "debug/policy_rejected_logps": -260.0146484375, "debug/reference_chosen_logps": -261.75518798828125, "debug/reference_rejected_logps": -257.6763916015625, "debug/sppo_chosen_loss": 2608.10009765625, "debug/sppo_chosen_reward_in_loss": -0.9377906918525696, "debug/sppo_rej_reward_in_loss": -2.3382537364959717, "debug/sppo_reject_loss": 2294.22265625, "epoch": 1.6304347826086958, "grad_norm": 60879.715605682446, "learning_rate": 8.339658444022771e-08, "logits/chosen": 1.206435203552246, "logits/rejected": 1.3074095249176025, "logps/chosen": -262.6929626464844, "logps/rejected": -260.0146484375, "loss": 4802.3266, "rewards/accuracies": 0.625, "rewards/chosen": -0.00937790609896183, "rewards/margins": 0.014004630967974663, "rewards/rejected": -0.023382537066936493, "step": 450 }, { "debug/policy_chosen_logits": 1.1662284135818481, "debug/policy_chosen_logps": -261.43780517578125, "debug/policy_rejected_logits": 1.441433310508728, "debug/policy_rejected_logps": -309.4091796875, "debug/reference_chosen_logps": -260.24456787109375, "debug/reference_rejected_logps": -304.6410827636719, "debug/sppo_chosen_loss": 2641.141357421875, "debug/sppo_chosen_reward_in_loss": -1.1932071447372437, "debug/sppo_rej_reward_in_loss": -4.768072128295898, "debug/sppo_reject_loss": 2096.729736328125, "epoch": 1.6485507246376812, "grad_norm": 63926.71558778056, "learning_rate": 8.31593927893738e-08, "logits/chosen": 1.1662284135818481, "logits/rejected": 1.441433310508728, "logps/chosen": -261.43780517578125, "logps/rejected": -309.4091796875, "loss": 4744.3574, "rewards/accuracies": 0.625, "rewards/chosen": -0.0119320722296834, "rewards/margins": 0.03574864938855171, "rewards/rejected": -0.04768072068691254, "step": 455 }, { "debug/policy_chosen_logits": 1.3488881587982178, "debug/policy_chosen_logps": -230.0709991455078, "debug/policy_rejected_logits": 1.55966055393219, "debug/policy_rejected_logps": -313.56268310546875, "debug/reference_chosen_logps": -229.99716186523438, "debug/reference_rejected_logps": -310.053955078125, "debug/sppo_chosen_loss": 2527.13623046875, "debug/sppo_chosen_reward_in_loss": -0.07382412254810333, "debug/sppo_rej_reward_in_loss": -3.508708953857422, "debug/sppo_reject_loss": 2203.64306640625, "epoch": 1.6666666666666665, "grad_norm": 67371.8390482823, "learning_rate": 8.292220113851992e-08, "logits/chosen": 1.3488881587982178, "logits/rejected": 1.55966055393219, "logps/chosen": -230.0709991455078, "logps/rejected": -313.56268310546875, "loss": 4756.7312, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0007382409530691803, "rewards/margins": 0.03434884548187256, "rewards/rejected": -0.035087086260318756, "step": 460 }, { "debug/policy_chosen_logits": 1.2849664688110352, "debug/policy_chosen_logps": -256.2344665527344, "debug/policy_rejected_logits": 1.563281536102295, "debug/policy_rejected_logps": -262.84503173828125, "debug/reference_chosen_logps": -254.5850372314453, "debug/reference_rejected_logps": -259.1615905761719, "debug/sppo_chosen_loss": 2687.16943359375, "debug/sppo_chosen_reward_in_loss": -1.6494476795196533, "debug/sppo_rej_reward_in_loss": -3.6834373474121094, "debug/sppo_reject_loss": 2171.74755859375, "epoch": 1.6847826086956523, "grad_norm": 62844.11782916527, "learning_rate": 8.268500948766603e-08, "logits/chosen": 1.2849664688110352, "logits/rejected": 1.563281536102295, "logps/chosen": -256.2344665527344, "logps/rejected": -262.84503173828125, "loss": 4777.1547, "rewards/accuracies": 0.625, "rewards/chosen": -0.016494475305080414, "rewards/margins": 0.02033989690244198, "rewards/rejected": -0.03683437407016754, "step": 465 }, { "debug/policy_chosen_logits": 1.2436730861663818, "debug/policy_chosen_logps": -234.88925170898438, "debug/policy_rejected_logits": 1.8374286890029907, "debug/policy_rejected_logps": -292.4685363769531, "debug/reference_chosen_logps": -234.3294677734375, "debug/reference_rejected_logps": -288.486083984375, "debug/sppo_chosen_loss": 2570.38134765625, "debug/sppo_chosen_reward_in_loss": -0.5597907900810242, "debug/sppo_rej_reward_in_loss": -3.982456684112549, "debug/sppo_reject_loss": 2164.974609375, "epoch": 1.7028985507246377, "grad_norm": 59488.42390716384, "learning_rate": 8.244781783681214e-08, "logits/chosen": 1.2436730861663818, "logits/rejected": 1.8374286890029907, "logps/chosen": -234.88925170898438, "logps/rejected": -292.4685363769531, "loss": 4830.7699, "rewards/accuracies": 0.75, "rewards/chosen": -0.005597907118499279, "rewards/margins": 0.034226659685373306, "rewards/rejected": -0.03982456773519516, "step": 470 }, { "debug/policy_chosen_logits": 1.3744897842407227, "debug/policy_chosen_logps": -270.1221618652344, "debug/policy_rejected_logits": 1.56520676612854, "debug/policy_rejected_logps": -281.8283996582031, "debug/reference_chosen_logps": -268.66082763671875, "debug/reference_rejected_logps": -276.802490234375, "debug/sppo_chosen_loss": 2671.38916015625, "debug/sppo_chosen_reward_in_loss": -1.4612995386123657, "debug/sppo_rej_reward_in_loss": -5.025888442993164, "debug/sppo_reject_loss": 2074.87451171875, "epoch": 1.721014492753623, "grad_norm": 61733.89657727192, "learning_rate": 8.221062618595825e-08, "logits/chosen": 1.3744897842407227, "logits/rejected": 1.56520676612854, "logps/chosen": -270.1221618652344, "logps/rejected": -281.8283996582031, "loss": 4732.2094, "rewards/accuracies": 0.625, "rewards/chosen": -0.014612993225455284, "rewards/margins": 0.03564589098095894, "rewards/rejected": -0.05025888606905937, "step": 475 }, { "debug/policy_chosen_logits": 1.2366863489151, "debug/policy_chosen_logps": -257.65899658203125, "debug/policy_rejected_logits": 1.40576171875, "debug/policy_rejected_logps": -261.4308166503906, "debug/reference_chosen_logps": -257.324951171875, "debug/reference_rejected_logps": -257.91546630859375, "debug/sppo_chosen_loss": 2543.58203125, "debug/sppo_chosen_reward_in_loss": -0.33407631516456604, "debug/sppo_rej_reward_in_loss": -3.515354633331299, "debug/sppo_reject_loss": 2219.15380859375, "epoch": 1.7391304347826086, "grad_norm": 85372.05720389963, "learning_rate": 8.197343453510436e-08, "logits/chosen": 1.2366863489151, "logits/rejected": 1.40576171875, "logps/chosen": -257.65899658203125, "logps/rejected": -261.4308166503906, "loss": 4729.5922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003340762807056308, "rewards/margins": 0.03181277960538864, "rewards/rejected": -0.03515354543924332, "step": 480 }, { "debug/policy_chosen_logits": 1.4717175960540771, "debug/policy_chosen_logps": -275.98895263671875, "debug/policy_rejected_logits": 1.719726324081421, "debug/policy_rejected_logps": -323.21142578125, "debug/reference_chosen_logps": -275.3488464355469, "debug/reference_rejected_logps": -318.627685546875, "debug/sppo_chosen_loss": 2575.63134765625, "debug/sppo_chosen_reward_in_loss": -0.640114963054657, "debug/sppo_rej_reward_in_loss": -4.583746910095215, "debug/sppo_reject_loss": 2115.492919921875, "epoch": 1.7572463768115942, "grad_norm": 61192.717509951464, "learning_rate": 8.173624288425048e-08, "logits/chosen": 1.4717175960540771, "logits/rejected": 1.719726324081421, "logps/chosen": -275.98895263671875, "logps/rejected": -323.21142578125, "loss": 4720.6652, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006401150021702051, "rewards/margins": 0.03943631798028946, "rewards/rejected": -0.04583746939897537, "step": 485 }, { "debug/policy_chosen_logits": 1.4655859470367432, "debug/policy_chosen_logps": -259.05706787109375, "debug/policy_rejected_logits": 1.7517127990722656, "debug/policy_rejected_logps": -293.79473876953125, "debug/reference_chosen_logps": -258.397216796875, "debug/reference_rejected_logps": -290.43145751953125, "debug/sppo_chosen_loss": 2576.09619140625, "debug/sppo_chosen_reward_in_loss": -0.6598647832870483, "debug/sppo_rej_reward_in_loss": -3.363311290740967, "debug/sppo_reject_loss": 2198.055419921875, "epoch": 1.7753623188405796, "grad_norm": 70991.7781718097, "learning_rate": 8.149905123339657e-08, "logits/chosen": 1.4655859470367432, "logits/rejected": 1.7517127990722656, "logps/chosen": -259.05706787109375, "logps/rejected": -293.79473876953125, "loss": 4664.007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006598648615181446, "rewards/margins": 0.027034465223550797, "rewards/rejected": -0.03363311290740967, "step": 490 }, { "debug/policy_chosen_logits": 1.1926593780517578, "debug/policy_chosen_logps": -232.2982177734375, "debug/policy_rejected_logits": 1.476413369178772, "debug/policy_rejected_logps": -290.04180908203125, "debug/reference_chosen_logps": -231.7759246826172, "debug/reference_rejected_logps": -286.0237121582031, "debug/sppo_chosen_loss": 2567.42236328125, "debug/sppo_chosen_reward_in_loss": -0.5222911834716797, "debug/sppo_rej_reward_in_loss": -4.018117904663086, "debug/sppo_reject_loss": 2148.363037109375, "epoch": 1.7934782608695652, "grad_norm": 75873.7432547816, "learning_rate": 8.12618595825427e-08, "logits/chosen": 1.1926593780517578, "logits/rejected": 1.476413369178772, "logps/chosen": -232.2982177734375, "logps/rejected": -290.04180908203125, "loss": 4757.4133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.005222911946475506, "rewards/margins": 0.03495826572179794, "rewards/rejected": -0.040181178599596024, "step": 495 }, { "debug/policy_chosen_logits": 1.215562343597412, "debug/policy_chosen_logps": -261.1588439941406, "debug/policy_rejected_logits": 1.4616596698760986, "debug/policy_rejected_logps": -267.5830383300781, "debug/reference_chosen_logps": -260.53173828125, "debug/reference_rejected_logps": -265.97222900390625, "debug/sppo_chosen_loss": 2579.6640625, "debug/sppo_chosen_reward_in_loss": -0.6271156072616577, "debug/sppo_rej_reward_in_loss": -1.6108038425445557, "debug/sppo_reject_loss": 2358.9453125, "epoch": 1.8115942028985508, "grad_norm": 63602.03255502694, "learning_rate": 8.10246679316888e-08, "logits/chosen": 1.215562343597412, "logits/rejected": 1.4616596698760986, "logps/chosen": -261.1588439941406, "logps/rejected": -267.5830383300781, "loss": 4715.0469, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00627115648239851, "rewards/margins": 0.009836881421506405, "rewards/rejected": -0.016108037903904915, "step": 500 }, { "epoch": 1.8115942028985508, "eval_debug/policy_chosen_logits": 1.5341249704360962, "eval_debug/policy_chosen_logps": -252.98873901367188, "eval_debug/policy_rejected_logits": 1.5897997617721558, "eval_debug/policy_rejected_logps": -261.42193603515625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2529.298095703125, "eval_debug/sppo_chosen_reward_in_loss": -0.07025247067213058, "eval_debug/sppo_rej_reward_in_loss": -1.7633041143417358, "eval_debug/sppo_reject_loss": 2376.3818359375, "eval_logits/chosen": 1.5341249704360962, "eval_logits/rejected": 1.5897997617721558, "eval_logps/chosen": -252.98873901367188, "eval_logps/rejected": -261.42193603515625, "eval_loss": 4771.205078125, "eval_rewards/accuracies": 0.4868420958518982, "eval_rewards/chosen": -0.0007025245577096939, "eval_rewards/margins": 0.016930514946579933, "eval_rewards/rejected": -0.017633043229579926, "eval_runtime": 28.3879, "eval_samples_per_second": 21.136, "eval_steps_per_second": 0.669, "step": 500 }, { "debug/policy_chosen_logits": 1.2810986042022705, "debug/policy_chosen_logps": -248.2319793701172, "debug/policy_rejected_logits": 1.3891974687576294, "debug/policy_rejected_logps": -281.86248779296875, "debug/reference_chosen_logps": -250.66650390625, "debug/reference_rejected_logps": -280.2650451660156, "debug/sppo_chosen_loss": 2272.516845703125, "debug/sppo_chosen_reward_in_loss": 2.434523344039917, "debug/sppo_rej_reward_in_loss": -1.5974708795547485, "debug/sppo_reject_loss": 2386.367919921875, "epoch": 1.8297101449275361, "grad_norm": 174079.79406977983, "learning_rate": 8.078747628083491e-08, "logits/chosen": 1.2810986042022705, "logits/rejected": 1.3891974687576294, "logps/chosen": -248.2319793701172, "logps/rejected": -281.86248779296875, "loss": 4673.1523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.024345234036445618, "rewards/margins": 0.04031994193792343, "rewards/rejected": -0.015974709764122963, "step": 505 }, { "debug/policy_chosen_logits": 1.3941171169281006, "debug/policy_chosen_logps": -253.9849090576172, "debug/policy_rejected_logits": 1.5818657875061035, "debug/policy_rejected_logps": -296.9537658691406, "debug/reference_chosen_logps": -257.995361328125, "debug/reference_rejected_logps": -295.6236267089844, "debug/sppo_chosen_loss": 2126.829345703125, "debug/sppo_chosen_reward_in_loss": 4.010450839996338, "debug/sppo_rej_reward_in_loss": -1.3301254510879517, "debug/sppo_reject_loss": 2436.87939453125, "epoch": 1.8478260869565217, "grad_norm": 66900.73837481005, "learning_rate": 8.055028462998102e-08, "logits/chosen": 1.3941171169281006, "logits/rejected": 1.5818657875061035, "logps/chosen": -253.9849090576172, "logps/rejected": -296.9537658691406, "loss": 4619.4895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04010450467467308, "rewards/margins": 0.05340576171875, "rewards/rejected": -0.013301253318786621, "step": 510 }, { "debug/policy_chosen_logits": 1.4030792713165283, "debug/policy_chosen_logps": -254.7285614013672, "debug/policy_rejected_logits": 1.5502079725265503, "debug/policy_rejected_logps": -269.3714904785156, "debug/reference_chosen_logps": -255.5850830078125, "debug/reference_rejected_logps": -265.862548828125, "debug/sppo_chosen_loss": 2444.778076171875, "debug/sppo_chosen_reward_in_loss": 0.8565012216567993, "debug/sppo_rej_reward_in_loss": -3.5089497566223145, "debug/sppo_reject_loss": 2245.17041015625, "epoch": 1.8659420289855073, "grad_norm": 63764.355992774814, "learning_rate": 8.031309297912713e-08, "logits/chosen": 1.4030792713165283, "logits/rejected": 1.5502079725265503, "logps/chosen": -254.7285614013672, "logps/rejected": -269.3714904785156, "loss": 4775.2594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008565010502934456, "rewards/margins": 0.043654508888721466, "rewards/rejected": -0.03508950024843216, "step": 515 }, { "debug/policy_chosen_logits": 1.5248899459838867, "debug/policy_chosen_logps": -266.8809509277344, "debug/policy_rejected_logits": 1.595474123954773, "debug/policy_rejected_logps": -281.7724609375, "debug/reference_chosen_logps": -267.4107360839844, "debug/reference_rejected_logps": -279.47845458984375, "debug/sppo_chosen_loss": 2485.14501953125, "debug/sppo_chosen_reward_in_loss": 0.5297962427139282, "debug/sppo_rej_reward_in_loss": -2.294001340866089, "debug/sppo_reject_loss": 2315.60498046875, "epoch": 1.8840579710144927, "grad_norm": 59459.300418523795, "learning_rate": 8.007590132827324e-08, "logits/chosen": 1.5248899459838867, "logits/rejected": 1.595474123954773, "logps/chosen": -266.8809509277344, "logps/rejected": -281.7724609375, "loss": 4767.0531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005297960247844458, "rewards/margins": 0.028237977996468544, "rewards/rejected": -0.0229400135576725, "step": 520 }, { "debug/policy_chosen_logits": 1.1980397701263428, "debug/policy_chosen_logps": -254.56338500976562, "debug/policy_rejected_logits": 1.636426568031311, "debug/policy_rejected_logps": -288.3833312988281, "debug/reference_chosen_logps": -254.3158416748047, "debug/reference_rejected_logps": -288.393310546875, "debug/sppo_chosen_loss": 2600.60791015625, "debug/sppo_chosen_reward_in_loss": -0.247528076171875, "debug/sppo_rej_reward_in_loss": 0.009991263970732689, "debug/sppo_reject_loss": 2526.95849609375, "epoch": 1.9021739130434783, "grad_norm": 68061.50597181973, "learning_rate": 7.983870967741935e-08, "logits/chosen": 1.1980397701263428, "logits/rejected": 1.636426568031311, "logps/chosen": -254.56338500976562, "logps/rejected": -288.3833312988281, "loss": 4760.2711, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002475281711667776, "rewards/margins": -0.002575193764641881, "rewards/rejected": 9.991265687858686e-05, "step": 525 }, { "debug/policy_chosen_logits": 1.156363606452942, "debug/policy_chosen_logps": -255.1627197265625, "debug/policy_rejected_logits": 1.2697885036468506, "debug/policy_rejected_logps": -289.9837951660156, "debug/reference_chosen_logps": -253.6742401123047, "debug/reference_rejected_logps": -286.6373596191406, "debug/sppo_chosen_loss": 2699.996826171875, "debug/sppo_chosen_reward_in_loss": -1.488465666770935, "debug/sppo_rej_reward_in_loss": -3.346395492553711, "debug/sppo_reject_loss": 2233.798583984375, "epoch": 1.9202898550724639, "grad_norm": 67627.61997292053, "learning_rate": 7.960151802656547e-08, "logits/chosen": 1.156363606452942, "logits/rejected": 1.2697885036468506, "logps/chosen": -255.1627197265625, "logps/rejected": -289.9837951660156, "loss": 4778.8504, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01488465629518032, "rewards/margins": 0.01857929676771164, "rewards/rejected": -0.03346395865082741, "step": 530 }, { "debug/policy_chosen_logits": 1.4285439252853394, "debug/policy_chosen_logps": -235.1498565673828, "debug/policy_rejected_logits": 1.6913135051727295, "debug/policy_rejected_logps": -264.16796875, "debug/reference_chosen_logps": -234.9927520751953, "debug/reference_rejected_logps": -260.7703552246094, "debug/sppo_chosen_loss": 2532.578125, "debug/sppo_chosen_reward_in_loss": -0.1570919007062912, "debug/sppo_rej_reward_in_loss": -3.3975937366485596, "debug/sppo_reject_loss": 2232.462158203125, "epoch": 1.9384057971014492, "grad_norm": 64721.88566770589, "learning_rate": 7.936432637571158e-08, "logits/chosen": 1.4285439252853394, "logits/rejected": 1.6913135051727295, "logps/chosen": -235.1498565673828, "logps/rejected": -264.16796875, "loss": 4679.4508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0015709183644503355, "rewards/margins": 0.03240501880645752, "rewards/rejected": -0.033975936472415924, "step": 535 }, { "debug/policy_chosen_logits": 1.0421147346496582, "debug/policy_chosen_logps": -249.03768920898438, "debug/policy_rejected_logits": 1.3673218488693237, "debug/policy_rejected_logps": -278.5938415527344, "debug/reference_chosen_logps": -248.1215362548828, "debug/reference_rejected_logps": -276.47796630859375, "debug/sppo_chosen_loss": 2632.06884765625, "debug/sppo_chosen_reward_in_loss": -0.9161418676376343, "debug/sppo_rej_reward_in_loss": -2.1158878803253174, "debug/sppo_reject_loss": 2319.23876953125, "epoch": 1.9565217391304348, "grad_norm": 65137.93491170249, "learning_rate": 7.912713472485768e-08, "logits/chosen": 1.0421147346496582, "logits/rejected": 1.3673218488693237, "logps/chosen": -249.03768920898438, "logps/rejected": -278.5938415527344, "loss": 4721.9492, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009161418303847313, "rewards/margins": 0.011997459456324577, "rewards/rejected": -0.02115887776017189, "step": 540 }, { "debug/policy_chosen_logits": 1.181572675704956, "debug/policy_chosen_logps": -262.95751953125, "debug/policy_rejected_logits": 1.416203260421753, "debug/policy_rejected_logps": -295.14556884765625, "debug/reference_chosen_logps": -261.45489501953125, "debug/reference_rejected_logps": -292.7149658203125, "debug/sppo_chosen_loss": 2690.55615234375, "debug/sppo_chosen_reward_in_loss": -1.5026168823242188, "debug/sppo_rej_reward_in_loss": -2.4306282997131348, "debug/sppo_reject_loss": 2303.905517578125, "epoch": 1.9746376811594204, "grad_norm": 60074.502863660986, "learning_rate": 7.888994307400379e-08, "logits/chosen": 1.181572675704956, "logits/rejected": 1.416203260421753, "logps/chosen": -262.95751953125, "logps/rejected": -295.14556884765625, "loss": 4639.1148, "rewards/accuracies": 0.5, "rewards/chosen": -0.015026169829070568, "rewards/margins": 0.009280113503336906, "rewards/rejected": -0.0243062824010849, "step": 545 }, { "debug/policy_chosen_logits": 0.9538512229919434, "debug/policy_chosen_logps": -247.245849609375, "debug/policy_rejected_logits": 1.0228263139724731, "debug/policy_rejected_logps": -283.17242431640625, "debug/reference_chosen_logps": -247.67520141601562, "debug/reference_rejected_logps": -279.9501647949219, "debug/sppo_chosen_loss": 2469.059814453125, "debug/sppo_chosen_reward_in_loss": 0.42936667799949646, "debug/sppo_rej_reward_in_loss": -3.222252607345581, "debug/sppo_reject_loss": 2227.85546875, "epoch": 1.9927536231884058, "grad_norm": 92853.93150276357, "learning_rate": 7.86527514231499e-08, "logits/chosen": 0.9538512229919434, "logits/rejected": 1.0228263139724731, "logps/chosen": -247.245849609375, "logps/rejected": -283.17242431640625, "loss": 4769.9555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00429366622120142, "rewards/margins": 0.03651618957519531, "rewards/rejected": -0.032222528010606766, "step": 550 }, { "debug/policy_chosen_logits": 1.0036228895187378, "debug/policy_chosen_logps": -235.4061279296875, "debug/policy_rejected_logits": 1.6517353057861328, "debug/policy_rejected_logps": -286.16009521484375, "debug/reference_chosen_logps": -234.8700714111328, "debug/reference_rejected_logps": -281.872314453125, "debug/sppo_chosen_loss": 2582.186767578125, "debug/sppo_chosen_reward_in_loss": -0.5360620617866516, "debug/sppo_rej_reward_in_loss": -4.287774085998535, "debug/sppo_reject_loss": 2132.886962890625, "epoch": 2.010869565217391, "grad_norm": 109379.56012644431, "learning_rate": 7.841555977229601e-08, "logits/chosen": 1.0036228895187378, "logits/rejected": 1.6517353057861328, "logps/chosen": -235.4061279296875, "logps/rejected": -286.16009521484375, "loss": 4814.3406, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005360620096325874, "rewards/margins": 0.037517111748456955, "rewards/rejected": -0.04287773743271828, "step": 555 }, { "debug/policy_chosen_logits": 1.530487298965454, "debug/policy_chosen_logps": -235.81600952148438, "debug/policy_rejected_logits": 1.7859302759170532, "debug/policy_rejected_logps": -273.2572021484375, "debug/reference_chosen_logps": -236.3900604248047, "debug/reference_rejected_logps": -269.92254638671875, "debug/sppo_chosen_loss": 2455.73974609375, "debug/sppo_chosen_reward_in_loss": 0.5740633010864258, "debug/sppo_rej_reward_in_loss": -3.334653377532959, "debug/sppo_reject_loss": 2221.925048828125, "epoch": 2.028985507246377, "grad_norm": 62415.07071691425, "learning_rate": 7.817836812144212e-08, "logits/chosen": 1.530487298965454, "logits/rejected": 1.7859302759170532, "logps/chosen": -235.81600952148438, "logps/rejected": -273.2572021484375, "loss": 4682.6898, "rewards/accuracies": 0.75, "rewards/chosen": 0.005740632768720388, "rewards/margins": 0.039087168872356415, "rewards/rejected": -0.03334653005003929, "step": 560 }, { "debug/policy_chosen_logits": 1.340097188949585, "debug/policy_chosen_logps": -251.3999481201172, "debug/policy_rejected_logits": 1.6633167266845703, "debug/policy_rejected_logps": -294.04168701171875, "debug/reference_chosen_logps": -251.97714233398438, "debug/reference_rejected_logps": -290.07403564453125, "debug/sppo_chosen_loss": 2452.09716796875, "debug/sppo_chosen_reward_in_loss": 0.5772041082382202, "debug/sppo_rej_reward_in_loss": -3.9676425457000732, "debug/sppo_reject_loss": 2170.06494140625, "epoch": 2.0471014492753623, "grad_norm": 62738.402195404444, "learning_rate": 7.794117647058824e-08, "logits/chosen": 1.340097188949585, "logits/rejected": 1.6633167266845703, "logps/chosen": -251.3999481201172, "logps/rejected": -294.04168701171875, "loss": 4677.343, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005772041156888008, "rewards/margins": 0.04544846713542938, "rewards/rejected": -0.03967642784118652, "step": 565 }, { "debug/policy_chosen_logits": 1.314124345779419, "debug/policy_chosen_logps": -256.4507751464844, "debug/policy_rejected_logits": 1.669500708580017, "debug/policy_rejected_logps": -293.2616271972656, "debug/reference_chosen_logps": -254.9364776611328, "debug/reference_rejected_logps": -289.62884521484375, "debug/sppo_chosen_loss": 2704.539794921875, "debug/sppo_chosen_reward_in_loss": -1.5142793655395508, "debug/sppo_rej_reward_in_loss": -3.632741928100586, "debug/sppo_reject_loss": 2197.43212890625, "epoch": 2.0652173913043477, "grad_norm": 70807.76281188594, "learning_rate": 7.770398481973435e-08, "logits/chosen": 1.314124345779419, "logits/rejected": 1.669500708580017, "logps/chosen": -256.4507751464844, "logps/rejected": -293.2616271972656, "loss": 4727.8828, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.015142792835831642, "rewards/margins": 0.02118462324142456, "rewards/rejected": -0.03632742166519165, "step": 570 }, { "debug/policy_chosen_logits": 1.2620753049850464, "debug/policy_chosen_logps": -254.2039337158203, "debug/policy_rejected_logits": 1.5737316608428955, "debug/policy_rejected_logps": -275.38507080078125, "debug/reference_chosen_logps": -253.73959350585938, "debug/reference_rejected_logps": -272.0418395996094, "debug/sppo_chosen_loss": 2558.74169921875, "debug/sppo_chosen_reward_in_loss": -0.464329332113266, "debug/sppo_rej_reward_in_loss": -3.343224048614502, "debug/sppo_reject_loss": 2219.814453125, "epoch": 2.0833333333333335, "grad_norm": 67907.9702061991, "learning_rate": 7.746679316888045e-08, "logits/chosen": 1.2620753049850464, "logits/rejected": 1.5737316608428955, "logps/chosen": -254.2039337158203, "logps/rejected": -275.38507080078125, "loss": 4651.4105, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004643293563276529, "rewards/margins": 0.028788944706320763, "rewards/rejected": -0.033432237803936005, "step": 575 }, { "debug/policy_chosen_logits": 0.8385118246078491, "debug/policy_chosen_logps": -239.9090118408203, "debug/policy_rejected_logits": 1.2391111850738525, "debug/policy_rejected_logps": -296.7748718261719, "debug/reference_chosen_logps": -238.2715606689453, "debug/reference_rejected_logps": -293.7236022949219, "debug/sppo_chosen_loss": 2700.380126953125, "debug/sppo_chosen_reward_in_loss": -1.6374528408050537, "debug/sppo_rej_reward_in_loss": -3.051283597946167, "debug/sppo_reject_loss": 2222.885986328125, "epoch": 2.101449275362319, "grad_norm": 65820.6351652557, "learning_rate": 7.722960151802656e-08, "logits/chosen": 0.8385118246078491, "logits/rejected": 1.2391111850738525, "logps/chosen": -239.9090118408203, "logps/rejected": -296.7748718261719, "loss": 4717.6727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.016374528408050537, "rewards/margins": 0.014138306491076946, "rewards/rejected": -0.030512835830450058, "step": 580 }, { "debug/policy_chosen_logits": 1.3985741138458252, "debug/policy_chosen_logps": -268.7876892089844, "debug/policy_rejected_logits": 1.6396198272705078, "debug/policy_rejected_logps": -270.4892883300781, "debug/reference_chosen_logps": -268.8836975097656, "debug/reference_rejected_logps": -266.7135314941406, "debug/sppo_chosen_loss": 2514.469970703125, "debug/sppo_chosen_reward_in_loss": 0.09603653103113174, "debug/sppo_rej_reward_in_loss": -3.7757625579833984, "debug/sppo_reject_loss": 2180.996826171875, "epoch": 2.119565217391304, "grad_norm": 80117.82386176355, "learning_rate": 7.699240986717267e-08, "logits/chosen": 1.3985741138458252, "logits/rejected": 1.6396198272705078, "logps/chosen": -268.7876892089844, "logps/rejected": -270.4892883300781, "loss": 4624.8102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0009603651124052703, "rewards/margins": 0.03871799260377884, "rewards/rejected": -0.03775762394070625, "step": 585 }, { "debug/policy_chosen_logits": 1.3296515941619873, "debug/policy_chosen_logps": -250.7174072265625, "debug/policy_rejected_logits": 1.7048896551132202, "debug/policy_rejected_logps": -320.5135803222656, "debug/reference_chosen_logps": -250.12686157226562, "debug/reference_rejected_logps": -315.68890380859375, "debug/sppo_chosen_loss": 2578.149658203125, "debug/sppo_chosen_reward_in_loss": -0.5905593633651733, "debug/sppo_rej_reward_in_loss": -4.824676990509033, "debug/sppo_reject_loss": 2108.701171875, "epoch": 2.13768115942029, "grad_norm": 70670.80338203225, "learning_rate": 7.675521821631878e-08, "logits/chosen": 1.3296515941619873, "logits/rejected": 1.7048896551132202, "logps/chosen": -250.7174072265625, "logps/rejected": -320.5135803222656, "loss": 4676.8914, "rewards/accuracies": 0.75, "rewards/chosen": -0.005905593745410442, "rewards/margins": 0.04234117642045021, "rewards/rejected": -0.04824677109718323, "step": 590 }, { "debug/policy_chosen_logits": 1.0052751302719116, "debug/policy_chosen_logps": -235.5624542236328, "debug/policy_rejected_logits": 1.3447738885879517, "debug/policy_rejected_logps": -293.3815002441406, "debug/reference_chosen_logps": -235.8861541748047, "debug/reference_rejected_logps": -288.54498291015625, "debug/sppo_chosen_loss": 2478.3037109375, "debug/sppo_chosen_reward_in_loss": 0.3237054944038391, "debug/sppo_rej_reward_in_loss": -4.836480140686035, "debug/sppo_reject_loss": 2113.497314453125, "epoch": 2.1557971014492754, "grad_norm": 129873.41028710218, "learning_rate": 7.651802656546489e-08, "logits/chosen": 1.0052751302719116, "logits/rejected": 1.3447738885879517, "logps/chosen": -235.5624542236328, "logps/rejected": -293.3815002441406, "loss": 4652.7609, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0032370544504374266, "rewards/margins": 0.05160186439752579, "rewards/rejected": -0.048364803194999695, "step": 595 }, { "debug/policy_chosen_logits": 1.1809848546981812, "debug/policy_chosen_logps": -241.6732177734375, "debug/policy_rejected_logits": 1.4068124294281006, "debug/policy_rejected_logps": -263.0619201660156, "debug/reference_chosen_logps": -241.9515838623047, "debug/reference_rejected_logps": -258.27471923828125, "debug/sppo_chosen_loss": 2485.47607421875, "debug/sppo_chosen_reward_in_loss": 0.2783866822719574, "debug/sppo_rej_reward_in_loss": -4.787167072296143, "debug/sppo_reject_loss": 2087.11767578125, "epoch": 2.1739130434782608, "grad_norm": 66137.45733045375, "learning_rate": 7.628083491461101e-08, "logits/chosen": 1.1809848546981812, "logits/rejected": 1.4068124294281006, "logps/chosen": -241.6732177734375, "logps/rejected": -263.0619201660156, "loss": 4665.2648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0027838668320327997, "rewards/margins": 0.050655532628297806, "rewards/rejected": -0.0478716678917408, "step": 600 }, { "epoch": 2.1739130434782608, "eval_debug/policy_chosen_logits": 1.5147371292114258, "eval_debug/policy_chosen_logps": -252.8381805419922, "eval_debug/policy_rejected_logits": 1.568827748298645, "eval_debug/policy_rejected_logps": -261.77886962890625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2515.5927734375, "eval_debug/sppo_chosen_reward_in_loss": 0.08029516041278839, "eval_debug/sppo_rej_reward_in_loss": -2.1202282905578613, "eval_debug/sppo_reject_loss": 2344.70947265625, "eval_logits/chosen": 1.5147371292114258, "eval_logits/rejected": 1.568827748298645, "eval_logps/chosen": -252.8381805419922, "eval_logps/rejected": -261.77886962890625, "eval_loss": 4749.77978515625, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": 0.000802951049990952, "eval_rewards/margins": 0.0220052320510149, "eval_rewards/rejected": -0.02120228111743927, "eval_runtime": 28.4881, "eval_samples_per_second": 21.061, "eval_steps_per_second": 0.667, "step": 600 }, { "debug/policy_chosen_logits": 1.2405939102172852, "debug/policy_chosen_logps": -245.2891082763672, "debug/policy_rejected_logits": 1.8459336757659912, "debug/policy_rejected_logps": -305.0096435546875, "debug/reference_chosen_logps": -245.0672607421875, "debug/reference_rejected_logps": -303.65692138671875, "debug/sppo_chosen_loss": 2543.26708984375, "debug/sppo_chosen_reward_in_loss": -0.22184638679027557, "debug/sppo_rej_reward_in_loss": -1.3526891469955444, "debug/sppo_reject_loss": 2400.721435546875, "epoch": 2.1920289855072466, "grad_norm": 65166.39708030864, "learning_rate": 7.60436432637571e-08, "logits/chosen": 1.2405939102172852, "logits/rejected": 1.8459336757659912, "logps/chosen": -245.2891082763672, "logps/rejected": -305.0096435546875, "loss": 4732.8301, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002218464156612754, "rewards/margins": 0.011308426037430763, "rewards/rejected": -0.013526889495551586, "step": 605 }, { "debug/policy_chosen_logits": 1.2397502660751343, "debug/policy_chosen_logps": -251.1930389404297, "debug/policy_rejected_logits": 1.3279813528060913, "debug/policy_rejected_logps": -264.08154296875, "debug/reference_chosen_logps": -251.7701873779297, "debug/reference_rejected_logps": -263.05694580078125, "debug/sppo_chosen_loss": 2459.10986328125, "debug/sppo_chosen_reward_in_loss": 0.577130913734436, "debug/sppo_rej_reward_in_loss": -1.0246174335479736, "debug/sppo_reject_loss": 2430.546630859375, "epoch": 2.210144927536232, "grad_norm": 93243.4743662712, "learning_rate": 7.580645161290323e-08, "logits/chosen": 1.2397502660751343, "logits/rejected": 1.3279813528060913, "logps/chosen": -251.1930389404297, "logps/rejected": -264.08154296875, "loss": 4748.1934, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005771309603005648, "rewards/margins": 0.016017483547329903, "rewards/rejected": -0.010246174409985542, "step": 610 }, { "debug/policy_chosen_logits": 1.268547534942627, "debug/policy_chosen_logps": -281.13580322265625, "debug/policy_rejected_logits": 1.4606815576553345, "debug/policy_rejected_logps": -297.2588195800781, "debug/reference_chosen_logps": -281.66461181640625, "debug/reference_rejected_logps": -295.0270080566406, "debug/sppo_chosen_loss": 2465.239501953125, "debug/sppo_chosen_reward_in_loss": 0.5288352966308594, "debug/sppo_rej_reward_in_loss": -2.23179292678833, "debug/sppo_reject_loss": 2320.22265625, "epoch": 2.2282608695652173, "grad_norm": 75780.24249252361, "learning_rate": 7.556925996204933e-08, "logits/chosen": 1.268547534942627, "logits/rejected": 1.4606815576553345, "logps/chosen": -281.13580322265625, "logps/rejected": -297.2588195800781, "loss": 4721.5188, "rewards/accuracies": 0.75, "rewards/chosen": 0.005288353189826012, "rewards/margins": 0.027606278657913208, "rewards/rejected": -0.022317929193377495, "step": 615 }, { "debug/policy_chosen_logits": 0.9408689737319946, "debug/policy_chosen_logps": -233.3802490234375, "debug/policy_rejected_logits": 1.4490621089935303, "debug/policy_rejected_logps": -315.6219787597656, "debug/reference_chosen_logps": -233.95346069335938, "debug/reference_rejected_logps": -311.99176025390625, "debug/sppo_chosen_loss": 2459.3095703125, "debug/sppo_chosen_reward_in_loss": 0.5732139348983765, "debug/sppo_rej_reward_in_loss": -3.630209445953369, "debug/sppo_reject_loss": 2203.49853515625, "epoch": 2.246376811594203, "grad_norm": 73362.93084263414, "learning_rate": 7.533206831119544e-08, "logits/chosen": 0.9408689737319946, "logits/rejected": 1.4490621089935303, "logps/chosen": -233.3802490234375, "logps/rejected": -315.6219787597656, "loss": 4701.6234, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.005732138641178608, "rewards/margins": 0.04203423112630844, "rewards/rejected": -0.03630208969116211, "step": 620 }, { "debug/policy_chosen_logits": 1.0855820178985596, "debug/policy_chosen_logps": -250.072509765625, "debug/policy_rejected_logits": 1.2474204301834106, "debug/policy_rejected_logps": -271.643310546875, "debug/reference_chosen_logps": -250.57568359375, "debug/reference_rejected_logps": -266.9862976074219, "debug/sppo_chosen_loss": 2479.56201171875, "debug/sppo_chosen_reward_in_loss": 0.503182053565979, "debug/sppo_rej_reward_in_loss": -4.657000541687012, "debug/sppo_reject_loss": 2133.64794921875, "epoch": 2.2644927536231885, "grad_norm": 61326.51730148805, "learning_rate": 7.509487666034155e-08, "logits/chosen": 1.0855820178985596, "logits/rejected": 1.2474204301834106, "logps/chosen": -250.072509765625, "logps/rejected": -271.643310546875, "loss": 4698.2883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.005031820386648178, "rewards/margins": 0.0516018271446228, "rewards/rejected": -0.04657001048326492, "step": 625 }, { "debug/policy_chosen_logits": 1.3660329580307007, "debug/policy_chosen_logps": -284.219970703125, "debug/policy_rejected_logits": 1.4941619634628296, "debug/policy_rejected_logps": -262.6479187011719, "debug/reference_chosen_logps": -283.0180969238281, "debug/reference_rejected_logps": -259.73748779296875, "debug/sppo_chosen_loss": 2677.47314453125, "debug/sppo_chosen_reward_in_loss": -1.2018907070159912, "debug/sppo_rej_reward_in_loss": -2.9104418754577637, "debug/sppo_reject_loss": 2259.016845703125, "epoch": 2.282608695652174, "grad_norm": 73911.49274407962, "learning_rate": 7.485768500948766e-08, "logits/chosen": 1.3660329580307007, "logits/rejected": 1.4941619634628296, "logps/chosen": -284.219970703125, "logps/rejected": -262.6479187011719, "loss": 4716.8875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012018906883895397, "rewards/margins": 0.01708551123738289, "rewards/rejected": -0.02910441718995571, "step": 630 }, { "debug/policy_chosen_logits": 1.2153542041778564, "debug/policy_chosen_logps": -235.53646850585938, "debug/policy_rejected_logits": 1.4520756006240845, "debug/policy_rejected_logps": -287.3448181152344, "debug/reference_chosen_logps": -236.39944458007812, "debug/reference_rejected_logps": -285.35406494140625, "debug/sppo_chosen_loss": 2424.7529296875, "debug/sppo_chosen_reward_in_loss": 0.862963080406189, "debug/sppo_rej_reward_in_loss": -1.9907958507537842, "debug/sppo_reject_loss": 2347.704833984375, "epoch": 2.300724637681159, "grad_norm": 100418.03686897733, "learning_rate": 7.462049335863377e-08, "logits/chosen": 1.2153542041778564, "logits/rejected": 1.4520756006240845, "logps/chosen": -235.53646850585938, "logps/rejected": -287.3448181152344, "loss": 4705.7055, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00862963031977415, "rewards/margins": 0.02853759005665779, "rewards/rejected": -0.019907960668206215, "step": 635 }, { "debug/policy_chosen_logits": 1.0639725923538208, "debug/policy_chosen_logps": -259.42852783203125, "debug/policy_rejected_logits": 1.4107351303100586, "debug/policy_rejected_logps": -289.49462890625, "debug/reference_chosen_logps": -259.8706359863281, "debug/reference_rejected_logps": -284.48968505859375, "debug/sppo_chosen_loss": 2468.21728515625, "debug/sppo_chosen_reward_in_loss": 0.442091166973114, "debug/sppo_rej_reward_in_loss": -5.004956245422363, "debug/sppo_reject_loss": 2092.140869140625, "epoch": 2.318840579710145, "grad_norm": 80440.5271321946, "learning_rate": 7.438330170777988e-08, "logits/chosen": 1.0639725923538208, "logits/rejected": 1.4107351303100586, "logps/chosen": -259.42852783203125, "logps/rejected": -289.49462890625, "loss": 4639.1387, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.004420911427587271, "rewards/margins": 0.054470472037792206, "rewards/rejected": -0.0500495620071888, "step": 640 }, { "debug/policy_chosen_logits": 1.1986241340637207, "debug/policy_chosen_logps": -281.43402099609375, "debug/policy_rejected_logits": 1.1429756879806519, "debug/policy_rejected_logps": -254.86874389648438, "debug/reference_chosen_logps": -280.4898681640625, "debug/reference_rejected_logps": -251.0946807861328, "debug/sppo_chosen_loss": 2633.010498046875, "debug/sppo_chosen_reward_in_loss": -0.9441375732421875, "debug/sppo_rej_reward_in_loss": -3.7740962505340576, "debug/sppo_reject_loss": 2195.27099609375, "epoch": 2.3369565217391304, "grad_norm": 67796.52372914675, "learning_rate": 7.4146110056926e-08, "logits/chosen": 1.1986241340637207, "logits/rejected": 1.1429756879806519, "logps/chosen": -281.43402099609375, "logps/rejected": -254.86874389648438, "loss": 4743.4246, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.009441375732421875, "rewards/margins": 0.028299588710069656, "rewards/rejected": -0.03774096444249153, "step": 645 }, { "debug/policy_chosen_logits": 1.4595186710357666, "debug/policy_chosen_logps": -268.525634765625, "debug/policy_rejected_logits": 1.4934539794921875, "debug/policy_rejected_logps": -297.20147705078125, "debug/reference_chosen_logps": -268.76470947265625, "debug/reference_rejected_logps": -293.04833984375, "debug/sppo_chosen_loss": 2490.87646484375, "debug/sppo_chosen_reward_in_loss": 0.23905925452709198, "debug/sppo_rej_reward_in_loss": -4.153144359588623, "debug/sppo_reject_loss": 2166.237548828125, "epoch": 2.355072463768116, "grad_norm": 73012.19519115123, "learning_rate": 7.39089184060721e-08, "logits/chosen": 1.4595186710357666, "logits/rejected": 1.4934539794921875, "logps/chosen": -268.525634765625, "logps/rejected": -297.20147705078125, "loss": 4703.3844, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0023905921261757612, "rewards/margins": 0.04392203688621521, "rewards/rejected": -0.04153144359588623, "step": 650 }, { "debug/policy_chosen_logits": 1.3086068630218506, "debug/policy_chosen_logps": -256.6421813964844, "debug/policy_rejected_logits": 1.6096570491790771, "debug/policy_rejected_logps": -295.10699462890625, "debug/reference_chosen_logps": -257.20928955078125, "debug/reference_rejected_logps": -293.06396484375, "debug/sppo_chosen_loss": 2456.015625, "debug/sppo_chosen_reward_in_loss": 0.5670536160469055, "debug/sppo_rej_reward_in_loss": -2.043034791946411, "debug/sppo_reject_loss": 2325.662109375, "epoch": 2.3731884057971016, "grad_norm": 71799.0295071937, "learning_rate": 7.367172675521821e-08, "logits/chosen": 1.3086068630218506, "logits/rejected": 1.6096570491790771, "logps/chosen": -256.6421813964844, "logps/rejected": -295.10699462890625, "loss": 4654.0449, "rewards/accuracies": 0.625, "rewards/chosen": 0.005670536309480667, "rewards/margins": 0.0261008832603693, "rewards/rejected": -0.020430345088243484, "step": 655 }, { "debug/policy_chosen_logits": 1.0361802577972412, "debug/policy_chosen_logps": -265.9391784667969, "debug/policy_rejected_logits": 1.2743251323699951, "debug/policy_rejected_logps": -285.8634033203125, "debug/reference_chosen_logps": -265.6632080078125, "debug/reference_rejected_logps": -280.2948303222656, "debug/sppo_chosen_loss": 2574.37890625, "debug/sppo_chosen_reward_in_loss": -0.27597731351852417, "debug/sppo_rej_reward_in_loss": -5.568592548370361, "debug/sppo_reject_loss": 2039.767578125, "epoch": 2.391304347826087, "grad_norm": 62196.05196492181, "learning_rate": 7.343453510436432e-08, "logits/chosen": 1.0361802577972412, "logits/rejected": 1.2743251323699951, "logps/chosen": -265.9391784667969, "logps/rejected": -285.8634033203125, "loss": 4712.8367, "rewards/accuracies": 0.75, "rewards/chosen": -0.0027597725857049227, "rewards/margins": 0.05292615294456482, "rewards/rejected": -0.055685918778181076, "step": 660 }, { "debug/policy_chosen_logits": 1.2509933710098267, "debug/policy_chosen_logps": -246.08291625976562, "debug/policy_rejected_logits": 1.6323570013046265, "debug/policy_rejected_logps": -276.28857421875, "debug/reference_chosen_logps": -245.8137969970703, "debug/reference_rejected_logps": -272.16265869140625, "debug/sppo_chosen_loss": 2538.547607421875, "debug/sppo_chosen_reward_in_loss": -0.26909542083740234, "debug/sppo_rej_reward_in_loss": -4.12593412399292, "debug/sppo_reject_loss": 2178.302978515625, "epoch": 2.4094202898550723, "grad_norm": 86644.81668861516, "learning_rate": 7.319734345351043e-08, "logits/chosen": 1.2509933710098267, "logits/rejected": 1.6323570013046265, "logps/chosen": -246.08291625976562, "logps/rejected": -276.28857421875, "loss": 4647.5207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002690953901037574, "rewards/margins": 0.03856838867068291, "rewards/rejected": -0.041259340941905975, "step": 665 }, { "debug/policy_chosen_logits": 1.5910011529922485, "debug/policy_chosen_logps": -280.7278137207031, "debug/policy_rejected_logits": 1.8075335025787354, "debug/policy_rejected_logps": -306.9920959472656, "debug/reference_chosen_logps": -281.46917724609375, "debug/reference_rejected_logps": -302.9301452636719, "debug/sppo_chosen_loss": 2434.78662109375, "debug/sppo_chosen_reward_in_loss": 0.7413776516914368, "debug/sppo_rej_reward_in_loss": -4.061938285827637, "debug/sppo_reject_loss": 2171.08251953125, "epoch": 2.427536231884058, "grad_norm": 65689.30262884953, "learning_rate": 7.296015180265654e-08, "logits/chosen": 1.5910011529922485, "logits/rejected": 1.8075335025787354, "logps/chosen": -280.7278137207031, "logps/rejected": -306.9920959472656, "loss": 4708.7, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.007413775660097599, "rewards/margins": 0.04803316295146942, "rewards/rejected": -0.0406193807721138, "step": 670 }, { "debug/policy_chosen_logits": 1.1634485721588135, "debug/policy_chosen_logps": -235.93643188476562, "debug/policy_rejected_logits": 1.326532244682312, "debug/policy_rejected_logps": -274.7832336425781, "debug/reference_chosen_logps": -235.17245483398438, "debug/reference_rejected_logps": -270.46295166015625, "debug/sppo_chosen_loss": 2606.688232421875, "debug/sppo_chosen_reward_in_loss": -0.7639774084091187, "debug/sppo_rej_reward_in_loss": -4.320298194885254, "debug/sppo_reject_loss": 2142.40771484375, "epoch": 2.4456521739130435, "grad_norm": 64556.98933521553, "learning_rate": 7.272296015180265e-08, "logits/chosen": 1.1634485721588135, "logits/rejected": 1.326532244682312, "logps/chosen": -235.93643188476562, "logps/rejected": -274.7832336425781, "loss": 4685.4164, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.007639773190021515, "rewards/margins": 0.035563208162784576, "rewards/rejected": -0.04320298507809639, "step": 675 }, { "debug/policy_chosen_logits": 1.5609382390975952, "debug/policy_chosen_logps": -285.33807373046875, "debug/policy_rejected_logits": 1.542031168937683, "debug/policy_rejected_logps": -262.7705993652344, "debug/reference_chosen_logps": -283.28729248046875, "debug/reference_rejected_logps": -259.95697021484375, "debug/sppo_chosen_loss": 2762.1884765625, "debug/sppo_chosen_reward_in_loss": -2.050769329071045, "debug/sppo_rej_reward_in_loss": -2.8136463165283203, "debug/sppo_reject_loss": 2267.11962890625, "epoch": 2.463768115942029, "grad_norm": 64293.54486746297, "learning_rate": 7.248576850094877e-08, "logits/chosen": 1.5609382390975952, "logits/rejected": 1.542031168937683, "logps/chosen": -285.33807373046875, "logps/rejected": -262.7705993652344, "loss": 4730.3922, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0205076914280653, "rewards/margins": 0.007628771476447582, "rewards/rejected": -0.028136461973190308, "step": 680 }, { "debug/policy_chosen_logits": 1.2182581424713135, "debug/policy_chosen_logps": -252.9768829345703, "debug/policy_rejected_logits": 1.522133469581604, "debug/policy_rejected_logps": -283.367431640625, "debug/reference_chosen_logps": -253.1710662841797, "debug/reference_rejected_logps": -280.0072021484375, "debug/sppo_chosen_loss": 2503.131103515625, "debug/sppo_chosen_reward_in_loss": 0.19420281052589417, "debug/sppo_rej_reward_in_loss": -3.3602325916290283, "debug/sppo_reject_loss": 2222.051513671875, "epoch": 2.4818840579710146, "grad_norm": 74922.41760488835, "learning_rate": 7.224857685009488e-08, "logits/chosen": 1.2182581424713135, "logits/rejected": 1.522133469581604, "logps/chosen": -252.9768829345703, "logps/rejected": -283.367431640625, "loss": 4610.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.001942028058692813, "rewards/margins": 0.03554435074329376, "rewards/rejected": -0.03360232710838318, "step": 685 }, { "debug/policy_chosen_logits": 1.1817820072174072, "debug/policy_chosen_logps": -249.6227569580078, "debug/policy_rejected_logits": 1.4627150297164917, "debug/policy_rejected_logps": -286.05560302734375, "debug/reference_chosen_logps": -250.5298309326172, "debug/reference_rejected_logps": -281.73443603515625, "debug/sppo_chosen_loss": 2416.05126953125, "debug/sppo_chosen_reward_in_loss": 0.9070972204208374, "debug/sppo_rej_reward_in_loss": -4.321181297302246, "debug/sppo_reject_loss": 2163.96875, "epoch": 2.5, "grad_norm": 87246.9439235192, "learning_rate": 7.201138519924098e-08, "logits/chosen": 1.1817820072174072, "logits/rejected": 1.4627150297164917, "logps/chosen": -249.6227569580078, "logps/rejected": -286.05560302734375, "loss": 4661.9211, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009070971980690956, "rewards/margins": 0.05228278040885925, "rewards/rejected": -0.043211810290813446, "step": 690 }, { "debug/policy_chosen_logits": 1.1838963031768799, "debug/policy_chosen_logps": -258.46527099609375, "debug/policy_rejected_logits": 1.2735018730163574, "debug/policy_rejected_logps": -269.1402282714844, "debug/reference_chosen_logps": -258.3566589355469, "debug/reference_rejected_logps": -263.8230285644531, "debug/sppo_chosen_loss": 2539.01513671875, "debug/sppo_chosen_reward_in_loss": -0.10861053317785263, "debug/sppo_rej_reward_in_loss": -5.317202568054199, "debug/sppo_reject_loss": 2053.17626953125, "epoch": 2.5181159420289854, "grad_norm": 60542.39159492325, "learning_rate": 7.177419354838709e-08, "logits/chosen": 1.1838963031768799, "logits/rejected": 1.2735018730163574, "logps/chosen": -258.46527099609375, "logps/rejected": -269.1402282714844, "loss": 4669.1945, "rewards/accuracies": 0.75, "rewards/chosen": -0.001086104311980307, "rewards/margins": 0.05208591744303703, "rewards/rejected": -0.053172022104263306, "step": 695 }, { "debug/policy_chosen_logits": 1.3196337223052979, "debug/policy_chosen_logps": -246.27572631835938, "debug/policy_rejected_logits": 1.5910546779632568, "debug/policy_rejected_logps": -282.46319580078125, "debug/reference_chosen_logps": -248.04214477539062, "debug/reference_rejected_logps": -281.76080322265625, "debug/sppo_chosen_loss": 2356.2646484375, "debug/sppo_chosen_reward_in_loss": 1.7664161920547485, "debug/sppo_rej_reward_in_loss": -0.7024328112602234, "debug/sppo_reject_loss": 2468.404052734375, "epoch": 2.536231884057971, "grad_norm": 139780.62136270115, "learning_rate": 7.15370018975332e-08, "logits/chosen": 1.3196337223052979, "logits/rejected": 1.5910546779632568, "logps/chosen": -246.27572631835938, "logps/rejected": -282.46319580078125, "loss": 4625.0359, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.017664160579442978, "rewards/margins": 0.024688487872481346, "rewards/rejected": -0.007024328224360943, "step": 700 }, { "epoch": 2.536231884057971, "eval_debug/policy_chosen_logits": 1.5097755193710327, "eval_debug/policy_chosen_logps": -244.15481567382812, "eval_debug/policy_rejected_logits": 1.5685186386108398, "eval_debug/policy_rejected_logps": -252.68405151367188, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 1714.2816162109375, "eval_debug/sppo_chosen_reward_in_loss": 8.763653755187988, "eval_debug/sppo_rej_reward_in_loss": 6.974597930908203, "eval_debug/sppo_reject_loss": 3259.76611328125, "eval_logits/chosen": 1.5097755193710327, "eval_logits/rejected": 1.5685186386108398, "eval_logps/chosen": -244.15481567382812, "eval_logps/rejected": -252.68405151367188, "eval_loss": 5035.46826171875, "eval_rewards/accuracies": 0.6447368264198303, "eval_rewards/chosen": 0.08763653039932251, "eval_rewards/margins": 0.017890559509396553, "eval_rewards/rejected": 0.0697459876537323, "eval_runtime": 28.5017, "eval_samples_per_second": 21.051, "eval_steps_per_second": 0.667, "step": 700 }, { "debug/policy_chosen_logits": 1.5419610738754272, "debug/policy_chosen_logps": -252.8527374267578, "debug/policy_rejected_logits": 1.7417796850204468, "debug/policy_rejected_logps": -281.376708984375, "debug/reference_chosen_logps": -263.1445007324219, "debug/reference_rejected_logps": -301.7698059082031, "debug/sppo_chosen_loss": 1648.705078125, "debug/sppo_chosen_reward_in_loss": 10.291768074035645, "debug/sppo_rej_reward_in_loss": 20.393077850341797, "debug/sppo_reject_loss": 5207.50634765625, "epoch": 2.5543478260869565, "grad_norm": 90806.208877395, "learning_rate": 7.129981024667931e-08, "logits/chosen": 1.5419610738754272, "logits/rejected": 1.7417796850204468, "logps/chosen": -252.8527374267578, "logps/rejected": -281.376708984375, "loss": 6354.2563, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.10291768610477448, "rewards/margins": -0.10101310163736343, "rewards/rejected": 0.2039307802915573, "step": 705 }, { "debug/policy_chosen_logits": 1.3171876668930054, "debug/policy_chosen_logps": -258.50311279296875, "debug/policy_rejected_logits": 1.5787550210952759, "debug/policy_rejected_logps": -319.204833984375, "debug/reference_chosen_logps": -267.941162109375, "debug/reference_rejected_logps": -326.2596740722656, "debug/sppo_chosen_loss": 1690.886962890625, "debug/sppo_chosen_reward_in_loss": 9.438051223754883, "debug/sppo_rej_reward_in_loss": 7.054859161376953, "debug/sppo_reject_loss": 3349.004638671875, "epoch": 2.572463768115942, "grad_norm": 73444.68247230946, "learning_rate": 7.106261859582542e-08, "logits/chosen": 1.3171876668930054, "logits/rejected": 1.5787550210952759, "logps/chosen": -258.50311279296875, "logps/rejected": -319.204833984375, "loss": 5072.0039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09438051283359528, "rewards/margins": 0.02383193001151085, "rewards/rejected": 0.07054858654737473, "step": 710 }, { "debug/policy_chosen_logits": 1.0042096376419067, "debug/policy_chosen_logps": -225.89022827148438, "debug/policy_rejected_logits": 1.2941092252731323, "debug/policy_rejected_logps": -277.9385681152344, "debug/reference_chosen_logps": -229.4971923828125, "debug/reference_rejected_logps": -279.83587646484375, "debug/sppo_chosen_loss": 2170.62451171875, "debug/sppo_chosen_reward_in_loss": 3.606924533843994, "debug/sppo_rej_reward_in_loss": 1.8972936868667603, "debug/sppo_reject_loss": 2726.21484375, "epoch": 2.5905797101449277, "grad_norm": 59159.47154174926, "learning_rate": 7.082542694497154e-08, "logits/chosen": 1.0042096376419067, "logits/rejected": 1.2941092252731323, "logps/chosen": -225.89022827148438, "logps/rejected": -277.9385681152344, "loss": 4791.243, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.036069247871637344, "rewards/margins": 0.01709630712866783, "rewards/rejected": 0.018972937017679214, "step": 715 }, { "debug/policy_chosen_logits": 1.0729498863220215, "debug/policy_chosen_logps": -242.7318878173828, "debug/policy_rejected_logits": 1.2859814167022705, "debug/policy_rejected_logps": -271.3218688964844, "debug/reference_chosen_logps": -246.52197265625, "debug/reference_rejected_logps": -272.2623596191406, "debug/sppo_chosen_loss": 2143.562255859375, "debug/sppo_chosen_reward_in_loss": 3.7900795936584473, "debug/sppo_rej_reward_in_loss": 0.9404786825180054, "debug/sppo_reject_loss": 2618.001953125, "epoch": 2.608695652173913, "grad_norm": 64495.00920762026, "learning_rate": 7.058823529411765e-08, "logits/chosen": 1.0729498863220215, "logits/rejected": 1.2859814167022705, "logps/chosen": -242.7318878173828, "logps/rejected": -271.3218688964844, "loss": 4601.9766, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03790079429745674, "rewards/margins": 0.02849600650370121, "rewards/rejected": 0.009404787793755531, "step": 720 }, { "debug/policy_chosen_logits": 1.2308111190795898, "debug/policy_chosen_logps": -229.8544158935547, "debug/policy_rejected_logits": 1.8873279094696045, "debug/policy_rejected_logps": -297.38922119140625, "debug/reference_chosen_logps": -232.2402801513672, "debug/reference_rejected_logps": -296.17901611328125, "debug/sppo_chosen_loss": 2275.4921875, "debug/sppo_chosen_reward_in_loss": 2.385882616043091, "debug/sppo_rej_reward_in_loss": -1.2101891040802002, "debug/sppo_reject_loss": 2429.00439453125, "epoch": 2.6268115942028984, "grad_norm": 58472.35877652262, "learning_rate": 7.035104364326376e-08, "logits/chosen": 1.2308111190795898, "logits/rejected": 1.8873279094696045, "logps/chosen": -229.8544158935547, "logps/rejected": -297.38922119140625, "loss": 4588.4539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.023858826607465744, "rewards/margins": 0.035960715264081955, "rewards/rejected": -0.01210189051926136, "step": 725 }, { "debug/policy_chosen_logits": 1.099022626876831, "debug/policy_chosen_logps": -243.7010955810547, "debug/policy_rejected_logits": 1.387231469154358, "debug/policy_rejected_logps": -280.25115966796875, "debug/reference_chosen_logps": -244.76611328125, "debug/reference_rejected_logps": -277.59765625, "debug/sppo_chosen_loss": 2416.036376953125, "debug/sppo_chosen_reward_in_loss": 1.0650326013565063, "debug/sppo_rej_reward_in_loss": -2.653526782989502, "debug/sppo_reject_loss": 2296.755615234375, "epoch": 2.644927536231884, "grad_norm": 58439.74489761965, "learning_rate": 7.011385199240986e-08, "logits/chosen": 1.099022626876831, "logits/rejected": 1.387231469154358, "logps/chosen": -243.7010955810547, "logps/rejected": -280.25115966796875, "loss": 4600.8906, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010650325566530228, "rewards/margins": 0.03718559071421623, "rewards/rejected": -0.026535267010331154, "step": 730 }, { "debug/policy_chosen_logits": 1.208804726600647, "debug/policy_chosen_logps": -261.3055419921875, "debug/policy_rejected_logits": 1.6377735137939453, "debug/policy_rejected_logps": -299.8998107910156, "debug/reference_chosen_logps": -262.77691650390625, "debug/reference_rejected_logps": -295.9397277832031, "debug/sppo_chosen_loss": 2386.124755859375, "debug/sppo_chosen_reward_in_loss": 1.4713882207870483, "debug/sppo_rej_reward_in_loss": -3.9600555896759033, "debug/sppo_reject_loss": 2199.98583984375, "epoch": 2.6630434782608696, "grad_norm": 59982.21606612011, "learning_rate": 6.987666034155597e-08, "logits/chosen": 1.208804726600647, "logits/rejected": 1.6377735137939453, "logps/chosen": -261.3055419921875, "logps/rejected": -299.8998107910156, "loss": 4614.5715, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014713883399963379, "rewards/margins": 0.05431443452835083, "rewards/rejected": -0.03960055857896805, "step": 735 }, { "debug/policy_chosen_logits": 1.3186663389205933, "debug/policy_chosen_logps": -275.8551940917969, "debug/policy_rejected_logits": 1.1153099536895752, "debug/policy_rejected_logps": -264.0783386230469, "debug/reference_chosen_logps": -274.85089111328125, "debug/reference_rejected_logps": -262.1892395019531, "debug/sppo_chosen_loss": 2635.03076171875, "debug/sppo_chosen_reward_in_loss": -1.0043277740478516, "debug/sppo_rej_reward_in_loss": -1.8891105651855469, "debug/sppo_reject_loss": 2340.14111328125, "epoch": 2.681159420289855, "grad_norm": 71934.30921166443, "learning_rate": 6.963946869070208e-08, "logits/chosen": 1.3186663389205933, "logits/rejected": 1.1153099536895752, "logps/chosen": -275.8551940917969, "logps/rejected": -264.0783386230469, "loss": 4680.6219, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010043277405202389, "rewards/margins": 0.008847828023135662, "rewards/rejected": -0.0188911072909832, "step": 740 }, { "debug/policy_chosen_logits": 1.268510341644287, "debug/policy_chosen_logps": -250.04617309570312, "debug/policy_rejected_logits": 1.233331322669983, "debug/policy_rejected_logps": -265.0309143066406, "debug/reference_chosen_logps": -249.9891357421875, "debug/reference_rejected_logps": -260.71539306640625, "debug/sppo_chosen_loss": 2529.843994140625, "debug/sppo_chosen_reward_in_loss": -0.0570489875972271, "debug/sppo_rej_reward_in_loss": -4.315499305725098, "debug/sppo_reject_loss": 2142.05224609375, "epoch": 2.699275362318841, "grad_norm": 67016.99391355577, "learning_rate": 6.940227703984819e-08, "logits/chosen": 1.268510341644287, "logits/rejected": 1.233331322669983, "logps/chosen": -250.04617309570312, "logps/rejected": -265.0309143066406, "loss": 4645.777, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005704900249838829, "rewards/margins": 0.042584508657455444, "rewards/rejected": -0.0431549958884716, "step": 745 }, { "debug/policy_chosen_logits": 1.069767951965332, "debug/policy_chosen_logps": -256.4144287109375, "debug/policy_rejected_logits": 1.1567275524139404, "debug/policy_rejected_logps": -262.8798828125, "debug/reference_chosen_logps": -257.0854797363281, "debug/reference_rejected_logps": -259.08953857421875, "debug/sppo_chosen_loss": 2442.606201171875, "debug/sppo_chosen_reward_in_loss": 0.6710414886474609, "debug/sppo_rej_reward_in_loss": -3.7903785705566406, "debug/sppo_reject_loss": 2203.1826171875, "epoch": 2.717391304347826, "grad_norm": 60925.5496150641, "learning_rate": 6.916508538899431e-08, "logits/chosen": 1.069767951965332, "logits/rejected": 1.1567275524139404, "logps/chosen": -256.4144287109375, "logps/rejected": -262.8798828125, "loss": 4602.1672, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0067104147747159, "rewards/margins": 0.04461420327425003, "rewards/rejected": -0.037903785705566406, "step": 750 }, { "debug/policy_chosen_logits": 1.1885673999786377, "debug/policy_chosen_logps": -276.02435302734375, "debug/policy_rejected_logits": 1.308869481086731, "debug/policy_rejected_logps": -287.28033447265625, "debug/reference_chosen_logps": -274.70001220703125, "debug/reference_rejected_logps": -281.486083984375, "debug/sppo_chosen_loss": 2660.95654296875, "debug/sppo_chosen_reward_in_loss": -1.3243558406829834, "debug/sppo_rej_reward_in_loss": -5.794241905212402, "debug/sppo_reject_loss": 2047.456298828125, "epoch": 2.7355072463768115, "grad_norm": 78780.10443130003, "learning_rate": 6.892789373814042e-08, "logits/chosen": 1.1885673999786377, "logits/rejected": 1.308869481086731, "logps/chosen": -276.02435302734375, "logps/rejected": -287.28033447265625, "loss": 4723.5691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.013243558816611767, "rewards/margins": 0.04469885304570198, "rewards/rejected": -0.057942409068346024, "step": 755 }, { "debug/policy_chosen_logits": 1.2253570556640625, "debug/policy_chosen_logps": -259.22723388671875, "debug/policy_rejected_logits": 1.7106854915618896, "debug/policy_rejected_logps": -303.2245178222656, "debug/reference_chosen_logps": -258.6191711425781, "debug/reference_rejected_logps": -298.0517883300781, "debug/sppo_chosen_loss": 2578.87646484375, "debug/sppo_chosen_reward_in_loss": -0.6080566644668579, "debug/sppo_rej_reward_in_loss": -5.172718048095703, "debug/sppo_reject_loss": 2069.8896484375, "epoch": 2.753623188405797, "grad_norm": 103635.08930027694, "learning_rate": 6.869070208728653e-08, "logits/chosen": 1.2253570556640625, "logits/rejected": 1.7106854915618896, "logps/chosen": -259.22723388671875, "logps/rejected": -303.2245178222656, "loss": 4709.0586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.006080566439777613, "rewards/margins": 0.04564661532640457, "rewards/rejected": -0.051727183163166046, "step": 760 }, { "debug/policy_chosen_logits": 1.3395227193832397, "debug/policy_chosen_logps": -263.3853454589844, "debug/policy_rejected_logits": 1.437042236328125, "debug/policy_rejected_logps": -281.8288879394531, "debug/reference_chosen_logps": -263.6646728515625, "debug/reference_rejected_logps": -276.64959716796875, "debug/sppo_chosen_loss": 2497.1796875, "debug/sppo_chosen_reward_in_loss": 0.2793167233467102, "debug/sppo_rej_reward_in_loss": -5.179283142089844, "debug/sppo_reject_loss": 2071.89599609375, "epoch": 2.7717391304347827, "grad_norm": 66122.53017757529, "learning_rate": 6.845351043643264e-08, "logits/chosen": 1.3395227193832397, "logits/rejected": 1.437042236328125, "logps/chosen": -263.3853454589844, "logps/rejected": -281.8288879394531, "loss": 4638.6133, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00279316701926291, "rewards/margins": 0.05458599328994751, "rewards/rejected": -0.051792822778224945, "step": 765 }, { "debug/policy_chosen_logits": 1.4400737285614014, "debug/policy_chosen_logps": -246.9227294921875, "debug/policy_rejected_logits": 1.9569038152694702, "debug/policy_rejected_logps": -292.9217834472656, "debug/reference_chosen_logps": -248.1265869140625, "debug/reference_rejected_logps": -289.3108825683594, "debug/sppo_chosen_loss": 2389.163330078125, "debug/sppo_chosen_reward_in_loss": 1.2038536071777344, "debug/sppo_rej_reward_in_loss": -3.6108856201171875, "debug/sppo_reject_loss": 2188.544677734375, "epoch": 2.789855072463768, "grad_norm": 72185.04677464087, "learning_rate": 6.821631878557874e-08, "logits/chosen": 1.4400737285614014, "logits/rejected": 1.9569038152694702, "logps/chosen": -246.9227294921875, "logps/rejected": -292.9217834472656, "loss": 4643.4516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.012038536369800568, "rewards/margins": 0.04814739152789116, "rewards/rejected": -0.03610885888338089, "step": 770 }, { "debug/policy_chosen_logits": 1.5654582977294922, "debug/policy_chosen_logps": -254.14987182617188, "debug/policy_rejected_logits": 1.8447706699371338, "debug/policy_rejected_logps": -306.61767578125, "debug/reference_chosen_logps": -254.22415161132812, "debug/reference_rejected_logps": -302.6522216796875, "debug/sppo_chosen_loss": 2511.81787109375, "debug/sppo_chosen_reward_in_loss": 0.0742717757821083, "debug/sppo_rej_reward_in_loss": -3.9654669761657715, "debug/sppo_reject_loss": 2203.16455078125, "epoch": 2.807971014492754, "grad_norm": 95658.0377960358, "learning_rate": 6.797912713472485e-08, "logits/chosen": 1.5654582977294922, "logits/rejected": 1.8447706699371338, "logps/chosen": -254.14987182617188, "logps/rejected": -306.61767578125, "loss": 4738.466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0007427178206853569, "rewards/margins": 0.04039738327264786, "rewards/rejected": -0.03965466842055321, "step": 775 }, { "debug/policy_chosen_logits": 1.1229254007339478, "debug/policy_chosen_logps": -243.0690460205078, "debug/policy_rejected_logits": 1.5946966409683228, "debug/policy_rejected_logps": -303.3990173339844, "debug/reference_chosen_logps": -242.7500762939453, "debug/reference_rejected_logps": -299.9582824707031, "debug/sppo_chosen_loss": 2574.80322265625, "debug/sppo_chosen_reward_in_loss": -0.3189578950405121, "debug/sppo_rej_reward_in_loss": -3.4407315254211426, "debug/sppo_reject_loss": 2246.947265625, "epoch": 2.8260869565217392, "grad_norm": 66229.98502880141, "learning_rate": 6.774193548387096e-08, "logits/chosen": 1.1229254007339478, "logits/rejected": 1.5946966409683228, "logps/chosen": -243.0690460205078, "logps/rejected": -303.3990173339844, "loss": 4678.091, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0031895791180431843, "rewards/margins": 0.031217733398079872, "rewards/rejected": -0.03440731018781662, "step": 780 }, { "debug/policy_chosen_logits": 1.1265347003936768, "debug/policy_chosen_logps": -248.6986846923828, "debug/policy_rejected_logits": 1.448249101638794, "debug/policy_rejected_logps": -278.3297119140625, "debug/reference_chosen_logps": -248.8385467529297, "debug/reference_rejected_logps": -271.8460693359375, "debug/sppo_chosen_loss": 2503.415771484375, "debug/sppo_chosen_reward_in_loss": 0.1398647278547287, "debug/sppo_rej_reward_in_loss": -6.483637809753418, "debug/sppo_reject_loss": 1989.5052490234375, "epoch": 2.8442028985507246, "grad_norm": 66422.49294663477, "learning_rate": 6.750474383301707e-08, "logits/chosen": 1.1265347003936768, "logits/rejected": 1.448249101638794, "logps/chosen": -248.6986846923828, "logps/rejected": -278.3297119140625, "loss": 4593.5887, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0013986475532874465, "rewards/margins": 0.0662350282073021, "rewards/rejected": -0.06483638286590576, "step": 785 }, { "debug/policy_chosen_logits": 1.3222026824951172, "debug/policy_chosen_logps": -246.96371459960938, "debug/policy_rejected_logits": 1.710313081741333, "debug/policy_rejected_logps": -302.73291015625, "debug/reference_chosen_logps": -246.872802734375, "debug/reference_rejected_logps": -299.05181884765625, "debug/sppo_chosen_loss": 2530.375, "debug/sppo_chosen_reward_in_loss": -0.09093017876148224, "debug/sppo_rej_reward_in_loss": -3.6811130046844482, "debug/sppo_reject_loss": 2187.28759765625, "epoch": 2.86231884057971, "grad_norm": 85878.27978072269, "learning_rate": 6.726755218216319e-08, "logits/chosen": 1.3222026824951172, "logits/rejected": 1.710313081741333, "logps/chosen": -246.96371459960938, "logps/rejected": -302.73291015625, "loss": 4641.7324, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0009093020344153047, "rewards/margins": 0.03590182960033417, "rewards/rejected": -0.03681113198399544, "step": 790 }, { "debug/policy_chosen_logits": 1.1963779926300049, "debug/policy_chosen_logps": -256.23699951171875, "debug/policy_rejected_logits": 1.4473451375961304, "debug/policy_rejected_logps": -293.70111083984375, "debug/reference_chosen_logps": -256.1133117675781, "debug/reference_rejected_logps": -289.4981994628906, "debug/sppo_chosen_loss": 2545.09521484375, "debug/sppo_chosen_reward_in_loss": -0.12368450313806534, "debug/sppo_rej_reward_in_loss": -4.202942848205566, "debug/sppo_reject_loss": 2168.801513671875, "epoch": 2.880434782608696, "grad_norm": 69769.03070479074, "learning_rate": 6.70303605313093e-08, "logits/chosen": 1.1963779926300049, "logits/rejected": 1.4473451375961304, "logps/chosen": -256.23699951171875, "logps/rejected": -293.70111083984375, "loss": 4601.3957, "rewards/accuracies": 0.625, "rewards/chosen": -0.0012368441093713045, "rewards/margins": 0.04079258441925049, "rewards/rejected": -0.04202943295240402, "step": 795 }, { "debug/policy_chosen_logits": 1.2974306344985962, "debug/policy_chosen_logps": -276.055908203125, "debug/policy_rejected_logits": 1.4969009160995483, "debug/policy_rejected_logps": -283.9123229980469, "debug/reference_chosen_logps": -275.1273193359375, "debug/reference_rejected_logps": -279.24957275390625, "debug/sppo_chosen_loss": 2638.364501953125, "debug/sppo_chosen_reward_in_loss": -0.9286226034164429, "debug/sppo_rej_reward_in_loss": -4.662759304046631, "debug/sppo_reject_loss": 2127.05419921875, "epoch": 2.898550724637681, "grad_norm": 72145.15954034042, "learning_rate": 6.679316888045541e-08, "logits/chosen": 1.2974306344985962, "logits/rejected": 1.4969009160995483, "logps/chosen": -276.055908203125, "logps/rejected": -283.9123229980469, "loss": 4637.3375, "rewards/accuracies": 0.625, "rewards/chosen": -0.009286226704716682, "rewards/margins": 0.03734136372804642, "rewards/rejected": -0.04662759602069855, "step": 800 }, { "epoch": 2.898550724637681, "eval_debug/policy_chosen_logits": 1.4772729873657227, "eval_debug/policy_chosen_logps": -253.23114013671875, "eval_debug/policy_rejected_logits": 1.5294151306152344, "eval_debug/policy_rejected_logps": -262.8460693359375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2569.70458984375, "eval_debug/sppo_chosen_reward_in_loss": -0.31267160177230835, "eval_debug/sppo_rej_reward_in_loss": -3.187434673309326, "eval_debug/sppo_reject_loss": 2272.2060546875, "eval_logits/chosen": 1.4772729873657227, "eval_logits/rejected": 1.5294151306152344, "eval_logps/chosen": -253.23114013671875, "eval_logps/rejected": -262.8460693359375, "eval_loss": 4705.77490234375, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": -0.0031267155427485704, "eval_rewards/margins": 0.02874763309955597, "eval_rewards/rejected": -0.03187434747815132, "eval_runtime": 28.4615, "eval_samples_per_second": 21.081, "eval_steps_per_second": 0.668, "step": 800 }, { "debug/policy_chosen_logits": 1.228362798690796, "debug/policy_chosen_logps": -252.15921020507812, "debug/policy_rejected_logits": 1.6530849933624268, "debug/policy_rejected_logps": -314.65679931640625, "debug/reference_chosen_logps": -253.0550079345703, "debug/reference_rejected_logps": -309.57537841796875, "debug/sppo_chosen_loss": 2421.93115234375, "debug/sppo_chosen_reward_in_loss": 0.8957826495170593, "debug/sppo_rej_reward_in_loss": -5.08138370513916, "debug/sppo_reject_loss": 2105.0166015625, "epoch": 2.9166666666666665, "grad_norm": 64770.41448004987, "learning_rate": 6.655597722960152e-08, "logits/chosen": 1.228362798690796, "logits/rejected": 1.6530849933624268, "logps/chosen": -252.15921020507812, "logps/rejected": -314.65679931640625, "loss": 4580.6055, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.008957825601100922, "rewards/margins": 0.059771664440631866, "rewards/rejected": -0.050813835114240646, "step": 805 }, { "debug/policy_chosen_logits": 1.2191035747528076, "debug/policy_chosen_logps": -248.7011260986328, "debug/policy_rejected_logits": 1.508216142654419, "debug/policy_rejected_logps": -316.2073669433594, "debug/reference_chosen_logps": -249.0221710205078, "debug/reference_rejected_logps": -309.80548095703125, "debug/sppo_chosen_loss": 2490.04443359375, "debug/sppo_chosen_reward_in_loss": 0.32104605436325073, "debug/sppo_rej_reward_in_loss": -6.40188455581665, "debug/sppo_reject_loss": 1978.004150390625, "epoch": 2.9347826086956523, "grad_norm": 66686.72099206004, "learning_rate": 6.631878557874762e-08, "logits/chosen": 1.2191035747528076, "logits/rejected": 1.508216142654419, "logps/chosen": -248.7011260986328, "logps/rejected": -316.2073669433594, "loss": 4663.8813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.003210460301488638, "rewards/margins": 0.06722930818796158, "rewards/rejected": -0.0640188455581665, "step": 810 }, { "debug/policy_chosen_logits": 1.0232962369918823, "debug/policy_chosen_logps": -238.018798828125, "debug/policy_rejected_logits": 1.4117815494537354, "debug/policy_rejected_logps": -298.9278869628906, "debug/reference_chosen_logps": -238.4188232421875, "debug/reference_rejected_logps": -296.5367126464844, "debug/sppo_chosen_loss": 2493.54248046875, "debug/sppo_chosen_reward_in_loss": 0.40000516176223755, "debug/sppo_rej_reward_in_loss": -2.391197443008423, "debug/sppo_reject_loss": 2306.767578125, "epoch": 2.9528985507246377, "grad_norm": 65744.92400996923, "learning_rate": 6.608159392789373e-08, "logits/chosen": 1.0232962369918823, "logits/rejected": 1.4117815494537354, "logps/chosen": -238.018798828125, "logps/rejected": -298.9278869628906, "loss": 4652.3043, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004000051412731409, "rewards/margins": 0.027912026271224022, "rewards/rejected": -0.02391197346150875, "step": 815 }, { "debug/policy_chosen_logits": 1.345824122428894, "debug/policy_chosen_logps": -250.0766143798828, "debug/policy_rejected_logits": 1.9338808059692383, "debug/policy_rejected_logps": -306.48687744140625, "debug/reference_chosen_logps": -251.37051391601562, "debug/reference_rejected_logps": -302.274658203125, "debug/sppo_chosen_loss": 2390.514404296875, "debug/sppo_chosen_reward_in_loss": 1.2939170598983765, "debug/sppo_rej_reward_in_loss": -4.212249755859375, "debug/sppo_reject_loss": 2170.71875, "epoch": 2.971014492753623, "grad_norm": 59398.92640310388, "learning_rate": 6.584440227703984e-08, "logits/chosen": 1.345824122428894, "logits/rejected": 1.9338808059692383, "logps/chosen": -250.0766143798828, "logps/rejected": -306.48687744140625, "loss": 4674.1766, "rewards/accuracies": 0.75, "rewards/chosen": 0.012939170002937317, "rewards/margins": 0.05506166070699692, "rewards/rejected": -0.0421224907040596, "step": 820 }, { "debug/policy_chosen_logits": 1.4544947147369385, "debug/policy_chosen_logps": -283.70843505859375, "debug/policy_rejected_logits": 1.7392966747283936, "debug/policy_rejected_logps": -301.4991149902344, "debug/reference_chosen_logps": -284.3904724121094, "debug/reference_rejected_logps": -295.550537109375, "debug/sppo_chosen_loss": 2450.37646484375, "debug/sppo_chosen_reward_in_loss": 0.682055652141571, "debug/sppo_rej_reward_in_loss": -5.948569297790527, "debug/sppo_reject_loss": 2043.864990234375, "epoch": 2.9891304347826084, "grad_norm": 68042.77538223789, "learning_rate": 6.560721062618596e-08, "logits/chosen": 1.4544947147369385, "logits/rejected": 1.7392966747283936, "logps/chosen": -283.70843505859375, "logps/rejected": -301.4991149902344, "loss": 4587.5, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.006820555776357651, "rewards/margins": 0.06630624830722809, "rewards/rejected": -0.05948569253087044, "step": 825 }, { "debug/policy_chosen_logits": 1.2692054510116577, "debug/policy_chosen_logps": -236.6494140625, "debug/policy_rejected_logits": 1.7388668060302734, "debug/policy_rejected_logps": -305.81744384765625, "debug/reference_chosen_logps": -238.1107177734375, "debug/reference_rejected_logps": -302.297607421875, "debug/sppo_chosen_loss": 2373.314453125, "debug/sppo_chosen_reward_in_loss": 1.4612903594970703, "debug/sppo_rej_reward_in_loss": -3.519826889038086, "debug/sppo_reject_loss": 2232.467529296875, "epoch": 3.0072463768115942, "grad_norm": 66495.56432506883, "learning_rate": 6.537001897533207e-08, "logits/chosen": 1.2692054510116577, "logits/rejected": 1.7388668060302734, "logps/chosen": -236.6494140625, "logps/rejected": -305.81744384765625, "loss": 4645.25, "rewards/accuracies": 0.75, "rewards/chosen": 0.014612903818488121, "rewards/margins": 0.049811169505119324, "rewards/rejected": -0.03519827127456665, "step": 830 }, { "debug/policy_chosen_logits": 1.3425302505493164, "debug/policy_chosen_logps": -260.2035217285156, "debug/policy_rejected_logits": 1.5310770273208618, "debug/policy_rejected_logps": -283.30157470703125, "debug/reference_chosen_logps": -261.3477478027344, "debug/reference_rejected_logps": -279.7190856933594, "debug/sppo_chosen_loss": 2409.96533203125, "debug/sppo_chosen_reward_in_loss": 1.1442630290985107, "debug/sppo_rej_reward_in_loss": -3.5824685096740723, "debug/sppo_reject_loss": 2221.889892578125, "epoch": 3.0253623188405796, "grad_norm": 72170.18331579148, "learning_rate": 6.513282732447818e-08, "logits/chosen": 1.3425302505493164, "logits/rejected": 1.5310770273208618, "logps/chosen": -260.2035217285156, "logps/rejected": -283.30157470703125, "loss": 4526.1039, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011442631483078003, "rewards/margins": 0.04726731404662132, "rewards/rejected": -0.03582468628883362, "step": 835 }, { "debug/policy_chosen_logits": 1.5465151071548462, "debug/policy_chosen_logps": -274.3478088378906, "debug/policy_rejected_logits": 1.3263134956359863, "debug/policy_rejected_logps": -277.4148864746094, "debug/reference_chosen_logps": -274.346923828125, "debug/reference_rejected_logps": -274.3509216308594, "debug/sppo_chosen_loss": 2550.4365234375, "debug/sppo_chosen_reward_in_loss": -0.0008403778192587197, "debug/sppo_rej_reward_in_loss": -3.063944101333618, "debug/sppo_reject_loss": 2266.9111328125, "epoch": 3.0434782608695654, "grad_norm": 57444.648797434675, "learning_rate": 6.489563567362429e-08, "logits/chosen": 1.5465151071548462, "logits/rejected": 1.3263134956359863, "logps/chosen": -274.3478088378906, "logps/rejected": -277.4148864746094, "loss": 4603.0047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -8.404813343076967e-06, "rewards/margins": 0.030631035566329956, "rewards/rejected": -0.030639439821243286, "step": 840 }, { "debug/policy_chosen_logits": 1.380659818649292, "debug/policy_chosen_logps": -257.90167236328125, "debug/policy_rejected_logits": 1.518367052078247, "debug/policy_rejected_logps": -271.87799072265625, "debug/reference_chosen_logps": -258.35516357421875, "debug/reference_rejected_logps": -267.05230712890625, "debug/sppo_chosen_loss": 2481.970703125, "debug/sppo_chosen_reward_in_loss": 0.4534896910190582, "debug/sppo_rej_reward_in_loss": -4.825692653656006, "debug/sppo_reject_loss": 2137.390869140625, "epoch": 3.0615942028985508, "grad_norm": 100902.20830641112, "learning_rate": 6.46584440227704e-08, "logits/chosen": 1.380659818649292, "logits/rejected": 1.518367052078247, "logps/chosen": -257.90167236328125, "logps/rejected": -271.87799072265625, "loss": 4605.6617, "rewards/accuracies": 0.75, "rewards/chosen": 0.004534896928817034, "rewards/margins": 0.05279182642698288, "rewards/rejected": -0.048256926238536835, "step": 845 }, { "debug/policy_chosen_logits": 1.1443955898284912, "debug/policy_chosen_logps": -261.2698974609375, "debug/policy_rejected_logits": 1.5599932670593262, "debug/policy_rejected_logps": -289.61553955078125, "debug/reference_chosen_logps": -262.4872131347656, "debug/reference_rejected_logps": -282.98260498046875, "debug/sppo_chosen_loss": 2392.6455078125, "debug/sppo_chosen_reward_in_loss": 1.217309594154358, "debug/sppo_rej_reward_in_loss": -6.6329240798950195, "debug/sppo_reject_loss": 1987.162109375, "epoch": 3.079710144927536, "grad_norm": 72978.23793449264, "learning_rate": 6.44212523719165e-08, "logits/chosen": 1.1443955898284912, "logits/rejected": 1.5599932670593262, "logps/chosen": -261.2698974609375, "logps/rejected": -289.61553955078125, "loss": 4566.157, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012173095718026161, "rewards/margins": 0.07850233465433121, "rewards/rejected": -0.0663292407989502, "step": 850 }, { "debug/policy_chosen_logits": 1.4892685413360596, "debug/policy_chosen_logps": -274.5538024902344, "debug/policy_rejected_logits": 1.5790163278579712, "debug/policy_rejected_logps": -318.97137451171875, "debug/reference_chosen_logps": -275.04705810546875, "debug/reference_rejected_logps": -311.6878356933594, "debug/sppo_chosen_loss": 2468.92724609375, "debug/sppo_chosen_reward_in_loss": 0.49327564239501953, "debug/sppo_rej_reward_in_loss": -7.283539772033691, "debug/sppo_reject_loss": 1930.694091796875, "epoch": 3.097826086956522, "grad_norm": 65157.7595652659, "learning_rate": 6.418406072106261e-08, "logits/chosen": 1.4892685413360596, "logits/rejected": 1.5790163278579712, "logps/chosen": -274.5538024902344, "logps/rejected": -318.97137451171875, "loss": 4617.0094, "rewards/accuracies": 0.75, "rewards/chosen": 0.004932756070047617, "rewards/margins": 0.07776814699172974, "rewards/rejected": -0.07283538579940796, "step": 855 }, { "debug/policy_chosen_logits": 1.2112557888031006, "debug/policy_chosen_logps": -259.48828125, "debug/policy_rejected_logits": 1.2626090049743652, "debug/policy_rejected_logps": -274.3577575683594, "debug/reference_chosen_logps": -260.32659912109375, "debug/reference_rejected_logps": -270.2201232910156, "debug/sppo_chosen_loss": 2433.81494140625, "debug/sppo_chosen_reward_in_loss": 0.8383318185806274, "debug/sppo_rej_reward_in_loss": -4.13765811920166, "debug/sppo_reject_loss": 2163.748046875, "epoch": 3.1159420289855073, "grad_norm": 62566.59623367877, "learning_rate": 6.394686907020873e-08, "logits/chosen": 1.2112557888031006, "logits/rejected": 1.2626090049743652, "logps/chosen": -259.48828125, "logps/rejected": -274.3577575683594, "loss": 4734.8844, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008383318781852722, "rewards/margins": 0.049759894609451294, "rewards/rejected": -0.04137657582759857, "step": 860 }, { "debug/policy_chosen_logits": 1.3012179136276245, "debug/policy_chosen_logps": -259.6365661621094, "debug/policy_rejected_logits": 1.6415010690689087, "debug/policy_rejected_logps": -277.46539306640625, "debug/reference_chosen_logps": -260.6236877441406, "debug/reference_rejected_logps": -272.70477294921875, "debug/sppo_chosen_loss": 2429.795654296875, "debug/sppo_chosen_reward_in_loss": 0.987095057964325, "debug/sppo_rej_reward_in_loss": -4.760601997375488, "debug/sppo_reject_loss": 2135.42724609375, "epoch": 3.1340579710144927, "grad_norm": 80271.39542940201, "learning_rate": 6.370967741935484e-08, "logits/chosen": 1.3012179136276245, "logits/rejected": 1.6415010690689087, "logps/chosen": -259.6365661621094, "logps/rejected": -277.46539306640625, "loss": 4581.9988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.009870951063930988, "rewards/margins": 0.05747697502374649, "rewards/rejected": -0.04760601744055748, "step": 865 }, { "debug/policy_chosen_logits": 1.5044926404953003, "debug/policy_chosen_logps": -271.87603759765625, "debug/policy_rejected_logits": 1.7668263912200928, "debug/policy_rejected_logps": -323.19561767578125, "debug/reference_chosen_logps": -273.129150390625, "debug/reference_rejected_logps": -319.109130859375, "debug/sppo_chosen_loss": 2388.428466796875, "debug/sppo_chosen_reward_in_loss": 1.2531499862670898, "debug/sppo_rej_reward_in_loss": -4.086410999298096, "debug/sppo_reject_loss": 2190.958740234375, "epoch": 3.1521739130434785, "grad_norm": 81132.97840621906, "learning_rate": 6.347248576850095e-08, "logits/chosen": 1.5044926404953003, "logits/rejected": 1.7668263912200928, "logps/chosen": -271.87603759765625, "logps/rejected": -323.19561767578125, "loss": 4593.793, "rewards/accuracies": 0.75, "rewards/chosen": 0.01253149937838316, "rewards/margins": 0.05339560657739639, "rewards/rejected": -0.04086410999298096, "step": 870 }, { "debug/policy_chosen_logits": 1.446988582611084, "debug/policy_chosen_logps": -273.71533203125, "debug/policy_rejected_logits": 1.4582245349884033, "debug/policy_rejected_logps": -269.6034240722656, "debug/reference_chosen_logps": -273.6427307128906, "debug/reference_rejected_logps": -265.6277160644531, "debug/sppo_chosen_loss": 2549.330810546875, "debug/sppo_chosen_reward_in_loss": -0.07260704040527344, "debug/sppo_rej_reward_in_loss": -3.9757189750671387, "debug/sppo_reject_loss": 2193.4951171875, "epoch": 3.170289855072464, "grad_norm": 65289.67735636841, "learning_rate": 6.323529411764706e-08, "logits/chosen": 1.446988582611084, "logits/rejected": 1.4582245349884033, "logps/chosen": -273.71533203125, "logps/rejected": -269.6034240722656, "loss": 4675.0609, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0007260694983415306, "rewards/margins": 0.03903111815452576, "rewards/rejected": -0.03975719213485718, "step": 875 }, { "debug/policy_chosen_logits": 1.326153039932251, "debug/policy_chosen_logps": -256.2907409667969, "debug/policy_rejected_logits": 1.6910384893417358, "debug/policy_rejected_logps": -342.2923889160156, "debug/reference_chosen_logps": -257.4051818847656, "debug/reference_rejected_logps": -340.0152587890625, "debug/sppo_chosen_loss": 2412.895751953125, "debug/sppo_chosen_reward_in_loss": 1.1144214868545532, "debug/sppo_rej_reward_in_loss": -2.2771120071411133, "debug/sppo_reject_loss": 2338.74609375, "epoch": 3.1884057971014492, "grad_norm": 104005.5690203319, "learning_rate": 6.299810246679317e-08, "logits/chosen": 1.326153039932251, "logits/rejected": 1.6910384893417358, "logps/chosen": -256.2907409667969, "logps/rejected": -342.2923889160156, "loss": 4502.0715, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.011144215241074562, "rewards/margins": 0.033915333449840546, "rewards/rejected": -0.022771120071411133, "step": 880 }, { "debug/policy_chosen_logits": 0.9587851762771606, "debug/policy_chosen_logps": -233.2637939453125, "debug/policy_rejected_logits": 1.2930221557617188, "debug/policy_rejected_logps": -285.134033203125, "debug/reference_chosen_logps": -232.969482421875, "debug/reference_rejected_logps": -279.6332702636719, "debug/sppo_chosen_loss": 2582.177001953125, "debug/sppo_chosen_reward_in_loss": -0.2943090498447418, "debug/sppo_rej_reward_in_loss": -5.500763416290283, "debug/sppo_reject_loss": 2091.379638671875, "epoch": 3.2065217391304346, "grad_norm": 117067.06036503153, "learning_rate": 6.276091081593927e-08, "logits/chosen": 0.9587851762771606, "logits/rejected": 1.2930221557617188, "logps/chosen": -233.2637939453125, "logps/rejected": -285.134033203125, "loss": 4743.2703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002943091094493866, "rewards/margins": 0.05206454545259476, "rewards/rejected": -0.05500763654708862, "step": 885 }, { "debug/policy_chosen_logits": 1.2469490766525269, "debug/policy_chosen_logps": -257.7119140625, "debug/policy_rejected_logits": 1.4921382665634155, "debug/policy_rejected_logps": -290.43084716796875, "debug/reference_chosen_logps": -256.938720703125, "debug/reference_rejected_logps": -284.2198486328125, "debug/sppo_chosen_loss": 2627.241455078125, "debug/sppo_chosen_reward_in_loss": -0.7731903195381165, "debug/sppo_rej_reward_in_loss": -6.2110276222229, "debug/sppo_reject_loss": 2015.809326171875, "epoch": 3.2246376811594204, "grad_norm": 80148.5004187906, "learning_rate": 6.252371916508538e-08, "logits/chosen": 1.2469490766525269, "logits/rejected": 1.4921382665634155, "logps/chosen": -257.7119140625, "logps/rejected": -290.43084716796875, "loss": 4545.6891, "rewards/accuracies": 0.625, "rewards/chosen": -0.007731902413070202, "rewards/margins": 0.05437837168574333, "rewards/rejected": -0.06211026757955551, "step": 890 }, { "debug/policy_chosen_logits": 1.369990348815918, "debug/policy_chosen_logps": -245.5441436767578, "debug/policy_rejected_logits": 1.5685436725616455, "debug/policy_rejected_logps": -262.864013671875, "debug/reference_chosen_logps": -245.36166381835938, "debug/reference_rejected_logps": -260.2655334472656, "debug/sppo_chosen_loss": 2548.572998046875, "debug/sppo_chosen_reward_in_loss": -0.18247947096824646, "debug/sppo_rej_reward_in_loss": -2.5984790325164795, "debug/sppo_reject_loss": 2304.638671875, "epoch": 3.2427536231884058, "grad_norm": 67597.45237524057, "learning_rate": 6.22865275142315e-08, "logits/chosen": 1.369990348815918, "logits/rejected": 1.5685436725616455, "logps/chosen": -245.5441436767578, "logps/rejected": -262.864013671875, "loss": 4743.8031, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0018247950356453657, "rewards/margins": 0.0241599939763546, "rewards/rejected": -0.025984788313508034, "step": 895 }, { "debug/policy_chosen_logits": 1.0764143466949463, "debug/policy_chosen_logps": -241.42822265625, "debug/policy_rejected_logits": 1.5565874576568604, "debug/policy_rejected_logps": -296.87884521484375, "debug/reference_chosen_logps": -242.0436553955078, "debug/reference_rejected_logps": -291.9529724121094, "debug/sppo_chosen_loss": 2461.28271484375, "debug/sppo_chosen_reward_in_loss": 0.6154451370239258, "debug/sppo_rej_reward_in_loss": -4.925865650177002, "debug/sppo_reject_loss": 2125.330078125, "epoch": 3.260869565217391, "grad_norm": 66577.15409151933, "learning_rate": 6.20493358633776e-08, "logits/chosen": 1.0764143466949463, "logits/rejected": 1.5565874576568604, "logps/chosen": -241.42822265625, "logps/rejected": -296.87884521484375, "loss": 4550.082, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.00615445151925087, "rewards/margins": 0.055413104593753815, "rewards/rejected": -0.04925865679979324, "step": 900 }, { "epoch": 3.260869565217391, "eval_debug/policy_chosen_logits": 1.4652026891708374, "eval_debug/policy_chosen_logps": -252.9286651611328, "eval_debug/policy_rejected_logits": 1.5159595012664795, "eval_debug/policy_rejected_logps": -262.8345031738281, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2544.358642578125, "eval_debug/sppo_chosen_reward_in_loss": -0.010185342282056808, "eval_debug/sppo_rej_reward_in_loss": -3.1758759021759033, "eval_debug/sppo_reject_loss": 2288.004150390625, "eval_logits/chosen": 1.4652026891708374, "eval_logits/rejected": 1.5159595012664795, "eval_logps/chosen": -252.9286651611328, "eval_logps/rejected": -262.8345031738281, "eval_loss": 4687.2900390625, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": -0.00010185375140281394, "eval_rewards/margins": 0.0316569060087204, "eval_rewards/rejected": -0.031758759170770645, "eval_runtime": 28.4108, "eval_samples_per_second": 21.119, "eval_steps_per_second": 0.669, "step": 900 }, { "debug/policy_chosen_logits": 1.3335583209991455, "debug/policy_chosen_logps": -273.39251708984375, "debug/policy_rejected_logits": 1.6337007284164429, "debug/policy_rejected_logps": -296.7558898925781, "debug/reference_chosen_logps": -273.14617919921875, "debug/reference_rejected_logps": -291.0046691894531, "debug/sppo_chosen_loss": 2591.5634765625, "debug/sppo_chosen_reward_in_loss": -0.24632683396339417, "debug/sppo_rej_reward_in_loss": -5.751223564147949, "debug/sppo_reject_loss": 2061.815185546875, "epoch": 3.278985507246377, "grad_norm": 75136.66018716336, "learning_rate": 6.181214421252372e-08, "logits/chosen": 1.3335583209991455, "logits/rejected": 1.6337007284164429, "logps/chosen": -273.39251708984375, "logps/rejected": -296.7558898925781, "loss": 4663.7465, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0024632681161165237, "rewards/margins": 0.05504896491765976, "rewards/rejected": -0.057512231171131134, "step": 905 }, { "debug/policy_chosen_logits": 1.202430009841919, "debug/policy_chosen_logps": -247.93728637695312, "debug/policy_rejected_logits": 1.3058010339736938, "debug/policy_rejected_logps": -280.98590087890625, "debug/reference_chosen_logps": -248.80142211914062, "debug/reference_rejected_logps": -276.1498107910156, "debug/sppo_chosen_loss": 2460.62060546875, "debug/sppo_chosen_reward_in_loss": 0.864148736000061, "debug/sppo_rej_reward_in_loss": -4.836081504821777, "debug/sppo_reject_loss": 2126.707763671875, "epoch": 3.2971014492753623, "grad_norm": 82636.85976888405, "learning_rate": 6.157495256166983e-08, "logits/chosen": 1.202430009841919, "logits/rejected": 1.3058010339736938, "logps/chosen": -247.93728637695312, "logps/rejected": -280.98590087890625, "loss": 4722.6609, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.00864148698747158, "rewards/margins": 0.05700229853391647, "rewards/rejected": -0.04836081340909004, "step": 910 }, { "debug/policy_chosen_logits": 1.6011492013931274, "debug/policy_chosen_logps": -265.23236083984375, "debug/policy_rejected_logits": 1.703529953956604, "debug/policy_rejected_logps": -276.78863525390625, "debug/reference_chosen_logps": -264.2405700683594, "debug/reference_rejected_logps": -274.06915283203125, "debug/sppo_chosen_loss": 2660.63916015625, "debug/sppo_chosen_reward_in_loss": -0.991807758808136, "debug/sppo_rej_reward_in_loss": -2.719452381134033, "debug/sppo_reject_loss": 2290.874267578125, "epoch": 3.3152173913043477, "grad_norm": 73761.28975105118, "learning_rate": 6.133776091081594e-08, "logits/chosen": 1.6011492013931274, "logits/rejected": 1.703529953956604, "logps/chosen": -265.23236083984375, "logps/rejected": -276.78863525390625, "loss": 4663.3953, "rewards/accuracies": 0.625, "rewards/chosen": -0.009918076917529106, "rewards/margins": 0.017276445403695107, "rewards/rejected": -0.027194524183869362, "step": 915 }, { "debug/policy_chosen_logits": 1.2337417602539062, "debug/policy_chosen_logps": -264.36761474609375, "debug/policy_rejected_logits": 1.2337805032730103, "debug/policy_rejected_logps": -267.5479431152344, "debug/reference_chosen_logps": -265.7757263183594, "debug/reference_rejected_logps": -265.7680358886719, "debug/sppo_chosen_loss": 2376.919921875, "debug/sppo_chosen_reward_in_loss": 1.4081287384033203, "debug/sppo_rej_reward_in_loss": -1.7799211740493774, "debug/sppo_reject_loss": 2362.98681640625, "epoch": 3.3333333333333335, "grad_norm": 59254.45940454811, "learning_rate": 6.110056925996205e-08, "logits/chosen": 1.2337417602539062, "logits/rejected": 1.2337805032730103, "logps/chosen": -264.36761474609375, "logps/rejected": -267.5479431152344, "loss": 4644.1445, "rewards/accuracies": 0.625, "rewards/chosen": 0.014081287197768688, "rewards/margins": 0.03188049793243408, "rewards/rejected": -0.01779920980334282, "step": 920 }, { "debug/policy_chosen_logits": 1.177049994468689, "debug/policy_chosen_logps": -278.21624755859375, "debug/policy_rejected_logits": 1.2722675800323486, "debug/policy_rejected_logps": -310.18560791015625, "debug/reference_chosen_logps": -278.3129577636719, "debug/reference_rejected_logps": -303.284423828125, "debug/sppo_chosen_loss": 2534.33203125, "debug/sppo_chosen_reward_in_loss": 0.09672851860523224, "debug/sppo_rej_reward_in_loss": -6.901175498962402, "debug/sppo_reject_loss": 2022.893310546875, "epoch": 3.351449275362319, "grad_norm": 74160.81950505967, "learning_rate": 6.086337760910815e-08, "logits/chosen": 1.177049994468689, "logits/rejected": 1.2722675800323486, "logps/chosen": -278.21624755859375, "logps/rejected": -310.18560791015625, "loss": 4624.8008, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0009672854212112725, "rewards/margins": 0.06997904181480408, "rewards/rejected": -0.06901176273822784, "step": 925 }, { "debug/policy_chosen_logits": 1.0474590063095093, "debug/policy_chosen_logps": -243.6260223388672, "debug/policy_rejected_logits": 1.3523534536361694, "debug/policy_rejected_logps": -281.63018798828125, "debug/reference_chosen_logps": -244.71994018554688, "debug/reference_rejected_logps": -276.67999267578125, "debug/sppo_chosen_loss": 2403.66650390625, "debug/sppo_chosen_reward_in_loss": 1.093942642211914, "debug/sppo_rej_reward_in_loss": -4.9501800537109375, "debug/sppo_reject_loss": 2107.836669921875, "epoch": 3.369565217391304, "grad_norm": 62655.58999698105, "learning_rate": 6.062618595825426e-08, "logits/chosen": 1.0474590063095093, "logits/rejected": 1.3523534536361694, "logps/chosen": -243.6260223388672, "logps/rejected": -281.63018798828125, "loss": 4619.0133, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010939424857497215, "rewards/margins": 0.06044122576713562, "rewards/rejected": -0.049501799046993256, "step": 930 }, { "debug/policy_chosen_logits": 1.0206243991851807, "debug/policy_chosen_logps": -230.58981323242188, "debug/policy_rejected_logits": 1.498462200164795, "debug/policy_rejected_logps": -262.7080993652344, "debug/reference_chosen_logps": -233.00662231445312, "debug/reference_rejected_logps": -258.6335144042969, "debug/sppo_chosen_loss": 2272.0126953125, "debug/sppo_chosen_reward_in_loss": 2.416801929473877, "debug/sppo_rej_reward_in_loss": -4.074548244476318, "debug/sppo_reject_loss": 2219.981689453125, "epoch": 3.38768115942029, "grad_norm": 64346.54905006239, "learning_rate": 6.038899430740037e-08, "logits/chosen": 1.0206243991851807, "logits/rejected": 1.498462200164795, "logps/chosen": -230.58981323242188, "logps/rejected": -262.7080993652344, "loss": 4586.4906, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.024168018251657486, "rewards/margins": 0.06491349637508392, "rewards/rejected": -0.04074548929929733, "step": 935 }, { "debug/policy_chosen_logits": 1.1483540534973145, "debug/policy_chosen_logps": -242.16879272460938, "debug/policy_rejected_logits": 1.2760623693466187, "debug/policy_rejected_logps": -264.7352294921875, "debug/reference_chosen_logps": -245.1829071044922, "debug/reference_rejected_logps": -262.12872314453125, "debug/sppo_chosen_loss": 2243.12060546875, "debug/sppo_chosen_reward_in_loss": 3.014132022857666, "debug/sppo_rej_reward_in_loss": -2.6064770221710205, "debug/sppo_reject_loss": 2314.98779296875, "epoch": 3.4057971014492754, "grad_norm": 68505.00444431895, "learning_rate": 6.015180265654649e-08, "logits/chosen": 1.1483540534973145, "logits/rejected": 1.2760623693466187, "logps/chosen": -242.16879272460938, "logps/rejected": -264.7352294921875, "loss": 4622.9918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.030141320079565048, "rewards/margins": 0.05620608851313591, "rewards/rejected": -0.02606477215886116, "step": 940 }, { "debug/policy_chosen_logits": 1.137284517288208, "debug/policy_chosen_logps": -223.4163818359375, "debug/policy_rejected_logits": 1.494816780090332, "debug/policy_rejected_logps": -265.3683776855469, "debug/reference_chosen_logps": -227.4475555419922, "debug/reference_rejected_logps": -264.8956298828125, "debug/sppo_chosen_loss": 2143.1484375, "debug/sppo_chosen_reward_in_loss": 4.031164169311523, "debug/sppo_rej_reward_in_loss": -0.472764790058136, "debug/sppo_reject_loss": 2508.26123046875, "epoch": 3.4239130434782608, "grad_norm": 128040.1280306347, "learning_rate": 5.99146110056926e-08, "logits/chosen": 1.137284517288208, "logits/rejected": 1.494816780090332, "logps/chosen": -223.4163818359375, "logps/rejected": -265.3683776855469, "loss": 4593.6637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04031164199113846, "rewards/margins": 0.04503928869962692, "rewards/rejected": -0.004727646708488464, "step": 945 }, { "debug/policy_chosen_logits": 0.8890711069107056, "debug/policy_chosen_logps": -251.12863159179688, "debug/policy_rejected_logits": 1.1951262950897217, "debug/policy_rejected_logps": -274.229736328125, "debug/reference_chosen_logps": -252.0839385986328, "debug/reference_rejected_logps": -270.8529052734375, "debug/sppo_chosen_loss": 2479.157470703125, "debug/sppo_chosen_reward_in_loss": 0.9552913904190063, "debug/sppo_rej_reward_in_loss": -3.376844882965088, "debug/sppo_reject_loss": 2244.093505859375, "epoch": 3.4420289855072466, "grad_norm": 66077.41258497938, "learning_rate": 5.967741935483871e-08, "logits/chosen": 0.8890711069107056, "logits/rejected": 1.1951262950897217, "logps/chosen": -251.12863159179688, "logps/rejected": -274.229736328125, "loss": 4569.2664, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009552913717925549, "rewards/margins": 0.043321363627910614, "rewards/rejected": -0.03376844525337219, "step": 950 }, { "debug/policy_chosen_logits": 1.2130244970321655, "debug/policy_chosen_logps": -247.39053344726562, "debug/policy_rejected_logits": 1.4722299575805664, "debug/policy_rejected_logps": -253.9204864501953, "debug/reference_chosen_logps": -250.8568572998047, "debug/reference_rejected_logps": -252.77029418945312, "debug/sppo_chosen_loss": 2173.708984375, "debug/sppo_chosen_reward_in_loss": 3.466336727142334, "debug/sppo_rej_reward_in_loss": -1.1502119302749634, "debug/sppo_reject_loss": 2451.362060546875, "epoch": 3.460144927536232, "grad_norm": 75680.95608191506, "learning_rate": 5.944022770398481e-08, "logits/chosen": 1.2130244970321655, "logits/rejected": 1.4722299575805664, "logps/chosen": -247.39053344726562, "logps/rejected": -253.9204864501953, "loss": 4561.6039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0346633680164814, "rewards/margins": 0.04616548866033554, "rewards/rejected": -0.011502119712531567, "step": 955 }, { "debug/policy_chosen_logits": 1.2317800521850586, "debug/policy_chosen_logps": -241.46444702148438, "debug/policy_rejected_logits": 1.4750645160675049, "debug/policy_rejected_logps": -279.91998291015625, "debug/reference_chosen_logps": -242.78359985351562, "debug/reference_rejected_logps": -274.2628479003906, "debug/sppo_chosen_loss": 2378.32177734375, "debug/sppo_chosen_reward_in_loss": 1.3191499710083008, "debug/sppo_rej_reward_in_loss": -5.657149314880371, "debug/sppo_reject_loss": 2103.384765625, "epoch": 3.4782608695652173, "grad_norm": 69154.63254747604, "learning_rate": 5.9203036053130925e-08, "logits/chosen": 1.2317800521850586, "logits/rejected": 1.4750645160675049, "logps/chosen": -241.46444702148438, "logps/rejected": -279.91998291015625, "loss": 4667.0453, "rewards/accuracies": 0.625, "rewards/chosen": 0.013191500678658485, "rewards/margins": 0.06976298987865448, "rewards/rejected": -0.05657149478793144, "step": 960 }, { "debug/policy_chosen_logits": 1.3727465867996216, "debug/policy_chosen_logps": -256.5999450683594, "debug/policy_rejected_logits": 1.7320778369903564, "debug/policy_rejected_logps": -308.158203125, "debug/reference_chosen_logps": -256.64630126953125, "debug/reference_rejected_logps": -303.5486755371094, "debug/sppo_chosen_loss": 2546.796630859375, "debug/sppo_chosen_reward_in_loss": 0.046335794031620026, "debug/sppo_rej_reward_in_loss": -4.609506130218506, "debug/sppo_reject_loss": 2160.57275390625, "epoch": 3.496376811594203, "grad_norm": 109173.32722584443, "learning_rate": 5.896584440227703e-08, "logits/chosen": 1.3727465867996216, "logits/rejected": 1.7320778369903564, "logps/chosen": -256.5999450683594, "logps/rejected": -308.158203125, "loss": 4516.9234, "rewards/accuracies": 0.625, "rewards/chosen": 0.0004633590579032898, "rewards/margins": 0.04655841737985611, "rewards/rejected": -0.04609506204724312, "step": 965 }, { "debug/policy_chosen_logits": 1.218552589416504, "debug/policy_chosen_logps": -264.9551696777344, "debug/policy_rejected_logits": 1.504345178604126, "debug/policy_rejected_logps": -296.6177673339844, "debug/reference_chosen_logps": -266.0826721191406, "debug/reference_rejected_logps": -290.98681640625, "debug/sppo_chosen_loss": 2408.279541015625, "debug/sppo_chosen_reward_in_loss": 1.1274702548980713, "debug/sppo_rej_reward_in_loss": -5.630954265594482, "debug/sppo_reject_loss": 2116.751220703125, "epoch": 3.5144927536231885, "grad_norm": 86943.8369596443, "learning_rate": 5.872865275142315e-08, "logits/chosen": 1.218552589416504, "logits/rejected": 1.504345178604126, "logps/chosen": -264.9551696777344, "logps/rejected": -296.6177673339844, "loss": 4743.4609, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011274700984358788, "rewards/margins": 0.06758423894643784, "rewards/rejected": -0.056309543550014496, "step": 970 }, { "debug/policy_chosen_logits": 1.245846152305603, "debug/policy_chosen_logps": -274.6354064941406, "debug/policy_rejected_logits": 1.4912992715835571, "debug/policy_rejected_logps": -294.6507263183594, "debug/reference_chosen_logps": -276.3829040527344, "debug/reference_rejected_logps": -289.47418212890625, "debug/sppo_chosen_loss": 2358.584228515625, "debug/sppo_chosen_reward_in_loss": 1.747462511062622, "debug/sppo_rej_reward_in_loss": -5.176543712615967, "debug/sppo_reject_loss": 2100.91162109375, "epoch": 3.532608695652174, "grad_norm": 98388.59124253406, "learning_rate": 5.849146110056926e-08, "logits/chosen": 1.245846152305603, "logits/rejected": 1.4912992715835571, "logps/chosen": -274.6354064941406, "logps/rejected": -294.6507263183594, "loss": 4568.4254, "rewards/accuracies": 0.75, "rewards/chosen": 0.017474623396992683, "rewards/margins": 0.06924006342887878, "rewards/rejected": -0.05176543444395065, "step": 975 }, { "debug/policy_chosen_logits": 1.1825393438339233, "debug/policy_chosen_logps": -227.0812530517578, "debug/policy_rejected_logits": 1.719199538230896, "debug/policy_rejected_logps": -287.44586181640625, "debug/reference_chosen_logps": -229.1777801513672, "debug/reference_rejected_logps": -279.7626953125, "debug/sppo_chosen_loss": 2300.39697265625, "debug/sppo_chosen_reward_in_loss": 2.096514940261841, "debug/sppo_rej_reward_in_loss": -7.68317174911499, "debug/sppo_reject_loss": 1935.713623046875, "epoch": 3.550724637681159, "grad_norm": 62987.47017068283, "learning_rate": 5.8254269449715365e-08, "logits/chosen": 1.1825393438339233, "logits/rejected": 1.719199538230896, "logps/chosen": -227.0812530517578, "logps/rejected": -287.44586181640625, "loss": 4506.441, "rewards/accuracies": 0.75, "rewards/chosen": 0.020965149626135826, "rewards/margins": 0.09779687225818634, "rewards/rejected": -0.07683172076940536, "step": 980 }, { "debug/policy_chosen_logits": 1.2285364866256714, "debug/policy_chosen_logps": -247.73269653320312, "debug/policy_rejected_logits": 1.4140039682388306, "debug/policy_rejected_logps": -291.87518310546875, "debug/reference_chosen_logps": -249.59201049804688, "debug/reference_rejected_logps": -285.00433349609375, "debug/sppo_chosen_loss": 2335.48779296875, "debug/sppo_chosen_reward_in_loss": 1.8593175411224365, "debug/sppo_rej_reward_in_loss": -6.870872497558594, "debug/sppo_reject_loss": 1991.682373046875, "epoch": 3.568840579710145, "grad_norm": 68854.4220478932, "learning_rate": 5.801707779886148e-08, "logits/chosen": 1.2285364866256714, "logits/rejected": 1.4140039682388306, "logps/chosen": -247.73269653320312, "logps/rejected": -291.87518310546875, "loss": 4541.391, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.01859317533671856, "rewards/margins": 0.08730189502239227, "rewards/rejected": -0.06870871782302856, "step": 985 }, { "debug/policy_chosen_logits": 1.4090076684951782, "debug/policy_chosen_logps": -271.1468811035156, "debug/policy_rejected_logits": 1.3900585174560547, "debug/policy_rejected_logps": -292.45086669921875, "debug/reference_chosen_logps": -272.51019287109375, "debug/reference_rejected_logps": -290.306884765625, "debug/sppo_chosen_loss": 2396.93408203125, "debug/sppo_chosen_reward_in_loss": 1.3632854223251343, "debug/sppo_rej_reward_in_loss": -2.143993616104126, "debug/sppo_reject_loss": 2359.3134765625, "epoch": 3.5869565217391304, "grad_norm": 114750.31416426151, "learning_rate": 5.777988614800758e-08, "logits/chosen": 1.4090076684951782, "logits/rejected": 1.3900585174560547, "logps/chosen": -271.1468811035156, "logps/rejected": -292.45086669921875, "loss": 4656.8316, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.013632853515446186, "rewards/margins": 0.03507278859615326, "rewards/rejected": -0.021439936012029648, "step": 990 }, { "debug/policy_chosen_logits": 1.1044788360595703, "debug/policy_chosen_logps": -255.77841186523438, "debug/policy_rejected_logits": 1.6304916143417358, "debug/policy_rejected_logps": -312.1942443847656, "debug/reference_chosen_logps": -255.8466339111328, "debug/reference_rejected_logps": -307.6287841796875, "debug/sppo_chosen_loss": 2517.78466796875, "debug/sppo_chosen_reward_in_loss": 0.06819553673267365, "debug/sppo_rej_reward_in_loss": -4.565450668334961, "debug/sppo_reject_loss": 2154.78564453125, "epoch": 3.605072463768116, "grad_norm": 76124.34857275525, "learning_rate": 5.7542694497153696e-08, "logits/chosen": 1.1044788360595703, "logits/rejected": 1.6304916143417358, "logps/chosen": -255.77841186523438, "logps/rejected": -312.1942443847656, "loss": 4558.3266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0006819546106271446, "rewards/margins": 0.04633646458387375, "rewards/rejected": -0.045654505491256714, "step": 995 }, { "debug/policy_chosen_logits": 1.4089725017547607, "debug/policy_chosen_logps": -263.33062744140625, "debug/policy_rejected_logits": 1.594951868057251, "debug/policy_rejected_logps": -289.5401916503906, "debug/reference_chosen_logps": -265.0359802246094, "debug/reference_rejected_logps": -285.32977294921875, "debug/sppo_chosen_loss": 2358.952392578125, "debug/sppo_chosen_reward_in_loss": 1.7053356170654297, "debug/sppo_rej_reward_in_loss": -4.210426330566406, "debug/sppo_reject_loss": 2213.955810546875, "epoch": 3.6231884057971016, "grad_norm": 68119.92230526533, "learning_rate": 5.7305502846299804e-08, "logits/chosen": 1.4089725017547607, "logits/rejected": 1.594951868057251, "logps/chosen": -263.33062744140625, "logps/rejected": -289.5401916503906, "loss": 4612.343, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017053356394171715, "rewards/margins": 0.05915762111544609, "rewards/rejected": -0.04210425913333893, "step": 1000 }, { "epoch": 3.6231884057971016, "eval_debug/policy_chosen_logits": 1.456944465637207, "eval_debug/policy_chosen_logps": -252.86813354492188, "eval_debug/policy_rejected_logits": 1.5061044692993164, "eval_debug/policy_rejected_logps": -262.890625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2546.73779296875, "eval_debug/sppo_chosen_reward_in_loss": 0.05035736784338951, "eval_debug/sppo_rej_reward_in_loss": -3.2319886684417725, "eval_debug/sppo_reject_loss": 2296.464111328125, "eval_logits/chosen": 1.456944465637207, "eval_logits/rejected": 1.5061044692993164, "eval_logps/chosen": -252.86813354492188, "eval_logps/rejected": -262.890625, "eval_loss": 4670.36669921875, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": 0.0005035737412981689, "eval_rewards/margins": 0.032823458313941956, "eval_rewards/rejected": -0.03231988474726677, "eval_runtime": 28.4414, "eval_samples_per_second": 21.096, "eval_steps_per_second": 0.668, "step": 1000 }, { "debug/policy_chosen_logits": 1.0761668682098389, "debug/policy_chosen_logps": -243.5882110595703, "debug/policy_rejected_logits": 1.6801502704620361, "debug/policy_rejected_logps": -306.94989013671875, "debug/reference_chosen_logps": -243.37484741210938, "debug/reference_rejected_logps": -299.6080017089844, "debug/sppo_chosen_loss": 2573.04296875, "debug/sppo_chosen_reward_in_loss": -0.2133689820766449, "debug/sppo_rej_reward_in_loss": -7.341833591461182, "debug/sppo_reject_loss": 2005.8109130859375, "epoch": 3.641304347826087, "grad_norm": 75826.55865741297, "learning_rate": 5.706831119544592e-08, "logits/chosen": 1.0761668682098389, "logits/rejected": 1.6801502704620361, "logps/chosen": -243.5882110595703, "logps/rejected": -306.94989013671875, "loss": 4508.2871, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0021336902864277363, "rewards/margins": 0.07128464430570602, "rewards/rejected": -0.07341833412647247, "step": 1005 }, { "debug/policy_chosen_logits": 1.139141321182251, "debug/policy_chosen_logps": -241.03134155273438, "debug/policy_rejected_logits": 1.722966194152832, "debug/policy_rejected_logps": -307.21746826171875, "debug/reference_chosen_logps": -241.87393188476562, "debug/reference_rejected_logps": -300.93377685546875, "debug/sppo_chosen_loss": 2430.720703125, "debug/sppo_chosen_reward_in_loss": 0.8425678014755249, "debug/sppo_rej_reward_in_loss": -6.283709526062012, "debug/sppo_reject_loss": 2025.876708984375, "epoch": 3.6594202898550723, "grad_norm": 60325.67867167411, "learning_rate": 5.6831119544592034e-08, "logits/chosen": 1.139141321182251, "logits/rejected": 1.722966194152832, "logps/chosen": -241.03134155273438, "logps/rejected": -307.21746826171875, "loss": 4578.1984, "rewards/accuracies": 0.75, "rewards/chosen": 0.008425678126513958, "rewards/margins": 0.07126276195049286, "rewards/rejected": -0.06283708661794662, "step": 1010 }, { "debug/policy_chosen_logits": 1.4580762386322021, "debug/policy_chosen_logps": -259.34576416015625, "debug/policy_rejected_logits": 1.293593168258667, "debug/policy_rejected_logps": -272.5147399902344, "debug/reference_chosen_logps": -259.12908935546875, "debug/reference_rejected_logps": -265.5401611328125, "debug/sppo_chosen_loss": 2547.24267578125, "debug/sppo_chosen_reward_in_loss": -0.21666832268238068, "debug/sppo_rej_reward_in_loss": -6.974601745605469, "debug/sppo_reject_loss": 1969.7564697265625, "epoch": 3.677536231884058, "grad_norm": 77203.40109790607, "learning_rate": 5.6593927893738136e-08, "logits/chosen": 1.4580762386322021, "logits/rejected": 1.293593168258667, "logps/chosen": -259.34576416015625, "logps/rejected": -272.5147399902344, "loss": 4609.8387, "rewards/accuracies": 0.75, "rewards/chosen": -0.0021666833199560642, "rewards/margins": 0.06757933646440506, "rewards/rejected": -0.06974602490663528, "step": 1015 }, { "debug/policy_chosen_logits": 0.9162737131118774, "debug/policy_chosen_logps": -241.2355499267578, "debug/policy_rejected_logits": 1.3415310382843018, "debug/policy_rejected_logps": -297.12078857421875, "debug/reference_chosen_logps": -241.02230834960938, "debug/reference_rejected_logps": -289.7048034667969, "debug/sppo_chosen_loss": 2553.9853515625, "debug/sppo_chosen_reward_in_loss": -0.21321754157543182, "debug/sppo_rej_reward_in_loss": -7.415976047515869, "debug/sppo_reject_loss": 1941.183349609375, "epoch": 3.6956521739130435, "grad_norm": 71521.8701822076, "learning_rate": 5.635673624288425e-08, "logits/chosen": 0.9162737131118774, "logits/rejected": 1.3415310382843018, "logps/chosen": -241.2355499267578, "logps/rejected": -297.12078857421875, "loss": 4530.693, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021321747917681932, "rewards/margins": 0.07202757894992828, "rewards/rejected": -0.07415975630283356, "step": 1020 }, { "debug/policy_chosen_logits": 1.4089264869689941, "debug/policy_chosen_logps": -254.9273681640625, "debug/policy_rejected_logits": 1.8630297183990479, "debug/policy_rejected_logps": -317.7763977050781, "debug/reference_chosen_logps": -256.17486572265625, "debug/reference_rejected_logps": -310.9232482910156, "debug/sppo_chosen_loss": 2390.381591796875, "debug/sppo_chosen_reward_in_loss": 1.2474699020385742, "debug/sppo_rej_reward_in_loss": -6.853158473968506, "debug/sppo_reject_loss": 1981.8870849609375, "epoch": 3.713768115942029, "grad_norm": 65824.47241917462, "learning_rate": 5.611954459203035e-08, "logits/chosen": 1.4089264869689941, "logits/rejected": 1.8630297183990479, "logps/chosen": -254.9273681640625, "logps/rejected": -317.7763977050781, "loss": 4540.2395, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.012474698945879936, "rewards/margins": 0.08100628852844238, "rewards/rejected": -0.0685315877199173, "step": 1025 }, { "debug/policy_chosen_logits": 1.2415307760238647, "debug/policy_chosen_logps": -277.1786193847656, "debug/policy_rejected_logits": 1.4892160892486572, "debug/policy_rejected_logps": -286.98895263671875, "debug/reference_chosen_logps": -277.847412109375, "debug/reference_rejected_logps": -283.011962890625, "debug/sppo_chosen_loss": 2463.440185546875, "debug/sppo_chosen_reward_in_loss": 0.668784499168396, "debug/sppo_rej_reward_in_loss": -3.9769866466522217, "debug/sppo_reject_loss": 2179.719482421875, "epoch": 3.7318840579710146, "grad_norm": 91508.97439440723, "learning_rate": 5.588235294117647e-08, "logits/chosen": 1.2415307760238647, "logits/rejected": 1.4892160892486572, "logps/chosen": -277.1786193847656, "logps/rejected": -286.98895263671875, "loss": 4709.6215, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.006687845103442669, "rewards/margins": 0.04645771533250809, "rewards/rejected": -0.039769865572452545, "step": 1030 }, { "debug/policy_chosen_logits": 1.0468541383743286, "debug/policy_chosen_logps": -270.3118896484375, "debug/policy_rejected_logits": 1.1386754512786865, "debug/policy_rejected_logps": -280.1852722167969, "debug/reference_chosen_logps": -271.19110107421875, "debug/reference_rejected_logps": -272.58306884765625, "debug/sppo_chosen_loss": 2455.1533203125, "debug/sppo_chosen_reward_in_loss": 0.8792282342910767, "debug/sppo_rej_reward_in_loss": -7.602209568023682, "debug/sppo_reject_loss": 1913.0250244140625, "epoch": 3.75, "grad_norm": 57982.12859352569, "learning_rate": 5.5645161290322576e-08, "logits/chosen": 1.0468541383743286, "logits/rejected": 1.1386754512786865, "logps/chosen": -270.3118896484375, "logps/rejected": -280.1852722167969, "loss": 4545.618, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.008792281150817871, "rewards/margins": 0.08481436967849731, "rewards/rejected": -0.07602208852767944, "step": 1035 }, { "debug/policy_chosen_logits": 1.7533938884735107, "debug/policy_chosen_logps": -261.36627197265625, "debug/policy_rejected_logits": 2.0570945739746094, "debug/policy_rejected_logps": -302.0135803222656, "debug/reference_chosen_logps": -260.8301696777344, "debug/reference_rejected_logps": -295.55316162109375, "debug/sppo_chosen_loss": 2627.633544921875, "debug/sppo_chosen_reward_in_loss": -0.5360885858535767, "debug/sppo_rej_reward_in_loss": -6.460418701171875, "debug/sppo_reject_loss": 2027.4420166015625, "epoch": 3.7681159420289854, "grad_norm": 61491.48698944422, "learning_rate": 5.540796963946869e-08, "logits/chosen": 1.7533938884735107, "logits/rejected": 2.0570945739746094, "logps/chosen": -261.36627197265625, "logps/rejected": -302.0135803222656, "loss": 4547.627, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.00536088552325964, "rewards/margins": 0.059243302792310715, "rewards/rejected": -0.06460419297218323, "step": 1040 }, { "debug/policy_chosen_logits": 1.5464121103286743, "debug/policy_chosen_logps": -245.330810546875, "debug/policy_rejected_logits": 1.785143256187439, "debug/policy_rejected_logps": -294.738037109375, "debug/reference_chosen_logps": -246.18161010742188, "debug/reference_rejected_logps": -288.33087158203125, "debug/sppo_chosen_loss": 2464.212890625, "debug/sppo_chosen_reward_in_loss": 0.8508247137069702, "debug/sppo_rej_reward_in_loss": -6.407144069671631, "debug/sppo_reject_loss": 1984.099365234375, "epoch": 3.786231884057971, "grad_norm": 69014.39754474655, "learning_rate": 5.5170777988614805e-08, "logits/chosen": 1.5464121103286743, "logits/rejected": 1.785143256187439, "logps/chosen": -245.330810546875, "logps/rejected": -294.738037109375, "loss": 4558.0805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.008508248254656792, "rewards/margins": 0.07257968187332153, "rewards/rejected": -0.06407143175601959, "step": 1045 }, { "debug/policy_chosen_logits": 1.2609115839004517, "debug/policy_chosen_logps": -263.4371032714844, "debug/policy_rejected_logits": 1.679581642150879, "debug/policy_rejected_logps": -312.66168212890625, "debug/reference_chosen_logps": -265.32025146484375, "debug/reference_rejected_logps": -303.7322998046875, "debug/sppo_chosen_loss": 2327.70361328125, "debug/sppo_chosen_reward_in_loss": 1.88314950466156, "debug/sppo_rej_reward_in_loss": -8.92940616607666, "debug/sppo_reject_loss": 1826.491455078125, "epoch": 3.8043478260869565, "grad_norm": 130681.35754107402, "learning_rate": 5.493358633776091e-08, "logits/chosen": 1.2609115839004517, "logits/rejected": 1.679581642150879, "logps/chosen": -263.4371032714844, "logps/rejected": -312.66168212890625, "loss": 4518.9848, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.018831495195627213, "rewards/margins": 0.10812555253505707, "rewards/rejected": -0.08929406106472015, "step": 1050 }, { "debug/policy_chosen_logits": 1.428841471672058, "debug/policy_chosen_logps": -274.14447021484375, "debug/policy_rejected_logits": 1.7098848819732666, "debug/policy_rejected_logps": -292.1790771484375, "debug/reference_chosen_logps": -276.7886962890625, "debug/reference_rejected_logps": -286.98443603515625, "debug/sppo_chosen_loss": 2252.016845703125, "debug/sppo_chosen_reward_in_loss": 2.64421010017395, "debug/sppo_rej_reward_in_loss": -5.194632053375244, "debug/sppo_reject_loss": 2078.817626953125, "epoch": 3.822463768115942, "grad_norm": 113803.55507421952, "learning_rate": 5.469639468690702e-08, "logits/chosen": 1.428841471672058, "logits/rejected": 1.7098848819732666, "logps/chosen": -274.14447021484375, "logps/rejected": -292.1790771484375, "loss": 4508.4406, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02644210122525692, "rewards/margins": 0.07838841527700424, "rewards/rejected": -0.05194631963968277, "step": 1055 }, { "debug/policy_chosen_logits": 1.3418858051300049, "debug/policy_chosen_logps": -267.40069580078125, "debug/policy_rejected_logits": 1.5534236431121826, "debug/policy_rejected_logps": -287.94891357421875, "debug/reference_chosen_logps": -266.3821716308594, "debug/reference_rejected_logps": -285.1313171386719, "debug/sppo_chosen_loss": 2670.68017578125, "debug/sppo_chosen_reward_in_loss": -1.0185024738311768, "debug/sppo_rej_reward_in_loss": -2.8175864219665527, "debug/sppo_reject_loss": 2302.353515625, "epoch": 3.8405797101449277, "grad_norm": 96112.86619649334, "learning_rate": 5.4459203036053124e-08, "logits/chosen": 1.3418858051300049, "logits/rejected": 1.5534236431121826, "logps/chosen": -267.40069580078125, "logps/rejected": -287.94891357421875, "loss": 4713.9641, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01018502376973629, "rewards/margins": 0.017990842461586, "rewards/rejected": -0.02817586623132229, "step": 1060 }, { "debug/policy_chosen_logits": 1.4602513313293457, "debug/policy_chosen_logps": -266.38262939453125, "debug/policy_rejected_logits": 1.5078020095825195, "debug/policy_rejected_logps": -267.8259582519531, "debug/reference_chosen_logps": -266.792236328125, "debug/reference_rejected_logps": -261.2626953125, "debug/sppo_chosen_loss": 2507.094970703125, "debug/sppo_chosen_reward_in_loss": 0.40961894392967224, "debug/sppo_rej_reward_in_loss": -6.5632643699646, "debug/sppo_reject_loss": 2010.8968505859375, "epoch": 3.858695652173913, "grad_norm": 61080.04619521491, "learning_rate": 5.422201138519924e-08, "logits/chosen": 1.4602513313293457, "logits/rejected": 1.5078020095825195, "logps/chosen": -266.38262939453125, "logps/rejected": -267.8259582519531, "loss": 4572.7539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004096188582479954, "rewards/margins": 0.06972883641719818, "rewards/rejected": -0.0656326487660408, "step": 1065 }, { "debug/policy_chosen_logits": 1.2502130270004272, "debug/policy_chosen_logps": -242.38699340820312, "debug/policy_rejected_logits": 1.4786583185195923, "debug/policy_rejected_logps": -260.73486328125, "debug/reference_chosen_logps": -243.15463256835938, "debug/reference_rejected_logps": -256.77166748046875, "debug/sppo_chosen_loss": 2466.73876953125, "debug/sppo_chosen_reward_in_loss": 0.767635703086853, "debug/sppo_rej_reward_in_loss": -3.9631810188293457, "debug/sppo_reject_loss": 2206.03369140625, "epoch": 3.8768115942028984, "grad_norm": 60371.23813088923, "learning_rate": 5.398481973434535e-08, "logits/chosen": 1.2502130270004272, "logits/rejected": 1.4786583185195923, "logps/chosen": -242.38699340820312, "logps/rejected": -260.73486328125, "loss": 4621.8828, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0076763564720749855, "rewards/margins": 0.047308165580034256, "rewards/rejected": -0.03963180631399155, "step": 1070 }, { "debug/policy_chosen_logits": 1.0678049325942993, "debug/policy_chosen_logps": -230.09619140625, "debug/policy_rejected_logits": 1.3625367879867554, "debug/policy_rejected_logps": -275.34063720703125, "debug/reference_chosen_logps": -231.5152130126953, "debug/reference_rejected_logps": -270.4071350097656, "debug/sppo_chosen_loss": 2386.3046875, "debug/sppo_chosen_reward_in_loss": 1.4190356731414795, "debug/sppo_rej_reward_in_loss": -4.933535575866699, "debug/sppo_reject_loss": 2169.053955078125, "epoch": 3.894927536231884, "grad_norm": 68551.62323347438, "learning_rate": 5.374762808349146e-08, "logits/chosen": 1.0678049325942993, "logits/rejected": 1.3625367879867554, "logps/chosen": -230.09619140625, "logps/rejected": -275.34063720703125, "loss": 4545.2449, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014190358109772205, "rewards/margins": 0.0635257139801979, "rewards/rejected": -0.04933535307645798, "step": 1075 }, { "debug/policy_chosen_logits": 1.1775163412094116, "debug/policy_chosen_logps": -237.8444061279297, "debug/policy_rejected_logits": 1.61894953250885, "debug/policy_rejected_logps": -324.2195739746094, "debug/reference_chosen_logps": -238.9965362548828, "debug/reference_rejected_logps": -321.0993957519531, "debug/sppo_chosen_loss": 2429.08447265625, "debug/sppo_chosen_reward_in_loss": 1.1521333456039429, "debug/sppo_rej_reward_in_loss": -3.120173215866089, "debug/sppo_reject_loss": 2287.29541015625, "epoch": 3.9130434782608696, "grad_norm": 57966.11629522188, "learning_rate": 5.3510436432637577e-08, "logits/chosen": 1.1775163412094116, "logits/rejected": 1.61894953250885, "logps/chosen": -237.8444061279297, "logps/rejected": -324.2195739746094, "loss": 4654.8531, "rewards/accuracies": 0.75, "rewards/chosen": 0.011521334759891033, "rewards/margins": 0.04272306337952614, "rewards/rejected": -0.03120173141360283, "step": 1080 }, { "debug/policy_chosen_logits": 1.2178781032562256, "debug/policy_chosen_logps": -257.8562927246094, "debug/policy_rejected_logits": 1.6842644214630127, "debug/policy_rejected_logps": -294.3348693847656, "debug/reference_chosen_logps": -259.33831787109375, "debug/reference_rejected_logps": -290.1636962890625, "debug/sppo_chosen_loss": 2381.67919921875, "debug/sppo_chosen_reward_in_loss": 1.4820289611816406, "debug/sppo_rej_reward_in_loss": -4.171219825744629, "debug/sppo_reject_loss": 2194.783203125, "epoch": 3.931159420289855, "grad_norm": 63461.56980289861, "learning_rate": 5.327324478178368e-08, "logits/chosen": 1.2178781032562256, "logits/rejected": 1.6842644214630127, "logps/chosen": -257.8562927246094, "logps/rejected": -294.3348693847656, "loss": 4550.3727, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014820289798080921, "rewards/margins": 0.05653248354792595, "rewards/rejected": -0.0417121946811676, "step": 1085 }, { "debug/policy_chosen_logits": 1.100486397743225, "debug/policy_chosen_logps": -230.83544921875, "debug/policy_rejected_logits": 1.5922205448150635, "debug/policy_rejected_logps": -323.64349365234375, "debug/reference_chosen_logps": -233.14584350585938, "debug/reference_rejected_logps": -319.9851989746094, "debug/sppo_chosen_loss": 2296.8564453125, "debug/sppo_chosen_reward_in_loss": 2.310420513153076, "debug/sppo_rej_reward_in_loss": -3.6582748889923096, "debug/sppo_reject_loss": 2238.854736328125, "epoch": 3.949275362318841, "grad_norm": 61879.32087097766, "learning_rate": 5.303605313092979e-08, "logits/chosen": 1.100486397743225, "logits/rejected": 1.5922205448150635, "logps/chosen": -230.83544921875, "logps/rejected": -323.64349365234375, "loss": 4668.55, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02310420386493206, "rewards/margins": 0.05968695878982544, "rewards/rejected": -0.03658274933695793, "step": 1090 }, { "debug/policy_chosen_logits": 0.9704431295394897, "debug/policy_chosen_logps": -229.97109985351562, "debug/policy_rejected_logits": 1.2151552438735962, "debug/policy_rejected_logps": -270.318115234375, "debug/reference_chosen_logps": -232.79541015625, "debug/reference_rejected_logps": -265.0921325683594, "debug/sppo_chosen_loss": 2244.123779296875, "debug/sppo_chosen_reward_in_loss": 2.824331521987915, "debug/sppo_rej_reward_in_loss": -5.225963592529297, "debug/sppo_reject_loss": 2129.97314453125, "epoch": 3.967391304347826, "grad_norm": 71200.30035546218, "learning_rate": 5.2798861480075895e-08, "logits/chosen": 0.9704431295394897, "logits/rejected": 1.2151552438735962, "logps/chosen": -229.97109985351562, "logps/rejected": -270.318115234375, "loss": 4715.6711, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02824331261217594, "rewards/margins": 0.080502949655056, "rewards/rejected": -0.05225963518023491, "step": 1095 }, { "debug/policy_chosen_logits": 1.3709768056869507, "debug/policy_chosen_logps": -270.6744384765625, "debug/policy_rejected_logits": 1.5750868320465088, "debug/policy_rejected_logps": -274.14801025390625, "debug/reference_chosen_logps": -273.50811767578125, "debug/reference_rejected_logps": -270.3443298339844, "debug/sppo_chosen_loss": 2248.73583984375, "debug/sppo_chosen_reward_in_loss": 2.833648681640625, "debug/sppo_rej_reward_in_loss": -3.803656816482544, "debug/sppo_reject_loss": 2198.38623046875, "epoch": 3.9855072463768115, "grad_norm": 62262.88513979131, "learning_rate": 5.256166982922201e-08, "logits/chosen": 1.3709768056869507, "logits/rejected": 1.5750868320465088, "logps/chosen": -270.6744384765625, "logps/rejected": -274.14801025390625, "loss": 4579.3098, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.028336485847830772, "rewards/margins": 0.0663730651140213, "rewards/rejected": -0.03803656995296478, "step": 1100 }, { "epoch": 3.9855072463768115, "eval_debug/policy_chosen_logits": 1.456533670425415, "eval_debug/policy_chosen_logps": -253.49632263183594, "eval_debug/policy_rejected_logits": 1.506239414215088, "eval_debug/policy_rejected_logps": -263.5656433105469, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2616.45263671875, "eval_debug/sppo_chosen_reward_in_loss": -0.5778376460075378, "eval_debug/sppo_rej_reward_in_loss": -3.907012462615967, "eval_debug/sppo_reject_loss": 2253.14208984375, "eval_logits/chosen": 1.456533670425415, "eval_logits/rejected": 1.506239414215088, "eval_logps/chosen": -253.49632263183594, "eval_logps/rejected": -263.5656433105469, "eval_loss": 4676.59033203125, "eval_rewards/accuracies": 0.5263158082962036, "eval_rewards/chosen": -0.005778376013040543, "eval_rewards/margins": 0.03329174593091011, "eval_rewards/rejected": -0.03907012194395065, "eval_runtime": 28.4845, "eval_samples_per_second": 21.064, "eval_steps_per_second": 0.667, "step": 1100 }, { "debug/policy_chosen_logits": 1.132196307182312, "debug/policy_chosen_logps": -257.04156494140625, "debug/policy_rejected_logits": 1.0593174695968628, "debug/policy_rejected_logps": -253.03121948242188, "debug/reference_chosen_logps": -258.0804748535156, "debug/reference_rejected_logps": -249.09701538085938, "debug/sppo_chosen_loss": 2430.91845703125, "debug/sppo_chosen_reward_in_loss": 1.0388778448104858, "debug/sppo_rej_reward_in_loss": -3.934187412261963, "debug/sppo_reject_loss": 2237.6044921875, "epoch": 4.003623188405797, "grad_norm": 83697.6761176109, "learning_rate": 5.232447817836811e-08, "logits/chosen": 1.132196307182312, "logits/rejected": 1.0593174695968628, "logps/chosen": -257.04156494140625, "logps/rejected": -253.03121948242188, "loss": 4633.5699, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.010388778522610664, "rewards/margins": 0.04973064735531807, "rewards/rejected": -0.039341870695352554, "step": 1105 }, { "debug/policy_chosen_logits": 1.1453125476837158, "debug/policy_chosen_logps": -229.22451782226562, "debug/policy_rejected_logits": 1.6569292545318604, "debug/policy_rejected_logps": -315.9703674316406, "debug/reference_chosen_logps": -230.3061981201172, "debug/reference_rejected_logps": -310.6712341308594, "debug/sppo_chosen_loss": 2433.438232421875, "debug/sppo_chosen_reward_in_loss": 1.0817075967788696, "debug/sppo_rej_reward_in_loss": -5.2991108894348145, "debug/sppo_reject_loss": 2125.366455078125, "epoch": 4.021739130434782, "grad_norm": 98568.01839645841, "learning_rate": 5.2087286527514226e-08, "logits/chosen": 1.1453125476837158, "logits/rejected": 1.6569292545318604, "logps/chosen": -229.22451782226562, "logps/rejected": -315.9703674316406, "loss": 4584.3953, "rewards/accuracies": 0.75, "rewards/chosen": 0.01081707514822483, "rewards/margins": 0.06380818039178848, "rewards/rejected": -0.0529911033809185, "step": 1110 }, { "debug/policy_chosen_logits": 1.241201639175415, "debug/policy_chosen_logps": -246.2945098876953, "debug/policy_rejected_logits": 1.370812177658081, "debug/policy_rejected_logps": -276.74261474609375, "debug/reference_chosen_logps": -246.9245147705078, "debug/reference_rejected_logps": -274.3244323730469, "debug/sppo_chosen_loss": 2489.19921875, "debug/sppo_chosen_reward_in_loss": 0.6299924850463867, "debug/sppo_rej_reward_in_loss": -2.41817045211792, "debug/sppo_reject_loss": 2336.05712890625, "epoch": 4.0398550724637685, "grad_norm": 59089.21551370471, "learning_rate": 5.185009487666034e-08, "logits/chosen": 1.241201639175415, "logits/rejected": 1.370812177658081, "logps/chosen": -246.2945098876953, "logps/rejected": -276.74261474609375, "loss": 4654.7855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.006299925502389669, "rewards/margins": 0.030481627210974693, "rewards/rejected": -0.024181704968214035, "step": 1115 }, { "debug/policy_chosen_logits": 1.2381556034088135, "debug/policy_chosen_logps": -257.24005126953125, "debug/policy_rejected_logits": 1.3313062191009521, "debug/policy_rejected_logps": -286.19012451171875, "debug/reference_chosen_logps": -259.2249450683594, "debug/reference_rejected_logps": -282.4788818359375, "debug/sppo_chosen_loss": 2332.8056640625, "debug/sppo_chosen_reward_in_loss": 1.984928846359253, "debug/sppo_rej_reward_in_loss": -3.7112789154052734, "debug/sppo_reject_loss": 2214.718505859375, "epoch": 4.057971014492754, "grad_norm": 114455.89556818095, "learning_rate": 5.161290322580645e-08, "logits/chosen": 1.2381556034088135, "logits/rejected": 1.3313062191009521, "logps/chosen": -257.24005126953125, "logps/rejected": -286.19012451171875, "loss": 4522.3562, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01984928920865059, "rewards/margins": 0.05696208029985428, "rewards/rejected": -0.03711279109120369, "step": 1120 }, { "debug/policy_chosen_logits": 1.209842562675476, "debug/policy_chosen_logps": -252.7437286376953, "debug/policy_rejected_logits": 1.5112091302871704, "debug/policy_rejected_logps": -294.62066650390625, "debug/reference_chosen_logps": -255.17752075195312, "debug/reference_rejected_logps": -289.46148681640625, "debug/sppo_chosen_loss": 2272.56689453125, "debug/sppo_chosen_reward_in_loss": 2.433781385421753, "debug/sppo_rej_reward_in_loss": -5.15915060043335, "debug/sppo_reject_loss": 2126.252685546875, "epoch": 4.076086956521739, "grad_norm": 78827.79285113512, "learning_rate": 5.1375711574952564e-08, "logits/chosen": 1.209842562675476, "logits/rejected": 1.5112091302871704, "logps/chosen": -252.7437286376953, "logps/rejected": -294.62066650390625, "loss": 4532.3141, "rewards/accuracies": 0.75, "rewards/chosen": 0.02433781325817108, "rewards/margins": 0.07592931389808655, "rewards/rejected": -0.051591504365205765, "step": 1125 }, { "debug/policy_chosen_logits": 1.3331242799758911, "debug/policy_chosen_logps": -276.8101501464844, "debug/policy_rejected_logits": 1.541341781616211, "debug/policy_rejected_logps": -294.5791015625, "debug/reference_chosen_logps": -276.34832763671875, "debug/reference_rejected_logps": -289.8849182128906, "debug/sppo_chosen_loss": 2596.18212890625, "debug/sppo_chosen_reward_in_loss": -0.4618555009365082, "debug/sppo_rej_reward_in_loss": -4.694179534912109, "debug/sppo_reject_loss": 2139.68798828125, "epoch": 4.094202898550725, "grad_norm": 101850.88439109617, "learning_rate": 5.1138519924098666e-08, "logits/chosen": 1.3331242799758911, "logits/rejected": 1.541341781616211, "logps/chosen": -276.8101501464844, "logps/rejected": -294.5791015625, "loss": 4563.6527, "rewards/accuracies": 0.625, "rewards/chosen": -0.004618555307388306, "rewards/margins": 0.042323239147663116, "rewards/rejected": -0.04694179445505142, "step": 1130 }, { "debug/policy_chosen_logits": 0.9701658487319946, "debug/policy_chosen_logps": -241.0208282470703, "debug/policy_rejected_logits": 1.249610185623169, "debug/policy_rejected_logps": -278.6710205078125, "debug/reference_chosen_logps": -242.5782012939453, "debug/reference_rejected_logps": -273.7024841308594, "debug/sppo_chosen_loss": 2375.3642578125, "debug/sppo_chosen_reward_in_loss": 1.5573704242706299, "debug/sppo_rej_reward_in_loss": -4.968528747558594, "debug/sppo_reject_loss": 2126.25537109375, "epoch": 4.11231884057971, "grad_norm": 68155.27284047905, "learning_rate": 5.090132827324478e-08, "logits/chosen": 0.9701658487319946, "logits/rejected": 1.249610185623169, "logps/chosen": -241.0208282470703, "logps/rejected": -278.6710205078125, "loss": 4614.9297, "rewards/accuracies": 0.75, "rewards/chosen": 0.01557370275259018, "rewards/margins": 0.06525899469852448, "rewards/rejected": -0.049685288220644, "step": 1135 }, { "debug/policy_chosen_logits": 0.934956431388855, "debug/policy_chosen_logps": -257.32757568359375, "debug/policy_rejected_logits": 1.447296142578125, "debug/policy_rejected_logps": -305.84912109375, "debug/reference_chosen_logps": -257.38555908203125, "debug/reference_rejected_logps": -298.5243225097656, "debug/sppo_chosen_loss": 2557.6728515625, "debug/sppo_chosen_reward_in_loss": 0.057987213134765625, "debug/sppo_rej_reward_in_loss": -7.324774742126465, "debug/sppo_reject_loss": 1964.286376953125, "epoch": 4.130434782608695, "grad_norm": 112972.04597035103, "learning_rate": 5.066413662239088e-08, "logits/chosen": 0.934956431388855, "logits/rejected": 1.447296142578125, "logps/chosen": -257.32757568359375, "logps/rejected": -305.84912109375, "loss": 4592.5555, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0005798719939775765, "rewards/margins": 0.07382762432098389, "rewards/rejected": -0.0732477456331253, "step": 1140 }, { "debug/policy_chosen_logits": 1.137599229812622, "debug/policy_chosen_logps": -247.9462432861328, "debug/policy_rejected_logits": 1.3648731708526611, "debug/policy_rejected_logps": -297.7298889160156, "debug/reference_chosen_logps": -248.70059204101562, "debug/reference_rejected_logps": -292.8433532714844, "debug/sppo_chosen_loss": 2463.04541015625, "debug/sppo_chosen_reward_in_loss": 0.7543275952339172, "debug/sppo_rej_reward_in_loss": -4.886526584625244, "debug/sppo_reject_loss": 2146.55224609375, "epoch": 4.148550724637682, "grad_norm": 56790.99397168494, "learning_rate": 5.0426944971537e-08, "logits/chosen": 1.137599229812622, "logits/rejected": 1.3648731708526611, "logps/chosen": -247.9462432861328, "logps/rejected": -297.7298889160156, "loss": 4581.6867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007543275598436594, "rewards/margins": 0.056408535689115524, "rewards/rejected": -0.04886526241898537, "step": 1145 }, { "debug/policy_chosen_logits": 1.1987998485565186, "debug/policy_chosen_logps": -249.37814331054688, "debug/policy_rejected_logits": 1.5439293384552002, "debug/policy_rejected_logps": -294.209716796875, "debug/reference_chosen_logps": -251.27197265625, "debug/reference_rejected_logps": -290.90399169921875, "debug/sppo_chosen_loss": 2345.739501953125, "debug/sppo_chosen_reward_in_loss": 1.893791913986206, "debug/sppo_rej_reward_in_loss": -3.3057198524475098, "debug/sppo_reject_loss": 2277.840576171875, "epoch": 4.166666666666667, "grad_norm": 63141.83995575706, "learning_rate": 5.018975332068311e-08, "logits/chosen": 1.1987998485565186, "logits/rejected": 1.5439293384552002, "logps/chosen": -249.37814331054688, "logps/rejected": -294.209716796875, "loss": 4624.3367, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.018937919288873672, "rewards/margins": 0.05199511721730232, "rewards/rejected": -0.03305719792842865, "step": 1150 }, { "debug/policy_chosen_logits": 1.2984672784805298, "debug/policy_chosen_logps": -267.91876220703125, "debug/policy_rejected_logits": 1.6929426193237305, "debug/policy_rejected_logps": -268.45672607421875, "debug/reference_chosen_logps": -268.6834716796875, "debug/reference_rejected_logps": -264.5723571777344, "debug/sppo_chosen_loss": 2474.086181640625, "debug/sppo_chosen_reward_in_loss": 0.7647321820259094, "debug/sppo_rej_reward_in_loss": -3.8843586444854736, "debug/sppo_reject_loss": 2243.4658203125, "epoch": 4.184782608695652, "grad_norm": 60455.18221015738, "learning_rate": 4.995256166982922e-08, "logits/chosen": 1.2984672784805298, "logits/rejected": 1.6929426193237305, "logps/chosen": -267.91876220703125, "logps/rejected": -268.45672607421875, "loss": 4561.9293, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007647321559488773, "rewards/margins": 0.04649090766906738, "rewards/rejected": -0.038843583315610886, "step": 1155 }, { "debug/policy_chosen_logits": 1.1441524028778076, "debug/policy_chosen_logps": -228.0914306640625, "debug/policy_rejected_logits": 1.7030442953109741, "debug/policy_rejected_logps": -300.6089782714844, "debug/reference_chosen_logps": -228.95333862304688, "debug/reference_rejected_logps": -298.1351623535156, "debug/sppo_chosen_loss": 2439.880859375, "debug/sppo_chosen_reward_in_loss": 0.8619012832641602, "debug/sppo_rej_reward_in_loss": -2.473867893218994, "debug/sppo_reject_loss": 2360.13037109375, "epoch": 4.202898550724638, "grad_norm": 62393.309838484005, "learning_rate": 4.971537001897533e-08, "logits/chosen": 1.1441524028778076, "logits/rejected": 1.7030442953109741, "logps/chosen": -228.0914306640625, "logps/rejected": -300.6089782714844, "loss": 4680.834, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.00861901231110096, "rewards/margins": 0.03335769101977348, "rewards/rejected": -0.024738678708672523, "step": 1160 }, { "debug/policy_chosen_logits": 1.061384916305542, "debug/policy_chosen_logps": -238.6508026123047, "debug/policy_rejected_logits": 1.4896352291107178, "debug/policy_rejected_logps": -275.42657470703125, "debug/reference_chosen_logps": -240.12442016601562, "debug/reference_rejected_logps": -271.47845458984375, "debug/sppo_chosen_loss": 2385.75390625, "debug/sppo_chosen_reward_in_loss": 1.4736206531524658, "debug/sppo_rej_reward_in_loss": -3.948108196258545, "debug/sppo_reject_loss": 2201.4970703125, "epoch": 4.221014492753623, "grad_norm": 60013.99653716095, "learning_rate": 4.9478178368121444e-08, "logits/chosen": 1.061384916305542, "logits/rejected": 1.4896352291107178, "logps/chosen": -238.6508026123047, "logps/rejected": -275.42657470703125, "loss": 4589.784, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014736207202076912, "rewards/margins": 0.05421728640794754, "rewards/rejected": -0.03948108106851578, "step": 1165 }, { "debug/policy_chosen_logits": 1.2733427286148071, "debug/policy_chosen_logps": -272.32177734375, "debug/policy_rejected_logits": 1.2476797103881836, "debug/policy_rejected_logps": -271.35662841796875, "debug/reference_chosen_logps": -274.59075927734375, "debug/reference_rejected_logps": -266.5238037109375, "debug/sppo_chosen_loss": 2289.10888671875, "debug/sppo_chosen_reward_in_loss": 2.2690114974975586, "debug/sppo_rej_reward_in_loss": -4.83282995223999, "debug/sppo_reject_loss": 2137.542724609375, "epoch": 4.239130434782608, "grad_norm": 62760.28247496744, "learning_rate": 4.924098671726755e-08, "logits/chosen": 1.2733427286148071, "logits/rejected": 1.2476797103881836, "logps/chosen": -272.32177734375, "logps/rejected": -271.35662841796875, "loss": 4579.4902, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02269011363387108, "rewards/margins": 0.07101841270923615, "rewards/rejected": -0.048328299075365067, "step": 1170 }, { "debug/policy_chosen_logits": 1.1383342742919922, "debug/policy_chosen_logps": -260.6459045410156, "debug/policy_rejected_logits": 1.1656829118728638, "debug/policy_rejected_logps": -282.4175109863281, "debug/reference_chosen_logps": -261.7978515625, "debug/reference_rejected_logps": -278.51116943359375, "debug/sppo_chosen_loss": 2423.7265625, "debug/sppo_chosen_reward_in_loss": 1.1519769430160522, "debug/sppo_rej_reward_in_loss": -3.906313419342041, "debug/sppo_reject_loss": 2209.96142578125, "epoch": 4.257246376811594, "grad_norm": 66690.75709796374, "learning_rate": 4.900379506641366e-08, "logits/chosen": 1.1383342742919922, "logits/rejected": 1.1656829118728638, "logps/chosen": -260.6459045410156, "logps/rejected": -282.4175109863281, "loss": 4560.2367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01151976827532053, "rewards/margins": 0.05058290436863899, "rewards/rejected": -0.03906313329935074, "step": 1175 }, { "debug/policy_chosen_logits": 1.291983723640442, "debug/policy_chosen_logps": -275.66510009765625, "debug/policy_rejected_logits": 1.4948934316635132, "debug/policy_rejected_logps": -307.16046142578125, "debug/reference_chosen_logps": -273.1227722167969, "debug/reference_rejected_logps": -302.58367919921875, "debug/sppo_chosen_loss": 2862.58154296875, "debug/sppo_chosen_reward_in_loss": -2.5422942638397217, "debug/sppo_rej_reward_in_loss": -4.576807498931885, "debug/sppo_reject_loss": 2187.82861328125, "epoch": 4.27536231884058, "grad_norm": 85315.2863686186, "learning_rate": 4.876660341555977e-08, "logits/chosen": 1.291983723640442, "logits/rejected": 1.4948934316635132, "logps/chosen": -275.66510009765625, "logps/rejected": -307.16046142578125, "loss": 4622.927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.025422941893339157, "rewards/margins": 0.020345130935311317, "rewards/rejected": -0.045768074691295624, "step": 1180 }, { "debug/policy_chosen_logits": 0.8780574798583984, "debug/policy_chosen_logps": -242.35122680664062, "debug/policy_rejected_logits": 1.2848012447357178, "debug/policy_rejected_logps": -308.982421875, "debug/reference_chosen_logps": -243.1239013671875, "debug/reference_rejected_logps": -302.6136169433594, "debug/sppo_chosen_loss": 2473.795166015625, "debug/sppo_chosen_reward_in_loss": 0.7726734280586243, "debug/sppo_rej_reward_in_loss": -6.368790626525879, "debug/sppo_reject_loss": 2001.2486572265625, "epoch": 4.293478260869565, "grad_norm": 63407.96589074536, "learning_rate": 4.852941176470588e-08, "logits/chosen": 0.8780574798583984, "logits/rejected": 1.2848012447357178, "logps/chosen": -242.35122680664062, "logps/rejected": -308.982421875, "loss": 4514.4953, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.007726734038442373, "rewards/margins": 0.07141464948654175, "rewards/rejected": -0.06368790566921234, "step": 1185 }, { "debug/policy_chosen_logits": 1.2959657907485962, "debug/policy_chosen_logps": -250.234619140625, "debug/policy_rejected_logits": 1.4259939193725586, "debug/policy_rejected_logps": -259.39508056640625, "debug/reference_chosen_logps": -252.074951171875, "debug/reference_rejected_logps": -256.25079345703125, "debug/sppo_chosen_loss": 2330.18212890625, "debug/sppo_chosen_reward_in_loss": 1.8403308391571045, "debug/sppo_rej_reward_in_loss": -3.144282817840576, "debug/sppo_reject_loss": 2303.275390625, "epoch": 4.311594202898551, "grad_norm": 74185.34876051976, "learning_rate": 4.829222011385199e-08, "logits/chosen": 1.2959657907485962, "logits/rejected": 1.4259939193725586, "logps/chosen": -250.234619140625, "logps/rejected": -259.39508056640625, "loss": 4534.2867, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0184033066034317, "rewards/margins": 0.04984613507986069, "rewards/rejected": -0.03144282475113869, "step": 1190 }, { "debug/policy_chosen_logits": 1.063323736190796, "debug/policy_chosen_logps": -260.62713623046875, "debug/policy_rejected_logits": 1.3779569864273071, "debug/policy_rejected_logps": -304.7257385253906, "debug/reference_chosen_logps": -261.36907958984375, "debug/reference_rejected_logps": -301.53497314453125, "debug/sppo_chosen_loss": 2459.88232421875, "debug/sppo_chosen_reward_in_loss": 0.7419716119766235, "debug/sppo_rej_reward_in_loss": -3.1907191276550293, "debug/sppo_reject_loss": 2275.12841796875, "epoch": 4.329710144927536, "grad_norm": 60634.36127221304, "learning_rate": 4.80550284629981e-08, "logits/chosen": 1.063323736190796, "logits/rejected": 1.3779569864273071, "logps/chosen": -260.62713623046875, "logps/rejected": -304.7257385253906, "loss": 4707.4836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0074197156354784966, "rewards/margins": 0.039326902478933334, "rewards/rejected": -0.03190718963742256, "step": 1195 }, { "debug/policy_chosen_logits": 1.6620442867279053, "debug/policy_chosen_logps": -251.6832733154297, "debug/policy_rejected_logits": 2.0438661575317383, "debug/policy_rejected_logps": -308.67962646484375, "debug/reference_chosen_logps": -252.7598114013672, "debug/reference_rejected_logps": -300.9691467285156, "debug/sppo_chosen_loss": 2414.723876953125, "debug/sppo_chosen_reward_in_loss": 1.0765705108642578, "debug/sppo_rej_reward_in_loss": -7.71053409576416, "debug/sppo_reject_loss": 1928.1484375, "epoch": 4.3478260869565215, "grad_norm": 62629.69501037734, "learning_rate": 4.781783681214421e-08, "logits/chosen": 1.6620442867279053, "logits/rejected": 2.0438661575317383, "logps/chosen": -251.6832733154297, "logps/rejected": -308.67962646484375, "loss": 4461.193, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01076570525765419, "rewards/margins": 0.08787104487419128, "rewards/rejected": -0.0771053358912468, "step": 1200 }, { "epoch": 4.3478260869565215, "eval_debug/policy_chosen_logits": 1.4448680877685547, "eval_debug/policy_chosen_logps": -252.53868103027344, "eval_debug/policy_rejected_logits": 1.4919214248657227, "eval_debug/policy_rejected_logps": -263.04656982421875, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2517.66552734375, "eval_debug/sppo_chosen_reward_in_loss": 0.3797861933708191, "eval_debug/sppo_rej_reward_in_loss": -3.3879170417785645, "eval_debug/sppo_reject_loss": 2292.259033203125, "eval_logits/chosen": 1.4448680877685547, "eval_logits/rejected": 1.4919214248657227, "eval_logps/chosen": -252.53868103027344, "eval_logps/rejected": -263.04656982421875, "eval_loss": 4657.2646484375, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.0037978619802743196, "eval_rewards/margins": 0.037677038460969925, "eval_rewards/rejected": -0.033879172056913376, "eval_runtime": 28.33, "eval_samples_per_second": 21.179, "eval_steps_per_second": 0.671, "step": 1200 }, { "debug/policy_chosen_logits": 1.067408800125122, "debug/policy_chosen_logps": -250.3118896484375, "debug/policy_rejected_logits": 1.3286750316619873, "debug/policy_rejected_logps": -297.41204833984375, "debug/reference_chosen_logps": -251.0874481201172, "debug/reference_rejected_logps": -295.08001708984375, "debug/sppo_chosen_loss": 2468.00439453125, "debug/sppo_chosen_reward_in_loss": 0.7755517959594727, "debug/sppo_rej_reward_in_loss": -2.33207631111145, "debug/sppo_reject_loss": 2334.721923828125, "epoch": 4.365942028985507, "grad_norm": 85587.71909780419, "learning_rate": 4.7580645161290323e-08, "logits/chosen": 1.067408800125122, "logits/rejected": 1.3286750316619873, "logps/chosen": -250.3118896484375, "logps/rejected": -297.41204833984375, "loss": 4580.6469, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007755516562610865, "rewards/margins": 0.031076278537511826, "rewards/rejected": -0.023320764303207397, "step": 1205 }, { "debug/policy_chosen_logits": 1.3632538318634033, "debug/policy_chosen_logps": -232.198974609375, "debug/policy_rejected_logits": 1.7290977239608765, "debug/policy_rejected_logps": -266.6654357910156, "debug/reference_chosen_logps": -235.09268188476562, "debug/reference_rejected_logps": -262.50762939453125, "debug/sppo_chosen_loss": 2229.469970703125, "debug/sppo_chosen_reward_in_loss": 2.893690347671509, "debug/sppo_rej_reward_in_loss": -4.1578145027160645, "debug/sppo_reject_loss": 2217.698486328125, "epoch": 4.384057971014493, "grad_norm": 125771.48157969775, "learning_rate": 4.734345351043643e-08, "logits/chosen": 1.3632538318634033, "logits/rejected": 1.7290977239608765, "logps/chosen": -232.198974609375, "logps/rejected": -266.6654357910156, "loss": 4492.3219, "rewards/accuracies": 0.75, "rewards/chosen": 0.028936902061104774, "rewards/margins": 0.07051504403352737, "rewards/rejected": -0.04157814383506775, "step": 1210 }, { "debug/policy_chosen_logits": 1.1103360652923584, "debug/policy_chosen_logps": -273.6535339355469, "debug/policy_rejected_logits": 1.1567230224609375, "debug/policy_rejected_logps": -274.80780029296875, "debug/reference_chosen_logps": -274.5265197753906, "debug/reference_rejected_logps": -268.24322509765625, "debug/sppo_chosen_loss": 2472.103515625, "debug/sppo_chosen_reward_in_loss": 0.8729490041732788, "debug/sppo_rej_reward_in_loss": -6.564622402191162, "debug/sppo_reject_loss": 1992.9027099609375, "epoch": 4.4021739130434785, "grad_norm": 60334.67741453669, "learning_rate": 4.710626185958254e-08, "logits/chosen": 1.1103360652923584, "logits/rejected": 1.1567230224609375, "logps/chosen": -273.6535339355469, "logps/rejected": -274.80780029296875, "loss": 4563.4563, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.008729489520192146, "rewards/margins": 0.07437571883201599, "rewards/rejected": -0.0656462237238884, "step": 1215 }, { "debug/policy_chosen_logits": 0.8364113569259644, "debug/policy_chosen_logps": -228.9216766357422, "debug/policy_rejected_logits": 1.1603446006774902, "debug/policy_rejected_logps": -279.7967529296875, "debug/reference_chosen_logps": -235.14913940429688, "debug/reference_rejected_logps": -283.31939697265625, "debug/sppo_chosen_loss": 1952.086669921875, "debug/sppo_chosen_reward_in_loss": 6.227484226226807, "debug/sppo_rej_reward_in_loss": 3.522630214691162, "debug/sppo_reject_loss": 2916.21923828125, "epoch": 4.420289855072464, "grad_norm": 121161.27347688074, "learning_rate": 4.686907020872865e-08, "logits/chosen": 0.8364113569259644, "logits/rejected": 1.1603446006774902, "logps/chosen": -228.9216766357422, "logps/rejected": -279.7967529296875, "loss": 4926.9902, "rewards/accuracies": 0.625, "rewards/chosen": 0.062274836003780365, "rewards/margins": 0.027048539370298386, "rewards/rejected": 0.035226304084062576, "step": 1220 }, { "debug/policy_chosen_logits": 1.3064606189727783, "debug/policy_chosen_logps": -246.05990600585938, "debug/policy_rejected_logits": 1.4045573472976685, "debug/policy_rejected_logps": -253.72244262695312, "debug/reference_chosen_logps": -254.294189453125, "debug/reference_rejected_logps": -261.4356689453125, "debug/sppo_chosen_loss": 1813.9495849609375, "debug/sppo_chosen_reward_in_loss": 8.2342529296875, "debug/sppo_rej_reward_in_loss": 7.7132568359375, "debug/sppo_reject_loss": 3369.33544921875, "epoch": 4.438405797101449, "grad_norm": 116422.87599193433, "learning_rate": 4.6631878557874757e-08, "logits/chosen": 1.3064606189727783, "logits/rejected": 1.4045573472976685, "logps/chosen": -246.05990600585938, "logps/rejected": -253.72244262695312, "loss": 5229.232, "rewards/accuracies": 0.625, "rewards/chosen": 0.08234253525733948, "rewards/margins": 0.00520996144041419, "rewards/rejected": 0.07713256776332855, "step": 1225 }, { "debug/policy_chosen_logits": 1.191929578781128, "debug/policy_chosen_logps": -256.81378173828125, "debug/policy_rejected_logits": 1.5418320894241333, "debug/policy_rejected_logps": -293.60113525390625, "debug/reference_chosen_logps": -262.35906982421875, "debug/reference_rejected_logps": -297.26397705078125, "debug/sppo_chosen_loss": 2027.944580078125, "debug/sppo_chosen_reward_in_loss": 5.54526424407959, "debug/sppo_rej_reward_in_loss": 3.662853717803955, "debug/sppo_reject_loss": 2952.05615234375, "epoch": 4.456521739130435, "grad_norm": 82205.84252702523, "learning_rate": 4.639468690702087e-08, "logits/chosen": 1.191929578781128, "logits/rejected": 1.5418320894241333, "logps/chosen": -256.81378173828125, "logps/rejected": -293.60113525390625, "loss": 4801.8328, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.05545263737440109, "rewards/margins": 0.018824102357029915, "rewards/rejected": 0.03662853688001633, "step": 1230 }, { "debug/policy_chosen_logits": 1.183424472808838, "debug/policy_chosen_logps": -240.068603515625, "debug/policy_rejected_logits": 1.5198001861572266, "debug/policy_rejected_logps": -285.1687927246094, "debug/reference_chosen_logps": -245.56253051757812, "debug/reference_rejected_logps": -283.6595153808594, "debug/sppo_chosen_loss": 2000.331787109375, "debug/sppo_chosen_reward_in_loss": 5.493918418884277, "debug/sppo_rej_reward_in_loss": -1.5092995166778564, "debug/sppo_reject_loss": 2439.949951171875, "epoch": 4.47463768115942, "grad_norm": 98229.55903472017, "learning_rate": 4.615749525616698e-08, "logits/chosen": 1.183424472808838, "logits/rejected": 1.5198001861572266, "logps/chosen": -240.068603515625, "logps/rejected": -285.1687927246094, "loss": 4613.166, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.054939188063144684, "rewards/margins": 0.07003218680620193, "rewards/rejected": -0.015092995949089527, "step": 1235 }, { "debug/policy_chosen_logits": 0.9283466339111328, "debug/policy_chosen_logps": -250.263427734375, "debug/policy_rejected_logits": 1.5370986461639404, "debug/policy_rejected_logps": -301.86749267578125, "debug/reference_chosen_logps": -252.189697265625, "debug/reference_rejected_logps": -299.5684814453125, "debug/sppo_chosen_loss": 2362.83544921875, "debug/sppo_chosen_reward_in_loss": 1.9262689352035522, "debug/sppo_rej_reward_in_loss": -2.2989847660064697, "debug/sppo_reject_loss": 2399.856689453125, "epoch": 4.492753623188406, "grad_norm": 133429.71123640475, "learning_rate": 4.5920303605313095e-08, "logits/chosen": 0.9283466339111328, "logits/rejected": 1.5370986461639404, "logps/chosen": -250.263427734375, "logps/rejected": -301.86749267578125, "loss": 4648.6891, "rewards/accuracies": 0.625, "rewards/chosen": 0.019262690097093582, "rewards/margins": 0.04225253686308861, "rewards/rejected": -0.022989843040704727, "step": 1240 }, { "debug/policy_chosen_logits": 1.2463074922561646, "debug/policy_chosen_logps": -245.46047973632812, "debug/policy_rejected_logits": 1.4078823328018188, "debug/policy_rejected_logps": -263.62945556640625, "debug/reference_chosen_logps": -249.0845184326172, "debug/reference_rejected_logps": -261.7829895019531, "debug/sppo_chosen_loss": 2180.987548828125, "debug/sppo_chosen_reward_in_loss": 3.624049425125122, "debug/sppo_rej_reward_in_loss": -1.8464775085449219, "debug/sppo_reject_loss": 2428.087158203125, "epoch": 4.510869565217392, "grad_norm": 118091.0269969167, "learning_rate": 4.56831119544592e-08, "logits/chosen": 1.2463074922561646, "logits/rejected": 1.4078823328018188, "logps/chosen": -245.46047973632812, "logps/rejected": -263.62945556640625, "loss": 4594.9641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03624049574136734, "rewards/margins": 0.05470527336001396, "rewards/rejected": -0.018464773893356323, "step": 1245 }, { "debug/policy_chosen_logits": 0.8258849382400513, "debug/policy_chosen_logps": -250.5426788330078, "debug/policy_rejected_logits": 1.2838810682296753, "debug/policy_rejected_logps": -309.7364196777344, "debug/reference_chosen_logps": -252.380859375, "debug/reference_rejected_logps": -305.73828125, "debug/sppo_chosen_loss": 2360.91943359375, "debug/sppo_chosen_reward_in_loss": 1.8381904363632202, "debug/sppo_rej_reward_in_loss": -3.998156785964966, "debug/sppo_reject_loss": 2221.0361328125, "epoch": 4.528985507246377, "grad_norm": 63485.26937294712, "learning_rate": 4.544592030360531e-08, "logits/chosen": 0.8258849382400513, "logits/rejected": 1.2838810682296753, "logps/chosen": -250.5426788330078, "logps/rejected": -309.7364196777344, "loss": 4567.9727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01838190294802189, "rewards/margins": 0.05836346745491028, "rewards/rejected": -0.03998156636953354, "step": 1250 }, { "debug/policy_chosen_logits": 1.1112148761749268, "debug/policy_chosen_logps": -236.4386749267578, "debug/policy_rejected_logits": 1.2074832916259766, "debug/policy_rejected_logps": -281.12060546875, "debug/reference_chosen_logps": -238.1943359375, "debug/reference_rejected_logps": -277.8025817871094, "debug/sppo_chosen_loss": 2374.26806640625, "debug/sppo_chosen_reward_in_loss": 1.755666971206665, "debug/sppo_rej_reward_in_loss": -3.318004608154297, "debug/sppo_reject_loss": 2285.48095703125, "epoch": 4.547101449275362, "grad_norm": 61035.411788180565, "learning_rate": 4.520872865275142e-08, "logits/chosen": 1.1112148761749268, "logits/rejected": 1.2074832916259766, "logps/chosen": -236.4386749267578, "logps/rejected": -281.12060546875, "loss": 4497.4, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01755666732788086, "rewards/margins": 0.050736717879772186, "rewards/rejected": -0.03318004682660103, "step": 1255 }, { "debug/policy_chosen_logits": 1.1837530136108398, "debug/policy_chosen_logps": -230.02969360351562, "debug/policy_rejected_logits": 1.231287956237793, "debug/policy_rejected_logps": -251.0710906982422, "debug/reference_chosen_logps": -231.77627563476562, "debug/reference_rejected_logps": -250.15414428710938, "debug/sppo_chosen_loss": 2365.934326171875, "debug/sppo_chosen_reward_in_loss": 1.7465994358062744, "debug/sppo_rej_reward_in_loss": -0.9169605374336243, "debug/sppo_reject_loss": 2498.203857421875, "epoch": 4.565217391304348, "grad_norm": 64800.6387946322, "learning_rate": 4.497153700189753e-08, "logits/chosen": 1.1837530136108398, "logits/rejected": 1.231287956237793, "logps/chosen": -230.02969360351562, "logps/rejected": -251.0710906982422, "loss": 4462.65, "rewards/accuracies": 0.625, "rewards/chosen": 0.017465993762016296, "rewards/margins": 0.02663559839129448, "rewards/rejected": -0.009169605560600758, "step": 1260 }, { "debug/policy_chosen_logits": 1.3405652046203613, "debug/policy_chosen_logps": -251.4224853515625, "debug/policy_rejected_logits": 1.5229171514511108, "debug/policy_rejected_logps": -292.28814697265625, "debug/reference_chosen_logps": -252.33349609375, "debug/reference_rejected_logps": -288.603515625, "debug/sppo_chosen_loss": 2467.6259765625, "debug/sppo_chosen_reward_in_loss": 0.9110046625137329, "debug/sppo_rej_reward_in_loss": -3.6846516132354736, "debug/sppo_reject_loss": 2307.93896484375, "epoch": 4.583333333333333, "grad_norm": 60594.901757938394, "learning_rate": 4.473434535104364e-08, "logits/chosen": 1.3405652046203613, "logits/rejected": 1.5229171514511108, "logps/chosen": -251.4224853515625, "logps/rejected": -292.28814697265625, "loss": 4589.5129, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009110046550631523, "rewards/margins": 0.0459565594792366, "rewards/rejected": -0.03684651479125023, "step": 1265 }, { "debug/policy_chosen_logits": 1.3859663009643555, "debug/policy_chosen_logps": -279.2012634277344, "debug/policy_rejected_logits": 1.4668617248535156, "debug/policy_rejected_logps": -312.72613525390625, "debug/reference_chosen_logps": -279.77496337890625, "debug/reference_rejected_logps": -307.9085693359375, "debug/sppo_chosen_loss": 2518.072265625, "debug/sppo_chosen_reward_in_loss": 0.5736767053604126, "debug/sppo_rej_reward_in_loss": -4.817553997039795, "debug/sppo_reject_loss": 2152.855712890625, "epoch": 4.601449275362318, "grad_norm": 80435.35232282539, "learning_rate": 4.449715370018975e-08, "logits/chosen": 1.3859663009643555, "logits/rejected": 1.4668617248535156, "logps/chosen": -279.2012634277344, "logps/rejected": -312.72613525390625, "loss": 4579.2359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.0057367668487131596, "rewards/margins": 0.05391230434179306, "rewards/rejected": -0.04817553982138634, "step": 1270 }, { "debug/policy_chosen_logits": 1.1441730260849, "debug/policy_chosen_logps": -258.90484619140625, "debug/policy_rejected_logits": 1.4566986560821533, "debug/policy_rejected_logps": -294.3694763183594, "debug/reference_chosen_logps": -259.3207092285156, "debug/reference_rejected_logps": -291.1746520996094, "debug/sppo_chosen_loss": 2506.06201171875, "debug/sppo_chosen_reward_in_loss": 0.41588249802589417, "debug/sppo_rej_reward_in_loss": -3.1948275566101074, "debug/sppo_reject_loss": 2271.953369140625, "epoch": 4.619565217391305, "grad_norm": 84049.06878952609, "learning_rate": 4.4259962049335866e-08, "logits/chosen": 1.1441730260849, "logits/rejected": 1.4566986560821533, "logps/chosen": -258.90484619140625, "logps/rejected": -294.3694763183594, "loss": 4581.1617, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004158825613558292, "rewards/margins": 0.036107100546360016, "rewards/rejected": -0.0319482758641243, "step": 1275 }, { "debug/policy_chosen_logits": 1.4204843044281006, "debug/policy_chosen_logps": -270.0739440917969, "debug/policy_rejected_logits": 1.2980986833572388, "debug/policy_rejected_logps": -270.5966491699219, "debug/reference_chosen_logps": -270.97503662109375, "debug/reference_rejected_logps": -266.56317138671875, "debug/sppo_chosen_loss": 2444.6591796875, "debug/sppo_chosen_reward_in_loss": 0.901079535484314, "debug/sppo_rej_reward_in_loss": -4.03351354598999, "debug/sppo_reject_loss": 2220.99365234375, "epoch": 4.63768115942029, "grad_norm": 114762.52328443929, "learning_rate": 4.4022770398481974e-08, "logits/chosen": 1.4204843044281006, "logits/rejected": 1.2980986833572388, "logps/chosen": -270.0739440917969, "logps/rejected": -270.5966491699219, "loss": 4652.9906, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.009010795503854752, "rewards/margins": 0.04934592917561531, "rewards/rejected": -0.04033513739705086, "step": 1280 }, { "debug/policy_chosen_logits": 1.1395741701126099, "debug/policy_chosen_logps": -217.71621704101562, "debug/policy_rejected_logits": 1.5526567697525024, "debug/policy_rejected_logps": -268.71380615234375, "debug/reference_chosen_logps": -220.8345947265625, "debug/reference_rejected_logps": -263.44635009765625, "debug/sppo_chosen_loss": 2206.55517578125, "debug/sppo_chosen_reward_in_loss": 3.118389844894409, "debug/sppo_rej_reward_in_loss": -5.26749849319458, "debug/sppo_reject_loss": 2182.8447265625, "epoch": 4.655797101449275, "grad_norm": 66276.94820416739, "learning_rate": 4.378557874762808e-08, "logits/chosen": 1.1395741701126099, "logits/rejected": 1.5526567697525024, "logps/chosen": -217.71621704101562, "logps/rejected": -268.71380615234375, "loss": 4592.6031, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.031183898448944092, "rewards/margins": 0.08385888487100601, "rewards/rejected": -0.05267498642206192, "step": 1285 }, { "debug/policy_chosen_logits": 1.063277006149292, "debug/policy_chosen_logps": -240.34963989257812, "debug/policy_rejected_logits": 1.464519739151001, "debug/policy_rejected_logps": -289.8151550292969, "debug/reference_chosen_logps": -240.63711547851562, "debug/reference_rejected_logps": -285.86376953125, "debug/sppo_chosen_loss": 2540.53857421875, "debug/sppo_chosen_reward_in_loss": 0.2874833941459656, "debug/sppo_rej_reward_in_loss": -3.951389789581299, "debug/sppo_reject_loss": 2214.654296875, "epoch": 4.673913043478261, "grad_norm": 66719.470123868, "learning_rate": 4.354838709677419e-08, "logits/chosen": 1.063277006149292, "logits/rejected": 1.464519739151001, "logps/chosen": -240.34963989257812, "logps/rejected": -289.8151550292969, "loss": 4571.2547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0028748351614922285, "rewards/margins": 0.04238872975111008, "rewards/rejected": -0.03951389342546463, "step": 1290 }, { "debug/policy_chosen_logits": 1.188763976097107, "debug/policy_chosen_logps": -250.81570434570312, "debug/policy_rejected_logits": 1.6729280948638916, "debug/policy_rejected_logps": -293.1302490234375, "debug/reference_chosen_logps": -250.30029296875, "debug/reference_rejected_logps": -286.7928771972656, "debug/sppo_chosen_loss": 2647.440185546875, "debug/sppo_chosen_reward_in_loss": -0.5154016613960266, "debug/sppo_rej_reward_in_loss": -6.337368965148926, "debug/sppo_reject_loss": 2078.28466796875, "epoch": 4.692028985507246, "grad_norm": 67109.84903424514, "learning_rate": 4.33111954459203e-08, "logits/chosen": 1.188763976097107, "logits/rejected": 1.6729280948638916, "logps/chosen": -250.81570434570312, "logps/rejected": -293.1302490234375, "loss": 4461.2637, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.005154015962034464, "rewards/margins": 0.05821967124938965, "rewards/rejected": -0.06337368488311768, "step": 1295 }, { "debug/policy_chosen_logits": 1.2608754634857178, "debug/policy_chosen_logps": -280.30352783203125, "debug/policy_rejected_logits": 1.3286454677581787, "debug/policy_rejected_logps": -276.49896240234375, "debug/reference_chosen_logps": -281.0545959472656, "debug/reference_rejected_logps": -272.4754333496094, "debug/sppo_chosen_loss": 2482.15673828125, "debug/sppo_chosen_reward_in_loss": 0.7510309219360352, "debug/sppo_rej_reward_in_loss": -4.023531913757324, "debug/sppo_reject_loss": 2223.169921875, "epoch": 4.710144927536232, "grad_norm": 75721.34598090543, "learning_rate": 4.307400379506641e-08, "logits/chosen": 1.2608754634857178, "logits/rejected": 1.3286454677581787, "logps/chosen": -280.30352783203125, "logps/rejected": -276.49896240234375, "loss": 4688.9563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007510309107601643, "rewards/margins": 0.04774563014507294, "rewards/rejected": -0.04023531824350357, "step": 1300 }, { "epoch": 4.710144927536232, "eval_debug/policy_chosen_logits": 1.4244481325149536, "eval_debug/policy_chosen_logps": -252.9359588623047, "eval_debug/policy_rejected_logits": 1.4724959135055542, "eval_debug/policy_rejected_logps": -263.3884582519531, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2567.22900390625, "eval_debug/sppo_chosen_reward_in_loss": -0.01748933270573616, "eval_debug/sppo_rej_reward_in_loss": -3.729843854904175, "eval_debug/sppo_reject_loss": 2285.481201171875, "eval_logits/chosen": 1.4244481325149536, "eval_logits/rejected": 1.4724959135055542, "eval_logps/chosen": -252.9359588623047, "eval_logps/rejected": -263.3884582519531, "eval_loss": 4654.3955078125, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": -0.00017489335732534528, "eval_rewards/margins": 0.03712354227900505, "eval_rewards/rejected": -0.03729843348264694, "eval_runtime": 28.3246, "eval_samples_per_second": 21.183, "eval_steps_per_second": 0.671, "step": 1300 }, { "debug/policy_chosen_logits": 1.0196096897125244, "debug/policy_chosen_logps": -242.3280029296875, "debug/policy_rejected_logits": 1.3811008930206299, "debug/policy_rejected_logps": -283.93609619140625, "debug/reference_chosen_logps": -243.1513214111328, "debug/reference_rejected_logps": -277.0658264160156, "debug/sppo_chosen_loss": 2451.96533203125, "debug/sppo_chosen_reward_in_loss": 0.8233364224433899, "debug/sppo_rej_reward_in_loss": -6.870279788970947, "debug/sppo_reject_loss": 1987.565673828125, "epoch": 4.728260869565218, "grad_norm": 70239.10027300968, "learning_rate": 4.283681214421252e-08, "logits/chosen": 1.0196096897125244, "logits/rejected": 1.3811008930206299, "logps/chosen": -242.3280029296875, "logps/rejected": -283.93609619140625, "loss": 4542.775, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.00823336374014616, "rewards/margins": 0.07693615555763245, "rewards/rejected": -0.06870279461145401, "step": 1305 }, { "debug/policy_chosen_logits": 1.2350099086761475, "debug/policy_chosen_logps": -255.0806427001953, "debug/policy_rejected_logits": 1.450093388557434, "debug/policy_rejected_logps": -293.8785095214844, "debug/reference_chosen_logps": -256.298828125, "debug/reference_rejected_logps": -287.5867614746094, "debug/sppo_chosen_loss": 2414.80712890625, "debug/sppo_chosen_reward_in_loss": 1.2181905508041382, "debug/sppo_rej_reward_in_loss": -6.291740417480469, "debug/sppo_reject_loss": 2032.808349609375, "epoch": 4.746376811594203, "grad_norm": 73576.70147156833, "learning_rate": 4.259962049335864e-08, "logits/chosen": 1.2350099086761475, "logits/rejected": 1.450093388557434, "logps/chosen": -255.0806427001953, "logps/rejected": -293.8785095214844, "loss": 4513.3492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.012181905098259449, "rewards/margins": 0.07509930431842804, "rewards/rejected": -0.06291739642620087, "step": 1310 }, { "debug/policy_chosen_logits": 1.4703645706176758, "debug/policy_chosen_logps": -258.75982666015625, "debug/policy_rejected_logits": 1.772477149963379, "debug/policy_rejected_logps": -283.7424011230469, "debug/reference_chosen_logps": -258.59027099609375, "debug/reference_rejected_logps": -279.9950256347656, "debug/sppo_chosen_loss": 2588.653076171875, "debug/sppo_chosen_reward_in_loss": -0.1695549041032791, "debug/sppo_rej_reward_in_loss": -3.74739408493042, "debug/sppo_reject_loss": 2241.117919921875, "epoch": 4.7644927536231885, "grad_norm": 70953.52895055551, "learning_rate": 4.2362428842504745e-08, "logits/chosen": 1.4703645706176758, "logits/rejected": 1.772477149963379, "logps/chosen": -258.75982666015625, "logps/rejected": -283.7424011230469, "loss": 4583.2578, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0016955494647845626, "rewards/margins": 0.03577839583158493, "rewards/rejected": -0.03747394308447838, "step": 1315 }, { "debug/policy_chosen_logits": 1.0896484851837158, "debug/policy_chosen_logps": -244.1891632080078, "debug/policy_rejected_logits": 1.4297540187835693, "debug/policy_rejected_logps": -287.01385498046875, "debug/reference_chosen_logps": -244.8174591064453, "debug/reference_rejected_logps": -283.5927429199219, "debug/sppo_chosen_loss": 2458.44287109375, "debug/sppo_chosen_reward_in_loss": 0.628299355506897, "debug/sppo_rej_reward_in_loss": -3.4211173057556152, "debug/sppo_reject_loss": 2274.162109375, "epoch": 4.782608695652174, "grad_norm": 62799.10745582101, "learning_rate": 4.2125237191650854e-08, "logits/chosen": 1.0896484851837158, "logits/rejected": 1.4297540187835693, "logps/chosen": -244.1891632080078, "logps/rejected": -287.01385498046875, "loss": 4661.557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006282993592321873, "rewards/margins": 0.0404941663146019, "rewards/rejected": -0.0342111699283123, "step": 1320 }, { "debug/policy_chosen_logits": 1.1600326299667358, "debug/policy_chosen_logps": -245.1553955078125, "debug/policy_rejected_logits": 1.3645192384719849, "debug/policy_rejected_logps": -301.7093200683594, "debug/reference_chosen_logps": -247.78457641601562, "debug/reference_rejected_logps": -296.8087463378906, "debug/sppo_chosen_loss": 2257.982666015625, "debug/sppo_chosen_reward_in_loss": 2.6291584968566895, "debug/sppo_rej_reward_in_loss": -4.900609016418457, "debug/sppo_reject_loss": 2124.41552734375, "epoch": 4.800724637681159, "grad_norm": 61293.21736585257, "learning_rate": 4.188804554079696e-08, "logits/chosen": 1.1600326299667358, "logits/rejected": 1.3645192384719849, "logps/chosen": -245.1553955078125, "logps/rejected": -301.7093200683594, "loss": 4410.2211, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.026291584596037865, "rewards/margins": 0.07529766857624054, "rewards/rejected": -0.049006082117557526, "step": 1325 }, { "debug/policy_chosen_logits": 1.029860258102417, "debug/policy_chosen_logps": -258.18927001953125, "debug/policy_rejected_logits": 1.3573116064071655, "debug/policy_rejected_logps": -332.4870910644531, "debug/reference_chosen_logps": -260.9222717285156, "debug/reference_rejected_logps": -326.95977783203125, "debug/sppo_chosen_loss": 2252.29736328125, "debug/sppo_chosen_reward_in_loss": 2.7329752445220947, "debug/sppo_rej_reward_in_loss": -5.527298927307129, "debug/sppo_reject_loss": 2118.50537109375, "epoch": 4.818840579710145, "grad_norm": 73642.55652860063, "learning_rate": 4.165085388994307e-08, "logits/chosen": 1.029860258102417, "logits/rejected": 1.3573116064071655, "logps/chosen": -258.18927001953125, "logps/rejected": -332.4870910644531, "loss": 4517.1781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02732974849641323, "rewards/margins": 0.08260273933410645, "rewards/rejected": -0.05527298524975777, "step": 1330 }, { "debug/policy_chosen_logits": 1.2658741474151611, "debug/policy_chosen_logps": -243.64553833007812, "debug/policy_rejected_logits": 1.6032794713974, "debug/policy_rejected_logps": -297.21173095703125, "debug/reference_chosen_logps": -245.0940399169922, "debug/reference_rejected_logps": -291.6605529785156, "debug/sppo_chosen_loss": 2407.762939453125, "debug/sppo_chosen_reward_in_loss": 1.4485273361206055, "debug/sppo_rej_reward_in_loss": -5.551199436187744, "debug/sppo_reject_loss": 2149.141357421875, "epoch": 4.836956521739131, "grad_norm": 88725.9836289771, "learning_rate": 4.141366223908918e-08, "logits/chosen": 1.2658741474151611, "logits/rejected": 1.6032794713974, "logps/chosen": -243.64553833007812, "logps/rejected": -297.21173095703125, "loss": 4569.8363, "rewards/accuracies": 0.75, "rewards/chosen": 0.014485272578895092, "rewards/margins": 0.06999726593494415, "rewards/rejected": -0.05551199987530708, "step": 1335 }, { "debug/policy_chosen_logits": 1.0831992626190186, "debug/policy_chosen_logps": -223.33621215820312, "debug/policy_rejected_logits": 1.5287272930145264, "debug/policy_rejected_logps": -284.8612060546875, "debug/reference_chosen_logps": -223.690185546875, "debug/reference_rejected_logps": -279.407958984375, "debug/sppo_chosen_loss": 2546.77490234375, "debug/sppo_chosen_reward_in_loss": 0.35397204756736755, "debug/sppo_rej_reward_in_loss": -5.453265190124512, "debug/sppo_reject_loss": 2130.91162109375, "epoch": 4.855072463768116, "grad_norm": 73562.88686829025, "learning_rate": 4.1176470588235293e-08, "logits/chosen": 1.0831992626190186, "logits/rejected": 1.5287272930145264, "logps/chosen": -223.33621215820312, "logps/rejected": -284.8612060546875, "loss": 4676.3711, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003539721015840769, "rewards/margins": 0.058072369545698166, "rewards/rejected": -0.054532647132873535, "step": 1340 }, { "debug/policy_chosen_logits": 1.31937837600708, "debug/policy_chosen_logps": -266.54534912109375, "debug/policy_rejected_logits": 1.643914818763733, "debug/policy_rejected_logps": -268.93243408203125, "debug/reference_chosen_logps": -267.3473815917969, "debug/reference_rejected_logps": -263.4280700683594, "debug/sppo_chosen_loss": 2462.66455078125, "debug/sppo_chosen_reward_in_loss": 0.802037239074707, "debug/sppo_rej_reward_in_loss": -5.50433874130249, "debug/sppo_reject_loss": 2119.84619140625, "epoch": 4.8731884057971016, "grad_norm": 97143.69585325856, "learning_rate": 4.09392789373814e-08, "logits/chosen": 1.31937837600708, "logits/rejected": 1.643914818763733, "logps/chosen": -266.54534912109375, "logps/rejected": -268.93243408203125, "loss": 4642.3746, "rewards/accuracies": 0.625, "rewards/chosen": 0.008020373061299324, "rewards/margins": 0.06306375563144684, "rewards/rejected": -0.05504338815808296, "step": 1345 }, { "debug/policy_chosen_logits": 1.1010644435882568, "debug/policy_chosen_logps": -234.9674530029297, "debug/policy_rejected_logits": 1.4601472616195679, "debug/policy_rejected_logps": -277.87548828125, "debug/reference_chosen_logps": -235.99813842773438, "debug/reference_rejected_logps": -273.8133239746094, "debug/sppo_chosen_loss": 2455.3671875, "debug/sppo_chosen_reward_in_loss": 1.0306648015975952, "debug/sppo_rej_reward_in_loss": -4.062138080596924, "debug/sppo_reject_loss": 2207.40185546875, "epoch": 4.891304347826087, "grad_norm": 63081.6235879241, "learning_rate": 4.0702087286527517e-08, "logits/chosen": 1.1010644435882568, "logits/rejected": 1.4601472616195679, "logps/chosen": -234.9674530029297, "logps/rejected": -277.87548828125, "loss": 4530.6977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010306647047400475, "rewards/margins": 0.050928033888339996, "rewards/rejected": -0.040621377527713776, "step": 1350 }, { "debug/policy_chosen_logits": 1.2216109037399292, "debug/policy_chosen_logps": -256.34625244140625, "debug/policy_rejected_logits": 1.522156000137329, "debug/policy_rejected_logps": -273.95599365234375, "debug/reference_chosen_logps": -257.6533203125, "debug/reference_rejected_logps": -271.18536376953125, "debug/sppo_chosen_loss": 2415.692626953125, "debug/sppo_chosen_reward_in_loss": 1.307077169418335, "debug/sppo_rej_reward_in_loss": -2.7706494331359863, "debug/sppo_reject_loss": 2311.42578125, "epoch": 4.909420289855072, "grad_norm": 95990.32562794119, "learning_rate": 4.0464895635673625e-08, "logits/chosen": 1.2216109037399292, "logits/rejected": 1.522156000137329, "logps/chosen": -256.34625244140625, "logps/rejected": -273.95599365234375, "loss": 4593.0359, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.013070771470665932, "rewards/margins": 0.040777262300252914, "rewards/rejected": -0.027706492692232132, "step": 1355 }, { "debug/policy_chosen_logits": 1.0214922428131104, "debug/policy_chosen_logps": -245.90286254882812, "debug/policy_rejected_logits": 1.5517314672470093, "debug/policy_rejected_logps": -306.38360595703125, "debug/reference_chosen_logps": -247.7864227294922, "debug/reference_rejected_logps": -300.8656921386719, "debug/sppo_chosen_loss": 2353.67333984375, "debug/sppo_chosen_reward_in_loss": 1.8835837841033936, "debug/sppo_rej_reward_in_loss": -5.517895221710205, "debug/sppo_reject_loss": 2175.05419921875, "epoch": 4.927536231884058, "grad_norm": 68842.68376456067, "learning_rate": 4.022770398481973e-08, "logits/chosen": 1.0214922428131104, "logits/rejected": 1.5517314672470093, "logps/chosen": -245.90286254882812, "logps/rejected": -306.38360595703125, "loss": 4555.4187, "rewards/accuracies": 0.75, "rewards/chosen": 0.01883583888411522, "rewards/margins": 0.07401479035615921, "rewards/rejected": -0.05517895147204399, "step": 1360 }, { "debug/policy_chosen_logits": 1.2349607944488525, "debug/policy_chosen_logps": -255.0648956298828, "debug/policy_rejected_logits": 1.411770224571228, "debug/policy_rejected_logps": -283.4617614746094, "debug/reference_chosen_logps": -257.22283935546875, "debug/reference_rejected_logps": -277.9129943847656, "debug/sppo_chosen_loss": 2306.45361328125, "debug/sppo_chosen_reward_in_loss": 2.1579513549804688, "debug/sppo_rej_reward_in_loss": -5.548783302307129, "debug/sppo_reject_loss": 2074.717529296875, "epoch": 4.945652173913043, "grad_norm": 73017.45116909397, "learning_rate": 3.999051233396584e-08, "logits/chosen": 1.2349607944488525, "logits/rejected": 1.411770224571228, "logps/chosen": -255.0648956298828, "logps/rejected": -283.4617614746094, "loss": 4556.3383, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02157951332628727, "rewards/margins": 0.07706734538078308, "rewards/rejected": -0.05548783019185066, "step": 1365 }, { "debug/policy_chosen_logits": 1.1194708347320557, "debug/policy_chosen_logps": -254.6831512451172, "debug/policy_rejected_logits": 1.5399668216705322, "debug/policy_rejected_logps": -323.66375732421875, "debug/reference_chosen_logps": -254.72119140625, "debug/reference_rejected_logps": -315.86517333984375, "debug/sppo_chosen_loss": 2591.28759765625, "debug/sppo_chosen_reward_in_loss": 0.03805046156048775, "debug/sppo_rej_reward_in_loss": -7.798565864562988, "debug/sppo_reject_loss": 1968.991455078125, "epoch": 4.963768115942029, "grad_norm": 137112.4779173004, "learning_rate": 3.975332068311195e-08, "logits/chosen": 1.1194708347320557, "logits/rejected": 1.5399668216705322, "logps/chosen": -254.6831512451172, "logps/rejected": -323.66375732421875, "loss": 4402.7445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00038050487637519836, "rewards/margins": 0.07836616039276123, "rewards/rejected": -0.07798565924167633, "step": 1370 }, { "debug/policy_chosen_logits": 1.4444690942764282, "debug/policy_chosen_logps": -259.66302490234375, "debug/policy_rejected_logits": 1.6535580158233643, "debug/policy_rejected_logps": -302.068359375, "debug/reference_chosen_logps": -260.85552978515625, "debug/reference_rejected_logps": -294.53472900390625, "debug/sppo_chosen_loss": 2425.268798828125, "debug/sppo_chosen_reward_in_loss": 1.1925013065338135, "debug/sppo_rej_reward_in_loss": -7.533595085144043, "debug/sppo_reject_loss": 1990.464111328125, "epoch": 4.981884057971015, "grad_norm": 72364.69086372273, "learning_rate": 3.951612903225806e-08, "logits/chosen": 1.4444690942764282, "logits/rejected": 1.6535580158233643, "logps/chosen": -259.66302490234375, "logps/rejected": -302.068359375, "loss": 4575.1687, "rewards/accuracies": 0.75, "rewards/chosen": 0.01192501187324524, "rewards/margins": 0.08726096898317337, "rewards/rejected": -0.07533595710992813, "step": 1375 }, { "debug/policy_chosen_logits": 1.4036178588867188, "debug/policy_chosen_logps": -272.2148132324219, "debug/policy_rejected_logits": 1.5798277854919434, "debug/policy_rejected_logps": -295.23297119140625, "debug/reference_chosen_logps": -273.52972412109375, "debug/reference_rejected_logps": -288.4667053222656, "debug/sppo_chosen_loss": 2391.544189453125, "debug/sppo_chosen_reward_in_loss": 1.314896821975708, "debug/sppo_rej_reward_in_loss": -6.766263008117676, "debug/sppo_reject_loss": 2000.7828369140625, "epoch": 5.0, "grad_norm": 66107.80312436711, "learning_rate": 3.927893738140417e-08, "logits/chosen": 1.4036178588867188, "logits/rejected": 1.5798277854919434, "logps/chosen": -272.2148132324219, "logps/rejected": -295.23297119140625, "loss": 4461.8727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.013148968107998371, "rewards/margins": 0.08081159740686417, "rewards/rejected": -0.06766261905431747, "step": 1380 }, { "debug/policy_chosen_logits": 1.064527988433838, "debug/policy_chosen_logps": -262.5690002441406, "debug/policy_rejected_logits": 1.3464521169662476, "debug/policy_rejected_logps": -312.77923583984375, "debug/reference_chosen_logps": -262.3939514160156, "debug/reference_rejected_logps": -307.7593078613281, "debug/sppo_chosen_loss": 2593.99267578125, "debug/sppo_chosen_reward_in_loss": -0.17507171630859375, "debug/sppo_rej_reward_in_loss": -5.019980430603027, "debug/sppo_reject_loss": 2117.771728515625, "epoch": 5.018115942028985, "grad_norm": 60421.88552944837, "learning_rate": 3.904174573055029e-08, "logits/chosen": 1.064527988433838, "logits/rejected": 1.3464521169662476, "logps/chosen": -262.5690002441406, "logps/rejected": -312.77923583984375, "loss": 4663.3609, "rewards/accuracies": 0.75, "rewards/chosen": -0.001750717288814485, "rewards/margins": 0.0484490841627121, "rewards/rejected": -0.05019979923963547, "step": 1385 }, { "debug/policy_chosen_logits": 1.2746237516403198, "debug/policy_chosen_logps": -259.770751953125, "debug/policy_rejected_logits": 1.693315863609314, "debug/policy_rejected_logps": -294.4795837402344, "debug/reference_chosen_logps": -261.4613342285156, "debug/reference_rejected_logps": -290.87646484375, "debug/sppo_chosen_loss": 2360.163330078125, "debug/sppo_chosen_reward_in_loss": 1.6905481815338135, "debug/sppo_rej_reward_in_loss": -3.60308837890625, "debug/sppo_reject_loss": 2244.31298828125, "epoch": 5.036231884057971, "grad_norm": 66547.23966266622, "learning_rate": 3.8804554079696396e-08, "logits/chosen": 1.2746237516403198, "logits/rejected": 1.693315863609314, "logps/chosen": -259.770751953125, "logps/rejected": -294.4795837402344, "loss": 4668.377, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01690548099577427, "rewards/margins": 0.0529363639652729, "rewards/rejected": -0.036030884832143784, "step": 1390 }, { "debug/policy_chosen_logits": 1.394603967666626, "debug/policy_chosen_logps": -271.45330810546875, "debug/policy_rejected_logits": 1.8004786968231201, "debug/policy_rejected_logps": -321.419189453125, "debug/reference_chosen_logps": -273.29937744140625, "debug/reference_rejected_logps": -315.52386474609375, "debug/sppo_chosen_loss": 2344.6923828125, "debug/sppo_chosen_reward_in_loss": 1.8460609912872314, "debug/sppo_rej_reward_in_loss": -5.895323276519775, "debug/sppo_reject_loss": 2063.442626953125, "epoch": 5.054347826086956, "grad_norm": 58559.782235879655, "learning_rate": 3.8567362428842504e-08, "logits/chosen": 1.394603967666626, "logits/rejected": 1.8004786968231201, "logps/chosen": -271.45330810546875, "logps/rejected": -321.419189453125, "loss": 4493.4766, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.018460609018802643, "rewards/margins": 0.07741384208202362, "rewards/rejected": -0.05895323306322098, "step": 1395 }, { "debug/policy_chosen_logits": 0.7532809972763062, "debug/policy_chosen_logps": -236.7574462890625, "debug/policy_rejected_logits": 1.2644236087799072, "debug/policy_rejected_logps": -309.5359802246094, "debug/reference_chosen_logps": -236.7089080810547, "debug/reference_rejected_logps": -302.9029235839844, "debug/sppo_chosen_loss": 2557.786376953125, "debug/sppo_chosen_reward_in_loss": -0.048520468175411224, "debug/sppo_rej_reward_in_loss": -6.633038520812988, "debug/sppo_reject_loss": 2042.6431884765625, "epoch": 5.072463768115942, "grad_norm": 70112.82165426727, "learning_rate": 3.833017077798861e-08, "logits/chosen": 0.7532809972763062, "logits/rejected": 1.2644236087799072, "logps/chosen": -236.7574462890625, "logps/rejected": -309.5359802246094, "loss": 4572.3969, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00048520491691306233, "rewards/margins": 0.06584517657756805, "rewards/rejected": -0.06633038818836212, "step": 1400 }, { "epoch": 5.072463768115942, "eval_debug/policy_chosen_logits": 1.4205509424209595, "eval_debug/policy_chosen_logps": -253.06065368652344, "eval_debug/policy_rejected_logits": 1.4662528038024902, "eval_debug/policy_rejected_logps": -263.6362609863281, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2580.254150390625, "eval_debug/sppo_chosen_reward_in_loss": -0.14217215776443481, "eval_debug/sppo_rej_reward_in_loss": -3.977602481842041, "eval_debug/sppo_reject_loss": 2263.763671875, "eval_logits/chosen": 1.4205509424209595, "eval_logits/rejected": 1.4662528038024902, "eval_logps/chosen": -253.06065368652344, "eval_logps/rejected": -263.6362609863281, "eval_loss": 4650.53515625, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": -0.001421720371581614, "eval_rewards/margins": 0.0383542999625206, "eval_rewards/rejected": -0.0397760234773159, "eval_runtime": 28.5712, "eval_samples_per_second": 21.0, "eval_steps_per_second": 0.665, "step": 1400 }, { "debug/policy_chosen_logits": 1.0779249668121338, "debug/policy_chosen_logps": -253.6057891845703, "debug/policy_rejected_logits": 1.1504356861114502, "debug/policy_rejected_logps": -265.81036376953125, "debug/reference_chosen_logps": -255.2992401123047, "debug/reference_rejected_logps": -260.98126220703125, "debug/sppo_chosen_loss": 2361.61865234375, "debug/sppo_chosen_reward_in_loss": 1.6934499740600586, "debug/sppo_rej_reward_in_loss": -4.829095840454102, "debug/sppo_reject_loss": 2182.90478515625, "epoch": 5.090579710144928, "grad_norm": 61446.4033177843, "learning_rate": 3.809297912713472e-08, "logits/chosen": 1.0779249668121338, "logits/rejected": 1.1504356861114502, "logps/chosen": -253.6057891845703, "logps/rejected": -265.81036376953125, "loss": 4452.7141, "rewards/accuracies": 0.625, "rewards/chosen": 0.016934499144554138, "rewards/margins": 0.06522545963525772, "rewards/rejected": -0.048290956765413284, "step": 1405 }, { "debug/policy_chosen_logits": 1.2195689678192139, "debug/policy_chosen_logps": -260.621337890625, "debug/policy_rejected_logits": 1.4920828342437744, "debug/policy_rejected_logps": -291.0221252441406, "debug/reference_chosen_logps": -260.37481689453125, "debug/reference_rejected_logps": -285.4285888671875, "debug/sppo_chosen_loss": 2593.103515625, "debug/sppo_chosen_reward_in_loss": -0.2464984953403473, "debug/sppo_rej_reward_in_loss": -5.593526363372803, "debug/sppo_reject_loss": 2117.669921875, "epoch": 5.108695652173913, "grad_norm": 65114.364606223855, "learning_rate": 3.785578747628083e-08, "logits/chosen": 1.2195689678192139, "logits/rejected": 1.4920828342437744, "logps/chosen": -260.621337890625, "logps/rejected": -291.0221252441406, "loss": 4644.5016, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00246498454362154, "rewards/margins": 0.05347027629613876, "rewards/rejected": -0.05593526363372803, "step": 1410 }, { "debug/policy_chosen_logits": 1.313755989074707, "debug/policy_chosen_logps": -271.34710693359375, "debug/policy_rejected_logits": 1.3349696397781372, "debug/policy_rejected_logps": -280.6801452636719, "debug/reference_chosen_logps": -272.76300048828125, "debug/reference_rejected_logps": -275.5526428222656, "debug/sppo_chosen_loss": 2393.33984375, "debug/sppo_chosen_reward_in_loss": 1.4158799648284912, "debug/sppo_rej_reward_in_loss": -5.127492427825928, "debug/sppo_reject_loss": 2140.75048828125, "epoch": 5.1268115942028984, "grad_norm": 64214.973795608035, "learning_rate": 3.7618595825426944e-08, "logits/chosen": 1.313755989074707, "logits/rejected": 1.3349696397781372, "logps/chosen": -271.34710693359375, "logps/rejected": -280.6801452636719, "loss": 4589.2063, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01415880024433136, "rewards/margins": 0.06543372571468353, "rewards/rejected": -0.05127492547035217, "step": 1415 }, { "debug/policy_chosen_logits": 1.3006422519683838, "debug/policy_chosen_logps": -257.43109130859375, "debug/policy_rejected_logits": 1.551833152770996, "debug/policy_rejected_logps": -303.74664306640625, "debug/reference_chosen_logps": -260.3376159667969, "debug/reference_rejected_logps": -298.91290283203125, "debug/sppo_chosen_loss": 2242.43212890625, "debug/sppo_chosen_reward_in_loss": 2.9064974784851074, "debug/sppo_rej_reward_in_loss": -4.833803176879883, "debug/sppo_reject_loss": 2155.903564453125, "epoch": 5.144927536231884, "grad_norm": 80996.1993470657, "learning_rate": 3.738140417457305e-08, "logits/chosen": 1.3006422519683838, "logits/rejected": 1.551833152770996, "logps/chosen": -257.43109130859375, "logps/rejected": -303.74664306640625, "loss": 4512.9945, "rewards/accuracies": 0.75, "rewards/chosen": 0.029064977541565895, "rewards/margins": 0.0774030089378357, "rewards/rejected": -0.04833803325891495, "step": 1420 }, { "debug/policy_chosen_logits": 1.0275158882141113, "debug/policy_chosen_logps": -281.12945556640625, "debug/policy_rejected_logits": 1.1934149265289307, "debug/policy_rejected_logps": -283.98614501953125, "debug/reference_chosen_logps": -281.7276306152344, "debug/reference_rejected_logps": -278.9505615234375, "debug/sppo_chosen_loss": 2505.123046875, "debug/sppo_chosen_reward_in_loss": 0.5981689691543579, "debug/sppo_rej_reward_in_loss": -5.035550117492676, "debug/sppo_reject_loss": 2070.24609375, "epoch": 5.163043478260869, "grad_norm": 84894.2129125988, "learning_rate": 3.714421252371917e-08, "logits/chosen": 1.0275158882141113, "logits/rejected": 1.1934149265289307, "logps/chosen": -281.12945556640625, "logps/rejected": -283.98614501953125, "loss": 4495.2781, "rewards/accuracies": 0.75, "rewards/chosen": 0.005981688387691975, "rewards/margins": 0.05633718892931938, "rewards/rejected": -0.05035550147294998, "step": 1425 }, { "debug/policy_chosen_logits": 1.023339033126831, "debug/policy_chosen_logps": -226.9719696044922, "debug/policy_rejected_logits": 1.429206132888794, "debug/policy_rejected_logps": -333.075439453125, "debug/reference_chosen_logps": -230.421142578125, "debug/reference_rejected_logps": -324.8152160644531, "debug/sppo_chosen_loss": 2208.994140625, "debug/sppo_chosen_reward_in_loss": 3.449148654937744, "debug/sppo_rej_reward_in_loss": -8.260204315185547, "debug/sppo_reject_loss": 1903.4114990234375, "epoch": 5.181159420289855, "grad_norm": 62782.57426780251, "learning_rate": 3.6907020872865276e-08, "logits/chosen": 1.023339033126831, "logits/rejected": 1.429206132888794, "logps/chosen": -226.9719696044922, "logps/rejected": -333.075439453125, "loss": 4471.5984, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.034491486847400665, "rewards/margins": 0.1170935407280922, "rewards/rejected": -0.08260203897953033, "step": 1430 }, { "debug/policy_chosen_logits": 0.6919659376144409, "debug/policy_chosen_logps": -236.5424346923828, "debug/policy_rejected_logits": 1.1365792751312256, "debug/policy_rejected_logps": -268.3335266113281, "debug/reference_chosen_logps": -237.2069549560547, "debug/reference_rejected_logps": -264.7642517089844, "debug/sppo_chosen_loss": 2506.296875, "debug/sppo_chosen_reward_in_loss": 0.6645118594169617, "debug/sppo_rej_reward_in_loss": -3.5692667961120605, "debug/sppo_reject_loss": 2261.9521484375, "epoch": 5.199275362318841, "grad_norm": 69268.64954253453, "learning_rate": 3.6669829222011384e-08, "logits/chosen": 0.6919659376144409, "logits/rejected": 1.1365792751312256, "logps/chosen": -236.5424346923828, "logps/rejected": -268.3335266113281, "loss": 4525.9953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006645118352025747, "rewards/margins": 0.04233778640627861, "rewards/rejected": -0.035692669451236725, "step": 1435 }, { "debug/policy_chosen_logits": 1.183068037033081, "debug/policy_chosen_logps": -280.33074951171875, "debug/policy_rejected_logits": 1.2893073558807373, "debug/policy_rejected_logps": -302.2462463378906, "debug/reference_chosen_logps": -281.6507873535156, "debug/reference_rejected_logps": -296.9029235839844, "debug/sppo_chosen_loss": 2441.598876953125, "debug/sppo_chosen_reward_in_loss": 1.3200454711914062, "debug/sppo_rej_reward_in_loss": -5.3433122634887695, "debug/sppo_reject_loss": 2155.077880859375, "epoch": 5.217391304347826, "grad_norm": 68204.69204754713, "learning_rate": 3.643263757115749e-08, "logits/chosen": 1.183068037033081, "logits/rejected": 1.2893073558807373, "logps/chosen": -280.33074951171875, "logps/rejected": -302.2462463378906, "loss": 4606.7289, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.013200454413890839, "rewards/margins": 0.06663356721401215, "rewards/rejected": -0.053433120250701904, "step": 1440 }, { "debug/policy_chosen_logits": 0.9523294568061829, "debug/policy_chosen_logps": -228.82699584960938, "debug/policy_rejected_logits": 1.6573665142059326, "debug/policy_rejected_logps": -296.42803955078125, "debug/reference_chosen_logps": -230.6846466064453, "debug/reference_rejected_logps": -292.0801086425781, "debug/sppo_chosen_loss": 2390.593017578125, "debug/sppo_chosen_reward_in_loss": 1.8576314449310303, "debug/sppo_rej_reward_in_loss": -4.347911357879639, "debug/sppo_reject_loss": 2183.09716796875, "epoch": 5.2355072463768115, "grad_norm": 76693.3445892092, "learning_rate": 3.61954459203036e-08, "logits/chosen": 0.9523294568061829, "logits/rejected": 1.6573665142059326, "logps/chosen": -228.82699584960938, "logps/rejected": -296.42803955078125, "loss": 4477.4961, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01857631467282772, "rewards/margins": 0.062055427581071854, "rewards/rejected": -0.04347911477088928, "step": 1445 }, { "debug/policy_chosen_logits": 1.2921512126922607, "debug/policy_chosen_logps": -253.98818969726562, "debug/policy_rejected_logits": 1.6245660781860352, "debug/policy_rejected_logps": -318.24261474609375, "debug/reference_chosen_logps": -254.13671875, "debug/reference_rejected_logps": -314.6860656738281, "debug/sppo_chosen_loss": 2518.2509765625, "debug/sppo_chosen_reward_in_loss": 0.1485462188720703, "debug/sppo_rej_reward_in_loss": -3.556605577468872, "debug/sppo_reject_loss": 2258.7685546875, "epoch": 5.253623188405797, "grad_norm": 59761.15258853687, "learning_rate": 3.595825426944971e-08, "logits/chosen": 1.2921512126922607, "logits/rejected": 1.6245660781860352, "logps/chosen": -253.98818969726562, "logps/rejected": -318.24261474609375, "loss": 4485.2957, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0014854620676487684, "rewards/margins": 0.03705151751637459, "rewards/rejected": -0.0355660542845726, "step": 1450 }, { "debug/policy_chosen_logits": 0.9015262722969055, "debug/policy_chosen_logps": -232.46408081054688, "debug/policy_rejected_logits": 1.237021803855896, "debug/policy_rejected_logps": -298.94036865234375, "debug/reference_chosen_logps": -233.8374481201172, "debug/reference_rejected_logps": -290.72314453125, "debug/sppo_chosen_loss": 2405.655029296875, "debug/sppo_chosen_reward_in_loss": 1.373361587524414, "debug/sppo_rej_reward_in_loss": -8.217247009277344, "debug/sppo_reject_loss": 1994.3980712890625, "epoch": 5.271739130434782, "grad_norm": 64123.068032963005, "learning_rate": 3.5721062618595824e-08, "logits/chosen": 0.9015262722969055, "logits/rejected": 1.237021803855896, "logps/chosen": -232.46408081054688, "logps/rejected": -298.94036865234375, "loss": 4584.1836, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013733616098761559, "rewards/margins": 0.0959060862660408, "rewards/rejected": -0.0821724683046341, "step": 1455 }, { "debug/policy_chosen_logits": 0.9645652770996094, "debug/policy_chosen_logps": -230.34765625, "debug/policy_rejected_logits": 1.2264922857284546, "debug/policy_rejected_logps": -285.845947265625, "debug/reference_chosen_logps": -232.3076629638672, "debug/reference_rejected_logps": -277.7240295410156, "debug/sppo_chosen_loss": 2347.331787109375, "debug/sppo_chosen_reward_in_loss": 1.9600273370742798, "debug/sppo_rej_reward_in_loss": -8.121912956237793, "debug/sppo_reject_loss": 1933.440673828125, "epoch": 5.2898550724637685, "grad_norm": 90452.37359089749, "learning_rate": 3.548387096774194e-08, "logits/chosen": 0.9645652770996094, "logits/rejected": 1.2264922857284546, "logps/chosen": -230.34765625, "logps/rejected": -285.845947265625, "loss": 4591.3922, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.019600268453359604, "rewards/margins": 0.1008194088935852, "rewards/rejected": -0.0812191367149353, "step": 1460 }, { "debug/policy_chosen_logits": 1.2732036113739014, "debug/policy_chosen_logps": -247.07958984375, "debug/policy_rejected_logits": 1.504052758216858, "debug/policy_rejected_logps": -271.5877380371094, "debug/reference_chosen_logps": -248.71542358398438, "debug/reference_rejected_logps": -269.37152099609375, "debug/sppo_chosen_loss": 2384.237060546875, "debug/sppo_chosen_reward_in_loss": 1.6358293294906616, "debug/sppo_rej_reward_in_loss": -2.2162370681762695, "debug/sppo_reject_loss": 2356.390869140625, "epoch": 5.307971014492754, "grad_norm": 80059.37828800437, "learning_rate": 3.524667931688805e-08, "logits/chosen": 1.2732036113739014, "logits/rejected": 1.504052758216858, "logps/chosen": -247.07958984375, "logps/rejected": -271.5877380371094, "loss": 4428.6633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01635829731822014, "rewards/margins": 0.03852067142724991, "rewards/rejected": -0.022162368521094322, "step": 1465 }, { "debug/policy_chosen_logits": 1.0236085653305054, "debug/policy_chosen_logps": -225.3211212158203, "debug/policy_rejected_logits": 1.88125479221344, "debug/policy_rejected_logps": -328.1840515136719, "debug/reference_chosen_logps": -227.2796173095703, "debug/reference_rejected_logps": -324.4100036621094, "debug/sppo_chosen_loss": 2318.153564453125, "debug/sppo_chosen_reward_in_loss": 1.958490014076233, "debug/sppo_rej_reward_in_loss": -3.774024248123169, "debug/sppo_reject_loss": 2217.359619140625, "epoch": 5.326086956521739, "grad_norm": 72004.87683281253, "learning_rate": 3.5009487666034155e-08, "logits/chosen": 1.0236085653305054, "logits/rejected": 1.88125479221344, "logps/chosen": -225.3211212158203, "logps/rejected": -328.1840515136719, "loss": 4527.5234, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0195848997682333, "rewards/margins": 0.05732514336705208, "rewards/rejected": -0.03774024173617363, "step": 1470 }, { "debug/policy_chosen_logits": 1.1755411624908447, "debug/policy_chosen_logps": -264.7723083496094, "debug/policy_rejected_logits": 1.4361246824264526, "debug/policy_rejected_logps": -295.58746337890625, "debug/reference_chosen_logps": -265.66778564453125, "debug/reference_rejected_logps": -290.55096435546875, "debug/sppo_chosen_loss": 2459.650634765625, "debug/sppo_chosen_reward_in_loss": 0.8954532742500305, "debug/sppo_rej_reward_in_loss": -5.036515712738037, "debug/sppo_reject_loss": 2155.29833984375, "epoch": 5.344202898550725, "grad_norm": 103707.62640298028, "learning_rate": 3.4772296015180263e-08, "logits/chosen": 1.1755411624908447, "logits/rejected": 1.4361246824264526, "logps/chosen": -264.7723083496094, "logps/rejected": -295.58746337890625, "loss": 4539.6918, "rewards/accuracies": 0.75, "rewards/chosen": 0.008954532444477081, "rewards/margins": 0.059319693595170975, "rewards/rejected": -0.05036516115069389, "step": 1475 }, { "debug/policy_chosen_logits": 1.3199971914291382, "debug/policy_chosen_logps": -283.9255065917969, "debug/policy_rejected_logits": 1.6441415548324585, "debug/policy_rejected_logps": -291.3877258300781, "debug/reference_chosen_logps": -286.08929443359375, "debug/reference_rejected_logps": -290.58465576171875, "debug/sppo_chosen_loss": 2313.395751953125, "debug/sppo_chosen_reward_in_loss": 2.163784980773926, "debug/sppo_rej_reward_in_loss": -0.8030645251274109, "debug/sppo_reject_loss": 2491.78076171875, "epoch": 5.36231884057971, "grad_norm": 82141.00437657817, "learning_rate": 3.453510436432637e-08, "logits/chosen": 1.3199971914291382, "logits/rejected": 1.6441415548324585, "logps/chosen": -283.9255065917969, "logps/rejected": -291.3877258300781, "loss": 4640.9637, "rewards/accuracies": 0.625, "rewards/chosen": 0.021637849509716034, "rewards/margins": 0.029668491333723068, "rewards/rejected": -0.008030645549297333, "step": 1480 }, { "debug/policy_chosen_logits": 1.0382921695709229, "debug/policy_chosen_logps": -255.6750946044922, "debug/policy_rejected_logits": 1.195730447769165, "debug/policy_rejected_logps": -276.95892333984375, "debug/reference_chosen_logps": -258.20574951171875, "debug/reference_rejected_logps": -270.19232177734375, "debug/sppo_chosen_loss": 2288.58837890625, "debug/sppo_chosen_reward_in_loss": 2.5306568145751953, "debug/sppo_rej_reward_in_loss": -6.766583442687988, "debug/sppo_reject_loss": 1966.270751953125, "epoch": 5.380434782608695, "grad_norm": 66828.05121181099, "learning_rate": 3.429791271347248e-08, "logits/chosen": 1.0382921695709229, "logits/rejected": 1.195730447769165, "logps/chosen": -255.6750946044922, "logps/rejected": -276.95892333984375, "loss": 4453.7992, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.025306567549705505, "rewards/margins": 0.09297239780426025, "rewards/rejected": -0.06766583025455475, "step": 1485 }, { "debug/policy_chosen_logits": 1.2640597820281982, "debug/policy_chosen_logps": -243.9297332763672, "debug/policy_rejected_logits": 1.5011266469955444, "debug/policy_rejected_logps": -260.5918884277344, "debug/reference_chosen_logps": -246.10781860351562, "debug/reference_rejected_logps": -257.28558349609375, "debug/sppo_chosen_loss": 2309.550048828125, "debug/sppo_chosen_reward_in_loss": 2.178067207336426, "debug/sppo_rej_reward_in_loss": -3.3062965869903564, "debug/sppo_reject_loss": 2279.09521484375, "epoch": 5.398550724637682, "grad_norm": 77440.11613650915, "learning_rate": 3.4060721062618595e-08, "logits/chosen": 1.2640597820281982, "logits/rejected": 1.5011266469955444, "logps/chosen": -243.9297332763672, "logps/rejected": -260.5918884277344, "loss": 4595.6547, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.021780669689178467, "rewards/margins": 0.054843634366989136, "rewards/rejected": -0.03306296467781067, "step": 1490 }, { "debug/policy_chosen_logits": 1.251159429550171, "debug/policy_chosen_logps": -255.2563934326172, "debug/policy_rejected_logits": 1.3697818517684937, "debug/policy_rejected_logps": -277.0352478027344, "debug/reference_chosen_logps": -257.09320068359375, "debug/reference_rejected_logps": -271.80712890625, "debug/sppo_chosen_loss": 2354.29638671875, "debug/sppo_chosen_reward_in_loss": 1.8368046283721924, "debug/sppo_rej_reward_in_loss": -5.228133201599121, "debug/sppo_reject_loss": 2134.17431640625, "epoch": 5.416666666666667, "grad_norm": 113744.2324091931, "learning_rate": 3.38235294117647e-08, "logits/chosen": 1.251159429550171, "logits/rejected": 1.3697818517684937, "logps/chosen": -255.2563934326172, "logps/rejected": -277.0352478027344, "loss": 4580.2445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01836804673075676, "rewards/margins": 0.07064937800168991, "rewards/rejected": -0.05228133127093315, "step": 1495 }, { "debug/policy_chosen_logits": 1.0046989917755127, "debug/policy_chosen_logps": -229.63711547851562, "debug/policy_rejected_logits": 1.2636783123016357, "debug/policy_rejected_logps": -248.46475219726562, "debug/reference_chosen_logps": -232.8604278564453, "debug/reference_rejected_logps": -243.4485321044922, "debug/sppo_chosen_loss": 2209.71875, "debug/sppo_chosen_reward_in_loss": 3.2233192920684814, "debug/sppo_rej_reward_in_loss": -5.016202926635742, "debug/sppo_reject_loss": 2163.51416015625, "epoch": 5.434782608695652, "grad_norm": 54754.59178959268, "learning_rate": 3.358633776091082e-08, "logits/chosen": 1.0046989917755127, "logits/rejected": 1.2636783123016357, "logps/chosen": -229.63711547851562, "logps/rejected": -248.46475219726562, "loss": 4497.8313, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.032233189791440964, "rewards/margins": 0.08239521831274033, "rewards/rejected": -0.05016202852129936, "step": 1500 }, { "epoch": 5.434782608695652, "eval_debug/policy_chosen_logits": 1.4118324518203735, "eval_debug/policy_chosen_logps": -252.53128051757812, "eval_debug/policy_rejected_logits": 1.4565733671188354, "eval_debug/policy_rejected_logps": -263.3676452636719, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2528.23388671875, "eval_debug/sppo_chosen_reward_in_loss": 0.38719919323921204, "eval_debug/sppo_rej_reward_in_loss": -3.7090137004852295, "eval_debug/sppo_reject_loss": 2293.697998046875, "eval_logits/chosen": 1.4118324518203735, "eval_logits/rejected": 1.4565733671188354, "eval_logps/chosen": -252.53128051757812, "eval_logps/rejected": -263.3676452636719, "eval_loss": 4637.40771484375, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": 0.0038719926960766315, "eval_rewards/margins": 0.04096212983131409, "eval_rewards/rejected": -0.03709014132618904, "eval_runtime": 28.6144, "eval_samples_per_second": 20.968, "eval_steps_per_second": 0.664, "step": 1500 }, { "debug/policy_chosen_logits": 1.1415979862213135, "debug/policy_chosen_logps": -252.59231567382812, "debug/policy_rejected_logits": 1.3891350030899048, "debug/policy_rejected_logps": -293.26861572265625, "debug/reference_chosen_logps": -252.951171875, "debug/reference_rejected_logps": -290.30047607421875, "debug/sppo_chosen_loss": 2540.763671875, "debug/sppo_chosen_reward_in_loss": 0.3588527739048004, "debug/sppo_rej_reward_in_loss": -2.9680838584899902, "debug/sppo_reject_loss": 2303.95654296875, "epoch": 5.452898550724638, "grad_norm": 63253.1537241967, "learning_rate": 3.3349146110056926e-08, "logits/chosen": 1.1415979862213135, "logits/rejected": 1.3891350030899048, "logps/chosen": -252.59231567382812, "logps/rejected": -293.26861572265625, "loss": 4542.4336, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0035885286051779985, "rewards/margins": 0.03326936811208725, "rewards/rejected": -0.02968083880841732, "step": 1505 }, { "debug/policy_chosen_logits": 1.0947173833847046, "debug/policy_chosen_logps": -249.10098266601562, "debug/policy_rejected_logits": 1.523273229598999, "debug/policy_rejected_logps": -279.7234802246094, "debug/reference_chosen_logps": -251.04904174804688, "debug/reference_rejected_logps": -274.56640625, "debug/sppo_chosen_loss": 2343.63330078125, "debug/sppo_chosen_reward_in_loss": 1.9480949640274048, "debug/sppo_rej_reward_in_loss": -5.157083034515381, "debug/sppo_reject_loss": 2129.125244140625, "epoch": 5.471014492753623, "grad_norm": 63674.961009985454, "learning_rate": 3.3111954459203035e-08, "logits/chosen": 1.0947173833847046, "logits/rejected": 1.523273229598999, "logps/chosen": -249.10098266601562, "logps/rejected": -279.7234802246094, "loss": 4457.6238, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.019480949267745018, "rewards/margins": 0.07105177640914917, "rewards/rejected": -0.051570825278759, "step": 1510 }, { "debug/policy_chosen_logits": 1.0274615287780762, "debug/policy_chosen_logps": -220.0609130859375, "debug/policy_rejected_logits": 1.4445350170135498, "debug/policy_rejected_logps": -283.0194091796875, "debug/reference_chosen_logps": -222.33889770507812, "debug/reference_rejected_logps": -275.36151123046875, "debug/sppo_chosen_loss": 2329.09130859375, "debug/sppo_chosen_reward_in_loss": 2.2779853343963623, "debug/sppo_rej_reward_in_loss": -7.6579179763793945, "debug/sppo_reject_loss": 1987.4312744140625, "epoch": 5.489130434782608, "grad_norm": 80103.10323563726, "learning_rate": 3.287476280834914e-08, "logits/chosen": 1.0274615287780762, "logits/rejected": 1.4445350170135498, "logps/chosen": -220.0609130859375, "logps/rejected": -283.0194091796875, "loss": 4472.7824, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.022779855877161026, "rewards/margins": 0.09935902804136276, "rewards/rejected": -0.07657917588949203, "step": 1515 }, { "debug/policy_chosen_logits": 1.5334327220916748, "debug/policy_chosen_logps": -279.19549560546875, "debug/policy_rejected_logits": 1.8267767429351807, "debug/policy_rejected_logps": -344.4455261230469, "debug/reference_chosen_logps": -279.59576416015625, "debug/reference_rejected_logps": -340.62744140625, "debug/sppo_chosen_loss": 2509.5625, "debug/sppo_chosen_reward_in_loss": 0.40025749802589417, "debug/sppo_rej_reward_in_loss": -3.8180630207061768, "debug/sppo_reject_loss": 2238.951904296875, "epoch": 5.507246376811594, "grad_norm": 87974.38402737744, "learning_rate": 3.263757115749525e-08, "logits/chosen": 1.5334327220916748, "logits/rejected": 1.8267767429351807, "logps/chosen": -279.19549560546875, "logps/rejected": -344.4455261230469, "loss": 4539.9926, "rewards/accuracies": 0.625, "rewards/chosen": 0.004002575762569904, "rewards/margins": 0.04218320548534393, "rewards/rejected": -0.0381806306540966, "step": 1520 }, { "debug/policy_chosen_logits": 1.2351617813110352, "debug/policy_chosen_logps": -253.60586547851562, "debug/policy_rejected_logits": 1.4738223552703857, "debug/policy_rejected_logps": -292.04669189453125, "debug/reference_chosen_logps": -255.423095703125, "debug/reference_rejected_logps": -287.0794372558594, "debug/sppo_chosen_loss": 2365.962158203125, "debug/sppo_chosen_reward_in_loss": 1.8172292709350586, "debug/sppo_rej_reward_in_loss": -4.967227935791016, "debug/sppo_reject_loss": 2143.7919921875, "epoch": 5.52536231884058, "grad_norm": 78300.72177135076, "learning_rate": 3.240037950664136e-08, "logits/chosen": 1.2351617813110352, "logits/rejected": 1.4738223552703857, "logps/chosen": -253.60586547851562, "logps/rejected": -292.04669189453125, "loss": 4569.6508, "rewards/accuracies": 0.75, "rewards/chosen": 0.01817229390144348, "rewards/margins": 0.06784456968307495, "rewards/rejected": -0.04967227950692177, "step": 1525 }, { "debug/policy_chosen_logits": 0.7970689535140991, "debug/policy_chosen_logps": -268.4679260253906, "debug/policy_rejected_logits": 0.8917680978775024, "debug/policy_rejected_logps": -276.13671875, "debug/reference_chosen_logps": -268.8172302246094, "debug/reference_rejected_logps": -273.11907958984375, "debug/sppo_chosen_loss": 2531.54638671875, "debug/sppo_chosen_reward_in_loss": 0.34928417205810547, "debug/sppo_rej_reward_in_loss": -3.0176525115966797, "debug/sppo_reject_loss": 2310.03173828125, "epoch": 5.543478260869565, "grad_norm": 67191.15613428685, "learning_rate": 3.2163187855787474e-08, "logits/chosen": 0.7970689535140991, "logits/rejected": 0.8917680978775024, "logps/chosen": -268.4679260253906, "logps/rejected": -276.13671875, "loss": 4598.5609, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003492841962724924, "rewards/margins": 0.033669363707304, "rewards/rejected": -0.030176525935530663, "step": 1530 }, { "debug/policy_chosen_logits": 1.0033587217330933, "debug/policy_chosen_logps": -251.0211944580078, "debug/policy_rejected_logits": 1.3567267656326294, "debug/policy_rejected_logps": -282.0811462402344, "debug/reference_chosen_logps": -253.4926300048828, "debug/reference_rejected_logps": -276.5571594238281, "debug/sppo_chosen_loss": 2308.2666015625, "debug/sppo_chosen_reward_in_loss": 2.471463680267334, "debug/sppo_rej_reward_in_loss": -5.524018287658691, "debug/sppo_reject_loss": 2109.814453125, "epoch": 5.561594202898551, "grad_norm": 99469.97465882145, "learning_rate": 3.192599620493359e-08, "logits/chosen": 1.0033587217330933, "logits/rejected": 1.3567267656326294, "logps/chosen": -251.0211944580078, "logps/rejected": -282.0811462402344, "loss": 4628.9617, "rewards/accuracies": 0.75, "rewards/chosen": 0.0247146375477314, "rewards/margins": 0.07995481789112091, "rewards/rejected": -0.05524018406867981, "step": 1535 }, { "debug/policy_chosen_logits": 1.0096734762191772, "debug/policy_chosen_logps": -228.7184295654297, "debug/policy_rejected_logits": 1.5451607704162598, "debug/policy_rejected_logps": -296.56536865234375, "debug/reference_chosen_logps": -230.16903686523438, "debug/reference_rejected_logps": -288.3876037597656, "debug/sppo_chosen_loss": 2379.11083984375, "debug/sppo_chosen_reward_in_loss": 1.4505977630615234, "debug/sppo_rej_reward_in_loss": -8.177800178527832, "debug/sppo_reject_loss": 1932.372314453125, "epoch": 5.579710144927536, "grad_norm": 65962.47005913446, "learning_rate": 3.16888045540797e-08, "logits/chosen": 1.0096734762191772, "logits/rejected": 1.5451607704162598, "logps/chosen": -228.7184295654297, "logps/rejected": -296.56536865234375, "loss": 4536.3977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014505976811051369, "rewards/margins": 0.09628397226333618, "rewards/rejected": -0.08177798986434937, "step": 1540 }, { "debug/policy_chosen_logits": 1.05217707157135, "debug/policy_chosen_logps": -240.3533477783203, "debug/policy_rejected_logits": 1.316529631614685, "debug/policy_rejected_logps": -264.3358154296875, "debug/reference_chosen_logps": -241.847900390625, "debug/reference_rejected_logps": -261.27923583984375, "debug/sppo_chosen_loss": 2398.14794921875, "debug/sppo_chosen_reward_in_loss": 1.4945652484893799, "debug/sppo_rej_reward_in_loss": -3.0565733909606934, "debug/sppo_reject_loss": 2308.44384765625, "epoch": 5.5978260869565215, "grad_norm": 61273.20336216504, "learning_rate": 3.1451612903225806e-08, "logits/chosen": 1.05217707157135, "logits/rejected": 1.316529631614685, "logps/chosen": -240.3533477783203, "logps/rejected": -264.3358154296875, "loss": 4457.0793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014945653267204762, "rewards/margins": 0.045511387288570404, "rewards/rejected": -0.030565734952688217, "step": 1545 }, { "debug/policy_chosen_logits": 1.1872689723968506, "debug/policy_chosen_logps": -249.18923950195312, "debug/policy_rejected_logits": 1.794293999671936, "debug/policy_rejected_logps": -309.87347412109375, "debug/reference_chosen_logps": -249.4769744873047, "debug/reference_rejected_logps": -302.5712890625, "debug/sppo_chosen_loss": 2513.45703125, "debug/sppo_chosen_reward_in_loss": 0.2877656817436218, "debug/sppo_rej_reward_in_loss": -7.302210807800293, "debug/sppo_reject_loss": 2019.406982421875, "epoch": 5.615942028985507, "grad_norm": 97268.77595423882, "learning_rate": 3.1214421252371914e-08, "logits/chosen": 1.1872689723968506, "logits/rejected": 1.794293999671936, "logps/chosen": -249.18923950195312, "logps/rejected": -309.87347412109375, "loss": 4445.6668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.002877655904740095, "rewards/margins": 0.07589976489543915, "rewards/rejected": -0.07302211225032806, "step": 1550 }, { "debug/policy_chosen_logits": 1.1443283557891846, "debug/policy_chosen_logps": -253.46200561523438, "debug/policy_rejected_logits": 1.5179729461669922, "debug/policy_rejected_logps": -308.83819580078125, "debug/reference_chosen_logps": -255.1627960205078, "debug/reference_rejected_logps": -301.81512451171875, "debug/sppo_chosen_loss": 2394.67578125, "debug/sppo_chosen_reward_in_loss": 1.7007999420166016, "debug/sppo_rej_reward_in_loss": -7.023089408874512, "debug/sppo_reject_loss": 1991.684814453125, "epoch": 5.634057971014493, "grad_norm": 76163.05412810206, "learning_rate": 3.097722960151802e-08, "logits/chosen": 1.1443283557891846, "logits/rejected": 1.5179729461669922, "logps/chosen": -253.46200561523438, "logps/rejected": -308.83819580078125, "loss": 4576.2734, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.01700800098478794, "rewards/margins": 0.08723888546228409, "rewards/rejected": -0.0702308863401413, "step": 1555 }, { "debug/policy_chosen_logits": 1.2496639490127563, "debug/policy_chosen_logps": -257.2691650390625, "debug/policy_rejected_logits": 1.609519600868225, "debug/policy_rejected_logps": -279.15179443359375, "debug/reference_chosen_logps": -259.53619384765625, "debug/reference_rejected_logps": -274.4745178222656, "debug/sppo_chosen_loss": 2335.58251953125, "debug/sppo_chosen_reward_in_loss": 2.2670352458953857, "debug/sppo_rej_reward_in_loss": -4.677281379699707, "debug/sppo_reject_loss": 2169.84619140625, "epoch": 5.6521739130434785, "grad_norm": 126967.3540927056, "learning_rate": 3.074003795066413e-08, "logits/chosen": 1.2496639490127563, "logits/rejected": 1.609519600868225, "logps/chosen": -257.2691650390625, "logps/rejected": -279.15179443359375, "loss": 4505.3664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.022670350968837738, "rewards/margins": 0.06944316625595093, "rewards/rejected": -0.04677281156182289, "step": 1560 }, { "debug/policy_chosen_logits": 0.7655173540115356, "debug/policy_chosen_logps": -238.21554565429688, "debug/policy_rejected_logits": 1.021499752998352, "debug/policy_rejected_logps": -310.1416931152344, "debug/reference_chosen_logps": -238.13815307617188, "debug/reference_rejected_logps": -302.5880126953125, "debug/sppo_chosen_loss": 2593.22607421875, "debug/sppo_chosen_reward_in_loss": -0.07740745693445206, "debug/sppo_rej_reward_in_loss": -7.553706169128418, "debug/sppo_reject_loss": 2026.0198974609375, "epoch": 5.670289855072464, "grad_norm": 71919.12902474291, "learning_rate": 3.0502846299810246e-08, "logits/chosen": 0.7655173540115356, "logits/rejected": 1.021499752998352, "logps/chosen": -238.21554565429688, "logps/rejected": -310.1416931152344, "loss": 4559.8012, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0007740751025266945, "rewards/margins": 0.0747629851102829, "rewards/rejected": -0.0755370706319809, "step": 1565 }, { "debug/policy_chosen_logits": 1.2906935214996338, "debug/policy_chosen_logps": -263.89093017578125, "debug/policy_rejected_logits": 1.586464524269104, "debug/policy_rejected_logps": -303.8671875, "debug/reference_chosen_logps": -265.8478088378906, "debug/reference_rejected_logps": -297.4637145996094, "debug/sppo_chosen_loss": 2341.831298828125, "debug/sppo_chosen_reward_in_loss": 1.9568710327148438, "debug/sppo_rej_reward_in_loss": -6.403465270996094, "debug/sppo_reject_loss": 2063.42724609375, "epoch": 5.688405797101449, "grad_norm": 86300.15615530143, "learning_rate": 3.0265654648956354e-08, "logits/chosen": 1.2906935214996338, "logits/rejected": 1.586464524269104, "logps/chosen": -263.89093017578125, "logps/rejected": -303.8671875, "loss": 4513.7398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.019568709656596184, "rewards/margins": 0.08360335975885391, "rewards/rejected": -0.06403464823961258, "step": 1570 }, { "debug/policy_chosen_logits": 1.4633818864822388, "debug/policy_chosen_logps": -264.929931640625, "debug/policy_rejected_logits": 1.6219351291656494, "debug/policy_rejected_logps": -291.8060302734375, "debug/reference_chosen_logps": -266.3921813964844, "debug/reference_rejected_logps": -285.4745178222656, "debug/sppo_chosen_loss": 2445.612548828125, "debug/sppo_chosen_reward_in_loss": 1.4622478485107422, "debug/sppo_rej_reward_in_loss": -6.33150577545166, "debug/sppo_reject_loss": 2044.7314453125, "epoch": 5.706521739130435, "grad_norm": 79355.07742967925, "learning_rate": 3.002846299810247e-08, "logits/chosen": 1.4633818864822388, "logits/rejected": 1.6219351291656494, "logps/chosen": -264.929931640625, "logps/rejected": -291.8060302734375, "loss": 4456.0758, "rewards/accuracies": 0.75, "rewards/chosen": 0.014622477814555168, "rewards/margins": 0.0779375359416008, "rewards/rejected": -0.06331505626440048, "step": 1575 }, { "debug/policy_chosen_logits": 1.3200656175613403, "debug/policy_chosen_logps": -266.7071838378906, "debug/policy_rejected_logits": 1.624782919883728, "debug/policy_rejected_logps": -285.70147705078125, "debug/reference_chosen_logps": -268.0072937011719, "debug/reference_rejected_logps": -280.94744873046875, "debug/sppo_chosen_loss": 2405.87451171875, "debug/sppo_chosen_reward_in_loss": 1.300122857093811, "debug/sppo_rej_reward_in_loss": -4.754025459289551, "debug/sppo_reject_loss": 2137.201416015625, "epoch": 5.72463768115942, "grad_norm": 89366.65117931982, "learning_rate": 2.9791271347248577e-08, "logits/chosen": 1.3200656175613403, "logits/rejected": 1.624782919883728, "logps/chosen": -266.7071838378906, "logps/rejected": -285.70147705078125, "loss": 4585.1406, "rewards/accuracies": 0.625, "rewards/chosen": 0.013001227751374245, "rewards/margins": 0.06054148077964783, "rewards/rejected": -0.04754025489091873, "step": 1580 }, { "debug/policy_chosen_logits": 0.9718109965324402, "debug/policy_chosen_logps": -245.7661895751953, "debug/policy_rejected_logits": 1.4039283990859985, "debug/policy_rejected_logps": -291.78131103515625, "debug/reference_chosen_logps": -246.39022827148438, "debug/reference_rejected_logps": -284.890625, "debug/sppo_chosen_loss": 2519.76611328125, "debug/sppo_chosen_reward_in_loss": 0.6240444183349609, "debug/sppo_rej_reward_in_loss": -6.890650272369385, "debug/sppo_reject_loss": 2023.6435546875, "epoch": 5.742753623188406, "grad_norm": 81775.38185613377, "learning_rate": 2.9554079696394685e-08, "logits/chosen": 0.9718109965324402, "logits/rejected": 1.4039283990859985, "logps/chosen": -245.7661895751953, "logps/rejected": -291.78131103515625, "loss": 4562.3156, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00624044518917799, "rewards/margins": 0.07514695078134537, "rewards/rejected": -0.0689065009355545, "step": 1585 }, { "debug/policy_chosen_logits": 0.9735002517700195, "debug/policy_chosen_logps": -254.20217895507812, "debug/policy_rejected_logits": 1.5613526105880737, "debug/policy_rejected_logps": -303.11114501953125, "debug/reference_chosen_logps": -256.5666198730469, "debug/reference_rejected_logps": -295.99114990234375, "debug/sppo_chosen_loss": 2305.04248046875, "debug/sppo_chosen_reward_in_loss": 2.364449977874756, "debug/sppo_rej_reward_in_loss": -7.119999885559082, "debug/sppo_reject_loss": 2006.219482421875, "epoch": 5.760869565217392, "grad_norm": 77368.47622696582, "learning_rate": 2.9316888045540794e-08, "logits/chosen": 0.9735002517700195, "logits/rejected": 1.5613526105880737, "logps/chosen": -254.20217895507812, "logps/rejected": -303.11114501953125, "loss": 4396.9453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.023644497618079185, "rewards/margins": 0.09484449774026871, "rewards/rejected": -0.07119999825954437, "step": 1590 }, { "debug/policy_chosen_logits": 1.0080276727676392, "debug/policy_chosen_logps": -244.85537719726562, "debug/policy_rejected_logits": 1.1557388305664062, "debug/policy_rejected_logps": -250.934814453125, "debug/reference_chosen_logps": -245.8179168701172, "debug/reference_rejected_logps": -247.0068817138672, "debug/sppo_chosen_loss": 2463.442626953125, "debug/sppo_chosen_reward_in_loss": 0.9625364542007446, "debug/sppo_rej_reward_in_loss": -3.9279186725616455, "debug/sppo_reject_loss": 2273.934814453125, "epoch": 5.778985507246377, "grad_norm": 75620.50123832183, "learning_rate": 2.9079696394686902e-08, "logits/chosen": 1.0080276727676392, "logits/rejected": 1.1557388305664062, "logps/chosen": -244.85537719726562, "logps/rejected": -250.934814453125, "loss": 4583.4543, "rewards/accuracies": 0.625, "rewards/chosen": 0.009625363163650036, "rewards/margins": 0.04890454560518265, "rewards/rejected": -0.039279185235500336, "step": 1595 }, { "debug/policy_chosen_logits": 0.983859658241272, "debug/policy_chosen_logps": -255.19509887695312, "debug/policy_rejected_logits": 1.412980318069458, "debug/policy_rejected_logps": -321.30352783203125, "debug/reference_chosen_logps": -257.37481689453125, "debug/reference_rejected_logps": -316.53289794921875, "debug/sppo_chosen_loss": 2352.030029296875, "debug/sppo_chosen_reward_in_loss": 2.1797327995300293, "debug/sppo_rej_reward_in_loss": -4.770643711090088, "debug/sppo_reject_loss": 2186.142578125, "epoch": 5.797101449275362, "grad_norm": 80536.04040900512, "learning_rate": 2.8842504743833017e-08, "logits/chosen": 0.983859658241272, "logits/rejected": 1.412980318069458, "logps/chosen": -255.19509887695312, "logps/rejected": -321.30352783203125, "loss": 4573.9879, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02179732732474804, "rewards/margins": 0.06950376182794571, "rewards/rejected": -0.04770643264055252, "step": 1600 }, { "epoch": 5.797101449275362, "eval_debug/policy_chosen_logits": 1.4099074602127075, "eval_debug/policy_chosen_logps": -252.22674560546875, "eval_debug/policy_rejected_logits": 1.455780029296875, "eval_debug/policy_rejected_logps": -262.9847412109375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2501.195556640625, "eval_debug/sppo_chosen_reward_in_loss": 0.6917134523391724, "eval_debug/sppo_rej_reward_in_loss": -3.326115369796753, "eval_debug/sppo_reject_loss": 2325.065673828125, "eval_logits/chosen": 1.4099074602127075, "eval_logits/rejected": 1.455780029296875, "eval_logps/chosen": -252.22674560546875, "eval_logps/rejected": -262.9847412109375, "eval_loss": 4628.5751953125, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.006917132996022701, "eval_rewards/margins": 0.04017828404903412, "eval_rewards/rejected": -0.03326115012168884, "eval_runtime": 28.4591, "eval_samples_per_second": 21.083, "eval_steps_per_second": 0.668, "step": 1600 }, { "debug/policy_chosen_logits": 1.4133713245391846, "debug/policy_chosen_logps": -261.71783447265625, "debug/policy_rejected_logits": 1.4047696590423584, "debug/policy_rejected_logps": -283.48260498046875, "debug/reference_chosen_logps": -262.82550048828125, "debug/reference_rejected_logps": -277.431396484375, "debug/sppo_chosen_loss": 2453.00439453125, "debug/sppo_chosen_reward_in_loss": 1.1076514720916748, "debug/sppo_rej_reward_in_loss": -6.051219463348389, "debug/sppo_reject_loss": 2102.118408203125, "epoch": 5.815217391304348, "grad_norm": 89876.35180220763, "learning_rate": 2.860531309297913e-08, "logits/chosen": 1.4133713245391846, "logits/rejected": 1.4047696590423584, "logps/chosen": -261.71783447265625, "logps/rejected": -283.48260498046875, "loss": 4500.5273, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011076515540480614, "rewards/margins": 0.07158870995044708, "rewards/rejected": -0.06051219627261162, "step": 1605 }, { "debug/policy_chosen_logits": 1.2928855419158936, "debug/policy_chosen_logps": -254.7082061767578, "debug/policy_rejected_logits": 1.4431750774383545, "debug/policy_rejected_logps": -278.58734130859375, "debug/reference_chosen_logps": -256.94012451171875, "debug/reference_rejected_logps": -272.6163024902344, "debug/sppo_chosen_loss": 2308.73291015625, "debug/sppo_chosen_reward_in_loss": 2.2319469451904297, "debug/sppo_rej_reward_in_loss": -5.971031665802002, "debug/sppo_reject_loss": 2061.353271484375, "epoch": 5.833333333333333, "grad_norm": 64581.09278714349, "learning_rate": 2.8368121442125237e-08, "logits/chosen": 1.2928855419158936, "logits/rejected": 1.4431750774383545, "logps/chosen": -254.7082061767578, "logps/rejected": -278.58734130859375, "loss": 4501.5859, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02231946960091591, "rewards/margins": 0.0820297822356224, "rewards/rejected": -0.0597103126347065, "step": 1610 }, { "debug/policy_chosen_logits": 1.098814606666565, "debug/policy_chosen_logps": -261.6798400878906, "debug/policy_rejected_logits": 1.2141131162643433, "debug/policy_rejected_logps": -284.25885009765625, "debug/reference_chosen_logps": -263.23260498046875, "debug/reference_rejected_logps": -279.31488037109375, "debug/sppo_chosen_loss": 2371.888916015625, "debug/sppo_chosen_reward_in_loss": 1.5527737140655518, "debug/sppo_rej_reward_in_loss": -4.943967819213867, "debug/sppo_reject_loss": 2163.468994140625, "epoch": 5.851449275362318, "grad_norm": 87668.84561987229, "learning_rate": 2.8130929791271345e-08, "logits/chosen": 1.098814606666565, "logits/rejected": 1.2141131162643433, "logps/chosen": -261.6798400878906, "logps/rejected": -284.25885009765625, "loss": 4485.2906, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.015527735464274883, "rewards/margins": 0.06496741622686386, "rewards/rejected": -0.049439676105976105, "step": 1615 }, { "debug/policy_chosen_logits": 0.9928579330444336, "debug/policy_chosen_logps": -252.87435913085938, "debug/policy_rejected_logits": 1.2248326539993286, "debug/policy_rejected_logps": -288.9918212890625, "debug/reference_chosen_logps": -253.2355194091797, "debug/reference_rejected_logps": -283.28363037109375, "debug/sppo_chosen_loss": 2537.11279296875, "debug/sppo_chosen_reward_in_loss": 0.36113911867141724, "debug/sppo_rej_reward_in_loss": -5.708200931549072, "debug/sppo_reject_loss": 2108.445068359375, "epoch": 5.869565217391305, "grad_norm": 58655.2885378233, "learning_rate": 2.7893738140417457e-08, "logits/chosen": 0.9928579330444336, "logits/rejected": 1.2248326539993286, "logps/chosen": -252.87435913085938, "logps/rejected": -288.9918212890625, "loss": 4583.1562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.003611389547586441, "rewards/margins": 0.060693394392728806, "rewards/rejected": -0.05708200857043266, "step": 1620 }, { "debug/policy_chosen_logits": 1.2882401943206787, "debug/policy_chosen_logps": -246.6195068359375, "debug/policy_rejected_logits": 1.5632935762405396, "debug/policy_rejected_logps": -283.4783630371094, "debug/reference_chosen_logps": -248.82217407226562, "debug/reference_rejected_logps": -276.4673767089844, "debug/sppo_chosen_loss": 2339.946533203125, "debug/sppo_chosen_reward_in_loss": 2.2026565074920654, "debug/sppo_rej_reward_in_loss": -7.010974884033203, "debug/sppo_reject_loss": 2072.89892578125, "epoch": 5.88768115942029, "grad_norm": 81903.83381055195, "learning_rate": 2.7656546489563565e-08, "logits/chosen": 1.2882401943206787, "logits/rejected": 1.5632935762405396, "logps/chosen": -246.6195068359375, "logps/rejected": -283.4783630371094, "loss": 4466.0813, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02202656678855419, "rewards/margins": 0.09213630855083466, "rewards/rejected": -0.07010974735021591, "step": 1625 }, { "debug/policy_chosen_logits": 0.97998046875, "debug/policy_chosen_logps": -230.58871459960938, "debug/policy_rejected_logits": 1.6162458658218384, "debug/policy_rejected_logps": -329.0350341796875, "debug/reference_chosen_logps": -232.83535766601562, "debug/reference_rejected_logps": -322.9118957519531, "debug/sppo_chosen_loss": 2328.44189453125, "debug/sppo_chosen_reward_in_loss": 2.2466397285461426, "debug/sppo_rej_reward_in_loss": -6.123122692108154, "debug/sppo_reject_loss": 2100.07958984375, "epoch": 5.905797101449275, "grad_norm": 152319.76540711249, "learning_rate": 2.7419354838709673e-08, "logits/chosen": 0.97998046875, "logits/rejected": 1.6162458658218384, "logps/chosen": -230.58871459960938, "logps/rejected": -329.0350341796875, "loss": 4597.4172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.022466396912932396, "rewards/margins": 0.0836976170539856, "rewards/rejected": -0.06123122572898865, "step": 1630 }, { "debug/policy_chosen_logits": 1.0076630115509033, "debug/policy_chosen_logps": -233.4595489501953, "debug/policy_rejected_logits": 1.4618885517120361, "debug/policy_rejected_logps": -322.53997802734375, "debug/reference_chosen_logps": -237.11782836914062, "debug/reference_rejected_logps": -316.41522216796875, "debug/sppo_chosen_loss": 2172.8671875, "debug/sppo_chosen_reward_in_loss": 3.658297061920166, "debug/sppo_rej_reward_in_loss": -6.124730587005615, "debug/sppo_reject_loss": 2089.35791015625, "epoch": 5.923913043478261, "grad_norm": 68826.30289848878, "learning_rate": 2.7182163187855788e-08, "logits/chosen": 1.0076630115509033, "logits/rejected": 1.4618885517120361, "logps/chosen": -233.4595489501953, "logps/rejected": -322.53997802734375, "loss": 4465.4828, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03658296540379524, "rewards/margins": 0.09783027321100235, "rewards/rejected": -0.06124730780720711, "step": 1635 }, { "debug/policy_chosen_logits": 0.8839296102523804, "debug/policy_chosen_logps": -245.0952606201172, "debug/policy_rejected_logits": 0.9679676294326782, "debug/policy_rejected_logps": -260.85888671875, "debug/reference_chosen_logps": -246.76345825195312, "debug/reference_rejected_logps": -256.6071472167969, "debug/sppo_chosen_loss": 2424.223388671875, "debug/sppo_chosen_reward_in_loss": 1.6681476831436157, "debug/sppo_rej_reward_in_loss": -4.251766204833984, "debug/sppo_reject_loss": 2210.83642578125, "epoch": 5.942028985507246, "grad_norm": 75992.92058643445, "learning_rate": 2.69449715370019e-08, "logits/chosen": 0.8839296102523804, "logits/rejected": 0.9679676294326782, "logps/chosen": -245.0952606201172, "logps/rejected": -260.85888671875, "loss": 4495.332, "rewards/accuracies": 0.75, "rewards/chosen": 0.016681477427482605, "rewards/margins": 0.05919914320111275, "rewards/rejected": -0.042517662048339844, "step": 1640 }, { "debug/policy_chosen_logits": 1.2831439971923828, "debug/policy_chosen_logps": -269.35162353515625, "debug/policy_rejected_logits": 1.743985891342163, "debug/policy_rejected_logps": -290.3727111816406, "debug/reference_chosen_logps": -273.44683837890625, "debug/reference_rejected_logps": -289.3643493652344, "debug/sppo_chosen_loss": 2139.2412109375, "debug/sppo_chosen_reward_in_loss": 4.09523868560791, "debug/sppo_rej_reward_in_loss": -1.008319616317749, "debug/sppo_reject_loss": 2479.122314453125, "epoch": 5.960144927536232, "grad_norm": 78043.65992033327, "learning_rate": 2.6707779886148008e-08, "logits/chosen": 1.2831439971923828, "logits/rejected": 1.743985891342163, "logps/chosen": -269.35162353515625, "logps/rejected": -290.3727111816406, "loss": 4698.8797, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04095238819718361, "rewards/margins": 0.05103558301925659, "rewards/rejected": -0.010083195753395557, "step": 1645 }, { "debug/policy_chosen_logits": 1.1628918647766113, "debug/policy_chosen_logps": -256.0224914550781, "debug/policy_rejected_logits": 1.5377750396728516, "debug/policy_rejected_logps": -305.90521240234375, "debug/reference_chosen_logps": -259.07080078125, "debug/reference_rejected_logps": -302.33856201171875, "debug/sppo_chosen_loss": 2251.554931640625, "debug/sppo_chosen_reward_in_loss": 3.0483267307281494, "debug/sppo_rej_reward_in_loss": -3.5666823387145996, "debug/sppo_reject_loss": 2264.40185546875, "epoch": 5.978260869565218, "grad_norm": 67593.92221712113, "learning_rate": 2.6470588235294116e-08, "logits/chosen": 1.1628918647766113, "logits/rejected": 1.5377750396728516, "logps/chosen": -256.0224914550781, "logps/rejected": -305.90521240234375, "loss": 4532.7336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.030483264476060867, "rewards/margins": 0.06615009158849716, "rewards/rejected": -0.035666827112436295, "step": 1650 }, { "debug/policy_chosen_logits": 1.0168049335479736, "debug/policy_chosen_logps": -265.20196533203125, "debug/policy_rejected_logits": 1.147486925125122, "debug/policy_rejected_logps": -267.65728759765625, "debug/reference_chosen_logps": -269.12786865234375, "debug/reference_rejected_logps": -266.747314453125, "debug/sppo_chosen_loss": 2146.16259765625, "debug/sppo_chosen_reward_in_loss": 3.9259142875671387, "debug/sppo_rej_reward_in_loss": -0.9099933505058289, "debug/sppo_reject_loss": 2501.74560546875, "epoch": 5.996376811594203, "grad_norm": 74167.82047555185, "learning_rate": 2.6233396584440225e-08, "logits/chosen": 1.0168049335479736, "logits/rejected": 1.147486925125122, "logps/chosen": -265.20196533203125, "logps/rejected": -267.65728759765625, "loss": 4510.0063, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03925914317369461, "rewards/margins": 0.04835907369852066, "rewards/rejected": -0.009099933318793774, "step": 1655 }, { "debug/policy_chosen_logits": 1.3281481266021729, "debug/policy_chosen_logps": -248.3011932373047, "debug/policy_rejected_logits": 1.4104559421539307, "debug/policy_rejected_logps": -261.9140319824219, "debug/reference_chosen_logps": -252.8326416015625, "debug/reference_rejected_logps": -259.8531494140625, "debug/sppo_chosen_loss": 2089.50830078125, "debug/sppo_chosen_reward_in_loss": 4.531442165374756, "debug/sppo_rej_reward_in_loss": -2.0609192848205566, "debug/sppo_reject_loss": 2387.59326171875, "epoch": 6.0144927536231885, "grad_norm": 190090.92272092507, "learning_rate": 2.5996204933586336e-08, "logits/chosen": 1.3281481266021729, "logits/rejected": 1.4104559421539307, "logps/chosen": -248.3011932373047, "logps/rejected": -261.9140319824219, "loss": 4373.9281, "rewards/accuracies": 0.75, "rewards/chosen": 0.045314423739910126, "rewards/margins": 0.06592361629009247, "rewards/rejected": -0.020609190687537193, "step": 1660 }, { "debug/policy_chosen_logits": 1.1314435005187988, "debug/policy_chosen_logps": -253.882568359375, "debug/policy_rejected_logits": 1.2066543102264404, "debug/policy_rejected_logps": -282.6380615234375, "debug/reference_chosen_logps": -256.0284118652344, "debug/reference_rejected_logps": -279.94073486328125, "debug/sppo_chosen_loss": 2356.90283203125, "debug/sppo_chosen_reward_in_loss": 2.1458301544189453, "debug/sppo_rej_reward_in_loss": -2.6973178386688232, "debug/sppo_reject_loss": 2402.356201171875, "epoch": 6.032608695652174, "grad_norm": 75649.55873801625, "learning_rate": 2.5759013282732444e-08, "logits/chosen": 1.1314435005187988, "logits/rejected": 1.2066543102264404, "logps/chosen": -253.882568359375, "logps/rejected": -282.6380615234375, "loss": 4477.3898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.021458299830555916, "rewards/margins": 0.048431478440761566, "rewards/rejected": -0.0269731767475605, "step": 1665 }, { "debug/policy_chosen_logits": 0.9810155630111694, "debug/policy_chosen_logps": -259.0936279296875, "debug/policy_rejected_logits": 1.3452823162078857, "debug/policy_rejected_logps": -272.289306640625, "debug/reference_chosen_logps": -260.8038330078125, "debug/reference_rejected_logps": -269.924072265625, "debug/sppo_chosen_loss": 2412.857421875, "debug/sppo_chosen_reward_in_loss": 1.710209608078003, "debug/sppo_rej_reward_in_loss": -2.3652427196502686, "debug/sppo_reject_loss": 2380.012939453125, "epoch": 6.050724637681159, "grad_norm": 70476.47404438716, "learning_rate": 2.5521821631878553e-08, "logits/chosen": 0.9810155630111694, "logits/rejected": 1.3452823162078857, "logps/chosen": -259.0936279296875, "logps/rejected": -272.289306640625, "loss": 4622.7539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01710209622979164, "rewards/margins": 0.0407545268535614, "rewards/rejected": -0.02365242876112461, "step": 1670 }, { "debug/policy_chosen_logits": 1.0480362176895142, "debug/policy_chosen_logps": -235.658935546875, "debug/policy_rejected_logits": 1.3363116979599, "debug/policy_rejected_logps": -270.40911865234375, "debug/reference_chosen_logps": -237.66552734375, "debug/reference_rejected_logps": -266.9305725097656, "debug/sppo_chosen_loss": 2375.833984375, "debug/sppo_chosen_reward_in_loss": 2.0065994262695312, "debug/sppo_rej_reward_in_loss": -3.478525161743164, "debug/sppo_reject_loss": 2295.56689453125, "epoch": 6.068840579710145, "grad_norm": 121915.79162545329, "learning_rate": 2.5284629981024668e-08, "logits/chosen": 1.0480362176895142, "logits/rejected": 1.3363116979599, "logps/chosen": -235.658935546875, "logps/rejected": -270.40911865234375, "loss": 4504.3027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.020065993070602417, "rewards/margins": 0.05485125258564949, "rewards/rejected": -0.03478524833917618, "step": 1675 }, { "debug/policy_chosen_logits": 0.9424604177474976, "debug/policy_chosen_logps": -244.01724243164062, "debug/policy_rejected_logits": 1.160203218460083, "debug/policy_rejected_logps": -281.0983581542969, "debug/reference_chosen_logps": -246.68087768554688, "debug/reference_rejected_logps": -274.8426208496094, "debug/sppo_chosen_loss": 2290.985595703125, "debug/sppo_chosen_reward_in_loss": 2.6636290550231934, "debug/sppo_rej_reward_in_loss": -6.255739212036133, "debug/sppo_reject_loss": 2084.802978515625, "epoch": 6.086956521739131, "grad_norm": 64445.05477988002, "learning_rate": 2.504743833017078e-08, "logits/chosen": 0.9424604177474976, "logits/rejected": 1.160203218460083, "logps/chosen": -244.01724243164062, "logps/rejected": -281.0983581542969, "loss": 4460.7867, "rewards/accuracies": 0.75, "rewards/chosen": 0.026636291295289993, "rewards/margins": 0.0891936793923378, "rewards/rejected": -0.06255738437175751, "step": 1680 }, { "debug/policy_chosen_logits": 1.0586298704147339, "debug/policy_chosen_logps": -251.181396484375, "debug/policy_rejected_logits": 1.5213289260864258, "debug/policy_rejected_logps": -321.1294860839844, "debug/reference_chosen_logps": -255.57870483398438, "debug/reference_rejected_logps": -316.02362060546875, "debug/sppo_chosen_loss": 2097.501953125, "debug/sppo_chosen_reward_in_loss": 4.397290229797363, "debug/sppo_rej_reward_in_loss": -5.105895042419434, "debug/sppo_reject_loss": 2166.73193359375, "epoch": 6.105072463768116, "grad_norm": 102848.29809951434, "learning_rate": 2.4810246679316887e-08, "logits/chosen": 1.0586298704147339, "logits/rejected": 1.5213289260864258, "logps/chosen": -251.181396484375, "logps/rejected": -321.1294860839844, "loss": 4329.082, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04397290199995041, "rewards/margins": 0.09503184258937836, "rewards/rejected": -0.051058948040008545, "step": 1685 }, { "debug/policy_chosen_logits": 1.180855631828308, "debug/policy_chosen_logps": -239.8685302734375, "debug/policy_rejected_logits": 1.3631532192230225, "debug/policy_rejected_logps": -288.0668640136719, "debug/reference_chosen_logps": -242.98355102539062, "debug/reference_rejected_logps": -285.4348449707031, "debug/sppo_chosen_loss": 2223.875732421875, "debug/sppo_chosen_reward_in_loss": 3.1150150299072266, "debug/sppo_rej_reward_in_loss": -2.6320197582244873, "debug/sppo_reject_loss": 2329.66455078125, "epoch": 6.1231884057971016, "grad_norm": 75342.10597554644, "learning_rate": 2.4573055028462996e-08, "logits/chosen": 1.180855631828308, "logits/rejected": 1.3631532192230225, "logps/chosen": -239.8685302734375, "logps/rejected": -288.0668640136719, "loss": 4493.1914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.031150151044130325, "rewards/margins": 0.05747034400701523, "rewards/rejected": -0.02632019855082035, "step": 1690 }, { "debug/policy_chosen_logits": 1.1230027675628662, "debug/policy_chosen_logps": -266.40081787109375, "debug/policy_rejected_logits": 1.543755292892456, "debug/policy_rejected_logps": -307.3990478515625, "debug/reference_chosen_logps": -269.02978515625, "debug/reference_rejected_logps": -303.439208984375, "debug/sppo_chosen_loss": 2277.095703125, "debug/sppo_chosen_reward_in_loss": 2.62892484664917, "debug/sppo_rej_reward_in_loss": -3.9598388671875, "debug/sppo_reject_loss": 2233.00634765625, "epoch": 6.141304347826087, "grad_norm": 66619.70023799865, "learning_rate": 2.4335863377609107e-08, "logits/chosen": 1.1230027675628662, "logits/rejected": 1.543755292892456, "logps/chosen": -266.40081787109375, "logps/rejected": -307.3990478515625, "loss": 4510.4379, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02628924883902073, "rewards/margins": 0.06588763743638992, "rewards/rejected": -0.039598386734724045, "step": 1695 }, { "debug/policy_chosen_logits": 1.0333296060562134, "debug/policy_chosen_logps": -258.3095397949219, "debug/policy_rejected_logits": 1.1790844202041626, "debug/policy_rejected_logps": -277.36505126953125, "debug/reference_chosen_logps": -261.2309875488281, "debug/reference_rejected_logps": -270.38702392578125, "debug/sppo_chosen_loss": 2243.58203125, "debug/sppo_chosen_reward_in_loss": 2.921450138092041, "debug/sppo_rej_reward_in_loss": -6.978041648864746, "debug/sppo_reject_loss": 1991.030517578125, "epoch": 6.159420289855072, "grad_norm": 60675.80588841475, "learning_rate": 2.409867172675522e-08, "logits/chosen": 1.0333296060562134, "logits/rejected": 1.1790844202041626, "logps/chosen": -258.3095397949219, "logps/rejected": -277.36505126953125, "loss": 4493.7113, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02921450138092041, "rewards/margins": 0.09899492561817169, "rewards/rejected": -0.06978040933609009, "step": 1700 }, { "epoch": 6.159420289855072, "eval_debug/policy_chosen_logits": 1.4028468132019043, "eval_debug/policy_chosen_logps": -251.8597412109375, "eval_debug/policy_rejected_logits": 1.4487698078155518, "eval_debug/policy_rejected_logps": -262.9095458984375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2467.51708984375, "eval_debug/sppo_chosen_reward_in_loss": 1.058749794960022, "eval_debug/sppo_rej_reward_in_loss": -3.25089955329895, "eval_debug/sppo_reject_loss": 2344.796142578125, "eval_logits/chosen": 1.4028468132019043, "eval_logits/rejected": 1.4487698078155518, "eval_logps/chosen": -251.8597412109375, "eval_logps/rejected": -262.9095458984375, "eval_loss": 4615.8251953125, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.010587499476969242, "eval_rewards/margins": 0.043096497654914856, "eval_rewards/rejected": -0.03250899538397789, "eval_runtime": 28.4957, "eval_samples_per_second": 21.056, "eval_steps_per_second": 0.667, "step": 1700 }, { "debug/policy_chosen_logits": 1.1369378566741943, "debug/policy_chosen_logps": -249.6217803955078, "debug/policy_rejected_logits": 1.5532922744750977, "debug/policy_rejected_logps": -312.9566955566406, "debug/reference_chosen_logps": -252.29525756835938, "debug/reference_rejected_logps": -306.924072265625, "debug/sppo_chosen_loss": 2271.484619140625, "debug/sppo_chosen_reward_in_loss": 2.673490047454834, "debug/sppo_rej_reward_in_loss": -6.0326433181762695, "debug/sppo_reject_loss": 2112.97705078125, "epoch": 6.177536231884058, "grad_norm": 68120.09822530487, "learning_rate": 2.3861480075901327e-08, "logits/chosen": 1.1369378566741943, "logits/rejected": 1.5532922744750977, "logps/chosen": -249.6217803955078, "logps/rejected": -312.9566955566406, "loss": 4419.7914, "rewards/accuracies": 0.75, "rewards/chosen": 0.02673489786684513, "rewards/margins": 0.0870613306760788, "rewards/rejected": -0.06032641977071762, "step": 1705 }, { "debug/policy_chosen_logits": 1.0618484020233154, "debug/policy_chosen_logps": -254.3784942626953, "debug/policy_rejected_logits": 1.5129501819610596, "debug/policy_rejected_logps": -304.7856140136719, "debug/reference_chosen_logps": -257.25634765625, "debug/reference_rejected_logps": -300.0366516113281, "debug/sppo_chosen_loss": 2262.7529296875, "debug/sppo_chosen_reward_in_loss": 2.877861261367798, "debug/sppo_rej_reward_in_loss": -4.748948574066162, "debug/sppo_reject_loss": 2152.86669921875, "epoch": 6.195652173913044, "grad_norm": 136626.54162799715, "learning_rate": 2.3624288425047436e-08, "logits/chosen": 1.0618484020233154, "logits/rejected": 1.5129501819610596, "logps/chosen": -254.3784942626953, "logps/rejected": -304.7856140136719, "loss": 4455.3164, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02877860888838768, "rewards/margins": 0.07626809179782867, "rewards/rejected": -0.04748949036002159, "step": 1710 }, { "debug/policy_chosen_logits": 0.8706594705581665, "debug/policy_chosen_logps": -234.66995239257812, "debug/policy_rejected_logits": 1.3307873010635376, "debug/policy_rejected_logps": -298.90673828125, "debug/reference_chosen_logps": -234.290283203125, "debug/reference_rejected_logps": -290.80792236328125, "debug/sppo_chosen_loss": 2670.64013671875, "debug/sppo_chosen_reward_in_loss": -0.37968406081199646, "debug/sppo_rej_reward_in_loss": -8.098812103271484, "debug/sppo_reject_loss": 1990.855224609375, "epoch": 6.213768115942029, "grad_norm": 81496.02461881285, "learning_rate": 2.3387096774193547e-08, "logits/chosen": 0.8706594705581665, "logits/rejected": 1.3307873010635376, "logps/chosen": -234.66995239257812, "logps/rejected": -298.90673828125, "loss": 4545.2516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003796841949224472, "rewards/margins": 0.07719127833843231, "rewards/rejected": -0.08098812401294708, "step": 1715 }, { "debug/policy_chosen_logits": 1.243774652481079, "debug/policy_chosen_logps": -231.81765747070312, "debug/policy_rejected_logits": 1.332818865776062, "debug/policy_rejected_logps": -259.4305725097656, "debug/reference_chosen_logps": -235.42819213867188, "debug/reference_rejected_logps": -255.14407348632812, "debug/sppo_chosen_loss": 2162.8759765625, "debug/sppo_chosen_reward_in_loss": 3.6105475425720215, "debug/sppo_rej_reward_in_loss": -4.286492347717285, "debug/sppo_reject_loss": 2236.702392578125, "epoch": 6.231884057971015, "grad_norm": 172915.2832910033, "learning_rate": 2.314990512333966e-08, "logits/chosen": 1.243774652481079, "logits/rejected": 1.332818865776062, "logps/chosen": -231.81765747070312, "logps/rejected": -259.4305725097656, "loss": 4454.4715, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.036105476319789886, "rewards/margins": 0.07897039502859116, "rewards/rejected": -0.04286491870880127, "step": 1720 }, { "debug/policy_chosen_logits": 1.2093563079833984, "debug/policy_chosen_logps": -261.21099853515625, "debug/policy_rejected_logits": 1.4681745767593384, "debug/policy_rejected_logps": -295.8939208984375, "debug/reference_chosen_logps": -262.1924743652344, "debug/reference_rejected_logps": -292.49652099609375, "debug/sppo_chosen_loss": 2493.75634765625, "debug/sppo_chosen_reward_in_loss": 0.9814649820327759, "debug/sppo_rej_reward_in_loss": -3.397385835647583, "debug/sppo_reject_loss": 2276.71240234375, "epoch": 6.25, "grad_norm": 75376.03147049477, "learning_rate": 2.2912713472485767e-08, "logits/chosen": 1.2093563079833984, "logits/rejected": 1.4681745767593384, "logps/chosen": -261.21099853515625, "logps/rejected": -295.8939208984375, "loss": 4476.1195, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.009814648889005184, "rewards/margins": 0.04378850385546684, "rewards/rejected": -0.03397385776042938, "step": 1725 }, { "debug/policy_chosen_logits": 1.1747931241989136, "debug/policy_chosen_logps": -239.6561279296875, "debug/policy_rejected_logits": 1.347029447555542, "debug/policy_rejected_logps": -270.828369140625, "debug/reference_chosen_logps": -239.88345336914062, "debug/reference_rejected_logps": -264.8446350097656, "debug/sppo_chosen_loss": 2559.77197265625, "debug/sppo_chosen_reward_in_loss": 0.2272968292236328, "debug/sppo_rej_reward_in_loss": -5.983729362487793, "debug/sppo_reject_loss": 2130.99755859375, "epoch": 6.268115942028985, "grad_norm": 85882.55197825772, "learning_rate": 2.2675521821631875e-08, "logits/chosen": 1.1747931241989136, "logits/rejected": 1.347029447555542, "logps/chosen": -239.6561279296875, "logps/rejected": -270.828369140625, "loss": 4520.0727, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0022729686461389065, "rewards/margins": 0.062110256403684616, "rewards/rejected": -0.05983729287981987, "step": 1730 }, { "debug/policy_chosen_logits": 0.9882405996322632, "debug/policy_chosen_logps": -246.3030242919922, "debug/policy_rejected_logits": 1.3169834613800049, "debug/policy_rejected_logps": -287.1503601074219, "debug/reference_chosen_logps": -248.7151641845703, "debug/reference_rejected_logps": -286.00408935546875, "debug/sppo_chosen_loss": 2303.157958984375, "debug/sppo_chosen_reward_in_loss": 2.412151336669922, "debug/sppo_rej_reward_in_loss": -1.1463024616241455, "debug/sppo_reject_loss": 2430.67578125, "epoch": 6.286231884057971, "grad_norm": 64942.11796986963, "learning_rate": 2.243833017077799e-08, "logits/chosen": 0.9882405996322632, "logits/rejected": 1.3169834613800049, "logps/chosen": -246.3030242919922, "logps/rejected": -287.1503601074219, "loss": 4583.1187, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.024121513590216637, "rewards/margins": 0.03558453544974327, "rewards/rejected": -0.011463024653494358, "step": 1735 }, { "debug/policy_chosen_logits": 1.2585456371307373, "debug/policy_chosen_logps": -272.79949951171875, "debug/policy_rejected_logits": 1.610257863998413, "debug/policy_rejected_logps": -330.37957763671875, "debug/reference_chosen_logps": -275.38238525390625, "debug/reference_rejected_logps": -324.34478759765625, "debug/sppo_chosen_loss": 2267.946533203125, "debug/sppo_chosen_reward_in_loss": 2.5828967094421387, "debug/sppo_rej_reward_in_loss": -6.034759998321533, "debug/sppo_reject_loss": 2094.81396484375, "epoch": 6.304347826086957, "grad_norm": 70932.4763290442, "learning_rate": 2.22011385199241e-08, "logits/chosen": 1.2585456371307373, "logits/rejected": 1.610257863998413, "logps/chosen": -272.79949951171875, "logps/rejected": -330.37957763671875, "loss": 4588.693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02582896687090397, "rewards/margins": 0.0861765667796135, "rewards/rejected": -0.06034759804606438, "step": 1740 }, { "debug/policy_chosen_logits": 0.8744049072265625, "debug/policy_chosen_logps": -261.36968994140625, "debug/policy_rejected_logits": 1.0528736114501953, "debug/policy_rejected_logps": -305.7917175292969, "debug/reference_chosen_logps": -263.1070861816406, "debug/reference_rejected_logps": -298.8080139160156, "debug/sppo_chosen_loss": 2376.523193359375, "debug/sppo_chosen_reward_in_loss": 1.7373943328857422, "debug/sppo_rej_reward_in_loss": -6.983738899230957, "debug/sppo_reject_loss": 2048.61865234375, "epoch": 6.322463768115942, "grad_norm": 67991.64233708223, "learning_rate": 2.1963946869070207e-08, "logits/chosen": 0.8744049072265625, "logits/rejected": 1.0528736114501953, "logps/chosen": -261.36968994140625, "logps/rejected": -305.7917175292969, "loss": 4560.8809, "rewards/accuracies": 0.75, "rewards/chosen": 0.01737394370138645, "rewards/margins": 0.08721132576465607, "rewards/rejected": -0.06983737647533417, "step": 1745 }, { "debug/policy_chosen_logits": 1.2370020151138306, "debug/policy_chosen_logps": -244.7306365966797, "debug/policy_rejected_logits": 1.4110075235366821, "debug/policy_rejected_logps": -275.5528869628906, "debug/reference_chosen_logps": -247.2993927001953, "debug/reference_rejected_logps": -272.1546325683594, "debug/sppo_chosen_loss": 2269.852783203125, "debug/sppo_chosen_reward_in_loss": 2.568727970123291, "debug/sppo_rej_reward_in_loss": -3.398226261138916, "debug/sppo_reject_loss": 2311.23681640625, "epoch": 6.340579710144928, "grad_norm": 94295.41210465899, "learning_rate": 2.1726755218216315e-08, "logits/chosen": 1.2370020151138306, "logits/rejected": 1.4110075235366821, "logps/chosen": -244.7306365966797, "logps/rejected": -275.5528869628906, "loss": 4401.8625, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02568727731704712, "rewards/margins": 0.05966954305768013, "rewards/rejected": -0.033982258290052414, "step": 1750 }, { "debug/policy_chosen_logits": 0.9745082855224609, "debug/policy_chosen_logps": -265.63018798828125, "debug/policy_rejected_logits": 1.226767897605896, "debug/policy_rejected_logps": -309.2068176269531, "debug/reference_chosen_logps": -267.568359375, "debug/reference_rejected_logps": -307.5111389160156, "debug/sppo_chosen_loss": 2368.237060546875, "debug/sppo_chosen_reward_in_loss": 1.938194990158081, "debug/sppo_rej_reward_in_loss": -1.6956819295883179, "debug/sppo_reject_loss": 2399.1015625, "epoch": 6.358695652173913, "grad_norm": 71523.56251723005, "learning_rate": 2.148956356736243e-08, "logits/chosen": 0.9745082855224609, "logits/rejected": 1.226767897605896, "logps/chosen": -265.63018798828125, "logps/rejected": -309.2068176269531, "loss": 4526.6758, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.019381949678063393, "rewards/margins": 0.036338768899440765, "rewards/rejected": -0.016956817358732224, "step": 1755 }, { "debug/policy_chosen_logits": 1.1224478483200073, "debug/policy_chosen_logps": -262.31341552734375, "debug/policy_rejected_logits": 1.3794440031051636, "debug/policy_rejected_logps": -301.6752624511719, "debug/reference_chosen_logps": -265.3297424316406, "debug/reference_rejected_logps": -296.4901123046875, "debug/sppo_chosen_loss": 2239.707763671875, "debug/sppo_chosen_reward_in_loss": 3.01631236076355, "debug/sppo_rej_reward_in_loss": -5.185140609741211, "debug/sppo_reject_loss": 2133.2783203125, "epoch": 6.3768115942028984, "grad_norm": 95183.85635221565, "learning_rate": 2.1252371916508538e-08, "logits/chosen": 1.1224478483200073, "logits/rejected": 1.3794440031051636, "logps/chosen": -262.31341552734375, "logps/rejected": -301.6752624511719, "loss": 4592.4672, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030163124203681946, "rewards/margins": 0.0820145234465599, "rewards/rejected": -0.051851410418748856, "step": 1760 }, { "debug/policy_chosen_logits": 1.3068724870681763, "debug/policy_chosen_logps": -274.2635192871094, "debug/policy_rejected_logits": 1.3568847179412842, "debug/policy_rejected_logps": -291.92388916015625, "debug/reference_chosen_logps": -274.2155456542969, "debug/reference_rejected_logps": -287.33245849609375, "debug/sppo_chosen_loss": 2572.94970703125, "debug/sppo_chosen_reward_in_loss": -0.048030853271484375, "debug/sppo_rej_reward_in_loss": -4.591431140899658, "debug/sppo_reject_loss": 2212.39697265625, "epoch": 6.394927536231884, "grad_norm": 83387.08784942274, "learning_rate": 2.1015180265654647e-08, "logits/chosen": 1.3068724870681763, "logits/rejected": 1.3568847179412842, "logps/chosen": -274.2635192871094, "logps/rejected": -291.92388916015625, "loss": 4541.618, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00048030837206169963, "rewards/margins": 0.045433998107910156, "rewards/rejected": -0.045914310961961746, "step": 1765 }, { "debug/policy_chosen_logits": 1.0534372329711914, "debug/policy_chosen_logps": -238.79867553710938, "debug/policy_rejected_logits": 1.4849567413330078, "debug/policy_rejected_logps": -283.2656555175781, "debug/reference_chosen_logps": -239.0229034423828, "debug/reference_rejected_logps": -279.0099182128906, "debug/sppo_chosen_loss": 2554.58447265625, "debug/sppo_chosen_reward_in_loss": 0.2242279052734375, "debug/sppo_rej_reward_in_loss": -4.2557291984558105, "debug/sppo_reject_loss": 2225.049560546875, "epoch": 6.413043478260869, "grad_norm": 83301.53560221496, "learning_rate": 2.0777988614800758e-08, "logits/chosen": 1.0534372329711914, "logits/rejected": 1.4849567413330078, "logps/chosen": -238.79867553710938, "logps/rejected": -283.2656555175781, "loss": 4651.1922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0022422783076763153, "rewards/margins": 0.0447995662689209, "rewards/rejected": -0.04255729168653488, "step": 1770 }, { "debug/policy_chosen_logits": 1.2274070978164673, "debug/policy_chosen_logps": -279.4800109863281, "debug/policy_rejected_logits": 1.2371889352798462, "debug/policy_rejected_logps": -279.3857727050781, "debug/reference_chosen_logps": -280.98284912109375, "debug/reference_rejected_logps": -277.61669921875, "debug/sppo_chosen_loss": 2392.0419921875, "debug/sppo_chosen_reward_in_loss": 1.5028270483016968, "debug/sppo_rej_reward_in_loss": -1.7690660953521729, "debug/sppo_reject_loss": 2391.68603515625, "epoch": 6.431159420289855, "grad_norm": 87966.1320731594, "learning_rate": 2.054079696394687e-08, "logits/chosen": 1.2274070978164673, "logits/rejected": 1.2371889352798462, "logps/chosen": -279.4800109863281, "logps/rejected": -279.3857727050781, "loss": 4524.3898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0150282708927989, "rewards/margins": 0.03271893039345741, "rewards/rejected": -0.017690658569335938, "step": 1775 }, { "debug/policy_chosen_logits": 0.8839910626411438, "debug/policy_chosen_logps": -271.580322265625, "debug/policy_rejected_logits": 1.2145618200302124, "debug/policy_rejected_logps": -312.14141845703125, "debug/reference_chosen_logps": -274.2127685546875, "debug/reference_rejected_logps": -305.23321533203125, "debug/sppo_chosen_loss": 2267.0732421875, "debug/sppo_chosen_reward_in_loss": 2.6324517726898193, "debug/sppo_rej_reward_in_loss": -6.908241271972656, "debug/sppo_reject_loss": 2013.29296875, "epoch": 6.449275362318841, "grad_norm": 62875.134823532266, "learning_rate": 2.0303605313092978e-08, "logits/chosen": 0.8839910626411438, "logits/rejected": 1.2145618200302124, "logps/chosen": -271.580322265625, "logps/rejected": -312.14141845703125, "loss": 4485.7164, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.026324516162276268, "rewards/margins": 0.0954069271683693, "rewards/rejected": -0.06908240169286728, "step": 1780 }, { "debug/policy_chosen_logits": 1.4505618810653687, "debug/policy_chosen_logps": -270.20306396484375, "debug/policy_rejected_logits": 1.5077623128890991, "debug/policy_rejected_logps": -285.6567077636719, "debug/reference_chosen_logps": -270.66192626953125, "debug/reference_rejected_logps": -283.3195495605469, "debug/sppo_chosen_loss": 2566.14306640625, "debug/sppo_chosen_reward_in_loss": 0.45892295241355896, "debug/sppo_rej_reward_in_loss": -2.3371639251708984, "debug/sppo_reject_loss": 2397.436767578125, "epoch": 6.467391304347826, "grad_norm": 59242.66318967145, "learning_rate": 2.0066413662239086e-08, "logits/chosen": 1.4505618810653687, "logits/rejected": 1.5077623128890991, "logps/chosen": -270.20306396484375, "logps/rejected": -285.6567077636719, "loss": 4451.6555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.004589228890836239, "rewards/margins": 0.027960866689682007, "rewards/rejected": -0.023371640592813492, "step": 1785 }, { "debug/policy_chosen_logits": 0.8478299379348755, "debug/policy_chosen_logps": -243.46347045898438, "debug/policy_rejected_logits": 1.2035138607025146, "debug/policy_rejected_logps": -287.11334228515625, "debug/reference_chosen_logps": -244.35690307617188, "debug/reference_rejected_logps": -282.8691711425781, "debug/sppo_chosen_loss": 2545.322021484375, "debug/sppo_chosen_reward_in_loss": 0.8934265375137329, "debug/sppo_rej_reward_in_loss": -4.244162559509277, "debug/sppo_reject_loss": 2227.722412109375, "epoch": 6.4855072463768115, "grad_norm": 93237.46039705593, "learning_rate": 1.9829222011385198e-08, "logits/chosen": 0.8478299379348755, "logits/rejected": 1.2035138607025146, "logps/chosen": -243.46347045898438, "logps/rejected": -287.11334228515625, "loss": 4571.032, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0089342650026083, "rewards/margins": 0.05137588828802109, "rewards/rejected": -0.042441628873348236, "step": 1790 }, { "debug/policy_chosen_logits": 1.1431405544281006, "debug/policy_chosen_logps": -226.17550659179688, "debug/policy_rejected_logits": 1.3040499687194824, "debug/policy_rejected_logps": -250.34335327148438, "debug/reference_chosen_logps": -227.589599609375, "debug/reference_rejected_logps": -245.1594696044922, "debug/sppo_chosen_loss": 2397.38720703125, "debug/sppo_chosen_reward_in_loss": 1.4140945672988892, "debug/sppo_rej_reward_in_loss": -5.183899402618408, "debug/sppo_reject_loss": 2178.306640625, "epoch": 6.503623188405797, "grad_norm": 101634.06370262135, "learning_rate": 1.959203036053131e-08, "logits/chosen": 1.1431405544281006, "logits/rejected": 1.3040499687194824, "logps/chosen": -226.17550659179688, "logps/rejected": -250.34335327148438, "loss": 4552.0852, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.014140945859253407, "rewards/margins": 0.06597994267940521, "rewards/rejected": -0.05183899402618408, "step": 1795 }, { "debug/policy_chosen_logits": 1.1058228015899658, "debug/policy_chosen_logps": -233.66165161132812, "debug/policy_rejected_logits": 1.4377822875976562, "debug/policy_rejected_logps": -271.5790710449219, "debug/reference_chosen_logps": -234.68594360351562, "debug/reference_rejected_logps": -270.9393005371094, "debug/sppo_chosen_loss": 2457.52978515625, "debug/sppo_chosen_reward_in_loss": 1.0243265628814697, "debug/sppo_rej_reward_in_loss": -0.6397857666015625, "debug/sppo_reject_loss": 2491.752197265625, "epoch": 6.521739130434782, "grad_norm": 96361.88212187681, "learning_rate": 1.9354838709677418e-08, "logits/chosen": 1.1058228015899658, "logits/rejected": 1.4377822875976562, "logps/chosen": -233.66165161132812, "logps/rejected": -271.5790710449219, "loss": 4579.916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010243264026939869, "rewards/margins": 0.016641123220324516, "rewards/rejected": -0.006397857330739498, "step": 1800 }, { "epoch": 6.521739130434782, "eval_debug/policy_chosen_logits": 1.4013402462005615, "eval_debug/policy_chosen_logps": -252.32699584960938, "eval_debug/policy_rejected_logits": 1.4454870223999023, "eval_debug/policy_rejected_logps": -263.42730712890625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2516.505859375, "eval_debug/sppo_chosen_reward_in_loss": 0.5914798378944397, "eval_debug/sppo_rej_reward_in_loss": -3.768653154373169, "eval_debug/sppo_reject_loss": 2301.599853515625, "eval_logits/chosen": 1.4013402462005615, "eval_logits/rejected": 1.4454870223999023, "eval_logps/chosen": -252.32699584960938, "eval_logps/rejected": -263.42730712890625, "eval_loss": 4618.2861328125, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": 0.0059147984720766544, "eval_rewards/margins": 0.043601326644420624, "eval_rewards/rejected": -0.037686530500650406, "eval_runtime": 28.4923, "eval_samples_per_second": 21.058, "eval_steps_per_second": 0.667, "step": 1800 }, { "debug/policy_chosen_logits": 1.0678592920303345, "debug/policy_chosen_logps": -241.2152862548828, "debug/policy_rejected_logits": 1.1222550868988037, "debug/policy_rejected_logps": -266.5633850097656, "debug/reference_chosen_logps": -242.4794464111328, "debug/reference_rejected_logps": -261.7477111816406, "debug/sppo_chosen_loss": 2414.31689453125, "debug/sppo_chosen_reward_in_loss": 1.264171838760376, "debug/sppo_rej_reward_in_loss": -4.815686225891113, "debug/sppo_reject_loss": 2181.051513671875, "epoch": 6.539855072463768, "grad_norm": 63354.98807018528, "learning_rate": 1.9117647058823526e-08, "logits/chosen": 1.0678592920303345, "logits/rejected": 1.1222550868988037, "logps/chosen": -241.2152862548828, "logps/rejected": -266.5633850097656, "loss": 4489.6062, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012641718611121178, "rewards/margins": 0.06079857423901558, "rewards/rejected": -0.04815686121582985, "step": 1805 }, { "debug/policy_chosen_logits": 1.096115231513977, "debug/policy_chosen_logps": -241.89071655273438, "debug/policy_rejected_logits": 1.6116344928741455, "debug/policy_rejected_logps": -313.1919860839844, "debug/reference_chosen_logps": -243.9291534423828, "debug/reference_rejected_logps": -306.9833984375, "debug/sppo_chosen_loss": 2343.572509765625, "debug/sppo_chosen_reward_in_loss": 2.038416624069214, "debug/sppo_rej_reward_in_loss": -6.208601951599121, "debug/sppo_reject_loss": 2059.91943359375, "epoch": 6.557971014492754, "grad_norm": 78679.8391695401, "learning_rate": 1.888045540796964e-08, "logits/chosen": 1.096115231513977, "logits/rejected": 1.6116344928741455, "logps/chosen": -241.89071655273438, "logps/rejected": -313.1919860839844, "loss": 4593.9781, "rewards/accuracies": 0.75, "rewards/chosen": 0.02038416638970375, "rewards/margins": 0.08247018605470657, "rewards/rejected": -0.06208602339029312, "step": 1810 }, { "debug/policy_chosen_logits": 1.493251085281372, "debug/policy_chosen_logps": -271.6695861816406, "debug/policy_rejected_logits": 1.6988967657089233, "debug/policy_rejected_logps": -284.50970458984375, "debug/reference_chosen_logps": -273.122802734375, "debug/reference_rejected_logps": -281.25311279296875, "debug/sppo_chosen_loss": 2389.869873046875, "debug/sppo_chosen_reward_in_loss": 1.4532390832901, "debug/sppo_rej_reward_in_loss": -3.2565548419952393, "debug/sppo_reject_loss": 2328.518310546875, "epoch": 6.576086956521739, "grad_norm": 111397.18127759268, "learning_rate": 1.864326375711575e-08, "logits/chosen": 1.493251085281372, "logits/rejected": 1.6988967657089233, "logps/chosen": -271.6695861816406, "logps/rejected": -284.50970458984375, "loss": 4530.6062, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.014532390050590038, "rewards/margins": 0.047097936272621155, "rewards/rejected": -0.03256554901599884, "step": 1815 }, { "debug/policy_chosen_logits": 1.004615306854248, "debug/policy_chosen_logps": -232.61917114257812, "debug/policy_rejected_logits": 1.4028778076171875, "debug/policy_rejected_logps": -286.76715087890625, "debug/reference_chosen_logps": -235.10836791992188, "debug/reference_rejected_logps": -282.6846008300781, "debug/sppo_chosen_loss": 2277.584228515625, "debug/sppo_chosen_reward_in_loss": 2.4891979694366455, "debug/sppo_rej_reward_in_loss": -4.08255672454834, "debug/sppo_reject_loss": 2253.26611328125, "epoch": 6.594202898550725, "grad_norm": 63669.11422006237, "learning_rate": 1.8406072106261857e-08, "logits/chosen": 1.004615306854248, "logits/rejected": 1.4028778076171875, "logps/chosen": -232.61917114257812, "logps/rejected": -286.76715087890625, "loss": 4521.5066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02489197999238968, "rewards/margins": 0.06571754068136215, "rewards/rejected": -0.04082556813955307, "step": 1820 }, { "debug/policy_chosen_logits": 0.9203485250473022, "debug/policy_chosen_logps": -237.0836181640625, "debug/policy_rejected_logits": 1.336921215057373, "debug/policy_rejected_logps": -287.36083984375, "debug/reference_chosen_logps": -238.054443359375, "debug/reference_rejected_logps": -277.24664306640625, "debug/sppo_chosen_loss": 2440.59326171875, "debug/sppo_chosen_reward_in_loss": 0.9708188772201538, "debug/sppo_rej_reward_in_loss": -10.11417007446289, "debug/sppo_reject_loss": 1855.061767578125, "epoch": 6.61231884057971, "grad_norm": 63425.059317606545, "learning_rate": 1.816888045540797e-08, "logits/chosen": 0.9203485250473022, "logits/rejected": 1.336921215057373, "logps/chosen": -237.0836181640625, "logps/rejected": -287.36083984375, "loss": 4403.1539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009708188474178314, "rewards/margins": 0.11084987968206406, "rewards/rejected": -0.10114170610904694, "step": 1825 }, { "debug/policy_chosen_logits": 1.229878544807434, "debug/policy_chosen_logps": -242.9008331298828, "debug/policy_rejected_logits": 1.5579349994659424, "debug/policy_rejected_logps": -280.9997863769531, "debug/reference_chosen_logps": -245.7642822265625, "debug/reference_rejected_logps": -274.960693359375, "debug/sppo_chosen_loss": 2241.7275390625, "debug/sppo_chosen_reward_in_loss": 2.86346435546875, "debug/sppo_rej_reward_in_loss": -6.039079189300537, "debug/sppo_reject_loss": 2090.055419921875, "epoch": 6.630434782608695, "grad_norm": 56630.38293720027, "learning_rate": 1.793168880455408e-08, "logits/chosen": 1.229878544807434, "logits/rejected": 1.5579349994659424, "logps/chosen": -242.9008331298828, "logps/rejected": -280.9997863769531, "loss": 4410.1887, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02863464318215847, "rewards/margins": 0.08902543038129807, "rewards/rejected": -0.06039078161120415, "step": 1830 }, { "debug/policy_chosen_logits": 1.4164469242095947, "debug/policy_chosen_logps": -276.7262268066406, "debug/policy_rejected_logits": 1.859042763710022, "debug/policy_rejected_logps": -322.5560302734375, "debug/reference_chosen_logps": -277.9795837402344, "debug/reference_rejected_logps": -316.0045471191406, "debug/sppo_chosen_loss": 2471.96728515625, "debug/sppo_chosen_reward_in_loss": 1.2533445358276367, "debug/sppo_rej_reward_in_loss": -6.55150842666626, "debug/sppo_reject_loss": 2044.7171630859375, "epoch": 6.648550724637682, "grad_norm": 141913.62950136012, "learning_rate": 1.769449715370019e-08, "logits/chosen": 1.4164469242095947, "logits/rejected": 1.859042763710022, "logps/chosen": -276.7262268066406, "logps/rejected": -322.5560302734375, "loss": 4624.5031, "rewards/accuracies": 0.75, "rewards/chosen": 0.012533443979918957, "rewards/margins": 0.07804852724075317, "rewards/rejected": -0.06551508605480194, "step": 1835 }, { "debug/policy_chosen_logits": 1.3056552410125732, "debug/policy_chosen_logps": -254.89169311523438, "debug/policy_rejected_logits": 1.4644079208374023, "debug/policy_rejected_logps": -289.231201171875, "debug/reference_chosen_logps": -258.4832458496094, "debug/reference_rejected_logps": -281.6604309082031, "debug/sppo_chosen_loss": 2166.486083984375, "debug/sppo_chosen_reward_in_loss": 3.5915920734405518, "debug/sppo_rej_reward_in_loss": -7.570761680603027, "debug/sppo_reject_loss": 1988.0650634765625, "epoch": 6.666666666666667, "grad_norm": 69786.59380173436, "learning_rate": 1.7457305502846297e-08, "logits/chosen": 1.3056552410125732, "logits/rejected": 1.4644079208374023, "logps/chosen": -254.89169311523438, "logps/rejected": -289.231201171875, "loss": 4565.5895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03591591864824295, "rewards/margins": 0.11162354052066803, "rewards/rejected": -0.07570761442184448, "step": 1840 }, { "debug/policy_chosen_logits": 1.1246999502182007, "debug/policy_chosen_logps": -264.46368408203125, "debug/policy_rejected_logits": 1.1066020727157593, "debug/policy_rejected_logps": -254.493408203125, "debug/reference_chosen_logps": -264.2433166503906, "debug/reference_rejected_logps": -252.0456085205078, "debug/sppo_chosen_loss": 2650.852783203125, "debug/sppo_chosen_reward_in_loss": -0.22037239372730255, "debug/sppo_rej_reward_in_loss": -2.447835922241211, "debug/sppo_reject_loss": 2332.664794921875, "epoch": 6.684782608695652, "grad_norm": 59264.24862578685, "learning_rate": 1.722011385199241e-08, "logits/chosen": 1.1246999502182007, "logits/rejected": 1.1066020727157593, "logps/chosen": -264.46368408203125, "logps/rejected": -254.493408203125, "loss": 4443.7168, "rewards/accuracies": 0.625, "rewards/chosen": -0.002203723881393671, "rewards/margins": 0.02227463386952877, "rewards/rejected": -0.024478357285261154, "step": 1845 }, { "debug/policy_chosen_logits": 1.3174164295196533, "debug/policy_chosen_logps": -277.3408508300781, "debug/policy_rejected_logits": 1.4734827280044556, "debug/policy_rejected_logps": -327.6285400390625, "debug/reference_chosen_logps": -276.53717041015625, "debug/reference_rejected_logps": -322.140380859375, "debug/sppo_chosen_loss": 2671.393310546875, "debug/sppo_chosen_reward_in_loss": -0.8037059903144836, "debug/sppo_rej_reward_in_loss": -5.48813533782959, "debug/sppo_reject_loss": 2128.63818359375, "epoch": 6.702898550724638, "grad_norm": 91955.3155479067, "learning_rate": 1.698292220113852e-08, "logits/chosen": 1.3174164295196533, "logits/rejected": 1.4734827280044556, "logps/chosen": -277.3408508300781, "logps/rejected": -327.6285400390625, "loss": 4533.8645, "rewards/accuracies": 0.625, "rewards/chosen": -0.008037058636546135, "rewards/margins": 0.04684429243206978, "rewards/rejected": -0.05488135293126106, "step": 1850 }, { "debug/policy_chosen_logits": 1.0340816974639893, "debug/policy_chosen_logps": -243.3152618408203, "debug/policy_rejected_logits": 1.0824315547943115, "debug/policy_rejected_logps": -290.51580810546875, "debug/reference_chosen_logps": -245.24673461914062, "debug/reference_rejected_logps": -287.10443115234375, "debug/sppo_chosen_loss": 2369.966796875, "debug/sppo_chosen_reward_in_loss": 1.931471824645996, "debug/sppo_rej_reward_in_loss": -3.411371946334839, "debug/sppo_reject_loss": 2307.01513671875, "epoch": 6.721014492753623, "grad_norm": 68843.3352012986, "learning_rate": 1.674573055028463e-08, "logits/chosen": 1.0340816974639893, "logits/rejected": 1.0824315547943115, "logps/chosen": -243.3152618408203, "logps/rejected": -290.51580810546875, "loss": 4642.7312, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0193147175014019, "rewards/margins": 0.05342843383550644, "rewards/rejected": -0.03411371633410454, "step": 1855 }, { "debug/policy_chosen_logits": 1.0709569454193115, "debug/policy_chosen_logps": -246.58236694335938, "debug/policy_rejected_logits": 1.2553744316101074, "debug/policy_rejected_logps": -260.4317321777344, "debug/reference_chosen_logps": -247.5150146484375, "debug/reference_rejected_logps": -256.20477294921875, "debug/sppo_chosen_loss": 2482.680419921875, "debug/sppo_chosen_reward_in_loss": 0.9326246380805969, "debug/sppo_rej_reward_in_loss": -4.226986408233643, "debug/sppo_reject_loss": 2250.196533203125, "epoch": 6.739130434782608, "grad_norm": 57911.3843312196, "learning_rate": 1.650853889943074e-08, "logits/chosen": 1.0709569454193115, "logits/rejected": 1.2553744316101074, "logps/chosen": -246.58236694335938, "logps/rejected": -260.4317321777344, "loss": 4505.6258, "rewards/accuracies": 0.625, "rewards/chosen": 0.00932624563574791, "rewards/margins": 0.051596105098724365, "rewards/rejected": -0.042269863188266754, "step": 1860 }, { "debug/policy_chosen_logits": 1.2661768198013306, "debug/policy_chosen_logps": -268.0488586425781, "debug/policy_rejected_logits": 1.4225587844848633, "debug/policy_rejected_logps": -294.9800720214844, "debug/reference_chosen_logps": -270.0035705566406, "debug/reference_rejected_logps": -285.4292907714844, "debug/sppo_chosen_loss": 2359.51318359375, "debug/sppo_chosen_reward_in_loss": 1.9546918869018555, "debug/sppo_rej_reward_in_loss": -9.550777435302734, "debug/sppo_reject_loss": 1889.544189453125, "epoch": 6.757246376811594, "grad_norm": 104542.69189331429, "learning_rate": 1.627134724857685e-08, "logits/chosen": 1.2661768198013306, "logits/rejected": 1.4225587844848633, "logps/chosen": -268.0488586425781, "logps/rejected": -294.9800720214844, "loss": 4394.8395, "rewards/accuracies": 0.75, "rewards/chosen": 0.01954692043364048, "rewards/margins": 0.11505468189716339, "rewards/rejected": -0.09550776332616806, "step": 1865 }, { "debug/policy_chosen_logits": 1.2627099752426147, "debug/policy_chosen_logps": -256.3724670410156, "debug/policy_rejected_logits": 1.7017396688461304, "debug/policy_rejected_logps": -319.2103576660156, "debug/reference_chosen_logps": -257.7935485839844, "debug/reference_rejected_logps": -313.9791564941406, "debug/sppo_chosen_loss": 2396.61181640625, "debug/sppo_chosen_reward_in_loss": 1.4210996627807617, "debug/sppo_rej_reward_in_loss": -5.231224536895752, "debug/sppo_reject_loss": 2155.52685546875, "epoch": 6.77536231884058, "grad_norm": 90080.53087097123, "learning_rate": 1.603415559772296e-08, "logits/chosen": 1.2627099752426147, "logits/rejected": 1.7017396688461304, "logps/chosen": -256.3724670410156, "logps/rejected": -319.2103576660156, "loss": 4516.1672, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.014210996218025684, "rewards/margins": 0.0665232390165329, "rewards/rejected": -0.05231224372982979, "step": 1870 }, { "debug/policy_chosen_logits": 1.1207185983657837, "debug/policy_chosen_logps": -267.49774169921875, "debug/policy_rejected_logits": 1.3464891910552979, "debug/policy_rejected_logps": -280.2220764160156, "debug/reference_chosen_logps": -268.8659362792969, "debug/reference_rejected_logps": -275.41595458984375, "debug/sppo_chosen_loss": 2395.053466796875, "debug/sppo_chosen_reward_in_loss": 1.3682245016098022, "debug/sppo_rej_reward_in_loss": -4.806126594543457, "debug/sppo_reject_loss": 2154.291259765625, "epoch": 6.793478260869565, "grad_norm": 134803.0260830987, "learning_rate": 1.579696394686907e-08, "logits/chosen": 1.1207185983657837, "logits/rejected": 1.3464891910552979, "logps/chosen": -267.49774169921875, "logps/rejected": -280.2220764160156, "loss": 4569.518, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.013682246208190918, "rewards/margins": 0.061743516474962234, "rewards/rejected": -0.04806126654148102, "step": 1875 }, { "debug/policy_chosen_logits": 1.3032572269439697, "debug/policy_chosen_logps": -272.27587890625, "debug/policy_rejected_logits": 1.492292881011963, "debug/policy_rejected_logps": -324.453125, "debug/reference_chosen_logps": -273.9796142578125, "debug/reference_rejected_logps": -318.5680847167969, "debug/sppo_chosen_loss": 2376.34912109375, "debug/sppo_chosen_reward_in_loss": 1.703736662864685, "debug/sppo_rej_reward_in_loss": -5.885059356689453, "debug/sppo_reject_loss": 2089.869140625, "epoch": 6.811594202898551, "grad_norm": 61094.87479848938, "learning_rate": 1.555977229601518e-08, "logits/chosen": 1.3032572269439697, "logits/rejected": 1.492292881011963, "logps/chosen": -272.27587890625, "logps/rejected": -324.453125, "loss": 4527.8016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017037367448210716, "rewards/margins": 0.07588796317577362, "rewards/rejected": -0.058850597590208054, "step": 1880 }, { "debug/policy_chosen_logits": 1.207608938217163, "debug/policy_chosen_logps": -247.23226928710938, "debug/policy_rejected_logits": 1.3167990446090698, "debug/policy_rejected_logps": -273.7459411621094, "debug/reference_chosen_logps": -251.1371307373047, "debug/reference_rejected_logps": -267.4615478515625, "debug/sppo_chosen_loss": 2143.943603515625, "debug/sppo_chosen_reward_in_loss": 3.904855251312256, "debug/sppo_rej_reward_in_loss": -6.284393787384033, "debug/sppo_reject_loss": 2066.11572265625, "epoch": 6.829710144927536, "grad_norm": 85530.3105377737, "learning_rate": 1.532258064516129e-08, "logits/chosen": 1.207608938217163, "logits/rejected": 1.3167990446090698, "logps/chosen": -247.23226928710938, "logps/rejected": -273.7459411621094, "loss": 4417.398, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03904855623841286, "rewards/margins": 0.10189249366521835, "rewards/rejected": -0.0628439337015152, "step": 1885 }, { "debug/policy_chosen_logits": 1.2641112804412842, "debug/policy_chosen_logps": -237.453369140625, "debug/policy_rejected_logits": 1.475335955619812, "debug/policy_rejected_logps": -279.74322509765625, "debug/reference_chosen_logps": -239.3144073486328, "debug/reference_rejected_logps": -273.20574951171875, "debug/sppo_chosen_loss": 2340.440673828125, "debug/sppo_chosen_reward_in_loss": 1.861051321029663, "debug/sppo_rej_reward_in_loss": -6.537452697753906, "debug/sppo_reject_loss": 2074.825927734375, "epoch": 6.8478260869565215, "grad_norm": 76221.26643629448, "learning_rate": 1.50853889943074e-08, "logits/chosen": 1.2641112804412842, "logits/rejected": 1.475335955619812, "logps/chosen": -237.453369140625, "logps/rejected": -279.74322509765625, "loss": 4366.6031, "rewards/accuracies": 0.75, "rewards/chosen": 0.0186105128377676, "rewards/margins": 0.08398503810167313, "rewards/rejected": -0.06537453085184097, "step": 1890 }, { "debug/policy_chosen_logits": 1.3160619735717773, "debug/policy_chosen_logps": -238.70358276367188, "debug/policy_rejected_logits": 1.5374112129211426, "debug/policy_rejected_logps": -300.89801025390625, "debug/reference_chosen_logps": -241.06076049804688, "debug/reference_rejected_logps": -292.98626708984375, "debug/sppo_chosen_loss": 2313.62646484375, "debug/sppo_chosen_reward_in_loss": 2.357179880142212, "debug/sppo_rej_reward_in_loss": -7.9117431640625, "debug/sppo_reject_loss": 1983.601806640625, "epoch": 6.865942028985507, "grad_norm": 58209.19935389577, "learning_rate": 1.4848197343453508e-08, "logits/chosen": 1.3160619735717773, "logits/rejected": 1.5374112129211426, "logps/chosen": -238.70358276367188, "logps/rejected": -300.89801025390625, "loss": 4363.2121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0235717985779047, "rewards/margins": 0.102689228951931, "rewards/rejected": -0.07911743223667145, "step": 1895 }, { "debug/policy_chosen_logits": 1.3453922271728516, "debug/policy_chosen_logps": -250.07955932617188, "debug/policy_rejected_logits": 1.6009600162506104, "debug/policy_rejected_logps": -282.43792724609375, "debug/reference_chosen_logps": -251.6648712158203, "debug/reference_rejected_logps": -276.7724609375, "debug/sppo_chosen_loss": 2390.849609375, "debug/sppo_chosen_reward_in_loss": 1.5852972269058228, "debug/sppo_rej_reward_in_loss": -5.665466785430908, "debug/sppo_reject_loss": 2122.5244140625, "epoch": 6.884057971014493, "grad_norm": 65350.91826894357, "learning_rate": 1.461100569259962e-08, "logits/chosen": 1.3453922271728516, "logits/rejected": 1.6009600162506104, "logps/chosen": -250.07955932617188, "logps/rejected": -282.43792724609375, "loss": 4682.2398, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015852969139814377, "rewards/margins": 0.07250763475894928, "rewards/rejected": -0.0566546693444252, "step": 1900 }, { "epoch": 6.884057971014493, "eval_debug/policy_chosen_logits": 1.399086594581604, "eval_debug/policy_chosen_logps": -252.3165283203125, "eval_debug/policy_rejected_logits": 1.4429121017456055, "eval_debug/policy_rejected_logps": -263.5052185058594, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2513.978515625, "eval_debug/sppo_chosen_reward_in_loss": 0.6019408106803894, "eval_debug/sppo_rej_reward_in_loss": -3.8465723991394043, "eval_debug/sppo_reject_loss": 2293.43798828125, "eval_logits/chosen": 1.399086594581604, "eval_logits/rejected": 1.4429121017456055, "eval_logps/chosen": -252.3165283203125, "eval_logps/rejected": -263.5052185058594, "eval_loss": 4613.93017578125, "eval_rewards/accuracies": 0.6184210777282715, "eval_rewards/chosen": 0.006019407417625189, "eval_rewards/margins": 0.04448513686656952, "eval_rewards/rejected": -0.038465727120637894, "eval_runtime": 28.414, "eval_samples_per_second": 21.116, "eval_steps_per_second": 0.669, "step": 1900 }, { "debug/policy_chosen_logits": 1.0614590644836426, "debug/policy_chosen_logps": -217.14266967773438, "debug/policy_rejected_logits": 1.4198445081710815, "debug/policy_rejected_logps": -282.9961853027344, "debug/reference_chosen_logps": -220.3137969970703, "debug/reference_rejected_logps": -278.0849914550781, "debug/sppo_chosen_loss": 2213.91748046875, "debug/sppo_chosen_reward_in_loss": 3.171109199523926, "debug/sppo_rej_reward_in_loss": -4.911197185516357, "debug/sppo_reject_loss": 2177.46435546875, "epoch": 6.9021739130434785, "grad_norm": 63272.659685661776, "learning_rate": 1.437381404174573e-08, "logits/chosen": 1.0614590644836426, "logits/rejected": 1.4198445081710815, "logps/chosen": -217.14266967773438, "logps/rejected": -282.9961853027344, "loss": 4506.2602, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.031711094081401825, "rewards/margins": 0.08082306385040283, "rewards/rejected": -0.049111973494291306, "step": 1905 }, { "debug/policy_chosen_logits": 1.2487655878067017, "debug/policy_chosen_logps": -254.3880615234375, "debug/policy_rejected_logits": 1.5873732566833496, "debug/policy_rejected_logps": -295.77734375, "debug/reference_chosen_logps": -255.3196258544922, "debug/reference_rejected_logps": -290.5936279296875, "debug/sppo_chosen_loss": 2461.73046875, "debug/sppo_chosen_reward_in_loss": 0.931576132774353, "debug/sppo_rej_reward_in_loss": -5.183750629425049, "debug/sppo_reject_loss": 2128.001953125, "epoch": 6.920289855072464, "grad_norm": 70976.1161677343, "learning_rate": 1.413662239089184e-08, "logits/chosen": 1.2487655878067017, "logits/rejected": 1.5873732566833496, "logps/chosen": -254.3880615234375, "logps/rejected": -295.77734375, "loss": 4638.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009315760806202888, "rewards/margins": 0.061153262853622437, "rewards/rejected": -0.0518374964594841, "step": 1910 }, { "debug/policy_chosen_logits": 0.9719457626342773, "debug/policy_chosen_logps": -234.44839477539062, "debug/policy_rejected_logits": 1.222617268562317, "debug/policy_rejected_logps": -279.7183532714844, "debug/reference_chosen_logps": -237.55307006835938, "debug/reference_rejected_logps": -274.2191467285156, "debug/sppo_chosen_loss": 2226.938720703125, "debug/sppo_chosen_reward_in_loss": 3.1046600341796875, "debug/sppo_rej_reward_in_loss": -5.499192237854004, "debug/sppo_reject_loss": 2128.958984375, "epoch": 6.938405797101449, "grad_norm": 61786.22107313845, "learning_rate": 1.3899430740037951e-08, "logits/chosen": 0.9719457626342773, "logits/rejected": 1.222617268562317, "logps/chosen": -234.44839477539062, "logps/rejected": -279.7183532714844, "loss": 4588.0164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03104659914970398, "rewards/margins": 0.0860385149717331, "rewards/rejected": -0.05499192327260971, "step": 1915 }, { "debug/policy_chosen_logits": 1.2095363140106201, "debug/policy_chosen_logps": -288.4798889160156, "debug/policy_rejected_logits": 1.1649754047393799, "debug/policy_rejected_logps": -266.6294860839844, "debug/reference_chosen_logps": -291.00299072265625, "debug/reference_rejected_logps": -263.9441223144531, "debug/sppo_chosen_loss": 2273.04931640625, "debug/sppo_chosen_reward_in_loss": 2.5230984687805176, "debug/sppo_rej_reward_in_loss": -2.685391664505005, "debug/sppo_reject_loss": 2289.666748046875, "epoch": 6.956521739130435, "grad_norm": 94037.89180045393, "learning_rate": 1.3662239089184061e-08, "logits/chosen": 1.2095363140106201, "logits/rejected": 1.1649754047393799, "logps/chosen": -288.4798889160156, "logps/rejected": -266.6294860839844, "loss": 4524.9098, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02523098513484001, "rewards/margins": 0.05208490043878555, "rewards/rejected": -0.02685391530394554, "step": 1920 }, { "debug/policy_chosen_logits": 1.39731764793396, "debug/policy_chosen_logps": -265.44482421875, "debug/policy_rejected_logits": 1.7464818954467773, "debug/policy_rejected_logps": -323.67926025390625, "debug/reference_chosen_logps": -266.50811767578125, "debug/reference_rejected_logps": -318.15435791015625, "debug/sppo_chosen_loss": 2453.25732421875, "debug/sppo_chosen_reward_in_loss": 1.06332266330719, "debug/sppo_rej_reward_in_loss": -5.524898529052734, "debug/sppo_reject_loss": 2112.041015625, "epoch": 6.97463768115942, "grad_norm": 74249.17604476992, "learning_rate": 1.342504743833017e-08, "logits/chosen": 1.39731764793396, "logits/rejected": 1.7464818954467773, "logps/chosen": -265.44482421875, "logps/rejected": -323.67926025390625, "loss": 4491.5437, "rewards/accuracies": 0.75, "rewards/chosen": 0.010633226484060287, "rewards/margins": 0.06588221341371536, "rewards/rejected": -0.055248986929655075, "step": 1925 }, { "debug/policy_chosen_logits": 1.30342698097229, "debug/policy_chosen_logps": -249.09030151367188, "debug/policy_rejected_logits": 1.7570003271102905, "debug/policy_rejected_logps": -300.46270751953125, "debug/reference_chosen_logps": -250.64013671875, "debug/reference_rejected_logps": -294.30718994140625, "debug/sppo_chosen_loss": 2373.09375, "debug/sppo_chosen_reward_in_loss": 1.549825668334961, "debug/sppo_rej_reward_in_loss": -6.155531406402588, "debug/sppo_reject_loss": 2092.76904296875, "epoch": 6.992753623188406, "grad_norm": 65596.96462390781, "learning_rate": 1.318785578747628e-08, "logits/chosen": 1.30342698097229, "logits/rejected": 1.7570003271102905, "logps/chosen": -249.09030151367188, "logps/rejected": -300.46270751953125, "loss": 4529.3219, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.015498255379498005, "rewards/margins": 0.07705356925725937, "rewards/rejected": -0.06155531480908394, "step": 1930 }, { "debug/policy_chosen_logits": 0.9820472002029419, "debug/policy_chosen_logps": -247.9277801513672, "debug/policy_rejected_logits": 1.17872953414917, "debug/policy_rejected_logps": -261.2237854003906, "debug/reference_chosen_logps": -250.2155303955078, "debug/reference_rejected_logps": -256.85382080078125, "debug/sppo_chosen_loss": 2313.646240234375, "debug/sppo_chosen_reward_in_loss": 2.28773832321167, "debug/sppo_rej_reward_in_loss": -4.369953155517578, "debug/sppo_reject_loss": 2210.85400390625, "epoch": 7.010869565217392, "grad_norm": 137436.28459313605, "learning_rate": 1.2950664136622391e-08, "logits/chosen": 0.9820472002029419, "logits/rejected": 1.17872953414917, "logps/chosen": -247.9277801513672, "logps/rejected": -261.2237854003906, "loss": 4446.5414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02287738025188446, "rewards/margins": 0.06657691299915314, "rewards/rejected": -0.04369953274726868, "step": 1935 }, { "debug/policy_chosen_logits": 1.338914394378662, "debug/policy_chosen_logps": -268.7223815917969, "debug/policy_rejected_logits": 1.4380134344100952, "debug/policy_rejected_logps": -289.16998291015625, "debug/reference_chosen_logps": -270.40582275390625, "debug/reference_rejected_logps": -283.4798278808594, "debug/sppo_chosen_loss": 2399.596923828125, "debug/sppo_chosen_reward_in_loss": 1.6834776401519775, "debug/sppo_rej_reward_in_loss": -5.690112590789795, "debug/sppo_reject_loss": 2140.331787109375, "epoch": 7.028985507246377, "grad_norm": 64837.47272682902, "learning_rate": 1.2713472485768501e-08, "logits/chosen": 1.338914394378662, "logits/rejected": 1.4380134344100952, "logps/chosen": -268.7223815917969, "logps/rejected": -289.16998291015625, "loss": 4526.8016, "rewards/accuracies": 0.75, "rewards/chosen": 0.016834774985909462, "rewards/margins": 0.07373590022325516, "rewards/rejected": -0.056901127099990845, "step": 1940 }, { "debug/policy_chosen_logits": 1.1111907958984375, "debug/policy_chosen_logps": -287.67041015625, "debug/policy_rejected_logits": 1.5086851119995117, "debug/policy_rejected_logps": -317.4016418457031, "debug/reference_chosen_logps": -288.9532165527344, "debug/reference_rejected_logps": -312.1133117675781, "debug/sppo_chosen_loss": 2424.06689453125, "debug/sppo_chosen_reward_in_loss": 1.282811164855957, "debug/sppo_rej_reward_in_loss": -5.288330554962158, "debug/sppo_reject_loss": 2143.939453125, "epoch": 7.047101449275362, "grad_norm": 96792.12371735008, "learning_rate": 1.247628083491461e-08, "logits/chosen": 1.1111907958984375, "logits/rejected": 1.5086851119995117, "logps/chosen": -287.67041015625, "logps/rejected": -317.4016418457031, "loss": 4441.6687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01282811164855957, "rewards/margins": 0.06571141630411148, "rewards/rejected": -0.05288330465555191, "step": 1945 }, { "debug/policy_chosen_logits": 1.0796027183532715, "debug/policy_chosen_logps": -237.2846221923828, "debug/policy_rejected_logits": 1.4237951040267944, "debug/policy_rejected_logps": -279.466796875, "debug/reference_chosen_logps": -237.2868194580078, "debug/reference_rejected_logps": -273.8456726074219, "debug/sppo_chosen_loss": 2598.83251953125, "debug/sppo_chosen_reward_in_loss": 0.002201080322265625, "debug/sppo_rej_reward_in_loss": -5.621129035949707, "debug/sppo_reject_loss": 2131.841796875, "epoch": 7.065217391304348, "grad_norm": 140040.80038712657, "learning_rate": 1.2239089184060721e-08, "logits/chosen": 1.0796027183532715, "logits/rejected": 1.4237951040267944, "logps/chosen": -237.2846221923828, "logps/rejected": -279.466796875, "loss": 4454.6789, "rewards/accuracies": 0.625, "rewards/chosen": 2.2009760868968442e-05, "rewards/margins": 0.05623329430818558, "rewards/rejected": -0.056211285293102264, "step": 1950 }, { "debug/policy_chosen_logits": 1.3378798961639404, "debug/policy_chosen_logps": -261.5946350097656, "debug/policy_rejected_logits": 1.3130693435668945, "debug/policy_rejected_logps": -269.3407897949219, "debug/reference_chosen_logps": -263.124755859375, "debug/reference_rejected_logps": -263.43121337890625, "debug/sppo_chosen_loss": 2408.43115234375, "debug/sppo_chosen_reward_in_loss": 1.5301036834716797, "debug/sppo_rej_reward_in_loss": -5.909576416015625, "debug/sppo_reject_loss": 2153.42919921875, "epoch": 7.083333333333333, "grad_norm": 64027.71001000307, "learning_rate": 1.200189753320683e-08, "logits/chosen": 1.3378798961639404, "logits/rejected": 1.3130693435668945, "logps/chosen": -261.5946350097656, "logps/rejected": -269.3407897949219, "loss": 4581.9156, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.015301036648452282, "rewards/margins": 0.07439680397510529, "rewards/rejected": -0.05909576267004013, "step": 1955 }, { "debug/policy_chosen_logits": 1.3207533359527588, "debug/policy_chosen_logps": -270.1539306640625, "debug/policy_rejected_logits": 1.3596980571746826, "debug/policy_rejected_logps": -276.98675537109375, "debug/reference_chosen_logps": -272.287353515625, "debug/reference_rejected_logps": -269.27130126953125, "debug/sppo_chosen_loss": 2337.30517578125, "debug/sppo_chosen_reward_in_loss": 2.133460283279419, "debug/sppo_rej_reward_in_loss": -7.715435981750488, "debug/sppo_reject_loss": 1986.58984375, "epoch": 7.101449275362318, "grad_norm": 85306.66751106597, "learning_rate": 1.176470588235294e-08, "logits/chosen": 1.3207533359527588, "logits/rejected": 1.3596980571746826, "logps/chosen": -270.1539306640625, "logps/rejected": -276.98675537109375, "loss": 4498.3664, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02133459970355034, "rewards/margins": 0.09848896414041519, "rewards/rejected": -0.07715436071157455, "step": 1960 }, { "debug/policy_chosen_logits": 1.2678791284561157, "debug/policy_chosen_logps": -278.25360107421875, "debug/policy_rejected_logits": 1.6893541812896729, "debug/policy_rejected_logps": -295.64044189453125, "debug/reference_chosen_logps": -280.9836730957031, "debug/reference_rejected_logps": -293.024658203125, "debug/sppo_chosen_loss": 2289.826904296875, "debug/sppo_chosen_reward_in_loss": 2.7300407886505127, "debug/sppo_rej_reward_in_loss": -2.615797281265259, "debug/sppo_reject_loss": 2354.09423828125, "epoch": 7.119565217391305, "grad_norm": 78088.39691908502, "learning_rate": 1.152751423149905e-08, "logits/chosen": 1.2678791284561157, "logits/rejected": 1.6893541812896729, "logps/chosen": -278.25360107421875, "logps/rejected": -295.64044189453125, "loss": 4552.8594, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.027300406247377396, "rewards/margins": 0.053458381444215775, "rewards/rejected": -0.02615796960890293, "step": 1965 }, { "debug/policy_chosen_logits": 0.8477060198783875, "debug/policy_chosen_logps": -239.5587921142578, "debug/policy_rejected_logits": 1.348474144935608, "debug/policy_rejected_logps": -299.83453369140625, "debug/reference_chosen_logps": -239.8334503173828, "debug/reference_rejected_logps": -297.77490234375, "debug/sppo_chosen_loss": 2563.475830078125, "debug/sppo_chosen_reward_in_loss": 0.2746526598930359, "debug/sppo_rej_reward_in_loss": -2.0596446990966797, "debug/sppo_reject_loss": 2365.90185546875, "epoch": 7.13768115942029, "grad_norm": 79963.69244326359, "learning_rate": 1.129032258064516e-08, "logits/chosen": 0.8477060198783875, "logits/rejected": 1.348474144935608, "logps/chosen": -239.5587921142578, "logps/rejected": -299.83453369140625, "loss": 4554.0625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002746526151895523, "rewards/margins": 0.023342972621321678, "rewards/rejected": -0.020596444606781006, "step": 1970 }, { "debug/policy_chosen_logits": 1.4603519439697266, "debug/policy_chosen_logps": -259.04827880859375, "debug/policy_rejected_logits": 1.4742807149887085, "debug/policy_rejected_logps": -273.266357421875, "debug/reference_chosen_logps": -260.7829284667969, "debug/reference_rejected_logps": -267.289794921875, "debug/sppo_chosen_loss": 2367.550048828125, "debug/sppo_chosen_reward_in_loss": 1.7346599102020264, "debug/sppo_rej_reward_in_loss": -5.97654914855957, "debug/sppo_reject_loss": 2116.225830078125, "epoch": 7.155797101449275, "grad_norm": 87264.67899007488, "learning_rate": 1.105313092979127e-08, "logits/chosen": 1.4603519439697266, "logits/rejected": 1.4742807149887085, "logps/chosen": -259.04827880859375, "logps/rejected": -273.266357421875, "loss": 4621.1074, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01734660007059574, "rewards/margins": 0.0771120935678482, "rewards/rejected": -0.05976548790931702, "step": 1975 }, { "debug/policy_chosen_logits": 1.136889100074768, "debug/policy_chosen_logps": -252.96078491210938, "debug/policy_rejected_logits": 1.465384840965271, "debug/policy_rejected_logps": -280.21942138671875, "debug/reference_chosen_logps": -254.67086791992188, "debug/reference_rejected_logps": -273.9142150878906, "debug/sppo_chosen_loss": 2382.818359375, "debug/sppo_chosen_reward_in_loss": 1.7100918292999268, "debug/sppo_rej_reward_in_loss": -6.3052496910095215, "debug/sppo_reject_loss": 2053.34228515625, "epoch": 7.173913043478261, "grad_norm": 63191.437355262286, "learning_rate": 1.081593927893738e-08, "logits/chosen": 1.136889100074768, "logits/rejected": 1.465384840965271, "logps/chosen": -252.96078491210938, "logps/rejected": -280.21942138671875, "loss": 4568.748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017100917175412178, "rewards/margins": 0.08015342056751251, "rewards/rejected": -0.06305249780416489, "step": 1980 }, { "debug/policy_chosen_logits": 1.368929386138916, "debug/policy_chosen_logps": -257.9754333496094, "debug/policy_rejected_logits": 1.3944368362426758, "debug/policy_rejected_logps": -273.3079528808594, "debug/reference_chosen_logps": -258.584716796875, "debug/reference_rejected_logps": -268.5025634765625, "debug/sppo_chosen_loss": 2509.17236328125, "debug/sppo_chosen_reward_in_loss": 0.609286904335022, "debug/sppo_rej_reward_in_loss": -4.805359840393066, "debug/sppo_reject_loss": 2177.56396484375, "epoch": 7.192028985507246, "grad_norm": 63051.0597032887, "learning_rate": 1.057874762808349e-08, "logits/chosen": 1.368929386138916, "logits/rejected": 1.3944368362426758, "logps/chosen": -257.9754333496094, "logps/rejected": -273.3079528808594, "loss": 4464.6734, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006092868745326996, "rewards/margins": 0.05414646863937378, "rewards/rejected": -0.04805359989404678, "step": 1985 }, { "debug/policy_chosen_logits": 0.9088782072067261, "debug/policy_chosen_logps": -222.2588348388672, "debug/policy_rejected_logits": 1.467335820198059, "debug/policy_rejected_logps": -317.38079833984375, "debug/reference_chosen_logps": -224.1483154296875, "debug/reference_rejected_logps": -308.30426025390625, "debug/sppo_chosen_loss": 2355.121826171875, "debug/sppo_chosen_reward_in_loss": 1.8894500732421875, "debug/sppo_rej_reward_in_loss": -9.076593399047852, "debug/sppo_reject_loss": 1859.735107421875, "epoch": 7.2101449275362315, "grad_norm": 64541.92823800524, "learning_rate": 1.03415559772296e-08, "logits/chosen": 0.9088782072067261, "logits/rejected": 1.467335820198059, "logps/chosen": -222.2588348388672, "logps/rejected": -317.38079833984375, "loss": 4342.1246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01889449916779995, "rewards/margins": 0.10966042429208755, "rewards/rejected": -0.09076593071222305, "step": 1990 }, { "debug/policy_chosen_logits": 1.3323959112167358, "debug/policy_chosen_logps": -279.4658203125, "debug/policy_rejected_logits": 1.4809930324554443, "debug/policy_rejected_logps": -297.109130859375, "debug/reference_chosen_logps": -281.09307861328125, "debug/reference_rejected_logps": -290.8413391113281, "debug/sppo_chosen_loss": 2382.39697265625, "debug/sppo_chosen_reward_in_loss": 1.6272386312484741, "debug/sppo_rej_reward_in_loss": -6.26776123046875, "debug/sppo_reject_loss": 2039.6396484375, "epoch": 7.228260869565218, "grad_norm": 64277.43927866372, "learning_rate": 1.010436432637571e-08, "logits/chosen": 1.3323959112167358, "logits/rejected": 1.4809930324554443, "logps/chosen": -279.4658203125, "logps/rejected": -297.109130859375, "loss": 4366.957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01627238467335701, "rewards/margins": 0.07895000278949738, "rewards/rejected": -0.06267760694026947, "step": 1995 }, { "debug/policy_chosen_logits": 1.1518235206604004, "debug/policy_chosen_logps": -257.5375061035156, "debug/policy_rejected_logits": 1.389533281326294, "debug/policy_rejected_logps": -300.15771484375, "debug/reference_chosen_logps": -260.25921630859375, "debug/reference_rejected_logps": -291.9169921875, "debug/sppo_chosen_loss": 2255.522705078125, "debug/sppo_chosen_reward_in_loss": 2.7216956615448, "debug/sppo_rej_reward_in_loss": -8.240727424621582, "debug/sppo_reject_loss": 1902.570556640625, "epoch": 7.246376811594203, "grad_norm": 89854.31112322766, "learning_rate": 9.867172675521822e-09, "logits/chosen": 1.1518235206604004, "logits/rejected": 1.389533281326294, "logps/chosen": -257.5375061035156, "logps/rejected": -300.15771484375, "loss": 4497.943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02721695974469185, "rewards/margins": 0.1096242293715477, "rewards/rejected": -0.08240728080272675, "step": 2000 }, { "epoch": 7.246376811594203, "eval_debug/policy_chosen_logits": 1.3966245651245117, "eval_debug/policy_chosen_logps": -252.42852783203125, "eval_debug/policy_rejected_logits": 1.4408994913101196, "eval_debug/policy_rejected_logps": -263.333740234375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2527.139892578125, "eval_debug/sppo_chosen_reward_in_loss": 0.48996397852897644, "eval_debug/sppo_rej_reward_in_loss": -3.675116777420044, "eval_debug/sppo_reject_loss": 2309.410400390625, "eval_logits/chosen": 1.3966245651245117, "eval_logits/rejected": 1.4408994913101196, "eval_logps/chosen": -252.42852783203125, "eval_logps/rejected": -263.333740234375, "eval_loss": 4617.740234375, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.004899639170616865, "eval_rewards/margins": 0.04165080934762955, "eval_rewards/rejected": -0.0367511622607708, "eval_runtime": 28.2793, "eval_samples_per_second": 21.217, "eval_steps_per_second": 0.672, "step": 2000 }, { "debug/policy_chosen_logits": 1.0749475955963135, "debug/policy_chosen_logps": -249.10543823242188, "debug/policy_rejected_logits": 1.4100449085235596, "debug/policy_rejected_logps": -281.3289489746094, "debug/reference_chosen_logps": -250.95486450195312, "debug/reference_rejected_logps": -275.25604248046875, "debug/sppo_chosen_loss": 2397.544189453125, "debug/sppo_chosen_reward_in_loss": 1.849412202835083, "debug/sppo_rej_reward_in_loss": -6.0729169845581055, "debug/sppo_reject_loss": 2126.12109375, "epoch": 7.2644927536231885, "grad_norm": 72766.05054739717, "learning_rate": 9.629981024667932e-09, "logits/chosen": 1.0749475955963135, "logits/rejected": 1.4100449085235596, "logps/chosen": -249.10543823242188, "logps/rejected": -281.3289489746094, "loss": 4405.5813, "rewards/accuracies": 0.75, "rewards/chosen": 0.018494119867682457, "rewards/margins": 0.07922328263521194, "rewards/rejected": -0.06072915717959404, "step": 2005 }, { "debug/policy_chosen_logits": 1.0792028903961182, "debug/policy_chosen_logps": -255.4202117919922, "debug/policy_rejected_logits": 1.410204291343689, "debug/policy_rejected_logps": -269.435546875, "debug/reference_chosen_logps": -256.257080078125, "debug/reference_rejected_logps": -267.22894287109375, "debug/sppo_chosen_loss": 2455.336181640625, "debug/sppo_chosen_reward_in_loss": 0.8368694186210632, "debug/sppo_rej_reward_in_loss": -2.2065939903259277, "debug/sppo_reject_loss": 2362.638671875, "epoch": 7.282608695652174, "grad_norm": 64339.762165804634, "learning_rate": 9.392789373814042e-09, "logits/chosen": 1.0792028903961182, "logits/rejected": 1.410204291343689, "logps/chosen": -255.4202117919922, "logps/rejected": -269.435546875, "loss": 4526.9836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00836869515478611, "rewards/margins": 0.030434634536504745, "rewards/rejected": -0.022065939381718636, "step": 2010 }, { "debug/policy_chosen_logits": 1.0717934370040894, "debug/policy_chosen_logps": -256.6336364746094, "debug/policy_rejected_logits": 1.329010248184204, "debug/policy_rejected_logps": -307.4125061035156, "debug/reference_chosen_logps": -256.99139404296875, "debug/reference_rejected_logps": -301.73614501953125, "debug/sppo_chosen_loss": 2556.468505859375, "debug/sppo_chosen_reward_in_loss": 0.3577736020088196, "debug/sppo_rej_reward_in_loss": -5.676340579986572, "debug/sppo_reject_loss": 2136.99658203125, "epoch": 7.300724637681159, "grad_norm": 82341.3930317484, "learning_rate": 9.155597722960152e-09, "logits/chosen": 1.0717934370040894, "logits/rejected": 1.329010248184204, "logps/chosen": -256.6336364746094, "logps/rejected": -307.4125061035156, "loss": 4498.8621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003577735275030136, "rewards/margins": 0.06034114211797714, "rewards/rejected": -0.056763410568237305, "step": 2015 }, { "debug/policy_chosen_logits": 1.3894195556640625, "debug/policy_chosen_logps": -259.94146728515625, "debug/policy_rejected_logits": 1.6163192987442017, "debug/policy_rejected_logps": -283.60491943359375, "debug/reference_chosen_logps": -261.0834045410156, "debug/reference_rejected_logps": -275.83831787109375, "debug/sppo_chosen_loss": 2467.23779296875, "debug/sppo_chosen_reward_in_loss": 1.141929268836975, "debug/sppo_rej_reward_in_loss": -7.766571044921875, "debug/sppo_reject_loss": 1964.348876953125, "epoch": 7.318840579710145, "grad_norm": 105733.44873037531, "learning_rate": 8.918406072106262e-09, "logits/chosen": 1.3894195556640625, "logits/rejected": 1.6163192987442017, "logps/chosen": -259.94146728515625, "logps/rejected": -283.60491943359375, "loss": 4656.2937, "rewards/accuracies": 0.75, "rewards/chosen": 0.011419291608035564, "rewards/margins": 0.08908500522375107, "rewards/rejected": -0.07766570895910263, "step": 2020 }, { "debug/policy_chosen_logits": 1.0844013690948486, "debug/policy_chosen_logps": -250.5321807861328, "debug/policy_rejected_logits": 1.305530309677124, "debug/policy_rejected_logps": -289.7793273925781, "debug/reference_chosen_logps": -253.705322265625, "debug/reference_rejected_logps": -283.3659973144531, "debug/sppo_chosen_loss": 2201.788330078125, "debug/sppo_chosen_reward_in_loss": 3.1731765270233154, "debug/sppo_rej_reward_in_loss": -6.413327217102051, "debug/sppo_reject_loss": 2008.0198974609375, "epoch": 7.336956521739131, "grad_norm": 76404.67083688965, "learning_rate": 8.681214421252372e-09, "logits/chosen": 1.0844013690948486, "logits/rejected": 1.305530309677124, "logps/chosen": -250.5321807861328, "logps/rejected": -289.7793273925781, "loss": 4425.3211, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03173176571726799, "rewards/margins": 0.09586503356695175, "rewards/rejected": -0.06413327157497406, "step": 2025 }, { "debug/policy_chosen_logits": 1.3615065813064575, "debug/policy_chosen_logps": -261.59857177734375, "debug/policy_rejected_logits": 1.3156540393829346, "debug/policy_rejected_logps": -268.8084411621094, "debug/reference_chosen_logps": -263.184814453125, "debug/reference_rejected_logps": -266.7096252441406, "debug/sppo_chosen_loss": 2404.5478515625, "debug/sppo_chosen_reward_in_loss": 1.5862499475479126, "debug/sppo_rej_reward_in_loss": -2.098817825317383, "debug/sppo_reject_loss": 2409.286865234375, "epoch": 7.355072463768116, "grad_norm": 63340.425698076106, "learning_rate": 8.444022770398482e-09, "logits/chosen": 1.3615065813064575, "logits/rejected": 1.3156540393829346, "logps/chosen": -261.59857177734375, "logps/rejected": -268.8084411621094, "loss": 4626.3344, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015862498432397842, "rewards/margins": 0.03685067594051361, "rewards/rejected": -0.02098817750811577, "step": 2030 }, { "debug/policy_chosen_logits": 1.0255852937698364, "debug/policy_chosen_logps": -248.276611328125, "debug/policy_rejected_logits": 1.5939472913742065, "debug/policy_rejected_logps": -293.2090148925781, "debug/reference_chosen_logps": -248.68417358398438, "debug/reference_rejected_logps": -287.56488037109375, "debug/sppo_chosen_loss": 2516.6484375, "debug/sppo_chosen_reward_in_loss": 0.4075614809989929, "debug/sppo_rej_reward_in_loss": -5.644109725952148, "debug/sppo_reject_loss": 2150.8486328125, "epoch": 7.3731884057971016, "grad_norm": 87049.99635233042, "learning_rate": 8.206831119544591e-09, "logits/chosen": 1.0255852937698364, "logits/rejected": 1.5939472913742065, "logps/chosen": -248.276611328125, "logps/rejected": -293.2090148925781, "loss": 4494.3172, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0040756165981292725, "rewards/margins": 0.06051671504974365, "rewards/rejected": -0.05644109100103378, "step": 2035 }, { "debug/policy_chosen_logits": 0.9478632807731628, "debug/policy_chosen_logps": -268.27734375, "debug/policy_rejected_logits": 1.4020640850067139, "debug/policy_rejected_logps": -315.4629821777344, "debug/reference_chosen_logps": -269.544921875, "debug/reference_rejected_logps": -309.2588806152344, "debug/sppo_chosen_loss": 2467.445556640625, "debug/sppo_chosen_reward_in_loss": 1.2675750255584717, "debug/sppo_rej_reward_in_loss": -6.204113960266113, "debug/sppo_reject_loss": 2100.890625, "epoch": 7.391304347826087, "grad_norm": 90885.00948660169, "learning_rate": 7.969639468690701e-09, "logits/chosen": 0.9478632807731628, "logits/rejected": 1.4020640850067139, "logps/chosen": -268.27734375, "logps/rejected": -315.4629821777344, "loss": 4475.4148, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012675751931965351, "rewards/margins": 0.07471688836812973, "rewards/rejected": -0.06204112619161606, "step": 2040 }, { "debug/policy_chosen_logits": 1.4638521671295166, "debug/policy_chosen_logps": -274.8016052246094, "debug/policy_rejected_logits": 1.5273889303207397, "debug/policy_rejected_logps": -305.61187744140625, "debug/reference_chosen_logps": -276.53521728515625, "debug/reference_rejected_logps": -299.4794921875, "debug/sppo_chosen_loss": 2374.765625, "debug/sppo_chosen_reward_in_loss": 1.7335855960845947, "debug/sppo_rej_reward_in_loss": -6.132406711578369, "debug/sppo_reject_loss": 2106.52197265625, "epoch": 7.409420289855072, "grad_norm": 63488.13057303716, "learning_rate": 7.732447817836813e-09, "logits/chosen": 1.4638521671295166, "logits/rejected": 1.5273889303207397, "logps/chosen": -274.8016052246094, "logps/rejected": -305.61187744140625, "loss": 4393.7363, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017335856333374977, "rewards/margins": 0.07865992188453674, "rewards/rejected": -0.061324067413806915, "step": 2045 }, { "debug/policy_chosen_logits": 1.335143804550171, "debug/policy_chosen_logps": -257.5989074707031, "debug/policy_rejected_logits": 1.4291226863861084, "debug/policy_rejected_logps": -296.6236572265625, "debug/reference_chosen_logps": -261.3354797363281, "debug/reference_rejected_logps": -290.55609130859375, "debug/sppo_chosen_loss": 2164.67724609375, "debug/sppo_chosen_reward_in_loss": 3.7365944385528564, "debug/sppo_rej_reward_in_loss": -6.0675740242004395, "debug/sppo_reject_loss": 2060.4326171875, "epoch": 7.427536231884058, "grad_norm": 79648.0962837916, "learning_rate": 7.495256166982921e-09, "logits/chosen": 1.335143804550171, "logits/rejected": 1.4291226863861084, "logps/chosen": -257.5989074707031, "logps/rejected": -296.6236572265625, "loss": 4358.9086, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03736594319343567, "rewards/margins": 0.09804168343544006, "rewards/rejected": -0.060675740242004395, "step": 2050 }, { "debug/policy_chosen_logits": 1.1475436687469482, "debug/policy_chosen_logps": -266.94866943359375, "debug/policy_rejected_logits": 1.1737476587295532, "debug/policy_rejected_logps": -286.8493347167969, "debug/reference_chosen_logps": -266.78277587890625, "debug/reference_rejected_logps": -282.5542297363281, "debug/sppo_chosen_loss": 2621.053466796875, "debug/sppo_chosen_reward_in_loss": -0.16587868332862854, "debug/sppo_rej_reward_in_loss": -4.295111656188965, "debug/sppo_reject_loss": 2163.965576171875, "epoch": 7.445652173913043, "grad_norm": 74597.15653963955, "learning_rate": 7.258064516129032e-09, "logits/chosen": 1.1475436687469482, "logits/rejected": 1.1737476587295532, "logps/chosen": -266.94866943359375, "logps/rejected": -286.8493347167969, "loss": 4526.5758, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0016587876016274095, "rewards/margins": 0.041292328387498856, "rewards/rejected": -0.04295111820101738, "step": 2055 }, { "debug/policy_chosen_logits": 1.432689905166626, "debug/policy_chosen_logps": -268.48883056640625, "debug/policy_rejected_logits": 1.7156181335449219, "debug/policy_rejected_logps": -310.301513671875, "debug/reference_chosen_logps": -269.88397216796875, "debug/reference_rejected_logps": -305.17596435546875, "debug/sppo_chosen_loss": 2442.27685546875, "debug/sppo_chosen_reward_in_loss": 1.3951797485351562, "debug/sppo_rej_reward_in_loss": -5.125535011291504, "debug/sppo_reject_loss": 2135.548828125, "epoch": 7.463768115942029, "grad_norm": 65899.09678136902, "learning_rate": 7.020872865275142e-09, "logits/chosen": 1.432689905166626, "logits/rejected": 1.7156181335449219, "logps/chosen": -268.48883056640625, "logps/rejected": -310.301513671875, "loss": 4601.1227, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.013951798900961876, "rewards/margins": 0.06520714610815048, "rewards/rejected": -0.05125534534454346, "step": 2060 }, { "debug/policy_chosen_logits": 1.3086445331573486, "debug/policy_chosen_logps": -258.2004089355469, "debug/policy_rejected_logits": 1.5987188816070557, "debug/policy_rejected_logps": -293.90411376953125, "debug/reference_chosen_logps": -261.1592102050781, "debug/reference_rejected_logps": -289.24749755859375, "debug/sppo_chosen_loss": 2238.3525390625, "debug/sppo_chosen_reward_in_loss": 2.958772659301758, "debug/sppo_rej_reward_in_loss": -4.656580924987793, "debug/sppo_reject_loss": 2168.284912109375, "epoch": 7.481884057971015, "grad_norm": 75699.77778337392, "learning_rate": 6.783681214421253e-09, "logits/chosen": 1.3086445331573486, "logits/rejected": 1.5987188816070557, "logps/chosen": -258.2004089355469, "logps/rejected": -293.90411376953125, "loss": 4531.5414, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.029587727040052414, "rewards/margins": 0.07615353167057037, "rewards/rejected": -0.046565812081098557, "step": 2065 }, { "debug/policy_chosen_logits": 1.2340729236602783, "debug/policy_chosen_logps": -253.41781616210938, "debug/policy_rejected_logits": 1.6521332263946533, "debug/policy_rejected_logps": -311.3460998535156, "debug/reference_chosen_logps": -255.8767852783203, "debug/reference_rejected_logps": -305.1612243652344, "debug/sppo_chosen_loss": 2279.5478515625, "debug/sppo_chosen_reward_in_loss": 2.4589591026306152, "debug/sppo_rej_reward_in_loss": -6.184895038604736, "debug/sppo_reject_loss": 2093.01025390625, "epoch": 7.5, "grad_norm": 64848.3657188948, "learning_rate": 6.546489563567362e-09, "logits/chosen": 1.2340729236602783, "logits/rejected": 1.6521332263946533, "logps/chosen": -253.41781616210938, "logps/rejected": -311.3460998535156, "loss": 4523.8285, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02458958886563778, "rewards/margins": 0.08643853664398193, "rewards/rejected": -0.061848945915699005, "step": 2070 }, { "debug/policy_chosen_logits": 1.3297024965286255, "debug/policy_chosen_logps": -271.7605285644531, "debug/policy_rejected_logits": 1.6037626266479492, "debug/policy_rejected_logps": -312.74176025390625, "debug/reference_chosen_logps": -274.61627197265625, "debug/reference_rejected_logps": -309.5181579589844, "debug/sppo_chosen_loss": 2286.35302734375, "debug/sppo_chosen_reward_in_loss": 2.8557345867156982, "debug/sppo_rej_reward_in_loss": -3.2235770225524902, "debug/sppo_reject_loss": 2268.72802734375, "epoch": 7.518115942028985, "grad_norm": 105889.03415785848, "learning_rate": 6.309297912713473e-09, "logits/chosen": 1.3297024965286255, "logits/rejected": 1.6037626266479492, "logps/chosen": -271.7605285644531, "logps/rejected": -312.74176025390625, "loss": 4434.8969, "rewards/accuracies": 0.75, "rewards/chosen": 0.028557348996400833, "rewards/margins": 0.060793112963438034, "rewards/rejected": -0.0322357714176178, "step": 2075 }, { "debug/policy_chosen_logits": 1.1554157733917236, "debug/policy_chosen_logps": -275.0699768066406, "debug/policy_rejected_logits": 1.0854908227920532, "debug/policy_rejected_logps": -284.55914306640625, "debug/reference_chosen_logps": -277.41717529296875, "debug/reference_rejected_logps": -277.25054931640625, "debug/sppo_chosen_loss": 2303.4892578125, "debug/sppo_chosen_reward_in_loss": 2.3471920490264893, "debug/sppo_rej_reward_in_loss": -7.308547019958496, "debug/sppo_reject_loss": 2021.475830078125, "epoch": 7.536231884057971, "grad_norm": 61735.54943235406, "learning_rate": 6.0721062618595826e-09, "logits/chosen": 1.1554157733917236, "logits/rejected": 1.0854908227920532, "logps/chosen": -275.0699768066406, "logps/rejected": -284.55914306640625, "loss": 4594.459, "rewards/accuracies": 0.75, "rewards/chosen": 0.02347191795706749, "rewards/margins": 0.09655739367008209, "rewards/rejected": -0.0730854719877243, "step": 2080 }, { "debug/policy_chosen_logits": 0.9209179878234863, "debug/policy_chosen_logps": -248.91104125976562, "debug/policy_rejected_logits": 1.4678943157196045, "debug/policy_rejected_logps": -300.1827697753906, "debug/reference_chosen_logps": -247.97744750976562, "debug/reference_rejected_logps": -295.57763671875, "debug/sppo_chosen_loss": 2683.385986328125, "debug/sppo_chosen_reward_in_loss": -0.9335857629776001, "debug/sppo_rej_reward_in_loss": -4.605147838592529, "debug/sppo_reject_loss": 2200.61083984375, "epoch": 7.554347826086957, "grad_norm": 62275.842318862065, "learning_rate": 5.8349146110056925e-09, "logits/chosen": 0.9209179878234863, "logits/rejected": 1.4678943157196045, "logps/chosen": -248.91104125976562, "logps/rejected": -300.1827697753906, "loss": 4575.0953, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00933585874736309, "rewards/margins": 0.03671561926603317, "rewards/rejected": -0.04605147987604141, "step": 2085 }, { "debug/policy_chosen_logits": 0.9393343925476074, "debug/policy_chosen_logps": -230.3099822998047, "debug/policy_rejected_logits": 1.4766675233840942, "debug/policy_rejected_logps": -288.49560546875, "debug/reference_chosen_logps": -233.8635711669922, "debug/reference_rejected_logps": -281.48101806640625, "debug/sppo_chosen_loss": 2180.860107421875, "debug/sppo_chosen_reward_in_loss": 3.5536065101623535, "debug/sppo_rej_reward_in_loss": -7.0145416259765625, "debug/sppo_reject_loss": 2004.818115234375, "epoch": 7.572463768115942, "grad_norm": 72369.83918600001, "learning_rate": 5.5977229601518025e-09, "logits/chosen": 0.9393343925476074, "logits/rejected": 1.4766675233840942, "logps/chosen": -230.3099822998047, "logps/rejected": -288.49560546875, "loss": 4511.5223, "rewards/accuracies": 0.75, "rewards/chosen": 0.03553606942296028, "rewards/margins": 0.10568146407604218, "rewards/rejected": -0.07014540582895279, "step": 2090 }, { "debug/policy_chosen_logits": 0.8067053556442261, "debug/policy_chosen_logps": -253.1298065185547, "debug/policy_rejected_logits": 1.2526203393936157, "debug/policy_rejected_logps": -313.2865295410156, "debug/reference_chosen_logps": -256.2008972167969, "debug/reference_rejected_logps": -307.97308349609375, "debug/sppo_chosen_loss": 2236.402099609375, "debug/sppo_chosen_reward_in_loss": 3.0711255073547363, "debug/sppo_rej_reward_in_loss": -5.31342077255249, "debug/sppo_reject_loss": 2159.132080078125, "epoch": 7.590579710144928, "grad_norm": 80901.5373464256, "learning_rate": 5.360531309297912e-09, "logits/chosen": 0.8067053556442261, "logits/rejected": 1.2526203393936157, "logps/chosen": -253.1298065185547, "logps/rejected": -313.2865295410156, "loss": 4472.0723, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.030711254104971886, "rewards/margins": 0.08384546637535095, "rewards/rejected": -0.05313421040773392, "step": 2095 }, { "debug/policy_chosen_logits": 1.2958418130874634, "debug/policy_chosen_logps": -273.1445617675781, "debug/policy_rejected_logits": 1.3935325145721436, "debug/policy_rejected_logps": -277.93450927734375, "debug/reference_chosen_logps": -273.92059326171875, "debug/reference_rejected_logps": -270.35479736328125, "debug/sppo_chosen_loss": 2492.843017578125, "debug/sppo_chosen_reward_in_loss": 0.7760518789291382, "debug/sppo_rej_reward_in_loss": -7.579698085784912, "debug/sppo_reject_loss": 1998.138427734375, "epoch": 7.608695652173913, "grad_norm": 76121.71525082327, "learning_rate": 5.123339658444022e-09, "logits/chosen": 1.2958418130874634, "logits/rejected": 1.3935325145721436, "logps/chosen": -273.1445617675781, "logps/rejected": -277.93450927734375, "loss": 4470.4805, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.007760518696159124, "rewards/margins": 0.08355750143527985, "rewards/rejected": -0.07579698413610458, "step": 2100 }, { "epoch": 7.608695652173913, "eval_debug/policy_chosen_logits": 1.3983112573623657, "eval_debug/policy_chosen_logps": -252.08984375, "eval_debug/policy_rejected_logits": 1.4419258832931519, "eval_debug/policy_rejected_logps": -263.379150390625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2493.60986328125, "eval_debug/sppo_chosen_reward_in_loss": 0.8286213278770447, "eval_debug/sppo_rej_reward_in_loss": -3.720534563064575, "eval_debug/sppo_reject_loss": 2304.22412109375, "eval_logits/chosen": 1.3983112573623657, "eval_logits/rejected": 1.4419258832931519, "eval_logps/chosen": -252.08984375, "eval_logps/rejected": -263.379150390625, "eval_loss": 4616.267578125, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.008286213502287865, "eval_rewards/margins": 0.04549156129360199, "eval_rewards/rejected": -0.037205345928668976, "eval_runtime": 28.3508, "eval_samples_per_second": 21.163, "eval_steps_per_second": 0.67, "step": 2100 }, { "debug/policy_chosen_logits": 0.6783396601676941, "debug/policy_chosen_logps": -230.4306640625, "debug/policy_rejected_logits": 0.9214555025100708, "debug/policy_rejected_logps": -246.3451690673828, "debug/reference_chosen_logps": -233.5577392578125, "debug/reference_rejected_logps": -242.1542510986328, "debug/sppo_chosen_loss": 2210.0966796875, "debug/sppo_chosen_reward_in_loss": 3.1270785331726074, "debug/sppo_rej_reward_in_loss": -4.190907955169678, "debug/sppo_reject_loss": 2214.10107421875, "epoch": 7.6268115942028984, "grad_norm": 70183.0935335667, "learning_rate": 4.886148007590132e-09, "logits/chosen": 0.6783396601676941, "logits/rejected": 0.9214555025100708, "logps/chosen": -230.4306640625, "logps/rejected": -246.3451690673828, "loss": 4342.3008, "rewards/accuracies": 0.75, "rewards/chosen": 0.03127078339457512, "rewards/margins": 0.07317986339330673, "rewards/rejected": -0.041909076273441315, "step": 2105 }, { "debug/policy_chosen_logits": 1.228139877319336, "debug/policy_chosen_logps": -256.35308837890625, "debug/policy_rejected_logits": 1.3803236484527588, "debug/policy_rejected_logps": -275.30694580078125, "debug/reference_chosen_logps": -259.5997619628906, "debug/reference_rejected_logps": -270.292724609375, "debug/sppo_chosen_loss": 2197.40576171875, "debug/sppo_chosen_reward_in_loss": 3.2466819286346436, "debug/sppo_rej_reward_in_loss": -5.014226913452148, "debug/sppo_reject_loss": 2176.42333984375, "epoch": 7.644927536231884, "grad_norm": 71937.36342126975, "learning_rate": 4.648956356736242e-09, "logits/chosen": 1.228139877319336, "logits/rejected": 1.3803236484527588, "logps/chosen": -256.35308837890625, "logps/rejected": -275.30694580078125, "loss": 4495.1066, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.032466821372509, "rewards/margins": 0.08260907977819443, "rewards/rejected": -0.05014226958155632, "step": 2110 }, { "debug/policy_chosen_logits": 1.0710529088974, "debug/policy_chosen_logps": -243.12246704101562, "debug/policy_rejected_logits": 1.17303466796875, "debug/policy_rejected_logps": -274.3731994628906, "debug/reference_chosen_logps": -243.8218231201172, "debug/reference_rejected_logps": -267.530517578125, "debug/sppo_chosen_loss": 2519.287841796875, "debug/sppo_chosen_reward_in_loss": 0.6993608474731445, "debug/sppo_rej_reward_in_loss": -6.842679023742676, "debug/sppo_reject_loss": 2054.475341796875, "epoch": 7.663043478260869, "grad_norm": 117518.94344049974, "learning_rate": 4.411764705882353e-09, "logits/chosen": 1.0710529088974, "logits/rejected": 1.17303466796875, "logps/chosen": -243.12246704101562, "logps/rejected": -274.3731994628906, "loss": 4516.1504, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.006993608083575964, "rewards/margins": 0.07542039453983307, "rewards/rejected": -0.06842678785324097, "step": 2115 }, { "debug/policy_chosen_logits": 0.9998262524604797, "debug/policy_chosen_logps": -227.8847198486328, "debug/policy_rejected_logits": 1.3254389762878418, "debug/policy_rejected_logps": -302.9129943847656, "debug/reference_chosen_logps": -229.51358032226562, "debug/reference_rejected_logps": -298.8193359375, "debug/sppo_chosen_loss": 2396.65576171875, "debug/sppo_chosen_reward_in_loss": 1.6288474798202515, "debug/sppo_rej_reward_in_loss": -4.093641757965088, "debug/sppo_reject_loss": 2208.817138671875, "epoch": 7.681159420289855, "grad_norm": 83363.54446780756, "learning_rate": 4.174573055028463e-09, "logits/chosen": 0.9998262524604797, "logits/rejected": 1.3254389762878418, "logps/chosen": -227.8847198486328, "logps/rejected": -302.9129943847656, "loss": 4601.225, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.016288474202156067, "rewards/margins": 0.05722489207983017, "rewards/rejected": -0.0409364178776741, "step": 2120 }, { "debug/policy_chosen_logits": 0.9560735821723938, "debug/policy_chosen_logps": -225.97933959960938, "debug/policy_rejected_logits": 1.352203130722046, "debug/policy_rejected_logps": -290.9984436035156, "debug/reference_chosen_logps": -229.05801391601562, "debug/reference_rejected_logps": -284.1490478515625, "debug/sppo_chosen_loss": 2221.994384765625, "debug/sppo_chosen_reward_in_loss": 3.078660488128662, "debug/sppo_rej_reward_in_loss": -6.849385738372803, "debug/sppo_reject_loss": 2039.010986328125, "epoch": 7.699275362318841, "grad_norm": 112451.22973502718, "learning_rate": 3.937381404174573e-09, "logits/chosen": 0.9560735821723938, "logits/rejected": 1.352203130722046, "logps/chosen": -225.97933959960938, "logps/rejected": -290.9984436035156, "loss": 4477.0047, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030786603689193726, "rewards/margins": 0.0992804542183876, "rewards/rejected": -0.06849385797977448, "step": 2125 }, { "debug/policy_chosen_logits": 1.3055236339569092, "debug/policy_chosen_logps": -252.66708374023438, "debug/policy_rejected_logits": 1.6392425298690796, "debug/policy_rejected_logps": -284.4870300292969, "debug/reference_chosen_logps": -253.86953735351562, "debug/reference_rejected_logps": -282.350830078125, "debug/sppo_chosen_loss": 2434.80908203125, "debug/sppo_chosen_reward_in_loss": 1.2024524211883545, "debug/sppo_rej_reward_in_loss": -2.1362316608428955, "debug/sppo_reject_loss": 2353.478515625, "epoch": 7.717391304347826, "grad_norm": 67123.82022122883, "learning_rate": 3.700189753320683e-09, "logits/chosen": 1.3055236339569092, "logits/rejected": 1.6392425298690796, "logps/chosen": -252.66708374023438, "logps/rejected": -284.4870300292969, "loss": 4591.7977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.012024523690342903, "rewards/margins": 0.03338683769106865, "rewards/rejected": -0.021362315863370895, "step": 2130 }, { "debug/policy_chosen_logits": 1.1301448345184326, "debug/policy_chosen_logps": -258.1528625488281, "debug/policy_rejected_logits": 1.5620170831680298, "debug/policy_rejected_logps": -300.11236572265625, "debug/reference_chosen_logps": -259.0867004394531, "debug/reference_rejected_logps": -294.621826171875, "debug/sppo_chosen_loss": 2466.033203125, "debug/sppo_chosen_reward_in_loss": 0.9338119626045227, "debug/sppo_rej_reward_in_loss": -5.490514755249023, "debug/sppo_reject_loss": 2135.0546875, "epoch": 7.7355072463768115, "grad_norm": 63073.51713892027, "learning_rate": 3.462998102466793e-09, "logits/chosen": 1.1301448345184326, "logits/rejected": 1.5620170831680298, "logps/chosen": -258.1528625488281, "logps/rejected": -300.11236572265625, "loss": 4540.4039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.009338117204606533, "rewards/margins": 0.06424326449632645, "rewards/rejected": -0.054905153810977936, "step": 2135 }, { "debug/policy_chosen_logits": 1.2218091487884521, "debug/policy_chosen_logps": -267.868896484375, "debug/policy_rejected_logits": 1.6044843196868896, "debug/policy_rejected_logps": -285.9657287597656, "debug/reference_chosen_logps": -269.52984619140625, "debug/reference_rejected_logps": -281.32818603515625, "debug/sppo_chosen_loss": 2392.14404296875, "debug/sppo_chosen_reward_in_loss": 1.6609121561050415, "debug/sppo_rej_reward_in_loss": -4.6375508308410645, "debug/sppo_reject_loss": 2162.18115234375, "epoch": 7.753623188405797, "grad_norm": 136230.97863793193, "learning_rate": 3.225806451612903e-09, "logits/chosen": 1.2218091487884521, "logits/rejected": 1.6044843196868896, "logps/chosen": -267.868896484375, "logps/rejected": -285.9657287597656, "loss": 4630.2898, "rewards/accuracies": 0.75, "rewards/chosen": 0.01660912111401558, "rewards/margins": 0.06298463046550751, "rewards/rejected": -0.04637550190091133, "step": 2140 }, { "debug/policy_chosen_logits": 1.3167120218276978, "debug/policy_chosen_logps": -262.1035461425781, "debug/policy_rejected_logits": 1.337059497833252, "debug/policy_rejected_logps": -283.91351318359375, "debug/reference_chosen_logps": -263.3422546386719, "debug/reference_rejected_logps": -277.1472473144531, "debug/sppo_chosen_loss": 2458.8310546875, "debug/sppo_chosen_reward_in_loss": 1.2387104034423828, "debug/sppo_rej_reward_in_loss": -6.766273498535156, "debug/sppo_reject_loss": 2023.1923828125, "epoch": 7.771739130434782, "grad_norm": 62984.067536547205, "learning_rate": 2.988614800759013e-09, "logits/chosen": 1.3167120218276978, "logits/rejected": 1.337059497833252, "logps/chosen": -262.1035461425781, "logps/rejected": -283.91351318359375, "loss": 4559.8297, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012387105263769627, "rewards/margins": 0.08004983514547348, "rewards/rejected": -0.06766273826360703, "step": 2145 }, { "debug/policy_chosen_logits": 1.004941701889038, "debug/policy_chosen_logps": -222.6114044189453, "debug/policy_rejected_logits": 1.1150052547454834, "debug/policy_rejected_logps": -247.7772674560547, "debug/reference_chosen_logps": -225.07754516601562, "debug/reference_rejected_logps": -243.9304962158203, "debug/sppo_chosen_loss": 2317.02294921875, "debug/sppo_chosen_reward_in_loss": 2.466127872467041, "debug/sppo_rej_reward_in_loss": -3.8467841148376465, "debug/sppo_reject_loss": 2263.35498046875, "epoch": 7.789855072463768, "grad_norm": 77701.0969837637, "learning_rate": 2.7514231499051234e-09, "logits/chosen": 1.004941701889038, "logits/rejected": 1.1150052547454834, "logps/chosen": -222.6114044189453, "logps/rejected": -247.7772674560547, "loss": 4422.4164, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02466127835214138, "rewards/margins": 0.06312911212444305, "rewards/rejected": -0.03846783563494682, "step": 2150 }, { "debug/policy_chosen_logits": 1.0830397605895996, "debug/policy_chosen_logps": -241.35513305664062, "debug/policy_rejected_logits": 1.2067186832427979, "debug/policy_rejected_logps": -294.6799011230469, "debug/reference_chosen_logps": -242.5572967529297, "debug/reference_rejected_logps": -286.7196960449219, "debug/sppo_chosen_loss": 2459.772216796875, "debug/sppo_chosen_reward_in_loss": 1.2021602392196655, "debug/sppo_rej_reward_in_loss": -7.9601945877075195, "debug/sppo_reject_loss": 1935.1702880859375, "epoch": 7.807971014492754, "grad_norm": 60498.0079605754, "learning_rate": 2.5142314990512333e-09, "logits/chosen": 1.0830397605895996, "logits/rejected": 1.2067186832427979, "logps/chosen": -241.35513305664062, "logps/rejected": -294.6799011230469, "loss": 4598.3992, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012021603062748909, "rewards/margins": 0.09162354469299316, "rewards/rejected": -0.0796019434928894, "step": 2155 }, { "debug/policy_chosen_logits": 1.1517736911773682, "debug/policy_chosen_logps": -230.2449493408203, "debug/policy_rejected_logits": 1.9268848896026611, "debug/policy_rejected_logps": -306.0497131347656, "debug/reference_chosen_logps": -232.04678344726562, "debug/reference_rejected_logps": -300.45745849609375, "debug/sppo_chosen_loss": 2403.190673828125, "debug/sppo_chosen_reward_in_loss": 1.8018343448638916, "debug/sppo_rej_reward_in_loss": -5.592249870300293, "debug/sppo_reject_loss": 2113.3349609375, "epoch": 7.826086956521739, "grad_norm": 61428.367827119604, "learning_rate": 2.2770398481973433e-09, "logits/chosen": 1.1517736911773682, "logits/rejected": 1.9268848896026611, "logps/chosen": -230.2449493408203, "logps/rejected": -306.0497131347656, "loss": 4600.55, "rewards/accuracies": 0.75, "rewards/chosen": 0.018018342554569244, "rewards/margins": 0.07394083589315414, "rewards/rejected": -0.0559224970638752, "step": 2160 }, { "debug/policy_chosen_logits": 1.241207480430603, "debug/policy_chosen_logps": -268.712890625, "debug/policy_rejected_logits": 1.204115867614746, "debug/policy_rejected_logps": -266.22076416015625, "debug/reference_chosen_logps": -269.70306396484375, "debug/reference_rejected_logps": -261.1277160644531, "debug/sppo_chosen_loss": 2464.787841796875, "debug/sppo_chosen_reward_in_loss": 0.9901620745658875, "debug/sppo_rej_reward_in_loss": -5.093061447143555, "debug/sppo_reject_loss": 2130.121337890625, "epoch": 7.844202898550725, "grad_norm": 93437.66916199154, "learning_rate": 2.039848197343453e-09, "logits/chosen": 1.241207480430603, "logits/rejected": 1.204115867614746, "logps/chosen": -268.712890625, "logps/rejected": -266.22076416015625, "loss": 4472.4664, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.009901619516313076, "rewards/margins": 0.06083223968744278, "rewards/rejected": -0.05093061178922653, "step": 2165 }, { "debug/policy_chosen_logits": 1.0723090171813965, "debug/policy_chosen_logps": -255.5542449951172, "debug/policy_rejected_logits": 1.266048789024353, "debug/policy_rejected_logps": -285.4629211425781, "debug/reference_chosen_logps": -258.68817138671875, "debug/reference_rejected_logps": -281.90374755859375, "debug/sppo_chosen_loss": 2211.93017578125, "debug/sppo_chosen_reward_in_loss": 3.133908748626709, "debug/sppo_rej_reward_in_loss": -3.559196949005127, "debug/sppo_reject_loss": 2285.771484375, "epoch": 7.86231884057971, "grad_norm": 66053.93095648492, "learning_rate": 1.8026565464895636e-09, "logits/chosen": 1.0723090171813965, "logits/rejected": 1.266048789024353, "logps/chosen": -255.5542449951172, "logps/rejected": -285.4629211425781, "loss": 4550.2055, "rewards/accuracies": 0.75, "rewards/chosen": 0.03133908659219742, "rewards/margins": 0.06693105399608612, "rewards/rejected": -0.0355919674038887, "step": 2170 }, { "debug/policy_chosen_logits": 1.362290620803833, "debug/policy_chosen_logps": -259.56341552734375, "debug/policy_rejected_logits": 1.4862273931503296, "debug/policy_rejected_logps": -282.407958984375, "debug/reference_chosen_logps": -259.996826171875, "debug/reference_rejected_logps": -276.11199951171875, "debug/sppo_chosen_loss": 2526.16845703125, "debug/sppo_chosen_reward_in_loss": 0.43338316679000854, "debug/sppo_rej_reward_in_loss": -6.295929908752441, "debug/sppo_reject_loss": 2039.265380859375, "epoch": 7.880434782608695, "grad_norm": 86089.74622798496, "learning_rate": 1.5654648956356737e-09, "logits/chosen": 1.362290620803833, "logits/rejected": 1.4862273931503296, "logps/chosen": -259.56341552734375, "logps/rejected": -282.407958984375, "loss": 4501.4195, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.004333832301199436, "rewards/margins": 0.06729312241077423, "rewards/rejected": -0.06295929849147797, "step": 2175 }, { "debug/policy_chosen_logits": 1.0118058919906616, "debug/policy_chosen_logps": -261.07781982421875, "debug/policy_rejected_logits": 1.2698683738708496, "debug/policy_rejected_logps": -291.1365661621094, "debug/reference_chosen_logps": -263.2150573730469, "debug/reference_rejected_logps": -288.1213684082031, "debug/sppo_chosen_loss": 2312.064697265625, "debug/sppo_chosen_reward_in_loss": 2.1372222900390625, "debug/sppo_rej_reward_in_loss": -3.0152218341827393, "debug/sppo_reject_loss": 2299.5263671875, "epoch": 7.898550724637682, "grad_norm": 78164.01586528082, "learning_rate": 1.3282732447817836e-09, "logits/chosen": 1.0118058919906616, "logits/rejected": 1.2698683738708496, "logps/chosen": -261.07781982421875, "logps/rejected": -291.1365661621094, "loss": 4526.3301, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.021372223272919655, "rewards/margins": 0.051524437963962555, "rewards/rejected": -0.03015221655368805, "step": 2180 }, { "debug/policy_chosen_logits": 1.3515137434005737, "debug/policy_chosen_logps": -265.62005615234375, "debug/policy_rejected_logits": 1.540601372718811, "debug/policy_rejected_logps": -304.4212951660156, "debug/reference_chosen_logps": -267.88934326171875, "debug/reference_rejected_logps": -299.74700927734375, "debug/sppo_chosen_loss": 2309.009521484375, "debug/sppo_chosen_reward_in_loss": 2.269253730773926, "debug/sppo_rej_reward_in_loss": -4.674244403839111, "debug/sppo_reject_loss": 2174.17138671875, "epoch": 7.916666666666667, "grad_norm": 74488.94388393678, "learning_rate": 1.0910815939278936e-09, "logits/chosen": 1.3515137434005737, "logits/rejected": 1.540601372718811, "logps/chosen": -265.62005615234375, "logps/rejected": -304.4212951660156, "loss": 4428.6488, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.022692536935210228, "rewards/margins": 0.06943497806787491, "rewards/rejected": -0.04674243927001953, "step": 2185 }, { "debug/policy_chosen_logits": 0.7716978192329407, "debug/policy_chosen_logps": -227.5405731201172, "debug/policy_rejected_logits": 1.2276690006256104, "debug/policy_rejected_logps": -317.48175048828125, "debug/reference_chosen_logps": -231.3727569580078, "debug/reference_rejected_logps": -307.95062255859375, "debug/sppo_chosen_loss": 2142.927490234375, "debug/sppo_chosen_reward_in_loss": 3.832199811935425, "debug/sppo_rej_reward_in_loss": -9.531160354614258, "debug/sppo_reject_loss": 1802.24609375, "epoch": 7.934782608695652, "grad_norm": 108852.12127678565, "learning_rate": 8.538899430740038e-10, "logits/chosen": 0.7716978192329407, "logits/rejected": 1.2276690006256104, "logps/chosen": -227.5405731201172, "logps/rejected": -317.48175048828125, "loss": 4501.9016, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.038321997970342636, "rewards/margins": 0.1336335986852646, "rewards/rejected": -0.09531159698963165, "step": 2190 }, { "debug/policy_chosen_logits": 1.2999484539031982, "debug/policy_chosen_logps": -283.1374816894531, "debug/policy_rejected_logits": 1.4268121719360352, "debug/policy_rejected_logps": -286.1319274902344, "debug/reference_chosen_logps": -283.5812072753906, "debug/reference_rejected_logps": -284.0365905761719, "debug/sppo_chosen_loss": 2520.8671875, "debug/sppo_chosen_reward_in_loss": 0.4437088072299957, "debug/sppo_rej_reward_in_loss": -2.0953316688537598, "debug/sppo_reject_loss": 2361.266357421875, "epoch": 7.952898550724638, "grad_norm": 105873.89855345472, "learning_rate": 6.166982922201139e-10, "logits/chosen": 1.2999484539031982, "logits/rejected": 1.4268121719360352, "logps/chosen": -283.1374816894531, "logps/rejected": -286.1319274902344, "loss": 4567.9129, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004437088035047054, "rewards/margins": 0.02539040520787239, "rewards/rejected": -0.020953314378857613, "step": 2195 }, { "debug/policy_chosen_logits": 1.147592544555664, "debug/policy_chosen_logps": -263.82501220703125, "debug/policy_rejected_logits": 1.516588807106018, "debug/policy_rejected_logps": -314.99664306640625, "debug/reference_chosen_logps": -266.16912841796875, "debug/reference_rejected_logps": -312.854248046875, "debug/sppo_chosen_loss": 2294.52685546875, "debug/sppo_chosen_reward_in_loss": 2.3441309928894043, "debug/sppo_rej_reward_in_loss": -2.142390489578247, "debug/sppo_reject_loss": 2377.422607421875, "epoch": 7.971014492753623, "grad_norm": 109606.21092770516, "learning_rate": 3.795066413662239e-10, "logits/chosen": 1.147592544555664, "logits/rejected": 1.516588807106018, "logps/chosen": -263.82501220703125, "logps/rejected": -314.99664306640625, "loss": 4514.8016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.023441310971975327, "rewards/margins": 0.044865213334560394, "rewards/rejected": -0.021423906087875366, "step": 2200 }, { "epoch": 7.971014492753623, "eval_debug/policy_chosen_logits": 1.3958569765090942, "eval_debug/policy_chosen_logps": -252.4095916748047, "eval_debug/policy_rejected_logits": 1.4404065608978271, "eval_debug/policy_rejected_logps": -263.3607482910156, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2526.56201171875, "eval_debug/sppo_chosen_reward_in_loss": 0.5088815689086914, "eval_debug/sppo_rej_reward_in_loss": -3.702131509780884, "eval_debug/sppo_reject_loss": 2309.32421875, "eval_logits/chosen": 1.3958569765090942, "eval_logits/rejected": 1.4404065608978271, "eval_logps/chosen": -252.4095916748047, "eval_logps/rejected": -263.3607482910156, "eval_loss": 4624.10107421875, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": 0.0050888159312307835, "eval_rewards/margins": 0.0421101339161396, "eval_rewards/rejected": -0.03702131658792496, "eval_runtime": 28.3108, "eval_samples_per_second": 21.193, "eval_steps_per_second": 0.671, "step": 2200 }, { "debug/policy_chosen_logits": 1.3122127056121826, "debug/policy_chosen_logps": -250.834228515625, "debug/policy_rejected_logits": 1.5941417217254639, "debug/policy_rejected_logps": -304.87213134765625, "debug/reference_chosen_logps": -252.6299285888672, "debug/reference_rejected_logps": -301.18548583984375, "debug/sppo_chosen_loss": 2375.86279296875, "debug/sppo_chosen_reward_in_loss": 1.7957038879394531, "debug/sppo_rej_reward_in_loss": -3.6865832805633545, "debug/sppo_reject_loss": 2240.017333984375, "epoch": 7.989130434782608, "grad_norm": 68461.86580867655, "learning_rate": 1.4231499051233395e-10, "logits/chosen": 1.3122127056121826, "logits/rejected": 1.5941417217254639, "logps/chosen": -250.834228515625, "logps/rejected": -304.87213134765625, "loss": 4433.9867, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.017957037314772606, "rewards/margins": 0.05482287332415581, "rewards/rejected": -0.03686583787202835, "step": 2205 }, { "epoch": 8.0, "step": 2208, "total_flos": 0.0, "train_loss": 4648.314686650815, "train_runtime": 15546.5095, "train_samples_per_second": 9.07, "train_steps_per_second": 0.142 } ], "logging_steps": 5, "max_steps": 2208, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }