diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,11728 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0283877878950187, - "eval_steps": 64, - "global_step": 480, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0021424745581146223, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 12.50527462559239, - "learning_rate": 0.0, - "logits/chosen": 1.0446698665618896, - "logits/rejected": 0.9781918525695801, - "logps/accuracies": 0.25, - "logps/chosen": -270.4280700683594, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -270.4280700683594, - "logps/ref_rejected": -259.14373779296875, - "logps/rejected": -259.14373779296875, - "loss": 0.6931, - "rewards/accuracies": 0.0, - "rewards/chosen": 0.0, - "rewards/grad_term": 0.02500000037252903, - "rewards/margins": 0.0, - "rewards/rejected": 0.0, - "step": 1 - }, - { - "epoch": 0.004284949116229245, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.856352015281924, - "learning_rate": 1.0679540942081149e-07, - "logits/chosen": 0.4414063096046448, - "logits/rejected": 0.32948625087738037, - "logps/accuracies": 0.5, - "logps/chosen": -318.21771240234375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -317.62615966796875, - "logps/ref_rejected": -221.48974609375, - "logps/rejected": -221.3629608154297, - "loss": 0.69, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.02957625314593315, - "rewards/grad_term": 0.0254487507045269, - "rewards/margins": -0.03591585159301758, - "rewards/rejected": 0.006339598447084427, - "step": 2 - }, - { - "epoch": 0.006427423674343867, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 13.280048310716381, - "learning_rate": 1.6926671918114913e-07, - "logits/chosen": 1.0256762504577637, - "logits/rejected": 0.8810745477676392, - "logps/accuracies": 0.5, - "logps/chosen": -355.5387268066406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -354.57049560546875, - "logps/ref_rejected": -364.0948486328125, - "logps/rejected": -363.83416748046875, - "loss": 0.6892, - "rewards/accuracies": 0.0, - "rewards/chosen": -0.0484108030796051, - "rewards/grad_term": 0.0257670097053051, - "rewards/margins": -0.061444856226444244, - "rewards/rejected": 0.013034057803452015, - "step": 3 - }, - { - "epoch": 0.00856989823245849, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.710494813385088, - "learning_rate": 2.1359081884162297e-07, - "logits/chosen": 1.1384323835372925, - "logits/rejected": 1.0655404329299927, - "logps/accuracies": 0.5, - "logps/chosen": -442.36578369140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -442.594970703125, - "logps/ref_rejected": -345.18572998046875, - "logps/rejected": -344.8496398925781, - "loss": 0.6921, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0114593505859375, - "rewards/grad_term": 0.0250665545463562, - "rewards/margins": -0.005344009958207607, - "rewards/rejected": 0.016803361475467682, - "step": 4 - }, - { - "epoch": 0.010712372790573112, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 14.937463503106938, - "learning_rate": 2.479712615391807e-07, - "logits/chosen": 0.5404180288314819, - "logits/rejected": 0.45622000098228455, - "logps/accuracies": 0.0, - "logps/chosen": -413.39813232421875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -413.3487854003906, - "logps/ref_rejected": -304.2044982910156, - "logps/rejected": -304.66510009765625, - "loss": 0.691, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.00246772775426507, - "rewards/grad_term": 0.024743197485804558, - "rewards/margins": 0.020562361925840378, - "rewards/rejected": -0.02303009107708931, - "step": 5 - }, - { - "epoch": 0.012854847348687734, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.517831245113905, - "learning_rate": 2.7606212860196063e-07, - "logits/chosen": 1.1012723445892334, - "logits/rejected": 0.7194727659225464, - "logps/accuracies": 0.5, - "logps/chosen": -258.80072021484375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -258.87176513671875, - "logps/ref_rejected": -247.64498901367188, - "logps/rejected": -247.63116455078125, - "loss": 0.6903, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.0035531027242541313, - "rewards/grad_term": 0.024964267387986183, - "rewards/margins": 0.002861691638827324, - "rewards/rejected": 0.0006914124824106693, - "step": 6 - }, - { - "epoch": 0.014997321906802356, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 12.204086479352032, - "learning_rate": 2.9981261829067217e-07, - "logits/chosen": 1.0940355062484741, - "logits/rejected": 0.9565569162368774, - "logps/accuracies": 0.25, - "logps/chosen": -237.1595916748047, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -236.5460662841797, - "logps/ref_rejected": -215.79209899902344, - "logps/rejected": -215.83944702148438, - "loss": 0.6913, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.030676748603582382, - "rewards/grad_term": 0.025353606790304184, - "rewards/margins": -0.02830987237393856, - "rewards/rejected": -0.0023668762296438217, - "step": 7 - }, - { - "epoch": 0.01713979646491698, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.475151458421857, - "learning_rate": 3.2038622826243447e-07, - "logits/chosen": 0.967343270778656, - "logits/rejected": 0.9412274360656738, - "logps/accuracies": 0.5, - "logps/chosen": -279.015380859375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -279.0576171875, - "logps/ref_rejected": -273.6654968261719, - "logps/rejected": -273.8743591308594, - "loss": 0.6941, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0021114349365234375, - "rewards/grad_term": 0.02484307810664177, - "rewards/margins": 0.01255502738058567, - "rewards/rejected": -0.010443592444062233, - "step": 8 - }, - { - "epoch": 0.0192822710230316, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.580237634234056, - "learning_rate": 3.3853343836229826e-07, - "logits/chosen": 0.9568102359771729, - "logits/rejected": 0.9983876943588257, - "logps/accuracies": 0.5, - "logps/chosen": -331.88922119140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -332.0152587890625, - "logps/ref_rejected": -332.083251953125, - "logps/rejected": -331.71844482421875, - "loss": 0.6903, - "rewards/accuracies": 0.25, - "rewards/chosen": 0.00630226219072938, - "rewards/grad_term": 0.02514958195388317, - "rewards/margins": -0.011937713250517845, - "rewards/rejected": 0.018239974975585938, - "step": 9 - }, - { - "epoch": 0.021424745581146223, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 13.151098903300593, - "learning_rate": 3.5476667095999224e-07, - "logits/chosen": 1.000882863998413, - "logits/rejected": 0.9467081427574158, - "logps/accuracies": 0.5, - "logps/chosen": -320.8128662109375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -320.7702941894531, - "logps/ref_rejected": -298.6964111328125, - "logps/rejected": -298.3901672363281, - "loss": 0.6846, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.0021286013070493937, - "rewards/grad_term": 0.025217382237315178, - "rewards/margins": -0.017440060153603554, - "rewards/rejected": 0.01531145628541708, - "step": 10 - }, - { - "epoch": 0.023567220139260846, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.233476785972572, - "learning_rate": 3.6945141607567076e-07, - "logits/chosen": 1.1362941265106201, - "logits/rejected": 1.0800057649612427, - "logps/accuracies": 0.5, - "logps/chosen": -398.43902587890625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -399.17889404296875, - "logps/ref_rejected": -400.2832946777344, - "logps/rejected": -400.0302429199219, - "loss": 0.6858, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.036993030458688736, - "rewards/grad_term": 0.024695834144949913, - "rewards/margins": 0.024341586977243423, - "rewards/rejected": 0.012651442550122738, - "step": 11 - }, - { - "epoch": 0.025709694697375468, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 12.317733670973258, - "learning_rate": 3.8285753802277215e-07, - "logits/chosen": 1.1415050029754639, - "logits/rejected": 0.6014249920845032, - "logps/accuracies": 0.0, - "logps/chosen": -187.37680053710938, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -187.9676513671875, - "logps/ref_rejected": -126.48046875, - "logps/rejected": -126.51386260986328, - "loss": 0.685, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0295425895601511, - "rewards/grad_term": 0.02461005374789238, - "rewards/margins": 0.03121213987469673, - "rewards/rejected": -0.0016695503145456314, - "step": 12 - }, - { - "epoch": 0.02785216925549009, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 13.848164526581762, - "learning_rate": 3.9518997473591026e-07, - "logits/chosen": 1.0744524002075195, - "logits/rejected": 0.8894139528274536, - "logps/accuracies": 0.0, - "logps/chosen": -277.2958984375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -278.2149658203125, - "logps/ref_rejected": -189.09963989257812, - "logps/rejected": -189.11630249023438, - "loss": 0.688, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.04595203697681427, - "rewards/grad_term": 0.02441561222076416, - "rewards/margins": 0.04678481072187424, - "rewards/rejected": -0.0008327784016728401, - "step": 13 - }, - { - "epoch": 0.029994643813604713, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 12.272371420817562, - "learning_rate": 4.066080277114836e-07, - "logits/chosen": 1.0568749904632568, - "logits/rejected": 0.8716171979904175, - "logps/accuracies": 0.0, - "logps/chosen": -219.07720947265625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -219.25765991210938, - "logps/ref_rejected": -129.22840881347656, - "logps/rejected": -128.9102020263672, - "loss": 0.686, - "rewards/accuracies": 0.25, - "rewards/chosen": 0.009022902697324753, - "rewards/grad_term": 0.025086142122745514, - "rewards/margins": -0.006888152565807104, - "rewards/rejected": 0.015911055728793144, - "step": 14 - }, - { - "epoch": 0.032137118371719335, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 12.428694983873564, - "learning_rate": 4.1723798072032976e-07, - "logits/chosen": 1.037872314453125, - "logits/rejected": 0.9692336320877075, - "logps/accuracies": 0.25, - "logps/chosen": -381.4244384765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -382.68658447265625, - "logps/ref_rejected": -322.65679931640625, - "logps/rejected": -321.3078308105469, - "loss": 0.6845, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.06310494244098663, - "rewards/grad_term": 0.02505427412688732, - "rewards/margins": -0.004344702698290348, - "rewards/rejected": 0.06744963675737381, - "step": 15 - }, - { - "epoch": 0.03427959292983396, - "flips/correct->correct": 1.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 12.620364972338749, - "learning_rate": 4.2718163768324594e-07, - "logits/chosen": 0.9468654990196228, - "logits/rejected": 1.170510172843933, - "logps/accuracies": 1.0, - "logps/chosen": -173.85870361328125, - "logps/ref_accuracies": 1.0, - "logps/ref_chosen": -174.64218139648438, - "logps/ref_rejected": -222.02438354492188, - "logps/rejected": -222.00958251953125, - "loss": 0.6788, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.03917388990521431, - "rewards/grad_term": 0.024519937112927437, - "rewards/margins": 0.03843364864587784, - "rewards/rejected": 0.0007402412593364716, - "step": 16 - }, - { - "epoch": 0.03642206748794858, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.697678503172199, - "learning_rate": 4.3652226762368345e-07, - "logits/chosen": 1.0040562152862549, - "logits/rejected": 0.9608211517333984, - "logps/accuracies": 0.5, - "logps/chosen": -253.23019409179688, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -254.5650177001953, - "logps/ref_rejected": -273.43707275390625, - "logps/rejected": -273.1368713378906, - "loss": 0.6744, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06674174964427948, - "rewards/grad_term": 0.024354225024580956, - "rewards/margins": 0.051732055842876434, - "rewards/rejected": 0.015009691938757896, - "step": 17 - }, - { - "epoch": 0.0385645420460632, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.78637513369074, - "learning_rate": 4.4532884778310973e-07, - "logits/chosen": 1.0503087043762207, - "logits/rejected": 0.9166826009750366, - "logps/accuracies": 0.5, - "logps/chosen": -244.81471252441406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -244.83123779296875, - "logps/ref_rejected": -240.30084228515625, - "logps/rejected": -239.78488159179688, - "loss": 0.6772, - "rewards/accuracies": 0.25, - "rewards/chosen": 0.0008262638584710658, - "rewards/grad_term": 0.0253120306879282, - "rewards/margins": -0.02497131936252117, - "rewards/rejected": 0.025797583162784576, - "step": 18 - }, - { - "epoch": 0.040707016604177824, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.706003949713145, - "learning_rate": 4.536591579881374e-07, - "logits/chosen": 1.015830159187317, - "logits/rejected": 0.9545015692710876, - "logps/accuracies": 0.25, - "logps/chosen": -264.29638671875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -266.071044921875, - "logps/ref_rejected": -236.97666931152344, - "logps/rejected": -236.77438354492188, - "loss": 0.6778, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.08873386681079865, - "rewards/grad_term": 0.02401968464255333, - "rewards/margins": 0.07861967384815216, - "rewards/rejected": 0.01011419203132391, - "step": 19 - }, - { - "epoch": 0.04284949116229245, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.815048094751976, - "learning_rate": 4.615620803808037e-07, - "logits/chosen": 0.896416187286377, - "logits/rejected": 1.0262924432754517, - "logps/accuracies": 0.5, - "logps/chosen": -294.67327880859375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -296.34197998046875, - "logps/ref_rejected": -321.8549499511719, - "logps/rejected": -321.5665283203125, - "loss": 0.6743, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08343505859375, - "rewards/grad_term": 0.02414114400744438, - "rewards/margins": 0.06901436299085617, - "rewards/rejected": 0.014420699328184128, - "step": 20 - }, - { - "epoch": 0.04499196572040707, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.637316673579596, - "learning_rate": 4.6907933747182127e-07, - "logits/chosen": 0.8530393242835999, - "logits/rejected": 0.6534068584442139, - "logps/accuracies": 0.25, - "logps/chosen": -276.7626647949219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -277.6459655761719, - "logps/ref_rejected": -269.14227294921875, - "logps/rejected": -270.58624267578125, - "loss": 0.6723, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.04416485130786896, - "rewards/grad_term": 0.02355196699500084, - "rewards/margins": 0.11636309325695038, - "rewards/rejected": -0.07219824939966202, - "step": 21 - }, - { - "epoch": 0.04713444027852169, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.876590663836538, - "learning_rate": 4.762468254964823e-07, - "logits/chosen": 1.009377121925354, - "logits/rejected": 0.9447416067123413, - "logps/accuracies": 0.5, - "logps/chosen": -317.7878723144531, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -319.1816711425781, - "logps/ref_rejected": -311.2941589355469, - "logps/rejected": -311.8822021484375, - "loss": 0.671, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.06969013810157776, - "rewards/grad_term": 0.023765018209815025, - "rewards/margins": 0.09909267723560333, - "rewards/rejected": -0.029402542859315872, - "step": 22 - }, - { - "epoch": 0.049276914836636314, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 12.586762655265133, - "learning_rate": 4.830956511375156e-07, - "logits/chosen": 1.203920841217041, - "logits/rejected": 1.1563231945037842, - "logps/accuracies": 0.0, - "logps/chosen": -436.044921875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -438.2389831542969, - "logps/ref_rejected": -385.3761901855469, - "logps/rejected": -385.66180419921875, - "loss": 0.6674, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.10970421135425568, - "rewards/grad_term": 0.0234676580876112, - "rewards/margins": 0.12398529052734375, - "rewards/rejected": -0.014281081967055798, - "step": 23 - }, - { - "epoch": 0.051419389394750936, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 12.26080518238754, - "learning_rate": 4.896529474435837e-07, - "logits/chosen": 0.9966791868209839, - "logits/rejected": 0.9108967185020447, - "logps/accuracies": 0.25, - "logps/chosen": -307.80572509765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -310.31866455078125, - "logps/ref_rejected": -252.79197692871094, - "logps/rejected": -252.8419189453125, - "loss": 0.6669, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.12564696371555328, - "rewards/grad_term": 0.023410532623529434, - "rewards/margins": 0.12814360857009888, - "rewards/rejected": -0.002496624831110239, - "step": 24 - }, - { - "epoch": 0.05356186395286556, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.366243366820711, - "learning_rate": 4.959425230783614e-07, - "logits/chosen": 0.9740027785301208, - "logits/rejected": 0.8190696239471436, - "logps/accuracies": 0.5, - "logps/chosen": -320.2606506347656, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -321.4267272949219, - "logps/ref_rejected": -328.9568176269531, - "logps/rejected": -328.89703369140625, - "loss": 0.6599, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.05830345302820206, - "rewards/grad_term": 0.024309232831001282, - "rewards/margins": 0.055313680320978165, - "rewards/rejected": 0.0029897689819335938, - "step": 25 - }, - { - "epoch": 0.05570433851098018, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.08055129360116, - "learning_rate": 5.019853841567218e-07, - "logits/chosen": 1.1111705303192139, - "logits/rejected": 0.6438971161842346, - "logps/accuracies": 0.25, - "logps/chosen": -187.502197265625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -188.0975341796875, - "logps/ref_rejected": -102.05082702636719, - "logps/rejected": -102.74217987060547, - "loss": 0.6499, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.029765892773866653, - "rewards/grad_term": 0.024199258536100388, - "rewards/margins": 0.06433363258838654, - "rewards/rejected": -0.034567736089229584, - "step": 26 - }, - { - "epoch": 0.0578468130690948, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.88218195606169, - "learning_rate": 5.078001575434473e-07, - "logits/chosen": 0.791816771030426, - "logits/rejected": 0.884813666343689, - "logps/accuracies": 0.5, - "logps/chosen": -189.6773223876953, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -190.02127075195312, - "logps/ref_rejected": -203.477783203125, - "logps/rejected": -203.25155639648438, - "loss": 0.6377, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.017197083681821823, - "rewards/grad_term": 0.024926653131842613, - "rewards/margins": 0.00588593352586031, - "rewards/rejected": 0.011311152018606663, - "step": 27 - }, - { - "epoch": 0.059989287627209426, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.239511800333705, - "learning_rate": 5.134034371322951e-07, - "logits/chosen": 1.134675145149231, - "logits/rejected": 0.9631079435348511, - "logps/accuracies": 0.25, - "logps/chosen": -266.0494079589844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -267.859619140625, - "logps/ref_rejected": -242.5032958984375, - "logps/rejected": -243.47943115234375, - "loss": 0.636, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.09051056951284409, - "rewards/grad_term": 0.023272007703781128, - "rewards/margins": 0.13931767642498016, - "rewards/rejected": -0.04880712181329727, - "step": 28 - }, - { - "epoch": 0.06213176218532405, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 11.25145831338842, - "learning_rate": 5.188100693331704e-07, - "logits/chosen": 1.1358017921447754, - "logits/rejected": 0.5541727542877197, - "logps/accuracies": 0.0, - "logps/chosen": -365.2521057128906, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -368.4560546875, - "logps/ref_rejected": -241.02734375, - "logps/rejected": -240.83399963378906, - "loss": 0.639, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.1601976454257965, - "rewards/grad_term": 0.0231227595359087, - "rewards/margins": 0.1505315899848938, - "rewards/rejected": 0.009666061028838158, - "step": 29 - }, - { - "epoch": 0.06427423674343867, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.7696981919874, - "learning_rate": 5.240333901411414e-07, - "logits/chosen": 1.0822885036468506, - "logits/rejected": 0.7618290185928345, - "logps/accuracies": 0.0, - "logps/chosen": -302.0892028808594, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -303.5124816894531, - "logps/ref_rejected": -231.44943237304688, - "logps/rejected": -231.66343688964844, - "loss": 0.6391, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.07116356492042542, - "rewards/grad_term": 0.023980939760804176, - "rewards/margins": 0.08186331391334534, - "rewards/rejected": -0.010699748061597347, - "step": 30 - }, - { - "epoch": 0.0664167113015533, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.976869632876706, - "learning_rate": 5.2908542331884e-07, - "logits/chosen": 1.2087453603744507, - "logits/rejected": 0.821852445602417, - "logps/accuracies": 0.0, - "logps/chosen": -408.1747741699219, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -410.11907958984375, - "logps/ref_rejected": -321.87982177734375, - "logps/rejected": -322.6016540527344, - "loss": 0.6316, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.09721412509679794, - "rewards/grad_term": 0.023339958861470222, - "rewards/margins": 0.1333070695400238, - "rewards/rejected": -0.03609294816851616, - "step": 31 - }, - { - "epoch": 0.06855918585966791, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.55127291702564, - "learning_rate": 5.339770471040575e-07, - "logits/chosen": 1.0062335729599, - "logits/rejected": 0.8178822994232178, - "logps/accuracies": 0.25, - "logps/chosen": -251.82504272460938, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -256.1924743652344, - "logps/ref_rejected": -205.9676971435547, - "logps/rejected": -208.8406524658203, - "loss": 0.6257, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.21837130188941956, - "rewards/grad_term": 0.020633019506931305, - "rewards/margins": 0.36201906204223633, - "rewards/rejected": -0.14364777505397797, - "step": 32 - }, - { - "epoch": 0.07070166041778254, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.480165494817408, - "learning_rate": 5.387181352568199e-07, - "logits/chosen": 0.5726549625396729, - "logits/rejected": 0.4411008358001709, - "logps/accuracies": 0.5, - "logps/chosen": -123.8817138671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -125.0975570678711, - "logps/ref_rejected": -92.3137435913086, - "logps/rejected": -92.87496948242188, - "loss": 0.6302, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06079201400279999, - "rewards/grad_term": 0.02389412932097912, - "rewards/margins": 0.08885356783866882, - "rewards/rejected": -0.028061550110578537, - "step": 33 - }, - { - "epoch": 0.07284413497589716, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.769954405702693, - "learning_rate": 5.43317677044495e-07, - "logits/chosen": 0.8887495994567871, - "logits/rejected": 0.7524275779724121, - "logps/accuracies": 0.5, - "logps/chosen": -233.1437530517578, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -235.97918701171875, - "logps/ref_rejected": -232.49705505371094, - "logps/rejected": -233.62205505371094, - "loss": 0.6253, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.1417713165283203, - "rewards/grad_term": 0.022557333111763, - "rewards/margins": 0.1980208307504654, - "rewards/rejected": -0.056249529123306274, - "step": 34 - }, - { - "epoch": 0.07498660953401179, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.153566860628073, - "learning_rate": 5.477838798298528e-07, - "logits/chosen": 0.9315862655639648, - "logits/rejected": 1.0292893648147583, - "logps/accuracies": 0.5, - "logps/chosen": -176.9609375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -178.0975799560547, - "logps/ref_rejected": -238.39404296875, - "logps/rejected": -238.72592163085938, - "loss": 0.6173, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.05683126673102379, - "rewards/grad_term": 0.02408386766910553, - "rewards/margins": 0.07342477142810822, - "rewards/rejected": -0.016593504697084427, - "step": 35 - }, - { - "epoch": 0.0771290840921264, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.29602350540091, - "learning_rate": 5.521242572039213e-07, - "logits/chosen": 1.1964863538742065, - "logits/rejected": 1.0984711647033691, - "logps/accuracies": 0.5, - "logps/chosen": -322.5604248046875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -322.1260986328125, - "logps/ref_rejected": -287.303466796875, - "logps/rejected": -293.40313720703125, - "loss": 0.6001, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.02171630784869194, - "rewards/grad_term": 0.021510563790798187, - "rewards/margins": 0.28326815366744995, - "rewards/rejected": -0.3049844801425934, - "step": 36 - }, - { - "epoch": 0.07927155865024103, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.139112324828863, - "learning_rate": 5.563457050409681e-07, - "logits/chosen": 1.173073410987854, - "logits/rejected": 1.0843687057495117, - "logps/accuracies": 0.75, - "logps/chosen": -273.99566650390625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -275.65478515625, - "logps/ref_rejected": -284.9312438964844, - "logps/rejected": -284.83282470703125, - "loss": 0.6029, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.08295460045337677, - "rewards/grad_term": 0.024028297513723373, - "rewards/margins": 0.07803288102149963, - "rewards/rejected": 0.0049217212945222855, - "step": 37 - }, - { - "epoch": 0.08141403320835565, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.36236689748956, - "learning_rate": 5.604545674089489e-07, - "logits/chosen": 1.127380609512329, - "logits/rejected": 0.8692578077316284, - "logps/accuracies": 0.25, - "logps/chosen": -628.7327270507812, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -633.2546997070312, - "logps/ref_rejected": -454.33013916015625, - "logps/rejected": -454.3887939453125, - "loss": 0.6011, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.22609534859657288, - "rewards/grad_term": 0.022296732291579247, - "rewards/margins": 0.22902806103229523, - "rewards/rejected": -0.0029327282682061195, - "step": 38 - }, - { - "epoch": 0.08355650776647028, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.40205719855862, - "learning_rate": 5.644566939170593e-07, - "logits/chosen": 1.1477112770080566, - "logits/rejected": 0.7197964191436768, - "logps/accuracies": 0.0, - "logps/chosen": -342.099609375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -345.7111511230469, - "logps/ref_rejected": -252.51214599609375, - "logps/rejected": -257.9091796875, - "loss": 0.5936, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.18057651817798615, - "rewards/grad_term": 0.01955207623541355, - "rewards/margins": 0.45042720437049866, - "rewards/rejected": -0.2698506712913513, - "step": 39 - }, - { - "epoch": 0.0856989823245849, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.52698957163884, - "learning_rate": 5.683574898016152e-07, - "logits/chosen": 1.155718207359314, - "logits/rejected": 0.9540653228759766, - "logps/accuracies": 0.25, - "logps/chosen": -353.8412780761719, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -355.0787353515625, - "logps/ref_rejected": -313.3082580566406, - "logps/rejected": -315.2515869140625, - "loss": 0.5912, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0618743896484375, - "rewards/grad_term": 0.023020360618829727, - "rewards/margins": 0.15904179215431213, - "rewards/rejected": -0.09716740250587463, - "step": 40 - }, - { - "epoch": 0.08784145688269952, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 14.94332500203197, - "learning_rate": 5.721619598264776e-07, - "logits/chosen": 1.1048097610473633, - "logits/rejected": 0.8005577325820923, - "logps/accuracies": 0.0, - "logps/chosen": -317.59912109375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -313.39208984375, - "logps/ref_rejected": -253.1265411376953, - "logps/rejected": -262.5826416015625, - "loss": 0.5854, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2103523313999176, - "rewards/grad_term": 0.021769195795059204, - "rewards/margins": 0.2624519467353821, - "rewards/rejected": -0.4728042781352997, - "step": 41 - }, - { - "epoch": 0.08998393144081414, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.074479922271454, - "learning_rate": 5.758747468926328e-07, - "logits/chosen": 0.8858407735824585, - "logits/rejected": 0.7222996354103088, - "logps/accuracies": 0.5, - "logps/chosen": -298.509521484375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -300.38153076171875, - "logps/ref_rejected": -263.8077087402344, - "logps/rejected": -261.6027526855469, - "loss": 0.5645, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.09360065311193466, - "rewards/grad_term": 0.025205716490745544, - "rewards/margins": -0.01664828509092331, - "rewards/rejected": 0.11024895310401917, - "step": 42 - }, - { - "epoch": 0.09212640599892877, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.322575540859543, - "learning_rate": 5.795001661041298e-07, - "logits/chosen": 1.0528746843338013, - "logits/rejected": 0.8013145923614502, - "logps/accuracies": 0.25, - "logps/chosen": -357.80035400390625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -354.52734375, - "logps/ref_rejected": -306.75, - "logps/rejected": -324.1343994140625, - "loss": 0.5605, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.16364938020706177, - "rewards/grad_term": 0.016722146421670914, - "rewards/margins": 0.7055709958076477, - "rewards/rejected": -0.8692203760147095, - "step": 43 - }, - { - "epoch": 0.09426888055704338, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.88902402136389, - "learning_rate": 5.830422349172938e-07, - "logits/chosen": 1.1213502883911133, - "logits/rejected": 0.7059850692749023, - "logps/accuracies": 0.25, - "logps/chosen": -306.6339416503906, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -308.36224365234375, - "logps/ref_rejected": -222.45750427246094, - "logps/rejected": -234.72048950195312, - "loss": 0.5277, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.08641643077135086, - "rewards/grad_term": 0.016946371644735336, - "rewards/margins": 0.6995644569396973, - "rewards/rejected": -0.6131480932235718, - "step": 44 - }, - { - "epoch": 0.09641135511515801, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.994736837402936, - "learning_rate": 5.865046999014789e-07, - "logits/chosen": 1.0356709957122803, - "logits/rejected": 0.9350219368934631, - "logps/accuracies": 0.25, - "logps/chosen": -449.03717041015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -447.7042236328125, - "logps/ref_rejected": -399.0120544433594, - "logps/rejected": -408.2356262207031, - "loss": 0.537, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06664619594812393, - "rewards/grad_term": 0.0201456006616354, - "rewards/margins": 0.3945322036743164, - "rewards/rejected": -0.46117842197418213, - "step": 45 - }, - { - "epoch": 0.09855382967327263, - "flips/correct->correct": 1.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.684993785164789, - "learning_rate": 5.898910605583271e-07, - "logits/chosen": 0.8869616389274597, - "logits/rejected": 0.9591479301452637, - "logps/accuracies": 1.0, - "logps/chosen": -190.82815551757812, - "logps/ref_accuracies": 1.0, - "logps/ref_chosen": -184.7421112060547, - "logps/ref_rejected": -232.9263153076172, - "logps/rejected": -237.15127563476562, - "loss": 0.5444, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.30430203676223755, - "rewards/grad_term": 0.026146598160266876, - "rewards/margins": -0.0930541530251503, - "rewards/rejected": -0.21124787628650665, - "step": 46 - }, - { - "epoch": 0.10069630423138726, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.511687293551553, - "learning_rate": 5.932045905791884e-07, - "logits/chosen": 0.9791369438171387, - "logits/rejected": 0.7344577312469482, - "logps/accuracies": 0.0, - "logps/chosen": -380.52587890625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -378.03131103515625, - "logps/ref_rejected": -319.94256591796875, - "logps/rejected": -337.8666687011719, - "loss": 0.5447, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.12472762912511826, - "rewards/grad_term": 0.01607554219663143, - "rewards/margins": 0.7714786529541016, - "rewards/rejected": -0.896206259727478, - "step": 47 - }, - { - "epoch": 0.10283877878950187, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.055964810316027, - "learning_rate": 5.964483568643951e-07, - "logits/chosen": 0.8970568776130676, - "logits/rejected": 0.48237085342407227, - "logps/accuracies": 0.5, - "logps/chosen": -349.3356018066406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -343.4019775390625, - "logps/ref_rejected": -257.5508117675781, - "logps/rejected": -269.60809326171875, - "loss": 0.5329, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2966794967651367, - "rewards/grad_term": 0.021424874663352966, - "rewards/margins": 0.30618318915367126, - "rewards/rejected": -0.6028627157211304, - "step": 48 - }, - { - "epoch": 0.1049812533476165, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.617890748457961, - "learning_rate": 5.996252365813443e-07, - "logits/chosen": 0.9411455392837524, - "logits/rejected": 0.5716761350631714, - "logps/accuracies": 0.0, - "logps/chosen": -416.63934326171875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -415.9534606933594, - "logps/ref_rejected": -247.12747192382812, - "logps/rejected": -250.519287109375, - "loss": 0.5287, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.03429294005036354, - "rewards/grad_term": 0.023422496393322945, - "rewards/margins": 0.1352972686290741, - "rewards/rejected": -0.16959019005298615, - "step": 49 - }, - { - "epoch": 0.10712372790573112, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.56249795289356, - "learning_rate": 6.02737932499173e-07, - "logits/chosen": 0.7994714975357056, - "logits/rejected": 0.42442572116851807, - "logps/accuracies": 0.0, - "logps/chosen": -441.199951171875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -438.83892822265625, - "logps/ref_rejected": -322.400146484375, - "logps/rejected": -338.1896667480469, - "loss": 0.5245, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.11804847419261932, - "rewards/grad_term": 0.016969487071037292, - "rewards/margins": 0.6714283227920532, - "rewards/rejected": -0.789476752281189, - "step": 50 - }, - { - "epoch": 0.10926620246384575, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.886334758313449, - "learning_rate": 6.057889868048325e-07, - "logits/chosen": 1.0163636207580566, - "logits/rejected": 0.8965986967086792, - "logps/accuracies": 0.25, - "logps/chosen": -416.2098083496094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -416.4342041015625, - "logps/ref_rejected": -324.2137145996094, - "logps/rejected": -339.7252502441406, - "loss": 0.5316, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.011221121996641159, - "rewards/grad_term": 0.01711445301771164, - "rewards/margins": 0.7867982983589172, - "rewards/rejected": -0.775577187538147, - "step": 51 - }, - { - "epoch": 0.11140867702196036, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.545934339053032, - "learning_rate": 6.087807935775333e-07, - "logits/chosen": 0.5185865759849548, - "logits/rejected": 0.3614073395729065, - "logps/accuracies": 0.25, - "logps/chosen": -229.86053466796875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -228.54452514648438, - "logps/ref_rejected": -195.83494567871094, - "logps/rejected": -206.37149047851562, - "loss": 0.5191, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06580007076263428, - "rewards/grad_term": 0.019602250307798386, - "rewards/margins": 0.461027592420578, - "rewards/rejected": -0.5268276929855347, - "step": 52 - }, - { - "epoch": 0.11355115158007499, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.703023398000125, - "learning_rate": 6.117156100749175e-07, - "logits/chosen": 0.9196311235427856, - "logits/rejected": 0.8846197128295898, - "logps/accuracies": 0.25, - "logps/chosen": -436.0137023925781, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -429.5511474609375, - "logps/ref_rejected": -402.48614501953125, - "logps/rejected": -423.6464538574219, - "loss": 0.5158, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.32312700152397156, - "rewards/grad_term": 0.016992483288049698, - "rewards/margins": 0.7348867654800415, - "rewards/rejected": -1.0580137968063354, - "step": 53 - }, - { - "epoch": 0.1156936261381896, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.613596866402307, - "learning_rate": 6.145955669642588e-07, - "logits/chosen": 1.0864933729171753, - "logits/rejected": 1.0046788454055786, - "logps/accuracies": 0.75, - "logps/chosen": -408.01947021484375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -394.4793395996094, - "logps/ref_rejected": -375.3567199707031, - "logps/rejected": -408.60504150390625, - "loss": 0.5135, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6770049929618835, - "rewards/grad_term": 0.013774032704532146, - "rewards/margins": 0.9854103326797485, - "rewards/rejected": -1.6624153852462769, - "step": 54 - }, - { - "epoch": 0.11783610069630424, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.495375001244799, - "learning_rate": 6.174226776148516e-07, - "logits/chosen": 0.8830907940864563, - "logits/rejected": 0.7350561618804932, - "logps/accuracies": 0.25, - "logps/chosen": -330.6471252441406, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -317.4583740234375, - "logps/ref_rejected": -287.886962890625, - "logps/rejected": -309.34942626953125, - "loss": 0.4821, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.6594364643096924, - "rewards/grad_term": 0.020037367939949036, - "rewards/margins": 0.41368618607521057, - "rewards/rejected": -1.0731226205825806, - "step": 55 - }, - { - "epoch": 0.11997857525441885, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.135520472135779, - "learning_rate": 6.201988465531067e-07, - "logits/chosen": 0.6756561994552612, - "logits/rejected": 0.6564769744873047, - "logps/accuracies": 0.5, - "logps/chosen": -181.22642517089844, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -179.98342895507812, - "logps/ref_rejected": -201.45867919921875, - "logps/rejected": -211.14109802246094, - "loss": 0.5161, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.06215019151568413, - "rewards/grad_term": 0.019981056451797485, - "rewards/margins": 0.421970933675766, - "rewards/rejected": -0.4841211438179016, - "step": 56 - }, - { - "epoch": 0.12212104981253348, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.52811690075287, - "learning_rate": 6.229258771692866e-07, - "logits/chosen": 0.7419092655181885, - "logits/rejected": 0.8074868321418762, - "logps/accuracies": 0.5, - "logps/chosen": -186.41204833984375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -182.34783935546875, - "logps/ref_rejected": -165.51622009277344, - "logps/rejected": -176.84945678710938, - "loss": 0.4952, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2032102644443512, - "rewards/grad_term": 0.02091275155544281, - "rewards/margins": 0.3634513318538666, - "rewards/rejected": -0.5666615962982178, - "step": 57 - }, - { - "epoch": 0.1242635243706481, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.647643500777985, - "learning_rate": 6.256054787539818e-07, - "logits/chosen": 1.005966067314148, - "logits/rejected": 0.8792574405670166, - "logps/accuracies": 0.25, - "logps/chosen": -421.94464111328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -406.43414306640625, - "logps/ref_rejected": -377.0458068847656, - "logps/rejected": -405.022705078125, - "loss": 0.4958, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7755249738693237, - "rewards/grad_term": 0.017923269420862198, - "rewards/margins": 0.623319149017334, - "rewards/rejected": -1.3988441228866577, - "step": 58 - }, - { - "epoch": 0.12640599892876273, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.581762363344323, - "learning_rate": 6.282392729330889e-07, - "logits/chosen": 0.788644552230835, - "logits/rejected": 0.8738614916801453, - "logps/accuracies": 0.5, - "logps/chosen": -299.53521728515625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -293.8477783203125, - "logps/ref_rejected": -257.6317443847656, - "logps/rejected": -273.8648681640625, - "loss": 0.4967, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.284370094537735, - "rewards/grad_term": 0.019201159477233887, - "rewards/margins": 0.5272856950759888, - "rewards/rejected": -0.8116558194160461, - "step": 59 - }, - { - "epoch": 0.12854847348687734, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.152545681984156, - "learning_rate": 6.308287995619528e-07, - "logits/chosen": 1.099388837814331, - "logits/rejected": 0.9928939342498779, - "logps/accuracies": 0.75, - "logps/chosen": -435.457763671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -415.12860107421875, - "logps/ref_rejected": -380.19720458984375, - "logps/rejected": -426.2452392578125, - "loss": 0.5009, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0164591073989868, - "rewards/grad_term": 0.011328812688589096, - "rewards/margins": 1.2859418392181396, - "rewards/rejected": -2.302400827407837, - "step": 60 - }, - { - "epoch": 0.13069094804499196, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.54221239337406, - "learning_rate": 6.33375522132322e-07, - "logits/chosen": 0.9639657735824585, - "logits/rejected": 0.8224814534187317, - "logps/accuracies": 0.25, - "logps/chosen": -348.4953918457031, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -334.9384765625, - "logps/ref_rejected": -316.9508056640625, - "logps/rejected": -356.79949951171875, - "loss": 0.4842, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6778436899185181, - "rewards/grad_term": 0.011656711809337139, - "rewards/margins": 1.314591884613037, - "rewards/rejected": -1.9924354553222656, - "step": 61 - }, - { - "epoch": 0.1328334226031066, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 9.022752821926046, - "learning_rate": 6.358808327396516e-07, - "logits/chosen": 1.0160002708435059, - "logits/rejected": 0.616927981376648, - "logps/accuracies": 0.0, - "logps/chosen": -329.97100830078125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -312.87115478515625, - "logps/ref_rejected": -194.55523681640625, - "logps/rejected": -234.106201171875, - "loss": 0.4768, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8549936413764954, - "rewards/grad_term": 0.013443742878735065, - "rewards/margins": 1.1225550174713135, - "rewards/rejected": -1.9775487184524536, - "step": 62 - }, - { - "epoch": 0.13497589716122121, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.278955604267708, - "learning_rate": 6.383460566529704e-07, - "logits/chosen": 1.092294454574585, - "logits/rejected": 0.9165109395980835, - "logps/accuracies": 0.25, - "logps/chosen": -447.57257080078125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -433.4320983886719, - "logps/ref_rejected": -357.5963439941406, - "logps/rejected": -389.59075927734375, - "loss": 0.467, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7070247530937195, - "rewards/grad_term": 0.014869745820760727, - "rewards/margins": 0.8926937580108643, - "rewards/rejected": -1.5997185707092285, - "step": 63 - }, - { - "epoch": 0.13711837171933583, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.629680006671775, - "learning_rate": 6.407724565248689e-07, - "logits/chosen": 1.0533897876739502, - "logits/rejected": 0.7830870151519775, - "logps/accuracies": 0.5, - "logps/chosen": -331.41888427734375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -315.2283020019531, - "logps/ref_rejected": -263.9187316894531, - "logps/rejected": -279.4315185546875, - "loss": 0.4427, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8095288276672363, - "rewards/grad_term": 0.025076739490032196, - "rewards/margins": -0.03389042615890503, - "rewards/rejected": -0.7756383419036865, - "step": 64 - }, - { - "epoch": 0.13711837171933583, - "eval_flips/correct->correct": 0.1599999964237213, - "eval_flips/correct->incorrect": 0.0, - "eval_flips/incorrect->correct": 0.019999999552965164, - "eval_flips/incorrect->incorrect": 0.8199999928474426, - "eval_logits/chosen": 0.9344247579574585, - "eval_logits/rejected": 0.7796935439109802, - "eval_logps/accuracies": 0.18000000715255737, - "eval_logps/chosen": -337.09112548828125, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -284.0068664550781, - "eval_loss": 0.4775756597518921, - "eval_rewards/accuracies": 0.7599999904632568, - "eval_rewards/chosen": -0.6787735819816589, - "eval_rewards/grad_term": 0.018939374014735222, - "eval_rewards/margins": 0.5865211486816406, - "eval_rewards/rejected": -1.2652947902679443, - "eval_runtime": 374.3115, - "eval_samples_per_second": 4.221, - "eval_steps_per_second": 0.134, - "step": 64 - }, - { - "epoch": 0.13926084627745045, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.614066894777759, - "learning_rate": 6.431612362750908e-07, - "logits/chosen": 0.8821598887443542, - "logits/rejected": 0.7168709635734558, - "logps/accuracies": 0.25, - "logps/chosen": -355.70806884765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -342.8082275390625, - "logps/ref_rejected": -322.1506652832031, - "logps/rejected": -350.2712097167969, - "loss": 0.4728, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6449924111366272, - "rewards/grad_term": 0.016398221254348755, - "rewards/margins": 0.7610346674919128, - "rewards/rejected": -1.40602707862854, - "step": 65 - }, - { - "epoch": 0.1414033208355651, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.855172230097049, - "learning_rate": 6.455135446776313e-07, - "logits/chosen": 0.7938796281814575, - "logits/rejected": 0.872632622718811, - "logps/accuracies": 0.5, - "logps/chosen": -310.7122802734375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -288.7535400390625, - "logps/ref_rejected": -302.68560791015625, - "logps/rejected": -338.1268615722656, - "loss": 0.4281, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.097936749458313, - "rewards/grad_term": 0.017016390338540077, - "rewards/margins": 0.6741248369216919, - "rewards/rejected": -1.7720615863800049, - "step": 66 - }, - { - "epoch": 0.1435457953936797, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 9.759871054620625, - "learning_rate": 6.478304786780968e-07, - "logits/chosen": 0.6220331192016602, - "logits/rejected": 0.5919966697692871, - "logps/accuracies": 0.0, - "logps/chosen": -368.77520751953125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -351.3057556152344, - "logps/ref_rejected": -261.40069580078125, - "logps/rejected": -304.46539306640625, - "loss": 0.475, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8734725713729858, - "rewards/grad_term": 0.014455066993832588, - "rewards/margins": 1.2797632217407227, - "rewards/rejected": -2.153235912322998, - "step": 67 - }, - { - "epoch": 0.14568826995179432, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.285309028552831, - "learning_rate": 6.501130864653065e-07, - "logits/chosen": 1.0729789733886719, - "logits/rejected": 1.0459753274917603, - "logps/accuracies": 0.5, - "logps/chosen": -303.7723388671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -290.2609558105469, - "logps/ref_rejected": -262.08392333984375, - "logps/rejected": -290.4940185546875, - "loss": 0.4704, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.6755678057670593, - "rewards/grad_term": 0.016569742932915688, - "rewards/margins": 0.7449362277984619, - "rewards/rejected": -1.420504093170166, - "step": 68 - }, - { - "epoch": 0.14783074450990893, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.513566366172148, - "learning_rate": 6.523623703186648e-07, - "logits/chosen": 0.894729495048523, - "logits/rejected": 1.0188794136047363, - "logps/accuracies": 0.75, - "logps/chosen": -317.3673095703125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -297.9476013183594, - "logps/ref_rejected": -303.1460876464844, - "logps/rejected": -348.55316162109375, - "loss": 0.4135, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9709864258766174, - "rewards/grad_term": 0.013456292450428009, - "rewards/margins": 1.2993673086166382, - "rewards/rejected": -2.2703537940979004, - "step": 69 - }, - { - "epoch": 0.14997321906802358, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.31200023242295, - "learning_rate": 6.545792892506645e-07, - "logits/chosen": 0.7371494174003601, - "logits/rejected": 0.9037774801254272, - "logps/accuracies": 0.5, - "logps/chosen": -312.41497802734375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -298.4285888671875, - "logps/ref_rejected": -268.10101318359375, - "logps/rejected": -295.3829345703125, - "loss": 0.3791, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.6993191242218018, - "rewards/grad_term": 0.017996307462453842, - "rewards/margins": 0.6647781729698181, - "rewards/rejected": -1.3640973567962646, - "step": 70 - }, - { - "epoch": 0.1521156936261382, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.814713554670275, - "learning_rate": 6.567647614619587e-07, - "logits/chosen": 0.8867220878601074, - "logits/rejected": 0.8295634984970093, - "logps/accuracies": 0.5, - "logps/chosen": -343.9578552246094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -321.64984130859375, - "logps/ref_rejected": -283.7974548339844, - "logps/rejected": -332.0108337402344, - "loss": 0.4731, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1153992414474487, - "rewards/grad_term": 0.012294553220272064, - "rewards/margins": 1.2952699661254883, - "rewards/rejected": -2.4106695652008057, - "step": 71 - }, - { - "epoch": 0.1542581681842528, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.137092152998001, - "learning_rate": 6.589196666247328e-07, - "logits/chosen": 0.9599927663803101, - "logits/rejected": 0.6932302713394165, - "logps/accuracies": 0.25, - "logps/chosen": -317.6519775390625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -303.6397705078125, - "logps/ref_rejected": -240.10572814941406, - "logps/rejected": -283.4850158691406, - "loss": 0.4246, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.700611412525177, - "rewards/grad_term": 0.012517021037638187, - "rewards/margins": 1.4683525562286377, - "rewards/rejected": -2.16896390914917, - "step": 72 - }, - { - "epoch": 0.15640064274236742, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.33751492174849, - "learning_rate": 6.610448480085853e-07, - "logits/chosen": 0.4570969045162201, - "logits/rejected": 0.4160915017127991, - "logps/accuracies": 0.25, - "logps/chosen": -212.59051513671875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -212.9862060546875, - "logps/ref_rejected": -181.0455322265625, - "logps/rejected": -190.78341674804688, - "loss": 0.4513, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.01978490501642227, - "rewards/grad_term": 0.019215064123272896, - "rewards/margins": 0.5066791772842407, - "rewards/rejected": -0.48689424991607666, - "step": 73 - }, - { - "epoch": 0.15854311730048207, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.366156800768572, - "learning_rate": 6.631411144617796e-07, - "logits/chosen": 0.7912124395370483, - "logits/rejected": 0.7016277313232422, - "logps/accuracies": 0.5, - "logps/chosen": -337.2362365722656, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -318.22076416015625, - "logps/ref_rejected": -285.63134765625, - "logps/rejected": -356.1361999511719, - "loss": 0.3717, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9507730603218079, - "rewards/grad_term": 0.008385634049773216, - "rewards/margins": 2.574469804763794, - "rewards/rejected": -3.525242805480957, - "step": 74 - }, - { - "epoch": 0.16068559185859668, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.408007517521634, - "learning_rate": 6.652092422595104e-07, - "logits/chosen": 0.8023636937141418, - "logits/rejected": 0.7735366821289062, - "logps/accuracies": 0.75, - "logps/chosen": -306.0715026855469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -275.71649169921875, - "logps/ref_rejected": -242.64198303222656, - "logps/rejected": -290.594970703125, - "loss": 0.3573, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.5177501440048218, - "rewards/grad_term": 0.018539991229772568, - "rewards/margins": 0.8798991441726685, - "rewards/rejected": -2.3976492881774902, - "step": 75 - }, - { - "epoch": 0.1628280664167113, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.365050151926509, - "learning_rate": 6.672499768297604e-07, - "logits/chosen": 0.8398016691207886, - "logits/rejected": 0.8039026260375977, - "logps/accuracies": 0.25, - "logps/chosen": -338.2458801269531, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -300.6439208984375, - "logps/ref_rejected": -266.9490661621094, - "logps/rejected": -321.94207763671875, - "loss": 0.4334, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8800976276397705, - "rewards/grad_term": 0.016930393874645233, - "rewards/margins": 0.8695545196533203, - "rewards/rejected": -2.749652147293091, - "step": 76 - }, - { - "epoch": 0.1649705409748259, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.75229443985931, - "learning_rate": 6.692640343663431e-07, - "logits/chosen": 0.8696576356887817, - "logits/rejected": 0.876376211643219, - "logps/accuracies": 0.5, - "logps/chosen": -341.6177978515625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -313.14349365234375, - "logps/ref_rejected": -307.5328369140625, - "logps/rejected": -359.84185791015625, - "loss": 0.3678, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4237148761749268, - "rewards/grad_term": 0.014790714718401432, - "rewards/margins": 1.1917363405227661, - "rewards/rejected": -2.6154510974884033, - "step": 77 - }, - { - "epoch": 0.16711301553294056, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 8.819014293843647, - "learning_rate": 6.712521033378708e-07, - "logits/chosen": 1.0659058094024658, - "logits/rejected": 0.6623323559761047, - "logps/accuracies": 0.0, - "logps/chosen": -331.59368896484375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -292.36590576171875, - "logps/ref_rejected": -146.4788360595703, - "logps/rejected": -199.08897399902344, - "loss": 0.4175, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.9613897800445557, - "rewards/grad_term": 0.020100167021155357, - "rewards/margins": 0.6691172122955322, - "rewards/rejected": -2.630506992340088, - "step": 78 - }, - { - "epoch": 0.16925549009105517, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.100776831917964, - "learning_rate": 6.732148459006032e-07, - "logits/chosen": 0.7964029312133789, - "logits/rejected": 0.6478776931762695, - "logps/accuracies": 0.25, - "logps/chosen": -320.44476318359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -274.2988586425781, - "logps/ref_rejected": -246.58741760253906, - "logps/rejected": -303.3105773925781, - "loss": 0.394, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.3072948455810547, - "rewards/grad_term": 0.022354397922754288, - "rewards/margins": 0.5288637280464172, - "rewards/rejected": -2.8361587524414062, - "step": 79 - }, - { - "epoch": 0.1713979646491698, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.215506702371561, - "learning_rate": 6.751528992224267e-07, - "logits/chosen": 1.0231748819351196, - "logits/rejected": 0.9256105422973633, - "logps/accuracies": 0.5, - "logps/chosen": -371.5207824707031, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -323.4173583984375, - "logps/ref_rejected": -287.026611328125, - "logps/rejected": -385.26141357421875, - "loss": 0.358, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4051713943481445, - "rewards/grad_term": 0.008784506469964981, - "rewards/margins": 2.5065674781799316, - "rewards/rejected": -4.911738872528076, - "step": 80 - }, - { - "epoch": 0.1735404392072844, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.286461530064006, - "learning_rate": 6.770668767245965e-07, - "logits/chosen": 0.9782469868659973, - "logits/rejected": 0.6938868761062622, - "logps/accuracies": 0.5, - "logps/chosen": -289.8148498535156, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -267.3828125, - "logps/ref_rejected": -228.83135986328125, - "logps/rejected": -291.4294128417969, - "loss": 0.3489, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1216013431549072, - "rewards/grad_term": 0.012826542370021343, - "rewards/margins": 2.0083022117614746, - "rewards/rejected": -3.129903554916382, - "step": 81 - }, - { - "epoch": 0.17568291376539905, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.751825989493291, - "learning_rate": 6.789573692472892e-07, - "logits/chosen": 0.8200634717941284, - "logits/rejected": 0.9688655138015747, - "logps/accuracies": 0.5, - "logps/chosen": -390.98150634765625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -348.5020751953125, - "logps/ref_rejected": -350.07159423828125, - "logps/rejected": -411.6036682128906, - "loss": 0.393, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.123972177505493, - "rewards/grad_term": 0.014323998242616653, - "rewards/margins": 0.9526323080062866, - "rewards/rejected": -3.0766046047210693, - "step": 82 - }, - { - "epoch": 0.17782538832351366, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.216588835288912, - "learning_rate": 6.808249461445122e-07, - "logits/chosen": 1.072934627532959, - "logits/rejected": 0.911789059638977, - "logps/accuracies": 0.75, - "logps/chosen": -479.0382080078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -419.0920104980469, - "logps/ref_rejected": -390.1745300292969, - "logps/rejected": -485.1689453125, - "loss": 0.3724, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.997309684753418, - "rewards/grad_term": 0.0095378328114748, - "rewards/margins": 1.7524113655090332, - "rewards/rejected": -4.749721050262451, - "step": 83 - }, - { - "epoch": 0.17996786288162828, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.25160372674251, - "learning_rate": 6.826701563134442e-07, - "logits/chosen": 0.8092617988586426, - "logits/rejected": 0.7646486163139343, - "logps/accuracies": 0.25, - "logps/chosen": -410.1537780761719, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -384.58575439453125, - "logps/ref_rejected": -304.9964904785156, - "logps/rejected": -354.8568115234375, - "loss": 0.3661, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2784030437469482, - "rewards/grad_term": 0.012526333332061768, - "rewards/margins": 1.2146127223968506, - "rewards/rejected": -2.493015766143799, - "step": 84 - }, - { - "epoch": 0.1821103374397429, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.118700174990803, - "learning_rate": 6.844935291628642e-07, - "logits/chosen": 1.0619235038757324, - "logits/rejected": 0.9984962344169617, - "logps/accuracies": 0.25, - "logps/chosen": -409.0619201660156, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -377.1236267089844, - "logps/ref_rejected": -326.3685302734375, - "logps/rejected": -405.38037109375, - "loss": 0.3594, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.596914291381836, - "rewards/grad_term": 0.009607160463929176, - "rewards/margins": 2.353677988052368, - "rewards/rejected": -3.950592041015625, - "step": 85 - }, - { - "epoch": 0.18425281199785754, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.571514403852135, - "learning_rate": 6.862955755249413e-07, - "logits/chosen": 0.9191405773162842, - "logits/rejected": 0.801410436630249, - "logps/accuracies": 0.5, - "logps/chosen": -312.10894775390625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -278.7945556640625, - "logps/ref_rejected": -234.3353271484375, - "logps/rejected": -277.7581787109375, - "loss": 0.3311, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6657218933105469, - "rewards/grad_term": 0.019233139231801033, - "rewards/margins": 0.5054203271865845, - "rewards/rejected": -2.171142339706421, - "step": 86 - }, - { - "epoch": 0.18639528655597215, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.757747519851693, - "learning_rate": 6.880767885143194e-07, - "logits/chosen": 0.8738288283348083, - "logits/rejected": 0.684519350528717, - "logps/accuracies": 0.25, - "logps/chosen": -410.3459167480469, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -376.2248840332031, - "logps/ref_rejected": -312.1935729980469, - "logps/rejected": -370.3998107910156, - "loss": 0.3656, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7060527801513672, - "rewards/grad_term": 0.01283444557338953, - "rewards/margins": 1.2042596340179443, - "rewards/rejected": -2.9103124141693115, - "step": 87 - }, - { - "epoch": 0.18853776111408677, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.985961972583743, - "learning_rate": 6.898376443381053e-07, - "logits/chosen": 0.9711716771125793, - "logits/rejected": 0.8526138067245483, - "logps/accuracies": 0.25, - "logps/chosen": -403.53948974609375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -360.1213073730469, - "logps/ref_rejected": -311.68292236328125, - "logps/rejected": -381.7286376953125, - "loss": 0.3717, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.170908212661743, - "rewards/grad_term": 0.018295947462320328, - "rewards/margins": 1.3313778638839722, - "rewards/rejected": -3.502286195755005, - "step": 88 - }, - { - "epoch": 0.19068023567220138, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.683570806703395, - "learning_rate": 6.915786030600927e-07, - "logits/chosen": 0.843113899230957, - "logits/rejected": 0.7101360559463501, - "logps/accuracies": 0.5, - "logps/chosen": -428.8794250488281, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -389.9796447753906, - "logps/ref_rejected": -322.93255615234375, - "logps/rejected": -415.9242248535156, - "loss": 0.3549, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9449876546859741, - "rewards/grad_term": 0.004990034736692905, - "rewards/margins": 2.704596996307373, - "rewards/rejected": -4.649584770202637, - "step": 89 - }, - { - "epoch": 0.19282271023031602, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.498185901157015, - "learning_rate": 6.933001093222904e-07, - "logits/chosen": 0.9054147601127625, - "logits/rejected": 0.7073743939399719, - "logps/accuracies": 0.75, - "logps/chosen": -294.0045471191406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -280.5016174316406, - "logps/ref_rejected": -278.77313232421875, - "logps/rejected": -316.6849060058594, - "loss": 0.3481, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6751471161842346, - "rewards/grad_term": 0.011688041500747204, - "rewards/margins": 1.2204415798187256, - "rewards/rejected": -1.895588755607605, - "step": 90 - }, - { - "epoch": 0.19496518478843064, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.490840097910208, - "learning_rate": 6.950025930265823e-07, - "logits/chosen": 0.8930804133415222, - "logits/rejected": 0.9076898097991943, - "logps/accuracies": 0.75, - "logps/chosen": -353.33563232421875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -318.02215576171875, - "logps/ref_rejected": -333.869140625, - "logps/rejected": -411.1112365722656, - "loss": 0.3822, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7656757831573486, - "rewards/grad_term": 0.006808947771787643, - "rewards/margins": 2.0964293479919434, - "rewards/rejected": -3.862105369567871, - "step": 91 - }, - { - "epoch": 0.19710765934654526, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.412294871691426, - "learning_rate": 6.966864699791386e-07, - "logits/chosen": 0.7138292789459229, - "logits/rejected": 0.6344163417816162, - "logps/accuracies": 0.75, - "logps/chosen": -284.339111328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -267.2784423828125, - "logps/ref_rejected": -271.889892578125, - "logps/rejected": -323.4517822265625, - "loss": 0.3446, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8530327081680298, - "rewards/grad_term": 0.008454471826553345, - "rewards/margins": 1.7250609397888184, - "rewards/rejected": -2.5780937671661377, - "step": 92 - }, - { - "epoch": 0.19925013390465987, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 7.851393127918492, - "learning_rate": 6.983521424999892e-07, - "logits/chosen": 1.1121702194213867, - "logits/rejected": 1.0464541912078857, - "logps/accuracies": 0.0, - "logps/chosen": -323.29986572265625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -300.96435546875, - "logps/ref_rejected": -263.5765686035156, - "logps/rejected": -319.5726318359375, - "loss": 0.3308, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1167749166488647, - "rewards/grad_term": 0.01233928557485342, - "rewards/margins": 1.683027982711792, - "rewards/rejected": -2.7998030185699463, - "step": 93 - }, - { - "epoch": 0.20139260846277451, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.7736296892204395, - "learning_rate": 7e-07, - "logits/chosen": 0.8960837125778198, - "logits/rejected": 0.761021614074707, - "logps/accuracies": 0.25, - "logps/chosen": -377.28021240234375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -330.466064453125, - "logps/ref_rejected": -286.0745849609375, - "logps/rejected": -351.7135009765625, - "loss": 0.296, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3407070636749268, - "rewards/grad_term": 0.015900662168860435, - "rewards/margins": 0.9412397742271423, - "rewards/rejected": -3.281947135925293, - "step": 94 - }, - { - "epoch": 0.20353508302088913, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.371159163521241, - "learning_rate": 7e-07, - "logits/chosen": 0.9651041030883789, - "logits/rejected": 0.7860502004623413, - "logps/accuracies": 0.25, - "logps/chosen": -425.8303527832031, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -374.81097412109375, - "logps/ref_rejected": -321.1005859375, - "logps/rejected": -400.951171875, - "loss": 0.3427, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5509684085845947, - "rewards/grad_term": 0.0106906583532691, - "rewards/margins": 1.4415616989135742, - "rewards/rejected": -3.99252986907959, - "step": 95 - }, - { - "epoch": 0.20567755757900374, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.177619063233454, - "learning_rate": 6.991646778042959e-07, - "logits/chosen": 0.7661604881286621, - "logits/rejected": 0.6532018780708313, - "logps/accuracies": 0.5, - "logps/chosen": -257.8993835449219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -240.183349609375, - "logps/ref_rejected": -198.9567413330078, - "logps/rejected": -253.74404907226562, - "loss": 0.3882, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8858024477958679, - "rewards/grad_term": 0.011164636351168156, - "rewards/margins": 1.8535633087158203, - "rewards/rejected": -2.739365816116333, - "step": 96 - }, - { - "epoch": 0.20782003213711836, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.250952702236187, - "learning_rate": 6.983293556085918e-07, - "logits/chosen": 0.8017215728759766, - "logits/rejected": 0.6558822393417358, - "logps/accuracies": 0.5, - "logps/chosen": -316.91131591796875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -272.19842529296875, - "logps/ref_rejected": -228.5543975830078, - "logps/rejected": -309.4710693359375, - "loss": 0.336, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.235644817352295, - "rewards/grad_term": 0.010754971764981747, - "rewards/margins": 1.810187816619873, - "rewards/rejected": -4.045832633972168, - "step": 97 - }, - { - "epoch": 0.209962506695233, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.58529126245538, - "learning_rate": 6.974940334128877e-07, - "logits/chosen": 0.8682336807250977, - "logits/rejected": 0.7378227710723877, - "logps/accuracies": 0.5, - "logps/chosen": -253.228515625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -229.12966918945312, - "logps/ref_rejected": -178.55392456054688, - "logps/rejected": -221.88421630859375, - "loss": 0.3899, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2049428224563599, - "rewards/grad_term": 0.014865662902593613, - "rewards/margins": 0.9615722894668579, - "rewards/rejected": -2.1665151119232178, - "step": 98 - }, - { - "epoch": 0.21210498125334762, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.572183229995163, - "learning_rate": 6.966587112171838e-07, - "logits/chosen": 0.8709318041801453, - "logits/rejected": 0.7755447626113892, - "logps/accuracies": 0.5, - "logps/chosen": -408.872802734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -363.9078369140625, - "logps/ref_rejected": -299.89794921875, - "logps/rejected": -368.15155029296875, - "loss": 0.3652, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.2482473850250244, - "rewards/grad_term": 0.018564339727163315, - "rewards/margins": 1.1644330024719238, - "rewards/rejected": -3.4126803874969482, - "step": 99 - }, - { - "epoch": 0.21424745581146223, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.624545109237621, - "learning_rate": 6.958233890214797e-07, - "logits/chosen": 0.9372926354408264, - "logits/rejected": 0.8751659989356995, - "logps/accuracies": 0.5, - "logps/chosen": -474.0740966796875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -424.33819580078125, - "logps/ref_rejected": -390.84747314453125, - "logps/rejected": -506.8907165527344, - "loss": 0.3056, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4867947101593018, - "rewards/grad_term": 0.004845261108130217, - "rewards/margins": 3.3153672218322754, - "rewards/rejected": -5.802162170410156, - "step": 100 - }, - { - "epoch": 0.21638993036957685, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.067295914483129, - "learning_rate": 6.949880668257756e-07, - "logits/chosen": 0.9544820785522461, - "logits/rejected": 0.9464821815490723, - "logps/accuracies": 0.5, - "logps/chosen": -494.697509765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -454.8968505859375, - "logps/ref_rejected": -400.18536376953125, - "logps/rejected": -466.2589111328125, - "loss": 0.358, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.990033745765686, - "rewards/grad_term": 0.013565966859459877, - "rewards/margins": 1.3136428594589233, - "rewards/rejected": -3.3036766052246094, - "step": 101 - }, - { - "epoch": 0.2185324049276915, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.960375031224479, - "learning_rate": 6.941527446300716e-07, - "logits/chosen": 0.8813360333442688, - "logits/rejected": 0.695158839225769, - "logps/accuracies": 0.5, - "logps/chosen": -291.07232666015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -251.7209930419922, - "logps/ref_rejected": -190.36097717285156, - "logps/rejected": -257.9454345703125, - "loss": 0.3402, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9675672054290771, - "rewards/grad_term": 0.011471440084278584, - "rewards/margins": 1.411657452583313, - "rewards/rejected": -3.3792247772216797, - "step": 102 - }, - { - "epoch": 0.2206748794858061, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.26805795080682, - "learning_rate": 6.933174224343675e-07, - "logits/chosen": 0.8939443826675415, - "logits/rejected": 0.625817596912384, - "logps/accuracies": 0.75, - "logps/chosen": -145.8295440673828, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -133.74105834960938, - "logps/ref_rejected": -113.23294067382812, - "logps/rejected": -148.43218994140625, - "loss": 0.3759, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6044239401817322, - "rewards/grad_term": 0.014196785166859627, - "rewards/margins": 1.1555386781692505, - "rewards/rejected": -1.7599626779556274, - "step": 103 - }, - { - "epoch": 0.22281735404392072, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.93695871637079, - "learning_rate": 6.924821002386635e-07, - "logits/chosen": 1.0364937782287598, - "logits/rejected": 0.563789427280426, - "logps/accuracies": 0.5, - "logps/chosen": -199.2762451171875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -178.82568359375, - "logps/ref_rejected": -115.96835327148438, - "logps/rejected": -148.9056396484375, - "loss": 0.3053, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0225275754928589, - "rewards/grad_term": 0.019687440246343613, - "rewards/margins": 0.6243367195129395, - "rewards/rejected": -1.6468642950057983, - "step": 104 - }, - { - "epoch": 0.22495982860203534, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.863344087781872, - "learning_rate": 6.916467780429593e-07, - "logits/chosen": 0.869300901889801, - "logits/rejected": 0.7735204100608826, - "logps/accuracies": 0.25, - "logps/chosen": -459.5062561035156, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -410.751708984375, - "logps/ref_rejected": -351.19830322265625, - "logps/rejected": -436.1498718261719, - "loss": 0.3494, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4377286434173584, - "rewards/grad_term": 0.008553780615329742, - "rewards/margins": 1.8098492622375488, - "rewards/rejected": -4.247577667236328, - "step": 105 - }, - { - "epoch": 0.22710230316014998, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.525931151845846, - "learning_rate": 6.908114558472554e-07, - "logits/chosen": 0.9212764501571655, - "logits/rejected": 0.7540117502212524, - "logps/accuracies": 0.75, - "logps/chosen": -369.387939453125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -333.4745788574219, - "logps/ref_rejected": -299.4930419921875, - "logps/rejected": -394.82952880859375, - "loss": 0.3309, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7956693172454834, - "rewards/grad_term": 0.00758711900562048, - "rewards/margins": 2.9711527824401855, - "rewards/rejected": -4.76682186126709, - "step": 106 - }, - { - "epoch": 0.2292447777182646, - "flips/correct->correct": 1.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.335424081368634, - "learning_rate": 6.899761336515513e-07, - "logits/chosen": 0.7336133718490601, - "logits/rejected": 0.8765916228294373, - "logps/accuracies": 1.0, - "logps/chosen": -244.1851348876953, - "logps/ref_accuracies": 1.0, - "logps/ref_chosen": -227.0002899169922, - "logps/ref_rejected": -286.24835205078125, - "logps/rejected": -336.29425048828125, - "loss": 0.3244, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8592423796653748, - "rewards/grad_term": 0.009657299146056175, - "rewards/margins": 1.643053412437439, - "rewards/rejected": -2.502295970916748, - "step": 107 - }, - { - "epoch": 0.2313872522763792, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.973679697099532, - "learning_rate": 6.891408114558472e-07, - "logits/chosen": 0.9743479490280151, - "logits/rejected": 1.0019692182540894, - "logps/accuracies": 0.75, - "logps/chosen": -339.78680419921875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -297.22393798828125, - "logps/ref_rejected": -304.42578125, - "logps/rejected": -374.98779296875, - "loss": 0.2818, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1281425952911377, - "rewards/grad_term": 0.01099520642310381, - "rewards/margins": 1.3999576568603516, - "rewards/rejected": -3.5281002521514893, - "step": 108 - }, - { - "epoch": 0.23352972683449383, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.989727540849666, - "learning_rate": 6.883054892601431e-07, - "logits/chosen": 1.1120461225509644, - "logits/rejected": 0.7604467272758484, - "logps/accuracies": 0.5, - "logps/chosen": -414.3526916503906, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -393.2984619140625, - "logps/ref_rejected": -310.2208251953125, - "logps/rejected": -371.7498779296875, - "loss": 0.3362, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.052709937095642, - "rewards/grad_term": 0.010077744722366333, - "rewards/margins": 2.023743152618408, - "rewards/rejected": -3.07645320892334, - "step": 109 - }, - { - "epoch": 0.23567220139260847, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.51051941007351, - "learning_rate": 6.874701670644392e-07, - "logits/chosen": 0.8338203430175781, - "logits/rejected": 0.677147388458252, - "logps/accuracies": 0.5, - "logps/chosen": -344.638671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -336.9551086425781, - "logps/ref_rejected": -251.71192932128906, - "logps/rejected": -278.19818115234375, - "loss": 0.3953, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.3841773271560669, - "rewards/grad_term": 0.01569775864481926, - "rewards/margins": 0.9401355981826782, - "rewards/rejected": -1.3243129253387451, - "step": 110 - }, - { - "epoch": 0.2378146759507231, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.352097570099665, - "learning_rate": 6.866348448687351e-07, - "logits/chosen": 1.0819525718688965, - "logits/rejected": 0.9638932943344116, - "logps/accuracies": 0.5, - "logps/chosen": -340.05609130859375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -319.25897216796875, - "logps/ref_rejected": -289.39013671875, - "logps/rejected": -384.5931091308594, - "loss": 0.2425, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0398552417755127, - "rewards/grad_term": 0.007856637239456177, - "rewards/margins": 3.7202935218811035, - "rewards/rejected": -4.760149002075195, - "step": 111 - }, - { - "epoch": 0.2399571505088377, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.759463222456692, - "learning_rate": 6.85799522673031e-07, - "logits/chosen": 0.6596381664276123, - "logits/rejected": 0.4419824481010437, - "logps/accuracies": 0.25, - "logps/chosen": -287.6408996582031, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -243.9873046875, - "logps/ref_rejected": -173.07931518554688, - "logps/rejected": -247.95965576171875, - "loss": 0.3367, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1826789379119873, - "rewards/grad_term": 0.011957314796745777, - "rewards/margins": 1.5613383054733276, - "rewards/rejected": -3.7440171241760254, - "step": 112 - }, - { - "epoch": 0.24209962506695232, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.026439048323674, - "learning_rate": 6.849642004773269e-07, - "logits/chosen": 0.9305301904678345, - "logits/rejected": 0.8421428203582764, - "logps/accuracies": 0.25, - "logps/chosen": -441.80340576171875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -359.161865234375, - "logps/ref_rejected": -317.730712890625, - "logps/rejected": -430.5499267578125, - "loss": 0.2678, - "rewards/accuracies": 0.5, - "rewards/chosen": -4.132076263427734, - "rewards/grad_term": 0.01556328870356083, - "rewards/margins": 1.508885145187378, - "rewards/rejected": -5.640961647033691, - "step": 113 - }, - { - "epoch": 0.24424209962506696, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.200400573622296, - "learning_rate": 6.841288782816229e-07, - "logits/chosen": 0.957403838634491, - "logits/rejected": 0.7602821588516235, - "logps/accuracies": 0.25, - "logps/chosen": -317.9858703613281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -303.8868713378906, - "logps/ref_rejected": -208.02256774902344, - "logps/rejected": -256.6693115234375, - "loss": 0.283, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7049498558044434, - "rewards/grad_term": 0.009732572361826897, - "rewards/margins": 1.7273874282836914, - "rewards/rejected": -2.4323372840881348, - "step": 114 - }, - { - "epoch": 0.24638457418318158, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.032492821291632, - "learning_rate": 6.832935560859188e-07, - "logits/chosen": 0.8847552537918091, - "logits/rejected": 0.8709891438484192, - "logps/accuracies": 0.75, - "logps/chosen": -275.5125427246094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -239.18618774414062, - "logps/ref_rejected": -209.97958374023438, - "logps/rejected": -299.82904052734375, - "loss": 0.335, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.816316843032837, - "rewards/grad_term": 0.007570344489067793, - "rewards/margins": 2.6761562824249268, - "rewards/rejected": -4.492473125457764, - "step": 115 - }, - { - "epoch": 0.2485270487412962, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.054789967348844, - "learning_rate": 6.824582338902147e-07, - "logits/chosen": 0.9024197459220886, - "logits/rejected": 0.7947896718978882, - "logps/accuracies": 1.0, - "logps/chosen": -347.18023681640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -307.41802978515625, - "logps/ref_rejected": -293.6680908203125, - "logps/rejected": -407.90380859375, - "loss": 0.3347, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9881106615066528, - "rewards/grad_term": 0.0060688708908855915, - "rewards/margins": 3.7236742973327637, - "rewards/rejected": -5.711784839630127, - "step": 116 - }, - { - "epoch": 0.25066952329941083, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 8.668010213567692, - "learning_rate": 6.816229116945108e-07, - "logits/chosen": 0.9976560473442078, - "logits/rejected": 0.6948249340057373, - "logps/accuracies": 0.0, - "logps/chosen": -401.39923095703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -337.87274169921875, - "logps/ref_rejected": -240.39515686035156, - "logps/rejected": -330.98101806640625, - "loss": 0.3004, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.17632794380188, - "rewards/grad_term": 0.013613393530249596, - "rewards/margins": 1.3529647588729858, - "rewards/rejected": -4.529292583465576, - "step": 117 - }, - { - "epoch": 0.25281199785752545, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.847556657578917, - "learning_rate": 6.807875894988067e-07, - "logits/chosen": 0.9075788259506226, - "logits/rejected": 0.7977707982063293, - "logps/accuracies": 0.75, - "logps/chosen": -375.44140625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -321.0868225097656, - "logps/ref_rejected": -267.4314270019531, - "logps/rejected": -378.83599853515625, - "loss": 0.3172, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7177300453186035, - "rewards/grad_term": 0.004429055377840996, - "rewards/margins": 2.8524982929229736, - "rewards/rejected": -5.570228099822998, - "step": 118 - }, - { - "epoch": 0.25495447241564007, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 8.68803296697522, - "learning_rate": 6.799522673031026e-07, - "logits/chosen": 0.895818293094635, - "logits/rejected": 0.575642466545105, - "logps/accuracies": 0.0, - "logps/chosen": -318.50885009765625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -287.44415283203125, - "logps/ref_rejected": -177.8115234375, - "logps/rejected": -243.67047119140625, - "loss": 0.3158, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5532350540161133, - "rewards/grad_term": 0.010060964152216911, - "rewards/margins": 1.739712119102478, - "rewards/rejected": -3.2929470539093018, - "step": 119 - }, - { - "epoch": 0.2570969469737547, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.719803160063796, - "learning_rate": 6.791169451073985e-07, - "logits/chosen": 0.7772294282913208, - "logits/rejected": 0.8315998315811157, - "logps/accuracies": 1.0, - "logps/chosen": -432.6302185058594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -355.44903564453125, - "logps/ref_rejected": -386.0659484863281, - "logps/rejected": -523.24609375, - "loss": 0.2837, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.859060287475586, - "rewards/grad_term": 0.003882521763443947, - "rewards/margins": 2.9999477863311768, - "rewards/rejected": -6.859008312225342, - "step": 120 - }, - { - "epoch": 0.2592394215318693, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.238978607387634, - "learning_rate": 6.782816229116945e-07, - "logits/chosen": 0.8819637894630432, - "logits/rejected": 0.7054441571235657, - "logps/accuracies": 0.25, - "logps/chosen": -393.9786071777344, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -364.44207763671875, - "logps/ref_rejected": -345.252685546875, - "logps/rejected": -389.1627502441406, - "loss": 0.2768, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4768271446228027, - "rewards/grad_term": 0.01706988736987114, - "rewards/margins": 0.7186762094497681, - "rewards/rejected": -2.1955032348632812, - "step": 121 - }, - { - "epoch": 0.2613818960899839, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.838179827940348, - "learning_rate": 6.774463007159905e-07, - "logits/chosen": 0.7459653615951538, - "logits/rejected": 0.7466898560523987, - "logps/accuracies": 0.5, - "logps/chosen": -289.6307067871094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -244.8365020751953, - "logps/ref_rejected": -223.6743927001953, - "logps/rejected": -318.4463195800781, - "loss": 0.273, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2397100925445557, - "rewards/grad_term": 0.010229886509478092, - "rewards/margins": 2.4988858699798584, - "rewards/rejected": -4.738595962524414, - "step": 122 - }, - { - "epoch": 0.2635243706480985, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.71899475513759, - "learning_rate": 6.766109785202863e-07, - "logits/chosen": 0.9420760869979858, - "logits/rejected": 0.8325188755989075, - "logps/accuracies": 0.75, - "logps/chosen": -358.3894348144531, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -302.154052734375, - "logps/ref_rejected": -271.524658203125, - "logps/rejected": -382.03302001953125, - "loss": 0.2531, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.811767339706421, - "rewards/grad_term": 0.010155830532312393, - "rewards/margins": 2.7136499881744385, - "rewards/rejected": -5.525417327880859, - "step": 123 - }, - { - "epoch": 0.2656668452062132, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.827729622567765, - "learning_rate": 6.757756563245823e-07, - "logits/chosen": 0.9883730411529541, - "logits/rejected": 0.8319672346115112, - "logps/accuracies": 0.5, - "logps/chosen": -461.412109375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -367.73486328125, - "logps/ref_rejected": -298.51544189453125, - "logps/rejected": -510.0968017578125, - "loss": 0.2583, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.683863162994385, - "rewards/grad_term": 0.0023802227806299925, - "rewards/margins": 5.895203590393066, - "rewards/rejected": -10.57906723022461, - "step": 124 - }, - { - "epoch": 0.2678093197643278, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 5.875715435058411, - "learning_rate": 6.749403341288783e-07, - "logits/chosen": 0.9549310207366943, - "logits/rejected": 0.8220203518867493, - "logps/accuracies": 0.25, - "logps/chosen": -292.9206237792969, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -250.8220672607422, - "logps/ref_rejected": -188.01498413085938, - "logps/rejected": -286.50128173828125, - "loss": 0.2039, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1049275398254395, - "rewards/grad_term": 0.006195048335939646, - "rewards/margins": 2.8193886280059814, - "rewards/rejected": -4.92431640625, - "step": 125 - }, - { - "epoch": 0.26995179432244243, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.463586322032903, - "learning_rate": 6.741050119331742e-07, - "logits/chosen": 0.9875746965408325, - "logits/rejected": 0.7808788418769836, - "logps/accuracies": 0.5, - "logps/chosen": -251.4386444091797, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -224.3441619873047, - "logps/ref_rejected": -220.13214111328125, - "logps/rejected": -279.40972900390625, - "loss": 0.3086, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3547240495681763, - "rewards/grad_term": 0.009899970144033432, - "rewards/margins": 1.6091564893722534, - "rewards/rejected": -2.9638805389404297, - "step": 126 - }, - { - "epoch": 0.27209426888055704, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.995166618243903, - "learning_rate": 6.732696897374701e-07, - "logits/chosen": 1.0715935230255127, - "logits/rejected": 0.9156344532966614, - "logps/accuracies": 0.5, - "logps/chosen": -495.48419189453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -411.56317138671875, - "logps/ref_rejected": -354.6976623535156, - "logps/rejected": -507.9322509765625, - "loss": 0.268, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.196050643920898, - "rewards/grad_term": 0.0026723581831902266, - "rewards/margins": 3.465679168701172, - "rewards/rejected": -7.66172981262207, - "step": 127 - }, - { - "epoch": 0.27423674343867166, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.959838246440706, - "learning_rate": 6.72434367541766e-07, - "logits/chosen": 0.9897805452346802, - "logits/rejected": 0.8163310289382935, - "logps/accuracies": 1.0, - "logps/chosen": -487.4618225097656, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -402.695556640625, - "logps/ref_rejected": -365.8092346191406, - "logps/rejected": -549.47314453125, - "loss": 0.303, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.238313674926758, - "rewards/grad_term": 0.0018401599954813719, - "rewards/margins": 4.944880485534668, - "rewards/rejected": -9.183194160461426, - "step": 128 - }, - { - "epoch": 0.27423674343867166, - "eval_flips/correct->correct": 0.1599999964237213, - "eval_flips/correct->incorrect": 0.0, - "eval_flips/incorrect->correct": 0.20000000298023224, - "eval_flips/incorrect->incorrect": 0.6399999856948853, - "eval_logits/chosen": 0.871035099029541, - "eval_logits/rejected": 0.7403469681739807, - "eval_logps/accuracies": 0.36000001430511475, - "eval_logps/chosen": -374.1800537109375, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -354.16534423828125, - "eval_loss": 0.290331095457077, - "eval_rewards/accuracies": 0.7799999713897705, - "eval_rewards/chosen": -2.533219575881958, - "eval_rewards/grad_term": 0.011603965424001217, - "eval_rewards/margins": 2.2400009632110596, - "eval_rewards/rejected": -4.773220062255859, - "eval_runtime": 374.0534, - "eval_samples_per_second": 4.224, - "eval_steps_per_second": 0.134, - "step": 128 - }, - { - "epoch": 0.2763792179967863, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.320596690520534, - "learning_rate": 6.715990453460621e-07, - "logits/chosen": 0.959011435508728, - "logits/rejected": 0.8785006999969482, - "logps/accuracies": 1.0, - "logps/chosen": -253.6005859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -211.31517028808594, - "logps/ref_rejected": -201.87469482421875, - "logps/rejected": -295.11090087890625, - "loss": 0.2446, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.114271402359009, - "rewards/grad_term": 0.01070532575249672, - "rewards/margins": 2.5475387573242188, - "rewards/rejected": -4.661810874938965, - "step": 129 - }, - { - "epoch": 0.2785216925549009, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.49663463971219, - "learning_rate": 6.707637231503579e-07, - "logits/chosen": 0.9510787129402161, - "logits/rejected": 0.6340673565864563, - "logps/accuracies": 0.25, - "logps/chosen": -353.7513427734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -313.7836608886719, - "logps/ref_rejected": -263.53387451171875, - "logps/rejected": -344.3742980957031, - "loss": 0.2306, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9983829259872437, - "rewards/grad_term": 0.007957426831126213, - "rewards/margins": 2.043639659881592, - "rewards/rejected": -4.042022228240967, - "step": 130 - }, - { - "epoch": 0.2806641671130155, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.934812547934818, - "learning_rate": 6.699284009546539e-07, - "logits/chosen": 0.7116758823394775, - "logits/rejected": 0.6254409551620483, - "logps/accuracies": 0.0, - "logps/chosen": -361.2596740722656, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -345.98199462890625, - "logps/ref_rejected": -231.908447265625, - "logps/rejected": -297.0892028808594, - "loss": 0.2395, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.763884425163269, - "rewards/grad_term": 0.008144252933561802, - "rewards/margins": 2.4951541423797607, - "rewards/rejected": -3.2590384483337402, - "step": 131 - }, - { - "epoch": 0.2828066416711302, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.274621875830415, - "learning_rate": 6.690930787589498e-07, - "logits/chosen": 0.9305391907691956, - "logits/rejected": 0.9035634994506836, - "logps/accuracies": 1.0, - "logps/chosen": -454.7461242675781, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -338.783447265625, - "logps/ref_rejected": -331.75201416015625, - "logps/rejected": -508.1329650878906, - "loss": 0.2374, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.79813289642334, - "rewards/grad_term": 0.0040429579094052315, - "rewards/margins": 3.020915985107422, - "rewards/rejected": -8.819048881530762, - "step": 132 - }, - { - "epoch": 0.2849491162292448, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.310190672888075, - "learning_rate": 6.682577565632458e-07, - "logits/chosen": 0.9652221202850342, - "logits/rejected": 0.8905255198478699, - "logps/accuracies": 0.75, - "logps/chosen": -474.09820556640625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -404.9132080078125, - "logps/ref_rejected": -397.7674560546875, - "logps/rejected": -542.0930786132812, - "loss": 0.2522, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.459249496459961, - "rewards/grad_term": 0.007244814652949572, - "rewards/margins": 3.7570319175720215, - "rewards/rejected": -7.216281414031982, - "step": 133 - }, - { - "epoch": 0.2870915907873594, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.377477290301309, - "learning_rate": 6.674224343675417e-07, - "logits/chosen": 0.9276644587516785, - "logits/rejected": 0.7834001779556274, - "logps/accuracies": 0.75, - "logps/chosen": -482.92828369140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -378.0372009277344, - "logps/ref_rejected": -330.1478271484375, - "logps/rejected": -559.7905883789062, - "loss": 0.2727, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.244553089141846, - "rewards/grad_term": 0.001372232916764915, - "rewards/margins": 6.237585544586182, - "rewards/rejected": -11.482138633728027, - "step": 134 - }, - { - "epoch": 0.289234065345474, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.304875956111065, - "learning_rate": 6.665871121718377e-07, - "logits/chosen": 0.8576828837394714, - "logits/rejected": 0.6341058611869812, - "logps/accuracies": 0.5, - "logps/chosen": -337.80535888671875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -294.550537109375, - "logps/ref_rejected": -220.64215087890625, - "logps/rejected": -329.065673828125, - "loss": 0.2647, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.162740707397461, - "rewards/grad_term": 0.009863924235105515, - "rewards/margins": 3.2584362030029297, - "rewards/rejected": -5.421176910400391, - "step": 135 - }, - { - "epoch": 0.29137653990358864, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.625140040989452, - "learning_rate": 6.657517899761337e-07, - "logits/chosen": 0.8870478272438049, - "logits/rejected": 0.7576145529747009, - "logps/accuracies": 0.5, - "logps/chosen": -305.3637390136719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -262.2385559082031, - "logps/ref_rejected": -227.50672912597656, - "logps/rejected": -304.53564453125, - "loss": 0.2461, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1562600135803223, - "rewards/grad_term": 0.010284369811415672, - "rewards/margins": 1.6951854228973389, - "rewards/rejected": -3.8514456748962402, - "step": 136 - }, - { - "epoch": 0.29351901446170325, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.018484141515877, - "learning_rate": 6.649164677804296e-07, - "logits/chosen": 0.9112040400505066, - "logits/rejected": 0.7212855815887451, - "logps/accuracies": 0.25, - "logps/chosen": -343.9112243652344, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -280.6199645996094, - "logps/ref_rejected": -210.87319946289062, - "logps/rejected": -326.63885498046875, - "loss": 0.2715, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.164562225341797, - "rewards/grad_term": 0.009311579167842865, - "rewards/margins": 2.62372088432312, - "rewards/rejected": -5.788283348083496, - "step": 137 - }, - { - "epoch": 0.29566148901981787, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.034703724953653, - "learning_rate": 6.640811455847255e-07, - "logits/chosen": 0.7124534845352173, - "logits/rejected": 0.6710624694824219, - "logps/accuracies": 0.5, - "logps/chosen": -434.5594482421875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -374.4966125488281, - "logps/ref_rejected": -310.2477722167969, - "logps/rejected": -434.73895263671875, - "loss": 0.2497, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0031418800354004, - "rewards/grad_term": 0.00873873382806778, - "rewards/margins": 3.221416473388672, - "rewards/rejected": -6.2245588302612305, - "step": 138 - }, - { - "epoch": 0.2978039635779325, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.812429176308449, - "learning_rate": 6.632458233890214e-07, - "logits/chosen": 0.9117505550384521, - "logits/rejected": 0.8502323031425476, - "logps/accuracies": 0.75, - "logps/chosen": -334.134765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -272.14581298828125, - "logps/ref_rejected": -239.22955322265625, - "logps/rejected": -368.5514221191406, - "loss": 0.2585, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0994479656219482, - "rewards/grad_term": 0.0053825220093131065, - "rewards/margins": 3.366644859313965, - "rewards/rejected": -6.466092586517334, - "step": 139 - }, - { - "epoch": 0.29994643813604716, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.660588605349608, - "learning_rate": 6.624105011933175e-07, - "logits/chosen": 0.6605690717697144, - "logits/rejected": 0.8247154951095581, - "logps/accuracies": 0.5, - "logps/chosen": -329.8360900878906, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -290.978271484375, - "logps/ref_rejected": -246.50843811035156, - "logps/rejected": -299.4759521484375, - "loss": 0.2595, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.94289231300354, - "rewards/grad_term": 0.01796804741024971, - "rewards/margins": 0.705483078956604, - "rewards/rejected": -2.6483755111694336, - "step": 140 - }, - { - "epoch": 0.30208891269416177, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.598864233990223, - "learning_rate": 6.615751789976133e-07, - "logits/chosen": 0.6485729217529297, - "logits/rejected": 0.6901566982269287, - "logps/accuracies": 1.0, - "logps/chosen": -373.37396240234375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -334.75689697265625, - "logps/ref_rejected": -381.53717041015625, - "logps/rejected": -493.22930908203125, - "loss": 0.2639, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.930854082107544, - "rewards/grad_term": 0.0024079105351120234, - "rewards/margins": 3.6537532806396484, - "rewards/rejected": -5.584607124328613, - "step": 141 - }, - { - "epoch": 0.3042313872522764, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.28171427184855, - "learning_rate": 6.607398568019093e-07, - "logits/chosen": 0.9653871655464172, - "logits/rejected": 0.6570479273796082, - "logps/accuracies": 0.5, - "logps/chosen": -340.83685302734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -281.6165771484375, - "logps/ref_rejected": -203.56463623046875, - "logps/rejected": -293.328857421875, - "loss": 0.2356, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9610140323638916, - "rewards/grad_term": 0.010910983197391033, - "rewards/margins": 1.5271971225738525, - "rewards/rejected": -4.488211154937744, - "step": 142 - }, - { - "epoch": 0.306373861810391, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.851086611102091, - "learning_rate": 6.599045346062052e-07, - "logits/chosen": 0.9678068161010742, - "logits/rejected": 0.8588843941688538, - "logps/accuracies": 0.75, - "logps/chosen": -388.9454345703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -324.3150634765625, - "logps/ref_rejected": -277.83184814453125, - "logps/rejected": -417.49212646484375, - "loss": 0.1983, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2315189838409424, - "rewards/grad_term": 0.004653441719710827, - "rewards/margins": 3.7514941692352295, - "rewards/rejected": -6.983013153076172, - "step": 143 - }, - { - "epoch": 0.3085163363685056, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.817695728935396, - "learning_rate": 6.590692124105012e-07, - "logits/chosen": 0.9185338616371155, - "logits/rejected": 0.8464267253875732, - "logps/accuracies": 0.75, - "logps/chosen": -355.24176025390625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -319.5701599121094, - "logps/ref_rejected": -291.1425476074219, - "logps/rejected": -393.3614196777344, - "loss": 0.2415, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7835807800292969, - "rewards/grad_term": 0.002678102580830455, - "rewards/margins": 3.327362060546875, - "rewards/rejected": -5.110942840576172, - "step": 144 - }, - { - "epoch": 0.31065881092662023, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 11.370643366730324, - "learning_rate": 6.582338902147971e-07, - "logits/chosen": 0.5740557909011841, - "logits/rejected": 0.39975208044052124, - "logps/accuracies": 0.25, - "logps/chosen": -487.64642333984375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -415.9956359863281, - "logps/ref_rejected": -316.01708984375, - "logps/rejected": -444.40020751953125, - "loss": 0.3096, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.5825393199920654, - "rewards/grad_term": 0.013735326007008553, - "rewards/margins": 2.836615562438965, - "rewards/rejected": -6.419155120849609, - "step": 145 - }, - { - "epoch": 0.31280128548473485, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.223483927318776, - "learning_rate": 6.57398568019093e-07, - "logits/chosen": 0.7999095320701599, - "logits/rejected": 0.5960132479667664, - "logps/accuracies": 0.75, - "logps/chosen": -299.7486572265625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -218.3001251220703, - "logps/ref_rejected": -201.54733276367188, - "logps/rejected": -300.24505615234375, - "loss": 0.2041, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.0724263191223145, - "rewards/grad_term": 0.02213701605796814, - "rewards/margins": 0.8624599575996399, - "rewards/rejected": -4.9348859786987305, - "step": 146 - }, - { - "epoch": 0.3149437600428495, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.579748942980807, - "learning_rate": 6.56563245823389e-07, - "logits/chosen": 0.926275372505188, - "logits/rejected": 0.8347383737564087, - "logps/accuracies": 0.75, - "logps/chosen": -363.90753173828125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -273.9169616699219, - "logps/ref_rejected": -306.166015625, - "logps/rejected": -483.9077453613281, - "loss": 0.257, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.4995293617248535, - "rewards/grad_term": 0.007000477984547615, - "rewards/margins": 4.387556076049805, - "rewards/rejected": -8.887085914611816, - "step": 147 - }, - { - "epoch": 0.31708623460096413, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.401901301955716, - "learning_rate": 6.557279236276849e-07, - "logits/chosen": 0.8783835172653198, - "logits/rejected": 0.748361349105835, - "logps/accuracies": 0.75, - "logps/chosen": -439.5329284667969, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -380.14080810546875, - "logps/ref_rejected": -311.24652099609375, - "logps/rejected": -461.9928283691406, - "loss": 0.2612, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.969606399536133, - "rewards/grad_term": 0.007177918218076229, - "rewards/margins": 4.567710876464844, - "rewards/rejected": -7.53731632232666, - "step": 148 - }, - { - "epoch": 0.31922870915907875, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 10.72349764158799, - "learning_rate": 6.548926014319809e-07, - "logits/chosen": 0.889015793800354, - "logits/rejected": 0.6200151443481445, - "logps/accuracies": 0.25, - "logps/chosen": -367.63616943359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -328.658203125, - "logps/ref_rejected": -265.4963073730469, - "logps/rejected": -371.23028564453125, - "loss": 0.2875, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9488979578018188, - "rewards/grad_term": 0.003809453221037984, - "rewards/margins": 3.337800979614258, - "rewards/rejected": -5.286699295043945, - "step": 149 - }, - { - "epoch": 0.32137118371719336, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.234046149517951, - "learning_rate": 6.540572792362768e-07, - "logits/chosen": 0.7394288182258606, - "logits/rejected": 0.7122300863265991, - "logps/accuracies": 0.75, - "logps/chosen": -405.5599365234375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -309.9801940917969, - "logps/ref_rejected": -281.0711975097656, - "logps/rejected": -448.6300048828125, - "loss": 0.2894, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.778985977172852, - "rewards/grad_term": 0.004231796134263277, - "rewards/margins": 3.5989530086517334, - "rewards/rejected": -8.377939224243164, - "step": 150 - }, - { - "epoch": 0.323513658275308, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.025800847575807, - "learning_rate": 6.532219570405727e-07, - "logits/chosen": 0.5386347770690918, - "logits/rejected": 0.35092538595199585, - "logps/accuracies": 0.5, - "logps/chosen": -216.28070068359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -184.84176635742188, - "logps/ref_rejected": -179.14764404296875, - "logps/rejected": -240.31222534179688, - "loss": 0.2617, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5719472169876099, - "rewards/grad_term": 0.012875164858996868, - "rewards/margins": 1.4862821102142334, - "rewards/rejected": -3.058229446411133, - "step": 151 - }, - { - "epoch": 0.3256561328334226, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.899213029036904, - "learning_rate": 6.523866348448687e-07, - "logits/chosen": 0.9789618253707886, - "logits/rejected": 0.363411545753479, - "logps/accuracies": 0.5, - "logps/chosen": -279.868408203125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -243.69374084472656, - "logps/ref_rejected": -157.2808380126953, - "logps/rejected": -250.97132873535156, - "loss": 0.241, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8087329864501953, - "rewards/grad_term": 0.009951984509825706, - "rewards/margins": 2.875791072845459, - "rewards/rejected": -4.6845245361328125, - "step": 152 - }, - { - "epoch": 0.3277986073915372, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.604763643820585, - "learning_rate": 6.515513126491647e-07, - "logits/chosen": 0.8571368455886841, - "logits/rejected": 0.25795796513557434, - "logps/accuracies": 0.25, - "logps/chosen": -449.5162658691406, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -410.160400390625, - "logps/ref_rejected": -145.2788543701172, - "logps/rejected": -226.03286743164062, - "loss": 0.2287, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9677932262420654, - "rewards/grad_term": 0.015731465071439743, - "rewards/margins": 2.0699081420898438, - "rewards/rejected": -4.03770112991333, - "step": 153 - }, - { - "epoch": 0.3299410819496518, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.719226366200353, - "learning_rate": 6.507159904534606e-07, - "logits/chosen": 0.6871969103813171, - "logits/rejected": 0.4975748658180237, - "logps/accuracies": 0.25, - "logps/chosen": -325.2784423828125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -273.6944580078125, - "logps/ref_rejected": -270.9206237792969, - "logps/rejected": -343.53662109375, - "loss": 0.2694, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5791993141174316, - "rewards/grad_term": 0.015888353809714317, - "rewards/margins": 1.0516000986099243, - "rewards/rejected": -3.6307995319366455, - "step": 154 - }, - { - "epoch": 0.3320835565077665, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.321361613018943, - "learning_rate": 6.498806682577566e-07, - "logits/chosen": 0.703808069229126, - "logits/rejected": 0.6969115734100342, - "logps/accuracies": 1.0, - "logps/chosen": -476.7120361328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -397.993896484375, - "logps/ref_rejected": -352.0650329589844, - "logps/rejected": -520.11376953125, - "loss": 0.1933, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.935906171798706, - "rewards/grad_term": 0.005700279027223587, - "rewards/margins": 4.466530799865723, - "rewards/rejected": -8.402437210083008, - "step": 155 - }, - { - "epoch": 0.3342260310658811, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.480714775934256, - "learning_rate": 6.490453460620525e-07, - "logits/chosen": 1.0573246479034424, - "logits/rejected": 0.7870907187461853, - "logps/accuracies": 0.25, - "logps/chosen": -498.836669921875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -427.01885986328125, - "logps/ref_rejected": -330.6030578613281, - "logps/rejected": -500.70513916015625, - "loss": 0.2106, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.590888023376465, - "rewards/grad_term": 0.0047126878052949905, - "rewards/margins": 4.914216995239258, - "rewards/rejected": -8.505105018615723, - "step": 156 - }, - { - "epoch": 0.33636850562399573, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.421964128197317, - "learning_rate": 6.482100238663484e-07, - "logits/chosen": 0.8880590796470642, - "logits/rejected": 0.7068474888801575, - "logps/accuracies": 0.75, - "logps/chosen": -279.9468994140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -263.3426513671875, - "logps/ref_rejected": -203.75613403320312, - "logps/rejected": -287.427978515625, - "loss": 0.2597, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8302121162414551, - "rewards/grad_term": 0.008612211793661118, - "rewards/margins": 3.353381395339966, - "rewards/rejected": -4.18359375, - "step": 157 - }, - { - "epoch": 0.33851098018211034, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.8752213374271, - "learning_rate": 6.473747016706444e-07, - "logits/chosen": 0.7713953852653503, - "logits/rejected": 0.7244459986686707, - "logps/accuracies": 1.0, - "logps/chosen": -343.670654296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -289.24114990234375, - "logps/ref_rejected": -304.27593994140625, - "logps/rejected": -411.5351257324219, - "loss": 0.2123, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7214746475219727, - "rewards/grad_term": 0.007066743914037943, - "rewards/margins": 2.641486406326294, - "rewards/rejected": -5.362961292266846, - "step": 158 - }, - { - "epoch": 0.34065345474022496, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.961209126432363, - "learning_rate": 6.465393794749403e-07, - "logits/chosen": 0.4089430570602417, - "logits/rejected": 0.48331207036972046, - "logps/accuracies": 1.0, - "logps/chosen": -227.35739135742188, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -194.3919219970703, - "logps/ref_rejected": -197.94259643554688, - "logps/rejected": -291.18634033203125, - "loss": 0.2271, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.648273229598999, - "rewards/grad_term": 0.007467413786798716, - "rewards/margins": 3.0139148235321045, - "rewards/rejected": -4.6621880531311035, - "step": 159 - }, - { - "epoch": 0.3427959292983396, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.623215056313119, - "learning_rate": 6.457040572792363e-07, - "logits/chosen": 0.7995873093605042, - "logits/rejected": 0.8776368498802185, - "logps/accuracies": 0.75, - "logps/chosen": -625.0630493164062, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -586.06396484375, - "logps/ref_rejected": -436.4676818847656, - "logps/rejected": -507.0144348144531, - "loss": 0.1998, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9499545097351074, - "rewards/grad_term": 0.008956504985690117, - "rewards/margins": 1.5773828029632568, - "rewards/rejected": -3.527337074279785, - "step": 160 - }, - { - "epoch": 0.3449384038564542, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.887366061229715, - "learning_rate": 6.448687350835322e-07, - "logits/chosen": 0.7675126791000366, - "logits/rejected": 0.530870258808136, - "logps/accuracies": 0.5, - "logps/chosen": -353.36944580078125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -317.55523681640625, - "logps/ref_rejected": -319.5533447265625, - "logps/rejected": -376.0810546875, - "loss": 0.2781, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.790710687637329, - "rewards/grad_term": 0.01590893603861332, - "rewards/margins": 1.035674810409546, - "rewards/rejected": -2.826385498046875, - "step": 161 - }, - { - "epoch": 0.3470808784145688, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.945357899951022, - "learning_rate": 6.440334128878281e-07, - "logits/chosen": 0.586737871170044, - "logits/rejected": 0.4655018150806427, - "logps/accuracies": 0.5, - "logps/chosen": -474.0818786621094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -397.549560546875, - "logps/ref_rejected": -303.9313049316406, - "logps/rejected": -453.242431640625, - "loss": 0.178, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.826618194580078, - "rewards/grad_term": 0.005110522732138634, - "rewards/margins": 3.6389386653900146, - "rewards/rejected": -7.465556621551514, - "step": 162 - }, - { - "epoch": 0.3492233529726835, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.421904565149697, - "learning_rate": 6.431980906921241e-07, - "logits/chosen": 0.8405193090438843, - "logits/rejected": 0.9586330652236938, - "logps/accuracies": 0.75, - "logps/chosen": -362.607666015625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -323.2601318359375, - "logps/ref_rejected": -350.115966796875, - "logps/rejected": -414.9684143066406, - "loss": 0.1903, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9673755168914795, - "rewards/grad_term": 0.014036407694220543, - "rewards/margins": 1.2752478122711182, - "rewards/rejected": -3.2426233291625977, - "step": 163 - }, - { - "epoch": 0.3513658275307981, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.31758932737487, - "learning_rate": 6.4236276849642e-07, - "logits/chosen": 0.9069796800613403, - "logits/rejected": 0.8120055198669434, - "logps/accuracies": 0.5, - "logps/chosen": -529.5686645507812, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -436.9043273925781, - "logps/ref_rejected": -385.4817199707031, - "logps/rejected": -571.4048461914062, - "loss": 0.2857, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.633216381072998, - "rewards/grad_term": 0.001609130296856165, - "rewards/margins": 4.662940502166748, - "rewards/rejected": -9.296156883239746, - "step": 164 - }, - { - "epoch": 0.3535083020889127, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 6.7907749776023, - "learning_rate": 6.41527446300716e-07, - "logits/chosen": 0.7551314830780029, - "logits/rejected": 0.6372643709182739, - "logps/accuracies": 0.25, - "logps/chosen": -594.386962890625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -496.920166015625, - "logps/ref_rejected": -391.0464172363281, - "logps/rejected": -584.2574462890625, - "loss": 0.2221, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.873340129852295, - "rewards/grad_term": 0.0031820686999708414, - "rewards/margins": 4.787212371826172, - "rewards/rejected": -9.660552978515625, - "step": 165 - }, - { - "epoch": 0.3556507766470273, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.42834345389666, - "learning_rate": 6.406921241050118e-07, - "logits/chosen": 0.8926582336425781, - "logits/rejected": 0.6021265983581543, - "logps/accuracies": 0.5, - "logps/chosen": -535.192138671875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -454.7031555175781, - "logps/ref_rejected": -355.3533935546875, - "logps/rejected": -487.2066650390625, - "loss": 0.2189, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.024447441101074, - "rewards/grad_term": 0.007553639821708202, - "rewards/margins": 2.56821608543396, - "rewards/rejected": -6.592663764953613, - "step": 166 - }, - { - "epoch": 0.35779325120514194, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.44062714406834, - "learning_rate": 6.398568019093079e-07, - "logits/chosen": 0.9791277647018433, - "logits/rejected": 0.7142946720123291, - "logps/accuracies": 0.5, - "logps/chosen": -324.64971923828125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -303.8039245605469, - "logps/ref_rejected": -202.87620544433594, - "logps/rejected": -278.5081481933594, - "loss": 0.2451, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.0422909259796143, - "rewards/grad_term": 0.009761723689734936, - "rewards/margins": 2.7393064498901367, - "rewards/rejected": -3.781597137451172, - "step": 167 - }, - { - "epoch": 0.35993572576325655, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.985736298029532, - "learning_rate": 6.390214797136038e-07, - "logits/chosen": 0.6620911955833435, - "logits/rejected": 0.6493997573852539, - "logps/accuracies": 0.75, - "logps/chosen": -189.7664794921875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -157.066650390625, - "logps/ref_rejected": -141.00611877441406, - "logps/rejected": -213.98069763183594, - "loss": 0.2257, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6349915266036987, - "rewards/grad_term": 0.010831539519131184, - "rewards/margins": 2.013737201690674, - "rewards/rejected": -3.648728370666504, - "step": 168 - }, - { - "epoch": 0.36207820032137117, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.312051247371532, - "learning_rate": 6.381861575178997e-07, - "logits/chosen": 0.9859127998352051, - "logits/rejected": 0.7602252960205078, - "logps/accuracies": 1.0, - "logps/chosen": -367.61090087890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -327.2779541015625, - "logps/ref_rejected": -278.348876953125, - "logps/rejected": -416.47991943359375, - "loss": 0.2197, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.016645908355713, - "rewards/grad_term": 0.005300410091876984, - "rewards/margins": 4.88990592956543, - "rewards/rejected": -6.906552314758301, - "step": 169 - }, - { - "epoch": 0.3642206748794858, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.642763111324385, - "learning_rate": 6.373508353221956e-07, - "logits/chosen": 0.8475183248519897, - "logits/rejected": 0.7893280386924744, - "logps/accuracies": 1.0, - "logps/chosen": -466.7734375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -354.476806640625, - "logps/ref_rejected": -379.0105285644531, - "logps/rejected": -591.1034545898438, - "loss": 0.2312, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.614831924438477, - "rewards/grad_term": 0.0010664674919098616, - "rewards/margins": 4.989813804626465, - "rewards/rejected": -10.604645729064941, - "step": 170 - }, - { - "epoch": 0.36636314943760045, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.565545454910559, - "learning_rate": 6.365155131264916e-07, - "logits/chosen": 0.7403796911239624, - "logits/rejected": 0.6863211393356323, - "logps/accuracies": 0.5, - "logps/chosen": -413.04949951171875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -341.1173400878906, - "logps/ref_rejected": -277.3340148925781, - "logps/rejected": -390.7886047363281, - "loss": 0.2088, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.596607208251953, - "rewards/grad_term": 0.006689072586596012, - "rewards/margins": 2.076122283935547, - "rewards/rejected": -5.672729015350342, - "step": 171 - }, - { - "epoch": 0.36850562399571507, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.792718661583799, - "learning_rate": 6.356801909307876e-07, - "logits/chosen": 1.0259259939193726, - "logits/rejected": 0.874252438545227, - "logps/accuracies": 0.75, - "logps/chosen": -411.4170837402344, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -338.65203857421875, - "logps/ref_rejected": -322.8931579589844, - "logps/rejected": -444.8133544921875, - "loss": 0.2075, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.638251543045044, - "rewards/grad_term": 0.00537948589771986, - "rewards/margins": 2.4577579498291016, - "rewards/rejected": -6.096009254455566, - "step": 172 - }, - { - "epoch": 0.3706480985538297, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.152790670378876, - "learning_rate": 6.348448687350834e-07, - "logits/chosen": 0.9457736015319824, - "logits/rejected": 0.7330727577209473, - "logps/accuracies": 0.5, - "logps/chosen": -449.04766845703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -385.5958251953125, - "logps/ref_rejected": -290.0688171386719, - "logps/rejected": -419.5983581542969, - "loss": 0.2484, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1725926399230957, - "rewards/grad_term": 0.004974587354809046, - "rewards/margins": 3.3038861751556396, - "rewards/rejected": -6.476478576660156, - "step": 173 - }, - { - "epoch": 0.3727905731119443, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.514176538085659, - "learning_rate": 6.340095465393795e-07, - "logits/chosen": 0.6537495851516724, - "logits/rejected": 0.8065310716629028, - "logps/accuracies": 0.75, - "logps/chosen": -366.6497802734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -316.4726867675781, - "logps/ref_rejected": -318.7170715332031, - "logps/rejected": -415.7598876953125, - "loss": 0.2316, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.508854389190674, - "rewards/grad_term": 0.017329072579741478, - "rewards/margins": 2.343287467956543, - "rewards/rejected": -4.852141857147217, - "step": 174 - }, - { - "epoch": 0.3749330476700589, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.348177913147394, - "learning_rate": 6.331742243436754e-07, - "logits/chosen": 0.8572670221328735, - "logits/rejected": 0.791947603225708, - "logps/accuracies": 0.75, - "logps/chosen": -487.83331298828125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -378.57891845703125, - "logps/ref_rejected": -352.18182373046875, - "logps/rejected": -550.4212646484375, - "loss": 0.1988, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.4627180099487305, - "rewards/grad_term": 0.0037125912494957447, - "rewards/margins": 4.449254989624023, - "rewards/rejected": -9.911972045898438, - "step": 175 - }, - { - "epoch": 0.37707552222817353, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.515808366477856, - "learning_rate": 6.323389021479714e-07, - "logits/chosen": 0.8945661783218384, - "logits/rejected": 0.6957411170005798, - "logps/accuracies": 0.75, - "logps/chosen": -261.3265380859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -211.19613647460938, - "logps/ref_rejected": -165.06219482421875, - "logps/rejected": -267.1302795410156, - "loss": 0.2924, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5065195560455322, - "rewards/grad_term": 0.011427883058786392, - "rewards/margins": 2.5968849658966064, - "rewards/rejected": -5.103404521942139, - "step": 176 - }, - { - "epoch": 0.37921799678628815, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.786001249289741, - "learning_rate": 6.315035799522672e-07, - "logits/chosen": 0.8739601969718933, - "logits/rejected": 0.669786274433136, - "logps/accuracies": 0.5, - "logps/chosen": -329.546142578125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -280.71832275390625, - "logps/ref_rejected": -243.82247924804688, - "logps/rejected": -331.6929931640625, - "loss": 0.3338, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.441390037536621, - "rewards/grad_term": 0.007867410778999329, - "rewards/margins": 1.952134609222412, - "rewards/rejected": -4.393524646759033, - "step": 177 - }, - { - "epoch": 0.38136047134440276, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.208154660206382, - "learning_rate": 6.306682577565633e-07, - "logits/chosen": 0.9574888944625854, - "logits/rejected": 0.8498983979225159, - "logps/accuracies": 1.0, - "logps/chosen": -543.029052734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -424.92236328125, - "logps/ref_rejected": -352.2156066894531, - "logps/rejected": -606.2493286132812, - "loss": 0.182, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.90533447265625, - "rewards/grad_term": 8.483060082653537e-05, - "rewards/margins": 6.796352386474609, - "rewards/rejected": -12.70168685913086, - "step": 178 - }, - { - "epoch": 0.38350294590251743, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.640876069823683, - "learning_rate": 6.298329355608592e-07, - "logits/chosen": 0.9586235284805298, - "logits/rejected": 0.8088182210922241, - "logps/accuracies": 0.75, - "logps/chosen": -398.8780212402344, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -298.6359558105469, - "logps/ref_rejected": -262.9993896484375, - "logps/rejected": -415.80419921875, - "loss": 0.196, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.012104034423828, - "rewards/grad_term": 0.006371453404426575, - "rewards/margins": 2.6281375885009766, - "rewards/rejected": -7.640241622924805, - "step": 179 - }, - { - "epoch": 0.38564542046063205, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.553393120180624, - "learning_rate": 6.289976133651551e-07, - "logits/chosen": 1.0224734544754028, - "logits/rejected": 0.756576657295227, - "logps/accuracies": 0.75, - "logps/chosen": -353.6854553222656, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -302.3289794921875, - "logps/ref_rejected": -237.62245178222656, - "logps/rejected": -358.02716064453125, - "loss": 0.2158, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5678231716156006, - "rewards/grad_term": 0.005727603565901518, - "rewards/margins": 3.452413558959961, - "rewards/rejected": -6.020236492156982, - "step": 180 - }, - { - "epoch": 0.38778789501874666, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.447493546427074, - "learning_rate": 6.28162291169451e-07, - "logits/chosen": 1.0232813358306885, - "logits/rejected": 0.7466151714324951, - "logps/accuracies": 0.5, - "logps/chosen": -400.76641845703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -331.197998046875, - "logps/ref_rejected": -238.2156524658203, - "logps/rejected": -412.0185546875, - "loss": 0.2115, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.478421449661255, - "rewards/grad_term": 0.0015466721961274743, - "rewards/margins": 5.211723804473877, - "rewards/rejected": -8.690145492553711, - "step": 181 - }, - { - "epoch": 0.3899303695768613, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.697644696654893, - "learning_rate": 6.27326968973747e-07, - "logits/chosen": 0.9457991123199463, - "logits/rejected": 0.7241038084030151, - "logps/accuracies": 0.5, - "logps/chosen": -442.8717041015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -379.5626525878906, - "logps/ref_rejected": -298.47650146484375, - "logps/rejected": -431.15509033203125, - "loss": 0.2081, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.165452480316162, - "rewards/grad_term": 0.008516497910022736, - "rewards/margins": 3.4684762954711914, - "rewards/rejected": -6.6339287757873535, - "step": 182 - }, - { - "epoch": 0.3920728441349759, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.8108725640856, - "learning_rate": 6.26491646778043e-07, - "logits/chosen": 1.0096490383148193, - "logits/rejected": 0.8373015522956848, - "logps/accuracies": 0.75, - "logps/chosen": -351.55303955078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -287.40643310546875, - "logps/ref_rejected": -275.7420654296875, - "logps/rejected": -407.6207275390625, - "loss": 0.2152, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.2073283195495605, - "rewards/grad_term": 0.011961029842495918, - "rewards/margins": 3.3866024017333984, - "rewards/rejected": -6.593931198120117, - "step": 183 - }, - { - "epoch": 0.3942153186930905, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.474698775172937, - "learning_rate": 6.256563245823388e-07, - "logits/chosen": 0.6665371060371399, - "logits/rejected": 0.6652243137359619, - "logps/accuracies": 0.75, - "logps/chosen": -505.3188171386719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -441.2266845703125, - "logps/ref_rejected": -416.076904296875, - "logps/rejected": -567.68310546875, - "loss": 0.2278, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.204606056213379, - "rewards/grad_term": 0.001420854590833187, - "rewards/margins": 4.375702857971191, - "rewards/rejected": -7.58030891418457, - "step": 184 - }, - { - "epoch": 0.3963577932512051, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.943275839760174, - "learning_rate": 6.248210023866348e-07, - "logits/chosen": 0.9953018426895142, - "logits/rejected": 0.795741617679596, - "logps/accuracies": 0.5, - "logps/chosen": -452.40887451171875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -384.6745910644531, - "logps/ref_rejected": -307.6947326660156, - "logps/rejected": -431.2263488769531, - "loss": 0.2104, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.386714220046997, - "rewards/grad_term": 0.0036700021009892225, - "rewards/margins": 2.7898666858673096, - "rewards/rejected": -6.176580905914307, - "step": 185 - }, - { - "epoch": 0.39850026780931974, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.413011794039116, - "learning_rate": 6.239856801909308e-07, - "logits/chosen": 0.6716040968894958, - "logits/rejected": 0.619194507598877, - "logps/accuracies": 0.5, - "logps/chosen": -467.7266845703125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -381.2197570800781, - "logps/ref_rejected": -290.17254638671875, - "logps/rejected": -458.6622314453125, - "loss": 0.1671, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.325344562530518, - "rewards/grad_term": 0.004242981784045696, - "rewards/margins": 4.0991411209106445, - "rewards/rejected": -8.424485206604004, - "step": 186 - }, - { - "epoch": 0.4006427423674344, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.119460917704207, - "learning_rate": 6.231503579952267e-07, - "logits/chosen": 0.8225597143173218, - "logits/rejected": 0.6662357449531555, - "logps/accuracies": 0.5, - "logps/chosen": -469.7325439453125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -364.77740478515625, - "logps/ref_rejected": -268.07843017578125, - "logps/rejected": -472.31781005859375, - "loss": 0.1839, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.247758388519287, - "rewards/grad_term": 0.006101151462644339, - "rewards/margins": 4.964210033416748, - "rewards/rejected": -10.211968421936035, - "step": 187 - }, - { - "epoch": 0.40278521692554903, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.046352079716472, - "learning_rate": 6.223150357995226e-07, - "logits/chosen": 0.8952844142913818, - "logits/rejected": 0.6202220916748047, - "logps/accuracies": 1.0, - "logps/chosen": -472.1369323730469, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -377.0249938964844, - "logps/ref_rejected": -345.5789489746094, - "logps/rejected": -568.91357421875, - "loss": 0.2002, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.755597114562988, - "rewards/grad_term": 0.0011726694647222757, - "rewards/margins": 6.411135673522949, - "rewards/rejected": -11.166732788085938, - "step": 188 - }, - { - "epoch": 0.40492769148366364, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.37154123816891, - "learning_rate": 6.214797136038185e-07, - "logits/chosen": 0.8080844879150391, - "logits/rejected": 0.843936026096344, - "logps/accuracies": 0.25, - "logps/chosen": -305.34619140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -231.69813537597656, - "logps/ref_rejected": -215.05422973632812, - "logps/rejected": -299.5094299316406, - "loss": 0.2138, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.682403087615967, - "rewards/grad_term": 0.020120887085795403, - "rewards/margins": 0.5403570532798767, - "rewards/rejected": -4.222760200500488, - "step": 189 - }, - { - "epoch": 0.40707016604177826, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.174975913775362, - "learning_rate": 6.206443914081146e-07, - "logits/chosen": 0.7329360246658325, - "logits/rejected": 0.8288344144821167, - "logps/accuracies": 0.5, - "logps/chosen": -461.33984375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -360.3821105957031, - "logps/ref_rejected": -286.70745849609375, - "logps/rejected": -451.14569091796875, - "loss": 0.2014, - "rewards/accuracies": 0.75, - "rewards/chosen": -5.047886848449707, - "rewards/grad_term": 0.013596764765679836, - "rewards/margins": 3.1740236282348633, - "rewards/rejected": -8.22191047668457, - "step": 190 - }, - { - "epoch": 0.4092126405998929, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.272281750209924, - "learning_rate": 6.198090692124104e-07, - "logits/chosen": 0.8074694275856018, - "logits/rejected": 0.7827705144882202, - "logps/accuracies": 0.5, - "logps/chosen": -419.7196960449219, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -305.22576904296875, - "logps/ref_rejected": -257.1751708984375, - "logps/rejected": -473.9278564453125, - "loss": 0.1775, - "rewards/accuracies": 0.75, - "rewards/chosen": -5.724696159362793, - "rewards/grad_term": 0.008914883248507977, - "rewards/margins": 5.112937927246094, - "rewards/rejected": -10.837634086608887, - "step": 191 - }, - { - "epoch": 0.4113551151580075, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.921800868376472, - "learning_rate": 6.189737470167064e-07, - "logits/chosen": 0.8224954009056091, - "logits/rejected": 0.7576948404312134, - "logps/accuracies": 0.5, - "logps/chosen": -276.3191833496094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -228.04379272460938, - "logps/ref_rejected": -168.54588317871094, - "logps/rejected": -262.92254638671875, - "loss": 0.221, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.4137697219848633, - "rewards/grad_term": 0.011381752789020538, - "rewards/margins": 2.305063247680664, - "rewards/rejected": -4.718832969665527, - "step": 192 - }, - { - "epoch": 0.4113551151580075, - "eval_flips/correct->correct": 0.1599999964237213, - "eval_flips/correct->incorrect": 0.0, - "eval_flips/incorrect->correct": 0.3199999928474426, - "eval_flips/incorrect->incorrect": 0.5199999809265137, - "eval_logits/chosen": 0.8205481767654419, - "eval_logits/rejected": 0.7034481763839722, - "eval_logps/accuracies": 0.47999998927116394, - "eval_logps/chosen": -390.3917541503906, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -389.7598876953125, - "eval_loss": 0.23252426087856293, - "eval_rewards/accuracies": 0.8600000143051147, - "eval_rewards/chosen": -3.3438057899475098, - "eval_rewards/grad_term": 0.009308630600571632, - "eval_rewards/margins": 3.2091403007507324, - "eval_rewards/rejected": -6.5529465675354, - "eval_runtime": 375.2407, - "eval_samples_per_second": 4.211, - "eval_steps_per_second": 0.133, - "step": 192 - }, - { - "epoch": 0.4134975897161221, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 12.176142217784117, - "learning_rate": 6.181384248210024e-07, - "logits/chosen": 0.9473562836647034, - "logits/rejected": 0.800918698310852, - "logps/accuracies": 1.0, - "logps/chosen": -362.5771484375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -297.9696350097656, - "logps/ref_rejected": -254.08636474609375, - "logps/rejected": -451.26666259765625, - "loss": 0.2425, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2303762435913086, - "rewards/grad_term": 0.0030712243169546127, - "rewards/margins": 6.628638744354248, - "rewards/rejected": -9.859014511108398, - "step": 193 - }, - { - "epoch": 0.4156400642742367, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.97633567390224, - "learning_rate": 6.173031026252983e-07, - "logits/chosen": 0.9676415324211121, - "logits/rejected": 0.9164015054702759, - "logps/accuracies": 0.5, - "logps/chosen": -323.4644470214844, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -252.95120239257812, - "logps/ref_rejected": -255.23983764648438, - "logps/rejected": -357.87921142578125, - "loss": 0.243, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.525662422180176, - "rewards/grad_term": 0.009022894315421581, - "rewards/margins": 1.606306791305542, - "rewards/rejected": -5.131969451904297, - "step": 194 - }, - { - "epoch": 0.4177825388323514, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.766543352301676, - "learning_rate": 6.164677804295942e-07, - "logits/chosen": 0.7897940278053284, - "logits/rejected": 0.8066399097442627, - "logps/accuracies": 0.5, - "logps/chosen": -236.746337890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -197.38284301757812, - "logps/ref_rejected": -189.28884887695312, - "logps/rejected": -234.21102905273438, - "loss": 0.1805, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9681750535964966, - "rewards/grad_term": 0.022014902904629707, - "rewards/margins": 0.2779344618320465, - "rewards/rejected": -2.2461094856262207, - "step": 195 - }, - { - "epoch": 0.419925013390466, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.405842493569402, - "learning_rate": 6.156324582338901e-07, - "logits/chosen": 1.0093345642089844, - "logits/rejected": 0.8334782719612122, - "logps/accuracies": 0.75, - "logps/chosen": -445.7866516113281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -337.7933349609375, - "logps/ref_rejected": -296.8775329589844, - "logps/rejected": -510.7151794433594, - "loss": 0.178, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.399665832519531, - "rewards/grad_term": 0.0011895910138264298, - "rewards/margins": 5.2922163009643555, - "rewards/rejected": -10.69188117980957, - "step": 196 - }, - { - "epoch": 0.4220674879485806, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.920133033788117, - "learning_rate": 6.147971360381862e-07, - "logits/chosen": 0.787909209728241, - "logits/rejected": 0.75300532579422, - "logps/accuracies": 0.5, - "logps/chosen": -440.1323547363281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -368.47412109375, - "logps/ref_rejected": -353.4744873046875, - "logps/rejected": -485.1199951171875, - "loss": 0.2506, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.5829124450683594, - "rewards/grad_term": 0.012279342859983444, - "rewards/margins": 2.9993643760681152, - "rewards/rejected": -6.582277297973633, - "step": 197 - }, - { - "epoch": 0.42420996250669524, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.419878913528332, - "learning_rate": 6.13961813842482e-07, - "logits/chosen": 1.1179535388946533, - "logits/rejected": 0.9940972924232483, - "logps/accuracies": 1.0, - "logps/chosen": -321.8353271484375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -260.9544372558594, - "logps/ref_rejected": -232.75350952148438, - "logps/rejected": -376.96441650390625, - "loss": 0.1569, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0440444946289062, - "rewards/grad_term": 0.007131978403776884, - "rewards/margins": 4.166501045227051, - "rewards/rejected": -7.210545539855957, - "step": 198 - }, - { - "epoch": 0.42635243706480985, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.76694640949059, - "learning_rate": 6.13126491646778e-07, - "logits/chosen": 0.9783276915550232, - "logits/rejected": 0.6347091794013977, - "logps/accuracies": 0.5, - "logps/chosen": -333.9166564941406, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -295.6625671386719, - "logps/ref_rejected": -178.05490112304688, - "logps/rejected": -262.8484191894531, - "loss": 0.2251, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9127050638198853, - "rewards/grad_term": 0.009310072287917137, - "rewards/margins": 2.3269693851470947, - "rewards/rejected": -4.2396745681762695, - "step": 199 - }, - { - "epoch": 0.42849491162292447, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.871363821825847, - "learning_rate": 6.122911694510739e-07, - "logits/chosen": 0.713058590888977, - "logits/rejected": 0.7966564893722534, - "logps/accuracies": 1.0, - "logps/chosen": -270.65655517578125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -239.4276580810547, - "logps/ref_rejected": -272.2807312011719, - "logps/rejected": -361.95782470703125, - "loss": 0.1946, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.561445713043213, - "rewards/grad_term": 0.006638450548052788, - "rewards/margins": 2.9224092960357666, - "rewards/rejected": -4.483855247497559, - "step": 200 - }, - { - "epoch": 0.4306373861810391, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.189710382812372, - "learning_rate": 6.1145584725537e-07, - "logits/chosen": 0.8107240200042725, - "logits/rejected": 0.6742185950279236, - "logps/accuracies": 0.75, - "logps/chosen": -461.14404296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -375.26239013671875, - "logps/ref_rejected": -366.937744140625, - "logps/rejected": -556.6005859375, - "loss": 0.1953, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.2940826416015625, - "rewards/grad_term": 0.002399621531367302, - "rewards/margins": 5.189059734344482, - "rewards/rejected": -9.483142852783203, - "step": 201 - }, - { - "epoch": 0.4327798607391537, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.981730687281297, - "learning_rate": 6.106205250596658e-07, - "logits/chosen": 0.8751631379127502, - "logits/rejected": 0.8072733879089355, - "logps/accuracies": 0.75, - "logps/chosen": -323.69873046875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -253.70254516601562, - "logps/ref_rejected": -235.48675537109375, - "logps/rejected": -390.3155517578125, - "loss": 0.2163, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.4998087882995605, - "rewards/grad_term": 0.009065371006727219, - "rewards/margins": 4.241631031036377, - "rewards/rejected": -7.741440296173096, - "step": 202 - }, - { - "epoch": 0.43492233529726837, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.322543837095315, - "learning_rate": 6.097852028639618e-07, - "logits/chosen": 0.8359104990959167, - "logits/rejected": 0.7406368255615234, - "logps/accuracies": 1.0, - "logps/chosen": -576.8265380859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -415.6806335449219, - "logps/ref_rejected": -388.8533935546875, - "logps/rejected": -631.3206787109375, - "loss": 0.2296, - "rewards/accuracies": 1.0, - "rewards/chosen": -8.057294845581055, - "rewards/grad_term": 0.004110483452677727, - "rewards/margins": 4.066068649291992, - "rewards/rejected": -12.123363494873047, - "step": 203 - }, - { - "epoch": 0.437064809855383, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.923150092849891, - "learning_rate": 6.089498806682577e-07, - "logits/chosen": 0.9337953329086304, - "logits/rejected": 0.5036557912826538, - "logps/accuracies": 0.75, - "logps/chosen": -321.8887939453125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -268.6906433105469, - "logps/ref_rejected": -248.19271850585938, - "logps/rejected": -372.2651062011719, - "loss": 0.2665, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6599063873291016, - "rewards/grad_term": 0.006289066281169653, - "rewards/margins": 3.543713092803955, - "rewards/rejected": -6.203619480133057, - "step": 204 - }, - { - "epoch": 0.4392072844134976, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.602466187696242, - "learning_rate": 6.081145584725537e-07, - "logits/chosen": 0.799699604511261, - "logits/rejected": 0.7706651091575623, - "logps/accuracies": 1.0, - "logps/chosen": -431.94512939453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -358.3432312011719, - "logps/ref_rejected": -336.4741516113281, - "logps/rejected": -577.3936157226562, - "loss": 0.2874, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6800947189331055, - "rewards/grad_term": 0.0002930064802058041, - "rewards/margins": 8.365878105163574, - "rewards/rejected": -12.04597282409668, - "step": 205 - }, - { - "epoch": 0.4413497589716122, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.272614970451562, - "learning_rate": 6.072792362768496e-07, - "logits/chosen": 0.9661082625389099, - "logits/rejected": 0.8308844566345215, - "logps/accuracies": 0.75, - "logps/chosen": -429.0587158203125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -331.1964111328125, - "logps/ref_rejected": -307.45654296875, - "logps/rejected": -492.7054138183594, - "loss": 0.1984, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.8931169509887695, - "rewards/grad_term": 0.0081776799634099, - "rewards/margins": 4.369326114654541, - "rewards/rejected": -9.262442588806152, - "step": 206 - }, - { - "epoch": 0.44349223352972683, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.721689104729112, - "learning_rate": 6.064439140811455e-07, - "logits/chosen": 0.9881528615951538, - "logits/rejected": 0.8519094586372375, - "logps/accuracies": 0.5, - "logps/chosen": -221.66915893554688, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -187.01608276367188, - "logps/ref_rejected": -149.39364624023438, - "logps/rejected": -215.79965209960938, - "loss": 0.2711, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.732654094696045, - "rewards/grad_term": 0.008831696584820747, - "rewards/margins": 1.587646245956421, - "rewards/rejected": -3.320300340652466, - "step": 207 - }, - { - "epoch": 0.44563470808784145, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.793737716453558, - "learning_rate": 6.056085918854416e-07, - "logits/chosen": 1.012654185295105, - "logits/rejected": 0.9926575422286987, - "logps/accuracies": 0.75, - "logps/chosen": -402.0953369140625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -353.88946533203125, - "logps/ref_rejected": -340.03839111328125, - "logps/rejected": -436.39117431640625, - "loss": 0.237, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4102942943573, - "rewards/grad_term": 0.004451955668628216, - "rewards/margins": 2.4073448181152344, - "rewards/rejected": -4.817638874053955, - "step": 208 - }, - { - "epoch": 0.44777718264595606, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.289495153079532, - "learning_rate": 6.047732696897374e-07, - "logits/chosen": 0.8617650270462036, - "logits/rejected": 0.6589657068252563, - "logps/accuracies": 0.5, - "logps/chosen": -439.2054443359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -381.7887268066406, - "logps/ref_rejected": -318.25933837890625, - "logps/rejected": -461.6387634277344, - "loss": 0.171, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.870835304260254, - "rewards/grad_term": 0.004674965050071478, - "rewards/margins": 4.2981367111206055, - "rewards/rejected": -7.168972015380859, - "step": 209 - }, - { - "epoch": 0.4499196572040707, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.537792356133702, - "learning_rate": 6.039379474940334e-07, - "logits/chosen": 0.9008455276489258, - "logits/rejected": 0.6296284198760986, - "logps/accuracies": 0.5, - "logps/chosen": -330.61273193359375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -269.29754638671875, - "logps/ref_rejected": -172.04388427734375, - "logps/rejected": -298.81842041015625, - "loss": 0.196, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.065760612487793, - "rewards/grad_term": 0.009436404332518578, - "rewards/margins": 3.272966146469116, - "rewards/rejected": -6.338726997375488, - "step": 210 - }, - { - "epoch": 0.45206213176218535, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.509811900422134, - "learning_rate": 6.031026252983293e-07, - "logits/chosen": 0.9384500980377197, - "logits/rejected": 0.7233452200889587, - "logps/accuracies": 0.5, - "logps/chosen": -262.9141845703125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -203.38661193847656, - "logps/ref_rejected": -200.27188110351562, - "logps/rejected": -254.28463745117188, - "loss": 0.2868, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.976378917694092, - "rewards/grad_term": 0.027324385941028595, - "rewards/margins": -0.275741308927536, - "rewards/rejected": -2.7006378173828125, - "step": 211 - }, - { - "epoch": 0.45420460632029996, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.335767333746587, - "learning_rate": 6.022673031026253e-07, - "logits/chosen": 0.8327051401138306, - "logits/rejected": 0.7020426988601685, - "logps/accuracies": 0.75, - "logps/chosen": -380.3292236328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -343.1072998046875, - "logps/ref_rejected": -288.2660217285156, - "logps/rejected": -439.3631591796875, - "loss": 0.1867, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8610966205596924, - "rewards/grad_term": 0.005783412139862776, - "rewards/margins": 5.693758964538574, - "rewards/rejected": -7.554856300354004, - "step": 212 - }, - { - "epoch": 0.4563470808784146, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.086663858024284, - "learning_rate": 6.014319809069212e-07, - "logits/chosen": 0.8908300995826721, - "logits/rejected": 0.8858309984207153, - "logps/accuracies": 0.5, - "logps/chosen": -195.6943359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -161.84103393554688, - "logps/ref_rejected": -157.9658966064453, - "logps/rejected": -229.5589141845703, - "loss": 0.1993, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6926652193069458, - "rewards/grad_term": 0.008458103984594345, - "rewards/margins": 1.8869857788085938, - "rewards/rejected": -3.57965087890625, - "step": 213 - }, - { - "epoch": 0.4584895554365292, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.664284257646828, - "learning_rate": 6.005966587112171e-07, - "logits/chosen": 0.7465036511421204, - "logits/rejected": 0.6328434348106384, - "logps/accuracies": 0.25, - "logps/chosen": -285.3111267089844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -251.16213989257812, - "logps/ref_rejected": -238.5145721435547, - "logps/rejected": -321.26043701171875, - "loss": 0.1983, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7074486017227173, - "rewards/grad_term": 0.007980713620781898, - "rewards/margins": 2.4298453330993652, - "rewards/rejected": -4.137293815612793, - "step": 214 - }, - { - "epoch": 0.4606320299946438, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.440221701148605, - "learning_rate": 5.997613365155131e-07, - "logits/chosen": 0.44053223729133606, - "logits/rejected": 0.45330262184143066, - "logps/accuracies": 0.5, - "logps/chosen": -552.835205078125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -504.29376220703125, - "logps/ref_rejected": -250.1544189453125, - "logps/rejected": -491.84893798828125, - "loss": 0.1717, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4270737171173096, - "rewards/grad_term": 0.0030039078556001186, - "rewards/margins": 9.657651901245117, - "rewards/rejected": -12.084726333618164, - "step": 215 - }, - { - "epoch": 0.4627745045527584, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.9857967128079475, - "learning_rate": 5.989260143198091e-07, - "logits/chosen": 0.7373754978179932, - "logits/rejected": 0.7481766939163208, - "logps/accuracies": 0.5, - "logps/chosen": -352.7110290527344, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -312.5104675292969, - "logps/ref_rejected": -294.700439453125, - "logps/rejected": -409.80865478515625, - "loss": 0.1965, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.010028600692749, - "rewards/grad_term": 0.008961998857557774, - "rewards/margins": 3.7453832626342773, - "rewards/rejected": -5.7554121017456055, - "step": 216 - }, - { - "epoch": 0.46491697911087304, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.188711876243962, - "learning_rate": 5.98090692124105e-07, - "logits/chosen": 0.9295454025268555, - "logits/rejected": 0.6259232759475708, - "logps/accuracies": 0.5, - "logps/chosen": -415.08636474609375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -358.3822021484375, - "logps/ref_rejected": -257.28253173828125, - "logps/rejected": -432.2508544921875, - "loss": 0.22, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8352086544036865, - "rewards/grad_term": 0.00315161794424057, - "rewards/margins": 5.913206577301025, - "rewards/rejected": -8.748414993286133, - "step": 217 - }, - { - "epoch": 0.46705945366898766, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.492749863968397, - "learning_rate": 5.972553699284009e-07, - "logits/chosen": 0.9154873490333557, - "logits/rejected": 0.7640275955200195, - "logps/accuracies": 0.5, - "logps/chosen": -264.8331604003906, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -208.0253448486328, - "logps/ref_rejected": -161.46011352539062, - "logps/rejected": -261.93133544921875, - "loss": 0.2141, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8403897285461426, - "rewards/grad_term": 0.009426168166100979, - "rewards/margins": 2.183171510696411, - "rewards/rejected": -5.023561477661133, - "step": 218 - }, - { - "epoch": 0.4692019282271023, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.503071502702135, - "learning_rate": 5.96420047732697e-07, - "logits/chosen": 0.7595028281211853, - "logits/rejected": 0.800428032875061, - "logps/accuracies": 1.0, - "logps/chosen": -582.7049560546875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -475.7439270019531, - "logps/ref_rejected": -485.4010009765625, - "logps/rejected": -718.46337890625, - "loss": 0.2029, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.348053932189941, - "rewards/grad_term": 0.0018055308610200882, - "rewards/margins": 6.3050642013549805, - "rewards/rejected": -11.653118133544922, - "step": 219 - }, - { - "epoch": 0.47134440278521694, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.843688764755232, - "learning_rate": 5.955847255369928e-07, - "logits/chosen": 0.773861825466156, - "logits/rejected": 0.7292585372924805, - "logps/accuracies": 0.75, - "logps/chosen": -431.9044494628906, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -366.72967529296875, - "logps/ref_rejected": -330.95587158203125, - "logps/rejected": -468.05487060546875, - "loss": 0.1818, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2587406635284424, - "rewards/grad_term": 0.006896655540913343, - "rewards/margins": 3.59621000289917, - "rewards/rejected": -6.854950904846191, - "step": 220 - }, - { - "epoch": 0.47348687734333156, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.079186736183265, - "learning_rate": 5.947494033412888e-07, - "logits/chosen": 0.9851402044296265, - "logits/rejected": 0.7796863317489624, - "logps/accuracies": 0.75, - "logps/chosen": -564.6864624023438, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -508.427734375, - "logps/ref_rejected": -418.2183837890625, - "logps/rejected": -642.1084594726562, - "loss": 0.2627, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8129372596740723, - "rewards/grad_term": 0.00010136763739865273, - "rewards/margins": 8.381568908691406, - "rewards/rejected": -11.19450569152832, - "step": 221 - }, - { - "epoch": 0.4756293519014462, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.596915507884203, - "learning_rate": 5.939140811455847e-07, - "logits/chosen": 0.9327103495597839, - "logits/rejected": 0.7600051164627075, - "logps/accuracies": 0.25, - "logps/chosen": -458.21246337890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -412.7664489746094, - "logps/ref_rejected": -339.1446533203125, - "logps/rejected": -441.3128662109375, - "loss": 0.2477, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2723002433776855, - "rewards/grad_term": 0.0055382088758051395, - "rewards/margins": 2.8361098766326904, - "rewards/rejected": -5.108409881591797, - "step": 222 - }, - { - "epoch": 0.4777718264595608, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.540065404325883, - "learning_rate": 5.930787589498806e-07, - "logits/chosen": 0.6721053123474121, - "logits/rejected": 0.5771878957748413, - "logps/accuracies": 0.5, - "logps/chosen": -351.98284912109375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -314.6383056640625, - "logps/ref_rejected": -232.02334594726562, - "logps/rejected": -344.7238464355469, - "loss": 0.1544, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.867226243019104, - "rewards/grad_term": 0.00492095947265625, - "rewards/margins": 3.7677993774414062, - "rewards/rejected": -5.635025978088379, - "step": 223 - }, - { - "epoch": 0.4799143010176754, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 6.681718039597231, - "learning_rate": 5.922434367541766e-07, - "logits/chosen": 0.9633818864822388, - "logits/rejected": 0.7339221239089966, - "logps/accuracies": 0.25, - "logps/chosen": -494.77099609375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -442.48052978515625, - "logps/ref_rejected": -331.5393981933594, - "logps/rejected": -480.627685546875, - "loss": 0.1871, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6145222187042236, - "rewards/grad_term": 0.000996602582745254, - "rewards/margins": 4.839890480041504, - "rewards/rejected": -7.454412937164307, - "step": 224 - }, - { - "epoch": 0.48205677557579, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.005924963769166, - "learning_rate": 5.914081145584725e-07, - "logits/chosen": 0.8806890845298767, - "logits/rejected": 0.6015447974205017, - "logps/accuracies": 0.25, - "logps/chosen": -366.54046630859375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -308.4888000488281, - "logps/ref_rejected": -246.00994873046875, - "logps/rejected": -355.2261047363281, - "loss": 0.1685, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.902583599090576, - "rewards/grad_term": 0.0076670260168612, - "rewards/margins": 2.558225631713867, - "rewards/rejected": -5.460808753967285, - "step": 225 - }, - { - "epoch": 0.48419925013390464, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.189114548174568, - "learning_rate": 5.905727923627685e-07, - "logits/chosen": 0.8432016968727112, - "logits/rejected": 0.4910334646701813, - "logps/accuracies": 0.5, - "logps/chosen": -529.419677734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -399.1881103515625, - "logps/ref_rejected": -216.96324157714844, - "logps/rejected": -383.4134216308594, - "loss": 0.2154, - "rewards/accuracies": 0.75, - "rewards/chosen": -6.511577606201172, - "rewards/grad_term": 0.013775240629911423, - "rewards/margins": 1.8109302520751953, - "rewards/rejected": -8.32250690460205, - "step": 226 - }, - { - "epoch": 0.4863417246920193, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.812465251320808, - "learning_rate": 5.897374701670644e-07, - "logits/chosen": 0.9737125039100647, - "logits/rejected": 0.8655239939689636, - "logps/accuracies": 0.75, - "logps/chosen": -478.7800598144531, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -391.8121643066406, - "logps/ref_rejected": -331.19952392578125, - "logps/rejected": -510.9104309082031, - "loss": 0.1759, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.348394393920898, - "rewards/grad_term": 0.0018389918841421604, - "rewards/margins": 4.637151718139648, - "rewards/rejected": -8.985546112060547, - "step": 227 - }, - { - "epoch": 0.4884841992501339, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.807007302887406, - "learning_rate": 5.889021479713604e-07, - "logits/chosen": 0.5967141389846802, - "logits/rejected": 0.588777482509613, - "logps/accuracies": 0.75, - "logps/chosen": -175.18948364257812, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -160.39163208007812, - "logps/ref_rejected": -131.2773895263672, - "logps/rejected": -182.36181640625, - "loss": 0.1968, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.7398918867111206, - "rewards/grad_term": 0.008824177086353302, - "rewards/margins": 1.8143287897109985, - "rewards/rejected": -2.554220676422119, - "step": 228 - }, - { - "epoch": 0.49062667380824854, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.665149636062972, - "learning_rate": 5.880668257756563e-07, - "logits/chosen": 0.9138537645339966, - "logits/rejected": 0.8063441514968872, - "logps/accuracies": 0.5, - "logps/chosen": -377.0699768066406, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -307.6856384277344, - "logps/ref_rejected": -232.5096435546875, - "logps/rejected": -368.8970031738281, - "loss": 0.1393, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4692177772521973, - "rewards/grad_term": 0.00434906966984272, - "rewards/margins": 3.3501501083374023, - "rewards/rejected": -6.819368362426758, - "step": 229 - }, - { - "epoch": 0.49276914836636315, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.623147163579497, - "learning_rate": 5.872315035799522e-07, - "logits/chosen": 0.8319353461265564, - "logits/rejected": 0.7092019319534302, - "logps/accuracies": 0.5, - "logps/chosen": -471.31524658203125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -425.01544189453125, - "logps/ref_rejected": -352.10040283203125, - "logps/rejected": -474.41741943359375, - "loss": 0.1697, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3149900436401367, - "rewards/grad_term": 0.007428554352372885, - "rewards/margins": 3.8008623123168945, - "rewards/rejected": -6.115852355957031, - "step": 230 - }, - { - "epoch": 0.49491162292447777, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.671116246729667, - "learning_rate": 5.863961813842482e-07, - "logits/chosen": 0.7218674421310425, - "logits/rejected": 0.6530136466026306, - "logps/accuracies": 0.75, - "logps/chosen": -417.69482421875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -333.3896789550781, - "logps/ref_rejected": -285.8385009765625, - "logps/rejected": -459.2354431152344, - "loss": 0.1788, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.215256690979004, - "rewards/grad_term": 0.004520585294812918, - "rewards/margins": 4.454591274261475, - "rewards/rejected": -8.66984748840332, - "step": 231 - }, - { - "epoch": 0.4970540974825924, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.021075456821866, - "learning_rate": 5.855608591885441e-07, - "logits/chosen": 0.732083797454834, - "logits/rejected": 0.4362190365791321, - "logps/accuracies": 0.75, - "logps/chosen": -291.83062744140625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -250.48623657226562, - "logps/ref_rejected": -179.92550659179688, - "logps/rejected": -318.45306396484375, - "loss": 0.1443, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0672202110290527, - "rewards/grad_term": 0.002242325572296977, - "rewards/margins": 4.859157562255859, - "rewards/rejected": -6.92637825012207, - "step": 232 - }, - { - "epoch": 0.499196572040707, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.428848978245107, - "learning_rate": 5.847255369928401e-07, - "logits/chosen": 0.6398648023605347, - "logits/rejected": 0.5878071784973145, - "logps/accuracies": 0.75, - "logps/chosen": -290.29071044921875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -244.7794647216797, - "logps/ref_rejected": -252.53553771972656, - "logps/rejected": -354.963134765625, - "loss": 0.2229, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.275561571121216, - "rewards/grad_term": 0.010291634127497673, - "rewards/margins": 2.8458194732666016, - "rewards/rejected": -5.121380805969238, - "step": 233 - }, - { - "epoch": 0.5013390465988217, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.246143439813943, - "learning_rate": 5.83890214797136e-07, - "logits/chosen": 0.9107778072357178, - "logits/rejected": 0.7426069378852844, - "logps/accuracies": 0.75, - "logps/chosen": -474.1260986328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -367.388671875, - "logps/ref_rejected": -272.3711242675781, - "logps/rejected": -551.78857421875, - "loss": 0.1338, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.3368730545043945, - "rewards/grad_term": 0.0010807998478412628, - "rewards/margins": 8.633999824523926, - "rewards/rejected": -13.970873832702637, - "step": 234 - }, - { - "epoch": 0.5034815211569362, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 15.830907508822516, - "learning_rate": 5.83054892601432e-07, - "logits/chosen": 0.9917585849761963, - "logits/rejected": 0.7569248080253601, - "logps/accuracies": 1.0, - "logps/chosen": -554.43359375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -480.7333068847656, - "logps/ref_rejected": -434.622314453125, - "logps/rejected": -575.601318359375, - "loss": 0.1535, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.685014486312866, - "rewards/grad_term": 0.002452064771205187, - "rewards/margins": 3.3639354705810547, - "rewards/rejected": -7.0489501953125, - "step": 235 - }, - { - "epoch": 0.5056239957150509, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.346658607688704, - "learning_rate": 5.822195704057279e-07, - "logits/chosen": 0.623228132724762, - "logits/rejected": 0.5134543180465698, - "logps/accuracies": 0.75, - "logps/chosen": -271.30902099609375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -203.46876525878906, - "logps/ref_rejected": -188.93699645996094, - "logps/rejected": -337.220947265625, - "loss": 0.1585, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.392012596130371, - "rewards/grad_term": 0.009173048660159111, - "rewards/margins": 4.022184371948242, - "rewards/rejected": -7.4141974449157715, - "step": 236 - }, - { - "epoch": 0.5077664702731655, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.626683894288021, - "learning_rate": 5.813842482100238e-07, - "logits/chosen": 0.8421118855476379, - "logits/rejected": 0.7152860760688782, - "logps/accuracies": 0.5, - "logps/chosen": -363.5376892089844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -290.9403381347656, - "logps/ref_rejected": -304.65911865234375, - "logps/rejected": -451.55059814453125, - "loss": 0.239, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.629868268966675, - "rewards/grad_term": 0.005975798238068819, - "rewards/margins": 3.7147045135498047, - "rewards/rejected": -7.344573020935059, - "step": 237 - }, - { - "epoch": 0.5099089448312801, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.218899504442438, - "learning_rate": 5.805489260143197e-07, - "logits/chosen": 0.7464509010314941, - "logits/rejected": 0.5703651309013367, - "logps/accuracies": 0.75, - "logps/chosen": -530.9554443359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -405.5935363769531, - "logps/ref_rejected": -343.15496826171875, - "logps/rejected": -560.3563232421875, - "loss": 0.2231, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.268095970153809, - "rewards/grad_term": 0.007501318119466305, - "rewards/margins": 4.591974258422852, - "rewards/rejected": -10.86007022857666, - "step": 238 - }, - { - "epoch": 0.5120514193893948, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.019653658814029, - "learning_rate": 5.797136038186157e-07, - "logits/chosen": 0.8765060901641846, - "logits/rejected": 0.5701332688331604, - "logps/accuracies": 0.5, - "logps/chosen": -290.7047424316406, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -251.8502960205078, - "logps/ref_rejected": -161.183837890625, - "logps/rejected": -263.0386962890625, - "loss": 0.1661, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.942723035812378, - "rewards/grad_term": 0.005877365358173847, - "rewards/margins": 3.1500186920166016, - "rewards/rejected": -5.092741966247559, - "step": 239 - }, - { - "epoch": 0.5141938939475094, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.853619053843736, - "learning_rate": 5.788782816229117e-07, - "logits/chosen": 0.6634964346885681, - "logits/rejected": 0.6507644653320312, - "logps/accuracies": 1.0, - "logps/chosen": -361.76287841796875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -260.14697265625, - "logps/ref_rejected": -247.04840087890625, - "logps/rejected": -400.8096618652344, - "loss": 0.1472, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.080793857574463, - "rewards/grad_term": 0.00737812090665102, - "rewards/margins": 2.6072704792022705, - "rewards/rejected": -7.688064098358154, - "step": 240 - }, - { - "epoch": 0.516336368505624, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 13.513765248794778, - "learning_rate": 5.780429594272076e-07, - "logits/chosen": 0.8699438571929932, - "logits/rejected": 0.7703713774681091, - "logps/accuracies": 0.75, - "logps/chosen": -379.037353515625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -295.375, - "logps/ref_rejected": -275.28857421875, - "logps/rejected": -539.343017578125, - "loss": 0.2574, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.1831183433532715, - "rewards/grad_term": 0.002768411999568343, - "rewards/margins": 9.019603729248047, - "rewards/rejected": -13.202722549438477, - "step": 241 - }, - { - "epoch": 0.5184788430637386, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.871277063040404, - "learning_rate": 5.772076372315036e-07, - "logits/chosen": 0.7803428769111633, - "logits/rejected": 0.6543869376182556, - "logps/accuracies": 0.5, - "logps/chosen": -564.8624267578125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -448.4819030761719, - "logps/ref_rejected": -379.95556640625, - "logps/rejected": -606.3104248046875, - "loss": 0.2026, - "rewards/accuracies": 0.75, - "rewards/chosen": -5.81902551651001, - "rewards/grad_term": 0.00902615487575531, - "rewards/margins": 5.498717784881592, - "rewards/rejected": -11.317742347717285, - "step": 242 - }, - { - "epoch": 0.5206213176218533, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.769246851000191, - "learning_rate": 5.763723150357995e-07, - "logits/chosen": 0.6692153811454773, - "logits/rejected": 0.896048367023468, - "logps/accuracies": 0.75, - "logps/chosen": -439.7333679199219, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -363.28973388671875, - "logps/ref_rejected": -574.2049560546875, - "logps/rejected": -756.08203125, - "loss": 0.155, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8221817016601562, - "rewards/grad_term": 0.0039497376419603825, - "rewards/margins": 5.271674156188965, - "rewards/rejected": -9.093855857849121, - "step": 243 - }, - { - "epoch": 0.5227637921799678, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.5351054842566185, - "learning_rate": 5.755369928400955e-07, - "logits/chosen": 0.9571207761764526, - "logits/rejected": 0.7909256815910339, - "logps/accuracies": 0.75, - "logps/chosen": -415.9464111328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -306.4901428222656, - "logps/ref_rejected": -293.0599060058594, - "logps/rejected": -455.033935546875, - "loss": 0.1491, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.472814559936523, - "rewards/grad_term": 0.005084376782178879, - "rewards/margins": 2.625887870788574, - "rewards/rejected": -8.098702430725098, - "step": 244 - }, - { - "epoch": 0.5249062667380825, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 16.74633297722825, - "learning_rate": 5.747016706443913e-07, - "logits/chosen": 0.8858977556228638, - "logits/rejected": 0.7780598998069763, - "logps/accuracies": 0.75, - "logps/chosen": -464.2574462890625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -314.8685302734375, - "logps/ref_rejected": -267.0308532714844, - "logps/rejected": -507.6094055175781, - "loss": 0.1805, - "rewards/accuracies": 1.0, - "rewards/chosen": -7.469447135925293, - "rewards/grad_term": 0.007684916723519564, - "rewards/margins": 4.559481620788574, - "rewards/rejected": -12.028928756713867, - "step": 245 - }, - { - "epoch": 0.527048741296197, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.317686067596801, - "learning_rate": 5.738663484486874e-07, - "logits/chosen": 0.30461055040359497, - "logits/rejected": 0.4746954143047333, - "logps/accuracies": 0.75, - "logps/chosen": -107.72335815429688, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -91.86939239501953, - "logps/ref_rejected": -89.95404052734375, - "logps/rejected": -137.3914031982422, - "loss": 0.22, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.792698323726654, - "rewards/grad_term": 0.012737632729113102, - "rewards/margins": 1.5791699886322021, - "rewards/rejected": -2.371868371963501, - "step": 246 - }, - { - "epoch": 0.5291912158543117, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.560628687601231, - "learning_rate": 5.730310262529833e-07, - "logits/chosen": 0.7160434722900391, - "logits/rejected": 0.5276747941970825, - "logps/accuracies": 1.0, - "logps/chosen": -328.2812805175781, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -279.67138671875, - "logps/ref_rejected": -251.840576171875, - "logps/rejected": -399.2860107421875, - "loss": 0.1576, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4304940700531006, - "rewards/grad_term": 0.0015676068142056465, - "rewards/margins": 4.941778659820557, - "rewards/rejected": -7.372272968292236, - "step": 247 - }, - { - "epoch": 0.5313336904124264, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.552825805844636, - "learning_rate": 5.721957040572792e-07, - "logits/chosen": 0.665590763092041, - "logits/rejected": 0.6970337629318237, - "logps/accuracies": 1.0, - "logps/chosen": -445.61358642578125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -375.84332275390625, - "logps/ref_rejected": -374.01043701171875, - "logps/rejected": -503.5439453125, - "loss": 0.1957, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4885122776031494, - "rewards/grad_term": 0.005766700953245163, - "rewards/margins": 2.9881629943847656, - "rewards/rejected": -6.476675510406494, - "step": 248 - }, - { - "epoch": 0.533476164970541, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 6.35131355845974, - "learning_rate": 5.713603818615751e-07, - "logits/chosen": 0.8978402614593506, - "logits/rejected": 0.5600339770317078, - "logps/accuracies": 0.25, - "logps/chosen": -461.3507385253906, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -393.03179931640625, - "logps/ref_rejected": -272.4175109863281, - "logps/rejected": -435.42462158203125, - "loss": 0.1457, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.415947675704956, - "rewards/grad_term": 0.004269158001989126, - "rewards/margins": 4.734410285949707, - "rewards/rejected": -8.150358200073242, - "step": 249 - }, - { - "epoch": 0.5356186395286556, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 17.580885514236424, - "learning_rate": 5.705250596658711e-07, - "logits/chosen": 0.6814495921134949, - "logits/rejected": 0.75849449634552, - "logps/accuracies": 0.75, - "logps/chosen": -353.9872131347656, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -253.66598510742188, - "logps/ref_rejected": -231.22552490234375, - "logps/rejected": -383.4581298828125, - "loss": 0.2176, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.016061305999756, - "rewards/grad_term": 0.007379300892353058, - "rewards/margins": 2.595566987991333, - "rewards/rejected": -7.611629009246826, - "step": 250 - }, - { - "epoch": 0.5377611140867702, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.1971499013479, - "learning_rate": 5.696897374701671e-07, - "logits/chosen": 0.6412093639373779, - "logits/rejected": 0.6849941611289978, - "logps/accuracies": 0.75, - "logps/chosen": -354.33367919921875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -277.5402526855469, - "logps/ref_rejected": -235.74111938476562, - "logps/rejected": -374.9162902832031, - "loss": 0.2137, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.839670181274414, - "rewards/grad_term": 0.01258145458996296, - "rewards/margins": 3.119089126586914, - "rewards/rejected": -6.958759307861328, - "step": 251 - }, - { - "epoch": 0.5399035886448849, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.69469276334864, - "learning_rate": 5.68854415274463e-07, - "logits/chosen": 0.8622183799743652, - "logits/rejected": 0.5919508337974548, - "logps/accuracies": 0.5, - "logps/chosen": -274.1978759765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -238.06967163085938, - "logps/ref_rejected": -164.38018798828125, - "logps/rejected": -275.35992431640625, - "loss": 0.2049, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.806409478187561, - "rewards/grad_term": 0.010785759426653385, - "rewards/margins": 3.742577075958252, - "rewards/rejected": -5.548986434936523, - "step": 252 - }, - { - "epoch": 0.5420460632029994, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.021225410718323, - "learning_rate": 5.680190930787589e-07, - "logits/chosen": 0.7470685243606567, - "logits/rejected": 0.6984888911247253, - "logps/accuracies": 0.75, - "logps/chosen": -221.68431091308594, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -187.86117553710938, - "logps/ref_rejected": -150.86964416503906, - "logps/rejected": -256.9394836425781, - "loss": 0.1673, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6911565065383911, - "rewards/grad_term": 0.011116426438093185, - "rewards/margins": 3.612335205078125, - "rewards/rejected": -5.303491592407227, - "step": 253 - }, - { - "epoch": 0.5441885377611141, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.151951865155123, - "learning_rate": 5.671837708830549e-07, - "logits/chosen": 0.22945332527160645, - "logits/rejected": 0.5243977308273315, - "logps/accuracies": 0.5, - "logps/chosen": -282.75384521484375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -227.72982788085938, - "logps/ref_rejected": -320.92535400390625, - "logps/rejected": -430.75787353515625, - "loss": 0.1473, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.7512013912200928, - "rewards/grad_term": 0.013240108266472816, - "rewards/margins": 2.740424394607544, - "rewards/rejected": -5.491625785827637, - "step": 254 - }, - { - "epoch": 0.5463310123192288, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.292561948011548, - "learning_rate": 5.663484486873508e-07, - "logits/chosen": 0.7049826979637146, - "logits/rejected": 0.7030065059661865, - "logps/accuracies": 1.0, - "logps/chosen": -465.6855773925781, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -391.0091552734375, - "logps/ref_rejected": -373.00775146484375, - "logps/rejected": -555.9122314453125, - "loss": 0.1945, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.733822822570801, - "rewards/grad_term": 0.001258535892702639, - "rewards/margins": 5.411401271820068, - "rewards/rejected": -9.145223617553711, - "step": 255 - }, - { - "epoch": 0.5484734868773433, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 12.415742091600887, - "learning_rate": 5.655131264916467e-07, - "logits/chosen": 0.9153692722320557, - "logits/rejected": 0.5475519895553589, - "logps/accuracies": 0.5, - "logps/chosen": -304.13330078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -245.09902954101562, - "logps/ref_rejected": -193.71463012695312, - "logps/rejected": -393.2826843261719, - "loss": 0.2106, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.951713800430298, - "rewards/grad_term": 0.011270977556705475, - "rewards/margins": 7.026688575744629, - "rewards/rejected": -9.978402137756348, - "step": 256 - }, - { - "epoch": 0.5484734868773433, - "eval_flips/correct->correct": 0.14000000059604645, - "eval_flips/correct->incorrect": 0.019999999552965164, - "eval_flips/incorrect->correct": 0.4399999976158142, - "eval_flips/incorrect->incorrect": 0.4000000059604645, - "eval_logits/chosen": 0.7667725086212158, - "eval_logits/rejected": 0.6504298448562622, - "eval_logps/accuracies": 0.5799999833106995, - "eval_logps/chosen": -395.9922790527344, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -407.676025390625, - "eval_loss": 0.19521716237068176, - "eval_rewards/accuracies": 0.8399999737739563, - "eval_rewards/chosen": -3.6238298416137695, - "eval_rewards/grad_term": 0.008681231178343296, - "eval_rewards/margins": 3.824923038482666, - "eval_rewards/rejected": -7.4487528800964355, - "eval_runtime": 372.955, - "eval_samples_per_second": 4.236, - "eval_steps_per_second": 0.134, - "step": 256 - }, - { - "epoch": 0.550615961435458, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.31965267361933, - "learning_rate": 5.646778042959426e-07, - "logits/chosen": 0.9101255536079407, - "logits/rejected": 0.8786407113075256, - "logps/accuracies": 0.75, - "logps/chosen": -516.9441528320312, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -426.1374206542969, - "logps/ref_rejected": -390.5966796875, - "logps/rejected": -595.52685546875, - "loss": 0.1809, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.540337085723877, - "rewards/grad_term": 0.0009564714273437858, - "rewards/margins": 5.706172466278076, - "rewards/rejected": -10.246509552001953, - "step": 257 - }, - { - "epoch": 0.5527584359935725, - "flips/correct->correct": 1.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.682059725515248, - "learning_rate": 5.638424821002387e-07, - "logits/chosen": 0.8498424291610718, - "logits/rejected": 0.8489320874214172, - "logps/accuracies": 1.0, - "logps/chosen": -413.9257507324219, - "logps/ref_accuracies": 1.0, - "logps/ref_chosen": -327.97869873046875, - "logps/ref_rejected": -369.13482666015625, - "logps/rejected": -582.4822998046875, - "loss": 0.169, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.2973527908325195, - "rewards/grad_term": 0.005528903566300869, - "rewards/margins": 6.370021820068359, - "rewards/rejected": -10.667373657226562, - "step": 258 - }, - { - "epoch": 0.5549009105516872, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.707978512883248, - "learning_rate": 5.630071599045346e-07, - "logits/chosen": 0.7243056297302246, - "logits/rejected": 0.6144933104515076, - "logps/accuracies": 0.75, - "logps/chosen": -426.29638671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -336.075439453125, - "logps/ref_rejected": -338.01348876953125, - "logps/rejected": -613.092041015625, - "loss": 0.1699, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.511048316955566, - "rewards/grad_term": 0.00518822530284524, - "rewards/margins": 9.242880821228027, - "rewards/rejected": -13.753929138183594, - "step": 259 - }, - { - "epoch": 0.5570433851098018, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.457819309849194, - "learning_rate": 5.621718377088305e-07, - "logits/chosen": 0.7747801542282104, - "logits/rejected": 0.6980942487716675, - "logps/accuracies": 1.0, - "logps/chosen": -333.2770080566406, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -263.5532531738281, - "logps/ref_rejected": -270.31927490234375, - "logps/rejected": -400.2752990722656, - "loss": 0.1812, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.486187696456909, - "rewards/grad_term": 0.014478763565421104, - "rewards/margins": 3.0116138458251953, - "rewards/rejected": -6.497801780700684, - "step": 260 - }, - { - "epoch": 0.5591858596679165, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 12.791539513426143, - "learning_rate": 5.613365155131265e-07, - "logits/chosen": 1.010023832321167, - "logits/rejected": 0.8089584708213806, - "logps/accuracies": 0.5, - "logps/chosen": -434.7796325683594, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -369.63201904296875, - "logps/ref_rejected": -312.5323791503906, - "logps/rejected": -433.6856994628906, - "loss": 0.2167, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.2573814392089844, - "rewards/grad_term": 0.011615730822086334, - "rewards/margins": 2.800283908843994, - "rewards/rejected": -6.057665824890137, - "step": 261 - }, - { - "epoch": 0.561328334226031, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.761871953548487, - "learning_rate": 5.605011933174224e-07, - "logits/chosen": 0.7621825337409973, - "logits/rejected": 0.6671872138977051, - "logps/accuracies": 1.0, - "logps/chosen": -520.250732421875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -421.12890625, - "logps/ref_rejected": -388.18841552734375, - "logps/rejected": -616.8232421875, - "loss": 0.1195, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.956088066101074, - "rewards/grad_term": 0.0011122592259198427, - "rewards/margins": 6.475651741027832, - "rewards/rejected": -11.431740760803223, - "step": 262 - }, - { - "epoch": 0.5634708087841457, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.615576943912552, - "learning_rate": 5.596658711217183e-07, - "logits/chosen": 0.7528675198554993, - "logits/rejected": 0.5586297512054443, - "logps/accuracies": 1.0, - "logps/chosen": -235.1770477294922, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -193.3452606201172, - "logps/ref_rejected": -156.8057861328125, - "logps/rejected": -284.637939453125, - "loss": 0.1786, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.0915887355804443, - "rewards/grad_term": 0.012236223556101322, - "rewards/margins": 4.300019264221191, - "rewards/rejected": -6.391608238220215, - "step": 263 - }, - { - "epoch": 0.5656132833422604, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.642389085683032, - "learning_rate": 5.588305489260142e-07, - "logits/chosen": 0.7206822633743286, - "logits/rejected": 0.6255587339401245, - "logps/accuracies": 1.0, - "logps/chosen": -403.0487976074219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -323.03924560546875, - "logps/ref_rejected": -263.4703369140625, - "logps/rejected": -451.4239196777344, - "loss": 0.1584, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.000478267669678, - "rewards/grad_term": 0.009540688246488571, - "rewards/margins": 5.3972015380859375, - "rewards/rejected": -9.397679328918457, - "step": 264 - }, - { - "epoch": 0.5677557579003749, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 14.507972697356939, - "learning_rate": 5.579952267303103e-07, - "logits/chosen": 0.8435995578765869, - "logits/rejected": 0.30664098262786865, - "logps/accuracies": 0.25, - "logps/chosen": -513.014404296875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -417.4500427246094, - "logps/ref_rejected": -269.4376220703125, - "logps/rejected": -427.7491760253906, - "loss": 0.2097, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.7782206535339355, - "rewards/grad_term": 0.010889217257499695, - "rewards/margins": 3.137356996536255, - "rewards/rejected": -7.9155778884887695, - "step": 265 - }, - { - "epoch": 0.5698982324584896, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.002273528640038, - "learning_rate": 5.571599045346062e-07, - "logits/chosen": 0.6676109433174133, - "logits/rejected": 0.7191418409347534, - "logps/accuracies": 0.75, - "logps/chosen": -496.98822021484375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -340.63250732421875, - "logps/ref_rejected": -327.24847412109375, - "logps/rejected": -582.9611206054688, - "loss": 0.1535, - "rewards/accuracies": 0.75, - "rewards/chosen": -7.817786693572998, - "rewards/grad_term": 0.011708484031260014, - "rewards/margins": 4.9678449630737305, - "rewards/rejected": -12.785632133483887, - "step": 266 - }, - { - "epoch": 0.5720407070166041, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.543702357825273, - "learning_rate": 5.563245823389021e-07, - "logits/chosen": 0.8063835501670837, - "logits/rejected": 0.6988131999969482, - "logps/accuracies": 0.75, - "logps/chosen": -403.5315856933594, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -330.3784484863281, - "logps/ref_rejected": -266.2269287109375, - "logps/rejected": -429.4187316894531, - "loss": 0.1285, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6576569080352783, - "rewards/grad_term": 0.006236384157091379, - "rewards/margins": 4.501932621002197, - "rewards/rejected": -8.159589767456055, - "step": 267 - }, - { - "epoch": 0.5741831815747188, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.10211268941143, - "learning_rate": 5.55489260143198e-07, - "logits/chosen": 0.6175810694694519, - "logits/rejected": 0.4528239965438843, - "logps/accuracies": 0.75, - "logps/chosen": -391.7451477050781, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -323.01617431640625, - "logps/ref_rejected": -281.732421875, - "logps/rejected": -443.7324523925781, - "loss": 0.1603, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4364490509033203, - "rewards/grad_term": 0.003603234887123108, - "rewards/margins": 4.663552284240723, - "rewards/rejected": -8.100001335144043, - "step": 268 - }, - { - "epoch": 0.5763256561328334, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.454583306377835, - "learning_rate": 5.546539379474941e-07, - "logits/chosen": 0.5018086433410645, - "logits/rejected": 0.3208431601524353, - "logps/accuracies": 0.75, - "logps/chosen": -292.32537841796875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -235.02024841308594, - "logps/ref_rejected": -239.4210968017578, - "logps/rejected": -393.904541015625, - "loss": 0.1477, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8652560710906982, - "rewards/grad_term": 0.005141068249940872, - "rewards/margins": 4.858916282653809, - "rewards/rejected": -7.724172115325928, - "step": 269 - }, - { - "epoch": 0.578468130690948, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.116562665988194, - "learning_rate": 5.5381861575179e-07, - "logits/chosen": 0.8009479641914368, - "logits/rejected": 0.5304053425788879, - "logps/accuracies": 0.75, - "logps/chosen": -567.357666015625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -447.6595458984375, - "logps/ref_rejected": -327.67388916015625, - "logps/rejected": -578.609619140625, - "loss": 0.181, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.984908580780029, - "rewards/grad_term": 0.001012351829558611, - "rewards/margins": 6.561877727508545, - "rewards/rejected": -12.546786308288574, - "step": 270 - }, - { - "epoch": 0.5806106052490627, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.286921940219483, - "learning_rate": 5.529832935560859e-07, - "logits/chosen": 0.8755187392234802, - "logits/rejected": 0.7794501781463623, - "logps/accuracies": 0.75, - "logps/chosen": -271.8689880371094, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -215.5362548828125, - "logps/ref_rejected": -208.42568969726562, - "logps/rejected": -326.4412536621094, - "loss": 0.1667, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8166375160217285, - "rewards/grad_term": 0.006757371127605438, - "rewards/margins": 3.084141254425049, - "rewards/rejected": -5.900778770446777, - "step": 271 - }, - { - "epoch": 0.5827530798071773, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.808675787712881, - "learning_rate": 5.521479713603818e-07, - "logits/chosen": 0.5810420513153076, - "logits/rejected": 0.5697520971298218, - "logps/accuracies": 0.75, - "logps/chosen": -252.84695434570312, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -222.0087890625, - "logps/ref_rejected": -220.40602111816406, - "logps/rejected": -321.25408935546875, - "loss": 0.1784, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5419079065322876, - "rewards/grad_term": 0.006449728738516569, - "rewards/margins": 3.500495433807373, - "rewards/rejected": -5.042403697967529, - "step": 272 - }, - { - "epoch": 0.584895554365292, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.461051225414454, - "learning_rate": 5.513126491646778e-07, - "logits/chosen": 0.6986079812049866, - "logits/rejected": 0.7241477370262146, - "logps/accuracies": 1.0, - "logps/chosen": -478.98333740234375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -346.74542236328125, - "logps/ref_rejected": -319.70550537109375, - "logps/rejected": -641.8858642578125, - "loss": 0.1217, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.61189603805542, - "rewards/grad_term": 0.004527165554463863, - "rewards/margins": 9.497122764587402, - "rewards/rejected": -16.109020233154297, - "step": 273 - }, - { - "epoch": 0.5870380289234065, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.28604619396496, - "learning_rate": 5.504773269689737e-07, - "logits/chosen": 0.9160727262496948, - "logits/rejected": 0.6226189732551575, - "logps/accuracies": 0.75, - "logps/chosen": -482.9158630371094, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -380.3866882324219, - "logps/ref_rejected": -318.1195068359375, - "logps/rejected": -563.2010498046875, - "loss": 0.1824, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.126457691192627, - "rewards/grad_term": 0.0009991895640268922, - "rewards/margins": 7.12761926651001, - "rewards/rejected": -12.254076957702637, - "step": 274 - }, - { - "epoch": 0.5891805034815212, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.96745166065974, - "learning_rate": 5.496420047732696e-07, - "logits/chosen": 0.7636332511901855, - "logits/rejected": 0.7924087643623352, - "logps/accuracies": 0.75, - "logps/chosen": -232.80300903320312, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -201.30563354492188, - "logps/ref_rejected": -200.0108184814453, - "logps/rejected": -281.12127685546875, - "loss": 0.19, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5748703479766846, - "rewards/grad_term": 0.011370973661541939, - "rewards/margins": 2.480652332305908, - "rewards/rejected": -4.055522918701172, - "step": 275 - }, - { - "epoch": 0.5913229780396357, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.556600598116974, - "learning_rate": 5.488066825775657e-07, - "logits/chosen": 0.3400154113769531, - "logits/rejected": 0.8462868332862854, - "logps/accuracies": 1.0, - "logps/chosen": -457.0469970703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -380.0611877441406, - "logps/ref_rejected": -327.03466796875, - "logps/rejected": -536.0282592773438, - "loss": 0.171, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8492913246154785, - "rewards/grad_term": 0.0008406995330005884, - "rewards/margins": 6.6003899574279785, - "rewards/rejected": -10.449681282043457, - "step": 276 - }, - { - "epoch": 0.5934654525977504, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.78258861657989, - "learning_rate": 5.479713603818616e-07, - "logits/chosen": 0.5925794839859009, - "logits/rejected": 0.3707428276538849, - "logps/accuracies": 0.5, - "logps/chosen": -392.9906921386719, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -328.11785888671875, - "logps/ref_rejected": -267.7380676269531, - "logps/rejected": -379.7018737792969, - "loss": 0.2255, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.243643283843994, - "rewards/grad_term": 0.007178822532296181, - "rewards/margins": 2.354548215866089, - "rewards/rejected": -5.598191261291504, - "step": 277 - }, - { - "epoch": 0.595607927155865, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.592007769382429, - "learning_rate": 5.471360381861575e-07, - "logits/chosen": 0.6001948714256287, - "logits/rejected": 0.4797150790691376, - "logps/accuracies": 0.75, - "logps/chosen": -329.23004150390625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -272.231201171875, - "logps/ref_rejected": -260.5545349121094, - "logps/rejected": -435.8937072753906, - "loss": 0.201, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8499412536621094, - "rewards/grad_term": 0.005573004484176636, - "rewards/margins": 5.917016506195068, - "rewards/rejected": -8.766958236694336, - "step": 278 - }, - { - "epoch": 0.5977504017139796, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.70464881667517, - "learning_rate": 5.463007159904534e-07, - "logits/chosen": 0.9618555307388306, - "logits/rejected": 0.7959021329879761, - "logps/accuracies": 0.75, - "logps/chosen": -419.58148193359375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -360.5795593261719, - "logps/ref_rejected": -305.4864196777344, - "logps/rejected": -494.42156982421875, - "loss": 0.1709, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9500961303710938, - "rewards/grad_term": 0.002942422404885292, - "rewards/margins": 6.496662616729736, - "rewards/rejected": -9.446758270263672, - "step": 279 - }, - { - "epoch": 0.5998928762720943, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 13.531014928298436, - "learning_rate": 5.454653937947494e-07, - "logits/chosen": 0.5968809127807617, - "logits/rejected": 0.6607197523117065, - "logps/accuracies": 0.75, - "logps/chosen": -369.2514343261719, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -323.9585266113281, - "logps/ref_rejected": -320.01959228515625, - "logps/rejected": -448.68682861328125, - "loss": 0.2235, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.2646453380584717, - "rewards/grad_term": 0.004641966428607702, - "rewards/margins": 4.168717861175537, - "rewards/rejected": -6.433363437652588, - "step": 280 - }, - { - "epoch": 0.6020353508302089, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.57138907693324, - "learning_rate": 5.446300715990454e-07, - "logits/chosen": 1.1103699207305908, - "logits/rejected": 0.9430161714553833, - "logps/accuracies": 0.5, - "logps/chosen": -314.4698486328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -291.67144775390625, - "logps/ref_rejected": -255.3843994140625, - "logps/rejected": -313.588134765625, - "loss": 0.1767, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.139919638633728, - "rewards/grad_term": 0.011935700662434101, - "rewards/margins": 1.7702679634094238, - "rewards/rejected": -2.9101874828338623, - "step": 281 - }, - { - "epoch": 0.6041778253883235, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.587776679123657, - "learning_rate": 5.437947494033412e-07, - "logits/chosen": 0.8917209506034851, - "logits/rejected": 0.7313340306282043, - "logps/accuracies": 0.5, - "logps/chosen": -615.8944702148438, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -534.5302734375, - "logps/ref_rejected": -437.1783447265625, - "logps/rejected": -592.5615844726562, - "loss": 0.2, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.068208694458008, - "rewards/grad_term": 0.002287252340465784, - "rewards/margins": 3.7009527683258057, - "rewards/rejected": -7.769161701202393, - "step": 282 - }, - { - "epoch": 0.6063202999464381, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.603440187653598, - "learning_rate": 5.429594272076372e-07, - "logits/chosen": 0.8125787377357483, - "logits/rejected": 0.5284969806671143, - "logps/accuracies": 0.75, - "logps/chosen": -394.273681640625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -301.3934326171875, - "logps/ref_rejected": -264.20989990234375, - "logps/rejected": -396.23590087890625, - "loss": 0.1553, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.644012451171875, - "rewards/grad_term": 0.012798898853361607, - "rewards/margins": 1.9572882652282715, - "rewards/rejected": -6.601301193237305, - "step": 283 - }, - { - "epoch": 0.6084627745045528, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.706538315869954, - "learning_rate": 5.421241050119332e-07, - "logits/chosen": 0.6700544357299805, - "logits/rejected": 0.6845322847366333, - "logps/accuracies": 1.0, - "logps/chosen": -373.3970031738281, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -308.53228759765625, - "logps/ref_rejected": -321.16717529296875, - "logps/rejected": -476.41796875, - "loss": 0.2091, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2432353496551514, - "rewards/grad_term": 0.002791226841509342, - "rewards/margins": 4.5193047523498535, - "rewards/rejected": -7.762540340423584, - "step": 284 - }, - { - "epoch": 0.6106052490626673, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.616290838954344, - "learning_rate": 5.412887828162291e-07, - "logits/chosen": 0.8290910720825195, - "logits/rejected": 0.931686520576477, - "logps/accuracies": 1.0, - "logps/chosen": -446.44232177734375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -383.1173400878906, - "logps/ref_rejected": -422.9083557128906, - "logps/rejected": -596.1305541992188, - "loss": 0.1758, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1662492752075195, - "rewards/grad_term": 0.005467691924422979, - "rewards/margins": 5.49485969543457, - "rewards/rejected": -8.66110897064209, - "step": 285 - }, - { - "epoch": 0.612747723620782, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.608396954924636, - "learning_rate": 5.40453460620525e-07, - "logits/chosen": 0.45172828435897827, - "logits/rejected": 0.7064520120620728, - "logps/accuracies": 1.0, - "logps/chosen": -322.91387939453125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -258.16265869140625, - "logps/ref_rejected": -297.7436218261719, - "logps/rejected": -475.2723388671875, - "loss": 0.1969, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.237560272216797, - "rewards/grad_term": 0.005947392899543047, - "rewards/margins": 5.638874530792236, - "rewards/rejected": -8.876434326171875, - "step": 286 - }, - { - "epoch": 0.6148901981788967, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.779231877934484, - "learning_rate": 5.39618138424821e-07, - "logits/chosen": 0.9248701333999634, - "logits/rejected": 0.8036876916885376, - "logps/accuracies": 0.5, - "logps/chosen": -467.22015380859375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -380.78204345703125, - "logps/ref_rejected": -339.566162109375, - "logps/rejected": -530.2546997070312, - "loss": 0.2517, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.321907043457031, - "rewards/grad_term": 0.002679330063983798, - "rewards/margins": 5.212520599365234, - "rewards/rejected": -9.534428596496582, - "step": 287 - }, - { - "epoch": 0.6170326727370112, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.667742293114383, - "learning_rate": 5.38782816229117e-07, - "logits/chosen": 0.5336281061172485, - "logits/rejected": 0.5964027643203735, - "logps/accuracies": 1.0, - "logps/chosen": -413.2911376953125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -360.906982421875, - "logps/ref_rejected": -325.00140380859375, - "logps/rejected": -499.9887390136719, - "loss": 0.2106, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6192078590393066, - "rewards/grad_term": 0.007026966195553541, - "rewards/margins": 6.1301589012146, - "rewards/rejected": -8.749366760253906, - "step": 288 - }, - { - "epoch": 0.6191751472951259, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.794004557760429, - "learning_rate": 5.379474940334129e-07, - "logits/chosen": 0.5764543414115906, - "logits/rejected": 0.49163103103637695, - "logps/accuracies": 1.0, - "logps/chosen": -346.34478759765625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -279.2290344238281, - "logps/ref_rejected": -275.8312072753906, - "logps/rejected": -466.09326171875, - "loss": 0.1684, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3557891845703125, - "rewards/grad_term": 0.00030989584047347307, - "rewards/margins": 6.157315254211426, - "rewards/rejected": -9.513103485107422, - "step": 289 - }, - { - "epoch": 0.6213176218532405, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.147575761036771, - "learning_rate": 5.371121718377088e-07, - "logits/chosen": 0.9949532747268677, - "logits/rejected": 0.8375188708305359, - "logps/accuracies": 0.75, - "logps/chosen": -450.9200134277344, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -368.46990966796875, - "logps/ref_rejected": -288.4126281738281, - "logps/rejected": -482.5291748046875, - "loss": 0.1457, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.122506141662598, - "rewards/grad_term": 0.0010359850712120533, - "rewards/margins": 5.583320140838623, - "rewards/rejected": -9.705825805664062, - "step": 290 - }, - { - "epoch": 0.6234600964113551, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.480445656722806, - "learning_rate": 5.362768496420047e-07, - "logits/chosen": 0.46533939242362976, - "logits/rejected": 0.4300745725631714, - "logps/accuracies": 1.0, - "logps/chosen": -365.7827453613281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -291.9012145996094, - "logps/ref_rejected": -255.63128662109375, - "logps/rejected": -422.51983642578125, - "loss": 0.1266, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6940770149230957, - "rewards/grad_term": 0.004254742059856653, - "rewards/margins": 4.650350093841553, - "rewards/rejected": -8.344427108764648, - "step": 291 - }, - { - "epoch": 0.6256025709694697, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.156244989772967, - "learning_rate": 5.354415274463007e-07, - "logits/chosen": 0.9660448431968689, - "logits/rejected": 0.5434409379959106, - "logps/accuracies": 0.75, - "logps/chosen": -476.6251220703125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -414.31597900390625, - "logps/ref_rejected": -342.433837890625, - "logps/rejected": -571.8237915039062, - "loss": 0.1555, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.115457534790039, - "rewards/grad_term": 0.006694721523672342, - "rewards/margins": 8.354040145874023, - "rewards/rejected": -11.469497680664062, - "step": 292 - }, - { - "epoch": 0.6277450455275844, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 9.41012930540287, - "learning_rate": 5.346062052505966e-07, - "logits/chosen": 0.9397240877151489, - "logits/rejected": 0.7151464223861694, - "logps/accuracies": 0.5, - "logps/chosen": -548.965576171875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -421.5787353515625, - "logps/ref_rejected": -343.83221435546875, - "logps/rejected": -598.891357421875, - "loss": 0.1439, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.369344234466553, - "rewards/grad_term": 0.0019262685673311353, - "rewards/margins": 6.383614540100098, - "rewards/rejected": -12.752958297729492, - "step": 293 - }, - { - "epoch": 0.629887520085699, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.624072315857759, - "learning_rate": 5.337708830548926e-07, - "logits/chosen": 0.45759594440460205, - "logits/rejected": 0.47171294689178467, - "logps/accuracies": 0.5, - "logps/chosen": -386.42669677734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -328.4192199707031, - "logps/ref_rejected": -252.86329650878906, - "logps/rejected": -425.3404541015625, - "loss": 0.1063, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9003729820251465, - "rewards/grad_term": 0.0017550851916894317, - "rewards/margins": 5.723484039306641, - "rewards/rejected": -8.623857498168945, - "step": 294 - }, - { - "epoch": 0.6320299946438136, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.713346436229937, - "learning_rate": 5.329355608591886e-07, - "logits/chosen": 0.7498375177383423, - "logits/rejected": 0.6682128310203552, - "logps/accuracies": 0.75, - "logps/chosen": -446.07830810546875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -382.056884765625, - "logps/ref_rejected": -337.1555480957031, - "logps/rejected": -528.916015625, - "loss": 0.158, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.201070785522461, - "rewards/grad_term": 0.006979628466069698, - "rewards/margins": 6.386953830718994, - "rewards/rejected": -9.588025093078613, - "step": 295 - }, - { - "epoch": 0.6341724692019283, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.364585833613935, - "learning_rate": 5.321002386634845e-07, - "logits/chosen": 0.7363272905349731, - "logits/rejected": 0.6899486780166626, - "logps/accuracies": 0.75, - "logps/chosen": -605.6181030273438, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -478.4359130859375, - "logps/ref_rejected": -392.2603759765625, - "logps/rejected": -669.0337524414062, - "loss": 0.176, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.359109401702881, - "rewards/grad_term": 0.0019621604587882757, - "rewards/margins": 7.479557991027832, - "rewards/rejected": -13.838666915893555, - "step": 296 - }, - { - "epoch": 0.6363149437600428, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.505205942394765, - "learning_rate": 5.312649164677804e-07, - "logits/chosen": 0.5754671096801758, - "logits/rejected": 0.594577431678772, - "logps/accuracies": 0.75, - "logps/chosen": -347.17852783203125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -265.71417236328125, - "logps/ref_rejected": -234.78533935546875, - "logps/rejected": -441.4500732421875, - "loss": 0.1279, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.07321834564209, - "rewards/grad_term": 0.004567963071167469, - "rewards/margins": 6.260017395019531, - "rewards/rejected": -10.333235740661621, - "step": 297 - }, - { - "epoch": 0.6384574183181575, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 18.711798908689843, - "learning_rate": 5.304295942720763e-07, - "logits/chosen": 0.870806872844696, - "logits/rejected": 0.6316779851913452, - "logps/accuracies": 0.75, - "logps/chosen": -419.4470520019531, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -333.89263916015625, - "logps/ref_rejected": -268.04510498046875, - "logps/rejected": -475.2600402832031, - "loss": 0.1525, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.277721405029297, - "rewards/grad_term": 0.0027266484685242176, - "rewards/margins": 6.083026885986328, - "rewards/rejected": -10.360748291015625, - "step": 298 - }, - { - "epoch": 0.6405998928762721, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 13.094138352584997, - "learning_rate": 5.295942720763724e-07, - "logits/chosen": 0.4675275683403015, - "logits/rejected": 0.606252133846283, - "logps/accuracies": 0.75, - "logps/chosen": -437.78228759765625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -361.85552978515625, - "logps/ref_rejected": -334.0460510253906, - "logps/rejected": -510.67462158203125, - "loss": 0.162, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.7963359355926514, - "rewards/grad_term": 0.00101923244073987, - "rewards/margins": 5.035091876983643, - "rewards/rejected": -8.831427574157715, - "step": 299 - }, - { - "epoch": 0.6427423674343867, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 1.0, - "grad_norm": 10.604360471548478, - "learning_rate": 5.287589498806682e-07, - "logits/chosen": 0.7601810097694397, - "logits/rejected": 0.5254924297332764, - "logps/accuracies": 0.0, - "logps/chosen": -309.6020202636719, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -279.2956848144531, - "logps/ref_rejected": -154.2837371826172, - "logps/rejected": -275.95166015625, - "loss": 0.1947, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5153169631958008, - "rewards/grad_term": 0.00842749048024416, - "rewards/margins": 4.56807804107666, - "rewards/rejected": -6.083395004272461, - "step": 300 - }, - { - "epoch": 0.6448848419925013, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 16.672250976383555, - "learning_rate": 5.279236276849642e-07, - "logits/chosen": 1.0493147373199463, - "logits/rejected": 0.5671635270118713, - "logps/accuracies": 0.75, - "logps/chosen": -357.0266418457031, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -274.6451721191406, - "logps/ref_rejected": -204.9969482421875, - "logps/rejected": -309.4927978515625, - "loss": 0.2159, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.119072914123535, - "rewards/grad_term": 0.014873827807605267, - "rewards/margins": 1.1057183742523193, - "rewards/rejected": -5.224791049957275, - "step": 301 - }, - { - "epoch": 0.647027316550616, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.772181415056538, - "learning_rate": 5.270883054892601e-07, - "logits/chosen": 0.6905862092971802, - "logits/rejected": 0.6120530366897583, - "logps/accuracies": 1.0, - "logps/chosen": -301.63970947265625, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -240.38360595703125, - "logps/ref_rejected": -259.4972839355469, - "logps/rejected": -422.0617370605469, - "loss": 0.1314, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0628066062927246, - "rewards/grad_term": 0.002045161323621869, - "rewards/margins": 5.065417289733887, - "rewards/rejected": -8.128223419189453, - "step": 302 - }, - { - "epoch": 0.6491697911087306, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.52033177772429, - "learning_rate": 5.262529832935561e-07, - "logits/chosen": 0.5544182658195496, - "logits/rejected": 0.45686784386634827, - "logps/accuracies": 1.0, - "logps/chosen": -278.7823181152344, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -196.63429260253906, - "logps/ref_rejected": -185.2466583251953, - "logps/rejected": -335.3304443359375, - "loss": 0.1615, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.1074018478393555, - "rewards/grad_term": 0.003845647443085909, - "rewards/margins": 3.3967883586883545, - "rewards/rejected": -7.504190444946289, - "step": 303 - }, - { - "epoch": 0.6513122656668452, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.017492432602673, - "learning_rate": 5.25417661097852e-07, - "logits/chosen": 0.8323764204978943, - "logits/rejected": 0.7166695594787598, - "logps/accuracies": 1.0, - "logps/chosen": -524.2388916015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -408.53387451171875, - "logps/ref_rejected": -332.67181396484375, - "logps/rejected": -610.9110717773438, - "loss": 0.1781, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.785252571105957, - "rewards/grad_term": 8.712082490092143e-05, - "rewards/margins": 8.12671184539795, - "rewards/rejected": -13.911964416503906, - "step": 304 - }, - { - "epoch": 0.6534547402249599, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.355578424387678, - "learning_rate": 5.245823389021479e-07, - "logits/chosen": 0.6990536451339722, - "logits/rejected": 0.6334518790245056, - "logps/accuracies": 0.75, - "logps/chosen": -439.53704833984375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -357.226318359375, - "logps/ref_rejected": -357.77783203125, - "logps/rejected": -587.9480590820312, - "loss": 0.1535, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.115536689758301, - "rewards/grad_term": 7.772783283144236e-05, - "rewards/margins": 7.392976760864258, - "rewards/rejected": -11.508513450622559, - "step": 305 - }, - { - "epoch": 0.6555972147830744, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.205005797383153, - "learning_rate": 5.237470167064439e-07, - "logits/chosen": 0.8662493228912354, - "logits/rejected": 0.800337553024292, - "logps/accuracies": 0.75, - "logps/chosen": -347.75860595703125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -291.16888427734375, - "logps/ref_rejected": -224.49195861816406, - "logps/rejected": -400.5769958496094, - "loss": 0.1112, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.829484701156616, - "rewards/grad_term": 0.004376052878797054, - "rewards/margins": 5.974765777587891, - "rewards/rejected": -8.804250717163086, - "step": 306 - }, - { - "epoch": 0.6577396893411891, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 13.319843033839788, - "learning_rate": 5.229116945107398e-07, - "logits/chosen": 0.6567318439483643, - "logits/rejected": 0.7602465748786926, - "logps/accuracies": 1.0, - "logps/chosen": -388.985107421875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -338.995361328125, - "logps/ref_rejected": -395.92230224609375, - "logps/rejected": -499.59552001953125, - "loss": 0.2511, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4994890689849854, - "rewards/grad_term": 0.0039435261860489845, - "rewards/margins": 2.684171438217163, - "rewards/rejected": -5.183660984039307, - "step": 307 - }, - { - "epoch": 0.6598821638993037, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.066998414831264, - "learning_rate": 5.220763723150358e-07, - "logits/chosen": 0.8940625786781311, - "logits/rejected": 0.903926432132721, - "logps/accuracies": 1.0, - "logps/chosen": -430.4771423339844, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -328.47900390625, - "logps/ref_rejected": -382.0928955078125, - "logps/rejected": -580.9340209960938, - "loss": 0.1645, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.099907875061035, - "rewards/grad_term": 0.0023625774774700403, - "rewards/margins": 4.842148780822754, - "rewards/rejected": -9.942056655883789, - "step": 308 - }, - { - "epoch": 0.6620246384574183, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.765055407846113, - "learning_rate": 5.212410501193317e-07, - "logits/chosen": 0.7993382215499878, - "logits/rejected": 0.6162198185920715, - "logps/accuracies": 0.75, - "logps/chosen": -455.43585205078125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -387.1122741699219, - "logps/ref_rejected": -310.28033447265625, - "logps/rejected": -487.04913330078125, - "loss": 0.1081, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4161789417266846, - "rewards/grad_term": 0.0006664180546067655, - "rewards/margins": 5.422262668609619, - "rewards/rejected": -8.838441848754883, - "step": 309 - }, - { - "epoch": 0.664167113015533, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.526420429572342, - "learning_rate": 5.204057279236276e-07, - "logits/chosen": 0.6536291241645813, - "logits/rejected": 0.7170487642288208, - "logps/accuracies": 1.0, - "logps/chosen": -343.4113464355469, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -269.9009704589844, - "logps/ref_rejected": -301.39697265625, - "logps/rejected": -437.5945129394531, - "loss": 0.1736, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6755175590515137, - "rewards/grad_term": 0.004929536487907171, - "rewards/margins": 3.1343586444854736, - "rewards/rejected": -6.809875965118408, - "step": 310 - }, - { - "epoch": 0.6663095875736476, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.317599300319795, - "learning_rate": 5.195704057279236e-07, - "logits/chosen": 0.7689659595489502, - "logits/rejected": 0.5763236284255981, - "logps/accuracies": 0.75, - "logps/chosen": -359.8375244140625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -294.6138916015625, - "logps/ref_rejected": -236.17083740234375, - "logps/rejected": -379.31927490234375, - "loss": 0.1194, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2611827850341797, - "rewards/grad_term": 0.003922306001186371, - "rewards/margins": 3.896237850189209, - "rewards/rejected": -7.157420635223389, - "step": 311 - }, - { - "epoch": 0.6684520621317622, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.491312588715147, - "learning_rate": 5.187350835322196e-07, - "logits/chosen": 0.6590454578399658, - "logits/rejected": 0.3621766269207001, - "logps/accuracies": 0.75, - "logps/chosen": -279.87823486328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -219.8527374267578, - "logps/ref_rejected": -189.44700622558594, - "logps/rejected": -309.5084533691406, - "loss": 0.2011, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.001275062561035, - "rewards/grad_term": 0.0161147378385067, - "rewards/margins": 3.0017971992492676, - "rewards/rejected": -6.003072261810303, - "step": 312 - }, - { - "epoch": 0.6705945366898768, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.580563906265365, - "learning_rate": 5.178997613365155e-07, - "logits/chosen": 0.8489370942115784, - "logits/rejected": 0.7335869073867798, - "logps/accuracies": 0.75, - "logps/chosen": -423.52874755859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -348.4372863769531, - "logps/ref_rejected": -324.8551025390625, - "logps/rejected": -563.6224975585938, - "loss": 0.131, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.754572629928589, - "rewards/grad_term": 0.00203010905534029, - "rewards/margins": 8.183797836303711, - "rewards/rejected": -11.938370704650879, - "step": 313 - }, - { - "epoch": 0.6727370112479915, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 8.743399053571068, - "learning_rate": 5.170644391408115e-07, - "logits/chosen": 0.940852165222168, - "logits/rejected": 0.7845810651779175, - "logps/accuracies": 0.25, - "logps/chosen": -512.4435424804688, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -435.47698974609375, - "logps/ref_rejected": -288.2704162597656, - "logps/rejected": -490.13836669921875, - "loss": 0.1653, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.84832763671875, - "rewards/grad_term": 0.00016607397992629558, - "rewards/margins": 6.24506950378418, - "rewards/rejected": -10.09339714050293, - "step": 314 - }, - { - "epoch": 0.674879485806106, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.878420297933516, - "learning_rate": 5.162291169451074e-07, - "logits/chosen": 0.7630524039268494, - "logits/rejected": 0.6900883913040161, - "logps/accuracies": 0.75, - "logps/chosen": -469.132080078125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -356.17041015625, - "logps/ref_rejected": -346.5165100097656, - "logps/rejected": -575.3904418945312, - "loss": 0.1708, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.648085594177246, - "rewards/grad_term": 0.0032002755906432867, - "rewards/margins": 5.795612335205078, - "rewards/rejected": -11.443696975708008, - "step": 315 - }, - { - "epoch": 0.6770219603642207, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.731907474004624, - "learning_rate": 5.153937947494033e-07, - "logits/chosen": 0.5569190979003906, - "logits/rejected": 0.6198952794075012, - "logps/accuracies": 1.0, - "logps/chosen": -371.6037292480469, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -293.22381591796875, - "logps/ref_rejected": -300.0587463378906, - "logps/rejected": -534.5460205078125, - "loss": 0.1858, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9189953804016113, - "rewards/grad_term": 0.00014798434858676046, - "rewards/margins": 7.8053669929504395, - "rewards/rejected": -11.72436237335205, - "step": 316 - }, - { - "epoch": 0.6791644349223352, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.47786260702623, - "learning_rate": 5.145584725536993e-07, - "logits/chosen": 0.8376766443252563, - "logits/rejected": 0.7569788098335266, - "logps/accuracies": 0.75, - "logps/chosen": -496.47161865234375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -384.5050048828125, - "logps/ref_rejected": -329.0260925292969, - "logps/rejected": -620.14892578125, - "loss": 0.1397, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.598330497741699, - "rewards/grad_term": 0.0014825062826275826, - "rewards/margins": 8.95781135559082, - "rewards/rejected": -14.556142807006836, - "step": 317 - }, - { - "epoch": 0.6813069094804499, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.306648680307601, - "learning_rate": 5.137231503579952e-07, - "logits/chosen": 0.6440654993057251, - "logits/rejected": 0.5563719868659973, - "logps/accuracies": 1.0, - "logps/chosen": -464.43914794921875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -345.64239501953125, - "logps/ref_rejected": -302.1645202636719, - "logps/rejected": -522.718505859375, - "loss": 0.1143, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.939836502075195, - "rewards/grad_term": 0.003235024632886052, - "rewards/margins": 5.087862968444824, - "rewards/rejected": -11.02769947052002, - "step": 318 - }, - { - "epoch": 0.6834493840385646, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.209754726841052, - "learning_rate": 5.128878281622912e-07, - "logits/chosen": 0.7397335171699524, - "logits/rejected": 0.6839704513549805, - "logps/accuracies": 0.75, - "logps/chosen": -304.3392639160156, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -254.27145385742188, - "logps/ref_rejected": -232.0565948486328, - "logps/rejected": -400.621337890625, - "loss": 0.179, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.503389835357666, - "rewards/grad_term": 0.005540979094803333, - "rewards/margins": 5.924847602844238, - "rewards/rejected": -8.428237915039062, - "step": 319 - }, - { - "epoch": 0.6855918585966791, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.033781332110546, - "learning_rate": 5.120525059665871e-07, - "logits/chosen": 0.7168871164321899, - "logits/rejected": 0.7381715774536133, - "logps/accuracies": 1.0, - "logps/chosen": -508.3505859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -435.618896484375, - "logps/ref_rejected": -405.5649719238281, - "logps/rejected": -566.0382080078125, - "loss": 0.1406, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.636585235595703, - "rewards/grad_term": 0.0022972766309976578, - "rewards/margins": 4.387078762054443, - "rewards/rejected": -8.023663520812988, - "step": 320 - }, - { - "epoch": 0.6855918585966791, - "eval_flips/correct->correct": 0.14000000059604645, - "eval_flips/correct->incorrect": 0.019999999552965164, - "eval_flips/incorrect->correct": 0.5400000214576721, - "eval_flips/incorrect->incorrect": 0.30000001192092896, - "eval_logits/chosen": 0.7168383002281189, - "eval_logits/rejected": 0.599204957485199, - "eval_logps/accuracies": 0.6800000071525574, - "eval_logps/chosen": -391.7984619140625, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -410.0682678222656, - "eval_loss": 0.17036853730678558, - "eval_rewards/accuracies": 0.8999999761581421, - "eval_rewards/chosen": -3.414141893386841, - "eval_rewards/grad_term": 0.007643704302608967, - "eval_rewards/margins": 4.154223442077637, - "eval_rewards/rejected": -7.568365097045898, - "eval_runtime": 374.585, - "eval_samples_per_second": 4.218, - "eval_steps_per_second": 0.133, - "step": 320 - }, - { - "epoch": 0.6877343331547938, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.190622967503248, - "learning_rate": 5.11217183770883e-07, - "logits/chosen": 0.8069337606430054, - "logits/rejected": 0.7580575942993164, - "logps/accuracies": 1.0, - "logps/chosen": -503.8867492675781, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -380.65765380859375, - "logps/ref_rejected": -327.27337646484375, - "logps/rejected": -596.4686279296875, - "loss": 0.1443, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.1614532470703125, - "rewards/grad_term": 0.00022571110457647592, - "rewards/margins": 7.298309326171875, - "rewards/rejected": -13.459762573242188, - "step": 321 - }, - { - "epoch": 0.6898768077129084, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 13.881500506001192, - "learning_rate": 5.10381861575179e-07, - "logits/chosen": 0.9965633749961853, - "logits/rejected": 0.7457981109619141, - "logps/accuracies": 0.75, - "logps/chosen": -474.2216491699219, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -421.9150390625, - "logps/ref_rejected": -353.48065185546875, - "logps/rejected": -548.97265625, - "loss": 0.1638, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6153299808502197, - "rewards/grad_term": 0.0007248412584885955, - "rewards/margins": 7.159272193908691, - "rewards/rejected": -9.774601936340332, - "step": 322 - }, - { - "epoch": 0.692019282271023, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.643673103869992, - "learning_rate": 5.095465393794749e-07, - "logits/chosen": 0.49757474660873413, - "logits/rejected": 0.41455477476119995, - "logps/accuracies": 0.5, - "logps/chosen": -357.2688293457031, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -293.4722900390625, - "logps/ref_rejected": -242.71273803710938, - "logps/rejected": -381.94488525390625, - "loss": 0.1431, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1898274421691895, - "rewards/grad_term": 0.0033772799652069807, - "rewards/margins": 3.771780490875244, - "rewards/rejected": -6.961607933044434, - "step": 323 - }, - { - "epoch": 0.6941617568291376, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.731140676983227, - "learning_rate": 5.087112171837709e-07, - "logits/chosen": 0.8970646858215332, - "logits/rejected": 0.7831761240959167, - "logps/accuracies": 1.0, - "logps/chosen": -555.628173828125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -494.30938720703125, - "logps/ref_rejected": -413.06451416015625, - "logps/rejected": -587.2745971679688, - "loss": 0.1652, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.065938711166382, - "rewards/grad_term": 0.002578580752015114, - "rewards/margins": 5.644565582275391, - "rewards/rejected": -8.710504531860352, - "step": 324 - }, - { - "epoch": 0.6963042313872523, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.5010699802203655, - "learning_rate": 5.078758949880667e-07, - "logits/chosen": 0.8070268630981445, - "logits/rejected": 0.6490368247032166, - "logps/accuracies": 1.0, - "logps/chosen": -577.15478515625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -460.09661865234375, - "logps/ref_rejected": -362.884033203125, - "logps/rejected": -633.048095703125, - "loss": 0.1418, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.852906227111816, - "rewards/grad_term": 0.00013052637223154306, - "rewards/margins": 7.655299186706543, - "rewards/rejected": -13.50820541381836, - "step": 325 - }, - { - "epoch": 0.698446705945367, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.546015605698586, - "learning_rate": 5.070405727923628e-07, - "logits/chosen": 0.5231800079345703, - "logits/rejected": 0.4889012575149536, - "logps/accuracies": 0.75, - "logps/chosen": -264.241455078125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -208.79318237304688, - "logps/ref_rejected": -202.1544189453125, - "logps/rejected": -345.9344787597656, - "loss": 0.1226, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7724146842956543, - "rewards/grad_term": 0.007728134281933308, - "rewards/margins": 4.41658878326416, - "rewards/rejected": -7.189002990722656, - "step": 326 - }, - { - "epoch": 0.7005891805034815, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 12.648714009679955, - "learning_rate": 5.062052505966587e-07, - "logits/chosen": 0.7718223333358765, - "logits/rejected": 0.7055037021636963, - "logps/accuracies": 0.75, - "logps/chosen": -459.9751892089844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -358.0395812988281, - "logps/ref_rejected": -338.55364990234375, - "logps/rejected": -556.436279296875, - "loss": 0.1803, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.096780776977539, - "rewards/grad_term": 0.0066120820119977, - "rewards/margins": 5.797348976135254, - "rewards/rejected": -10.89413070678711, - "step": 327 - }, - { - "epoch": 0.7027316550615962, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.30205872449101, - "learning_rate": 5.053699284009546e-07, - "logits/chosen": 0.6625626683235168, - "logits/rejected": 0.6732759475708008, - "logps/accuracies": 0.75, - "logps/chosen": -315.85107421875, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -257.46136474609375, - "logps/ref_rejected": -273.6905212402344, - "logps/rejected": -384.72711181640625, - "loss": 0.1767, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.919485092163086, - "rewards/grad_term": 0.007900861091911793, - "rewards/margins": 2.6323447227478027, - "rewards/rejected": -5.551829814910889, - "step": 328 - }, - { - "epoch": 0.7048741296197107, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.156605830327571, - "learning_rate": 5.045346062052505e-07, - "logits/chosen": 0.7717033624649048, - "logits/rejected": 0.6601508855819702, - "logps/accuracies": 1.0, - "logps/chosen": -463.640380859375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -382.90325927734375, - "logps/ref_rejected": -358.9565734863281, - "logps/rejected": -566.164306640625, - "loss": 0.1797, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.036858558654785, - "rewards/grad_term": 0.0012353091733530164, - "rewards/margins": 6.323529243469238, - "rewards/rejected": -10.360387802124023, - "step": 329 - }, - { - "epoch": 0.7070166041778254, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.641900480934386, - "learning_rate": 5.036992840095465e-07, - "logits/chosen": 0.6714078187942505, - "logits/rejected": 0.5740436315536499, - "logps/accuracies": 1.0, - "logps/chosen": -453.4899597167969, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -362.24810791015625, - "logps/ref_rejected": -388.57666015625, - "logps/rejected": -607.985595703125, - "loss": 0.1501, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.562093257904053, - "rewards/grad_term": 0.011679948307573795, - "rewards/margins": 6.408352851867676, - "rewards/rejected": -10.97044563293457, - "step": 330 - }, - { - "epoch": 0.70915907873594, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.768091277395346, - "learning_rate": 5.028639618138425e-07, - "logits/chosen": 0.4849713146686554, - "logits/rejected": 0.3761657476425171, - "logps/accuracies": 0.75, - "logps/chosen": -472.632080078125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -394.3712158203125, - "logps/ref_rejected": -320.7559814453125, - "logps/rejected": -576.399658203125, - "loss": 0.1809, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9130444526672363, - "rewards/grad_term": 0.0002858239458873868, - "rewards/margins": 8.869141578674316, - "rewards/rejected": -12.782186508178711, - "step": 331 - }, - { - "epoch": 0.7113015532940546, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.361339079624875, - "learning_rate": 5.020286396181383e-07, - "logits/chosen": 0.7538321614265442, - "logits/rejected": 0.619208574295044, - "logps/accuracies": 0.5, - "logps/chosen": -538.201416015625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -406.4328308105469, - "logps/ref_rejected": -341.9276123046875, - "logps/rejected": -551.7299194335938, - "loss": 0.1106, - "rewards/accuracies": 0.75, - "rewards/chosen": -6.588429927825928, - "rewards/grad_term": 0.009247011505067348, - "rewards/margins": 3.9016823768615723, - "rewards/rejected": -10.4901123046875, - "step": 332 - }, - { - "epoch": 0.7134440278521692, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.211461201062549, - "learning_rate": 5.011933174224344e-07, - "logits/chosen": 0.9981447458267212, - "logits/rejected": 0.6901638507843018, - "logps/accuracies": 0.75, - "logps/chosen": -473.5837707519531, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -413.8009948730469, - "logps/ref_rejected": -332.0254211425781, - "logps/rejected": -507.0432434082031, - "loss": 0.1231, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9891393184661865, - "rewards/grad_term": 0.0005725694936700165, - "rewards/margins": 5.761752605438232, - "rewards/rejected": -8.75089168548584, - "step": 333 - }, - { - "epoch": 0.7155865024102839, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.170697657476269, - "learning_rate": 5.003579952267303e-07, - "logits/chosen": 0.826168954372406, - "logits/rejected": 0.6049452424049377, - "logps/accuracies": 0.5, - "logps/chosen": -268.6787109375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -210.28704833984375, - "logps/ref_rejected": -152.79354858398438, - "logps/rejected": -252.23291015625, - "loss": 0.1311, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.919583797454834, - "rewards/grad_term": 0.009706183336675167, - "rewards/margins": 2.0523836612701416, - "rewards/rejected": -4.9719672203063965, - "step": 334 - }, - { - "epoch": 0.7177289769683985, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.060208207763219, - "learning_rate": 4.995226730310263e-07, - "logits/chosen": 0.8894015550613403, - "logits/rejected": 0.6564974784851074, - "logps/accuracies": 1.0, - "logps/chosen": -500.45635986328125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -414.86962890625, - "logps/ref_rejected": -351.6375732421875, - "logps/rejected": -605.1538696289062, - "loss": 0.1278, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.2793378829956055, - "rewards/grad_term": 0.0034387409687042236, - "rewards/margins": 8.396476745605469, - "rewards/rejected": -12.675814628601074, - "step": 335 - }, - { - "epoch": 0.7198714515265131, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.25, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.56000560457653, - "learning_rate": 4.986873508353221e-07, - "logits/chosen": 0.6553113460540771, - "logits/rejected": 0.6264476180076599, - "logps/accuracies": 0.75, - "logps/chosen": -260.0503234863281, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -188.68035888671875, - "logps/ref_rejected": -198.7422332763672, - "logps/rejected": -327.9527282714844, - "loss": 0.157, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.5684990882873535, - "rewards/grad_term": 0.014257272705435753, - "rewards/margins": 2.8920247554779053, - "rewards/rejected": -6.46052360534668, - "step": 336 - }, - { - "epoch": 0.7220139260846278, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.247249685460771, - "learning_rate": 4.978520286396182e-07, - "logits/chosen": 0.8244426250457764, - "logits/rejected": 0.6587880849838257, - "logps/accuracies": 0.75, - "logps/chosen": -550.476318359375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -413.40692138671875, - "logps/ref_rejected": -312.9790954589844, - "logps/rejected": -601.8764038085938, - "loss": 0.1166, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.853466987609863, - "rewards/grad_term": 0.0013105407124385238, - "rewards/margins": 7.591399192810059, - "rewards/rejected": -14.444866180419922, - "step": 337 - }, - { - "epoch": 0.7241564006427423, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.3962044900308355, - "learning_rate": 4.970167064439141e-07, - "logits/chosen": 0.36316683888435364, - "logits/rejected": 0.26318785548210144, - "logps/accuracies": 0.5, - "logps/chosen": -459.98480224609375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -393.48419189453125, - "logps/ref_rejected": -242.56094360351562, - "logps/rejected": -432.6356506347656, - "loss": 0.1479, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3250298500061035, - "rewards/grad_term": 0.0004948938149027526, - "rewards/margins": 6.178703784942627, - "rewards/rejected": -9.50373363494873, - "step": 338 - }, - { - "epoch": 0.726298875200857, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.583520613174011, - "learning_rate": 4.9618138424821e-07, - "logits/chosen": 0.6698114275932312, - "logits/rejected": 0.6079827547073364, - "logps/accuracies": 1.0, - "logps/chosen": -316.6778869628906, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -285.0715637207031, - "logps/ref_rejected": -286.4187316894531, - "logps/rejected": -387.62554931640625, - "loss": 0.1442, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5803159475326538, - "rewards/grad_term": 0.007696358487010002, - "rewards/margins": 3.480024814605713, - "rewards/rejected": -5.060340881347656, - "step": 339 - }, - { - "epoch": 0.7284413497589716, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.838033966465567, - "learning_rate": 4.953460620525059e-07, - "logits/chosen": 0.7049505710601807, - "logits/rejected": 0.6646623015403748, - "logps/accuracies": 1.0, - "logps/chosen": -391.0535888671875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -312.54132080078125, - "logps/ref_rejected": -331.8018798828125, - "logps/rejected": -517.3717041015625, - "loss": 0.146, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.925614833831787, - "rewards/grad_term": 0.007477066479623318, - "rewards/margins": 5.35287618637085, - "rewards/rejected": -9.278491020202637, - "step": 340 - }, - { - "epoch": 0.7305838243170862, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.719543349851092, - "learning_rate": 4.945107398568019e-07, - "logits/chosen": 0.6794430017471313, - "logits/rejected": 0.5714715719223022, - "logps/accuracies": 1.0, - "logps/chosen": -349.4491271972656, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -272.2052001953125, - "logps/ref_rejected": -253.03768920898438, - "logps/rejected": -437.6853942871094, - "loss": 0.1376, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.862197160720825, - "rewards/grad_term": 0.004704746417701244, - "rewards/margins": 5.3701887130737305, - "rewards/rejected": -9.232385635375977, - "step": 341 - }, - { - "epoch": 0.7327262988752009, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.451256306905016, - "learning_rate": 4.936754176610979e-07, - "logits/chosen": 0.6575830578804016, - "logits/rejected": 0.6638262271881104, - "logps/accuracies": 0.75, - "logps/chosen": -314.11114501953125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -257.2018737792969, - "logps/ref_rejected": -241.7303009033203, - "logps/rejected": -397.7080078125, - "loss": 0.1602, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.845463275909424, - "rewards/grad_term": 0.009044932201504707, - "rewards/margins": 4.953423023223877, - "rewards/rejected": -7.798886299133301, - "step": 342 - }, - { - "epoch": 0.7348687734333155, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.459693770757402, - "learning_rate": 4.928400954653937e-07, - "logits/chosen": 0.529110312461853, - "logits/rejected": 0.46302855014801025, - "logps/accuracies": 0.75, - "logps/chosen": -384.07061767578125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -316.386962890625, - "logps/ref_rejected": -298.5597839355469, - "logps/rejected": -452.8936767578125, - "loss": 0.1482, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3841824531555176, - "rewards/grad_term": 0.0015116020804271102, - "rewards/margins": 4.332512378692627, - "rewards/rejected": -7.7166948318481445, - "step": 343 - }, - { - "epoch": 0.7370112479914301, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.920848833475498, - "learning_rate": 4.920047732696897e-07, - "logits/chosen": 0.7953596711158752, - "logits/rejected": 0.7348934412002563, - "logps/accuracies": 0.75, - "logps/chosen": -594.716552734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -465.59210205078125, - "logps/ref_rejected": -436.11572265625, - "logps/rejected": -708.698974609375, - "loss": 0.1812, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.456222057342529, - "rewards/grad_term": 0.0025254576466977596, - "rewards/margins": 7.172940731048584, - "rewards/rejected": -13.629162788391113, - "step": 344 - }, - { - "epoch": 0.7391537225495447, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.924633562541802, - "learning_rate": 4.911694510739857e-07, - "logits/chosen": 0.5842578411102295, - "logits/rejected": 0.6216070652008057, - "logps/accuracies": 1.0, - "logps/chosen": -463.14154052734375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -373.7330322265625, - "logps/ref_rejected": -408.91278076171875, - "logps/rejected": -685.0960693359375, - "loss": 0.1078, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.470427513122559, - "rewards/grad_term": 0.0002844279515556991, - "rewards/margins": 9.338738441467285, - "rewards/rejected": -13.809165000915527, - "step": 345 - }, - { - "epoch": 0.7412961971076594, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 12.414755725942422, - "learning_rate": 4.903341288782816e-07, - "logits/chosen": 0.8328725099563599, - "logits/rejected": 0.7645402550697327, - "logps/accuracies": 0.75, - "logps/chosen": -414.1654968261719, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -368.20654296875, - "logps/ref_rejected": -308.8746337890625, - "logps/rejected": -470.432373046875, - "loss": 0.2259, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.297947883605957, - "rewards/grad_term": 0.00027060159482061863, - "rewards/margins": 5.779940605163574, - "rewards/rejected": -8.077888488769531, - "step": 346 - }, - { - "epoch": 0.7434386716657739, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.169966513004652, - "learning_rate": 4.894988066825775e-07, - "logits/chosen": 0.554535984992981, - "logits/rejected": 0.265200138092041, - "logps/accuracies": 0.75, - "logps/chosen": -356.061767578125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -315.3848876953125, - "logps/ref_rejected": -268.03143310546875, - "logps/rejected": -405.561767578125, - "loss": 0.1324, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.033844470977783, - "rewards/grad_term": 0.005571221467107534, - "rewards/margins": 4.842672348022461, - "rewards/rejected": -6.876516342163086, - "step": 347 - }, - { - "epoch": 0.7455811462238886, - "flips/correct->correct": 1.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.180681687024289, - "learning_rate": 4.886634844868734e-07, - "logits/chosen": 0.6118199229240417, - "logits/rejected": 0.6814651489257812, - "logps/accuracies": 1.0, - "logps/chosen": -246.84640502929688, - "logps/ref_accuracies": 1.0, - "logps/ref_chosen": -195.61514282226562, - "logps/ref_rejected": -214.57460021972656, - "logps/rejected": -361.24395751953125, - "loss": 0.1331, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5615625381469727, - "rewards/grad_term": 0.007279230747371912, - "rewards/margins": 4.77190637588501, - "rewards/rejected": -7.333469390869141, - "step": 348 - }, - { - "epoch": 0.7477236207820032, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.210731357114345, - "learning_rate": 4.878281622911695e-07, - "logits/chosen": 1.0106936693191528, - "logits/rejected": 0.45131033658981323, - "logps/accuracies": 0.5, - "logps/chosen": -454.847412109375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -324.7191162109375, - "logps/ref_rejected": -223.54873657226562, - "logps/rejected": -440.8221740722656, - "loss": 0.1168, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.506414890289307, - "rewards/grad_term": 0.009222909808158875, - "rewards/margins": 4.357255935668945, - "rewards/rejected": -10.86367130279541, - "step": 349 - }, - { - "epoch": 0.7498660953401178, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.362818321342788, - "learning_rate": 4.869928400954653e-07, - "logits/chosen": 0.586777925491333, - "logits/rejected": 0.5780112147331238, - "logps/accuracies": 1.0, - "logps/chosen": -262.08038330078125, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -232.11459350585938, - "logps/ref_rejected": -231.54193115234375, - "logps/rejected": -376.2002868652344, - "loss": 0.1459, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4982905387878418, - "rewards/grad_term": 0.006848993711173534, - "rewards/margins": 5.734626770019531, - "rewards/rejected": -7.232917785644531, - "step": 350 - }, - { - "epoch": 0.7520085698982325, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.297264703165888, - "learning_rate": 4.861575178997613e-07, - "logits/chosen": 0.5312216877937317, - "logits/rejected": 0.5838853716850281, - "logps/accuracies": 1.0, - "logps/chosen": -411.0349426269531, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -309.9207763671875, - "logps/ref_rejected": -289.38275146484375, - "logps/rejected": -511.20758056640625, - "loss": 0.1103, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.055708885192871, - "rewards/grad_term": 0.0036189735401421785, - "rewards/margins": 6.035533428192139, - "rewards/rejected": -11.091242790222168, - "step": 351 - }, - { - "epoch": 0.7541510444563471, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.720091390841414, - "learning_rate": 4.853221957040573e-07, - "logits/chosen": 0.8272877931594849, - "logits/rejected": 0.6150888204574585, - "logps/accuracies": 0.75, - "logps/chosen": -382.7689514160156, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -322.699951171875, - "logps/ref_rejected": -250.23941040039062, - "logps/rejected": -417.6976623535156, - "loss": 0.1311, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0034492015838623, - "rewards/grad_term": 0.0010191942565143108, - "rewards/margins": 5.369463920593262, - "rewards/rejected": -8.372913360595703, - "step": 352 - }, - { - "epoch": 0.7562935190144617, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.819345327687853, - "learning_rate": 4.844868735083532e-07, - "logits/chosen": 0.8716313242912292, - "logits/rejected": 0.7711046934127808, - "logps/accuracies": 0.75, - "logps/chosen": -352.4783020019531, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -285.69561767578125, - "logps/ref_rejected": -241.1641845703125, - "logps/rejected": -428.366943359375, - "loss": 0.131, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3391335010528564, - "rewards/grad_term": 0.006919885985553265, - "rewards/margins": 6.021005153656006, - "rewards/rejected": -9.360138893127441, - "step": 353 - }, - { - "epoch": 0.7584359935725763, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.544013532966439, - "learning_rate": 4.836515513126491e-07, - "logits/chosen": 0.8605263233184814, - "logits/rejected": 0.7548648715019226, - "logps/accuracies": 0.75, - "logps/chosen": -513.380126953125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -414.0445861816406, - "logps/ref_rejected": -390.1202392578125, - "logps/rejected": -633.4596557617188, - "loss": 0.1838, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.966775417327881, - "rewards/grad_term": 0.005854703951627016, - "rewards/margins": 7.2001953125, - "rewards/rejected": -12.166970252990723, - "step": 354 - }, - { - "epoch": 0.760578468130691, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.840668647752036, - "learning_rate": 4.82816229116945e-07, - "logits/chosen": 0.49805212020874023, - "logits/rejected": 0.5557568669319153, - "logps/accuracies": 0.75, - "logps/chosen": -327.806884765625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -277.4596252441406, - "logps/ref_rejected": -261.7403869628906, - "logps/rejected": -415.97454833984375, - "loss": 0.083, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.5173633098602295, - "rewards/grad_term": 0.0034056631848216057, - "rewards/margins": 5.194344997406006, - "rewards/rejected": -7.711708068847656, - "step": 355 - }, - { - "epoch": 0.7627209426888055, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.886486362606452, - "learning_rate": 4.819809069212411e-07, - "logits/chosen": 0.523609459400177, - "logits/rejected": 0.3943213224411011, - "logps/accuracies": 1.0, - "logps/chosen": -449.4044494628906, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -386.558349609375, - "logps/ref_rejected": -362.3239440917969, - "logps/rejected": -540.4671020507812, - "loss": 0.1275, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1423020362854004, - "rewards/grad_term": 0.000673395290505141, - "rewards/margins": 5.76485538482666, - "rewards/rejected": -8.907157897949219, - "step": 356 - }, - { - "epoch": 0.7648634172469202, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.772834608130855, - "learning_rate": 4.811455847255369e-07, - "logits/chosen": 0.7413110733032227, - "logits/rejected": 0.6600308418273926, - "logps/accuracies": 0.75, - "logps/chosen": -372.83050537109375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -308.9735107421875, - "logps/ref_rejected": -233.7576904296875, - "logps/rejected": -412.01788330078125, - "loss": 0.107, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.192850351333618, - "rewards/grad_term": 0.0023573609068989754, - "rewards/margins": 5.720157623291016, - "rewards/rejected": -8.913007736206055, - "step": 357 - }, - { - "epoch": 0.7670058918050349, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.586255491609359, - "learning_rate": 4.803102625298329e-07, - "logits/chosen": 0.7194008827209473, - "logits/rejected": 0.529259443283081, - "logps/accuracies": 0.25, - "logps/chosen": -470.25994873046875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -412.6962890625, - "logps/ref_rejected": -344.50018310546875, - "logps/rejected": -454.2938537597656, - "loss": 0.1335, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8781819343566895, - "rewards/grad_term": 0.006395381409674883, - "rewards/margins": 2.6115007400512695, - "rewards/rejected": -5.489683151245117, - "step": 358 - }, - { - "epoch": 0.7691483663631494, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.427948430933896, - "learning_rate": 4.794749403341288e-07, - "logits/chosen": 0.5997850298881531, - "logits/rejected": 0.39114004373550415, - "logps/accuracies": 0.5, - "logps/chosen": -450.5962219238281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -360.58770751953125, - "logps/ref_rejected": -266.60211181640625, - "logps/rejected": -491.2527770996094, - "loss": 0.1088, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.500424861907959, - "rewards/grad_term": 0.0012759572127833962, - "rewards/margins": 6.7321085929870605, - "rewards/rejected": -11.232534408569336, - "step": 359 - }, - { - "epoch": 0.7712908409212641, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.10147872579214, - "learning_rate": 4.786396181384249e-07, - "logits/chosen": 0.6526762247085571, - "logits/rejected": 0.406318336725235, - "logps/accuracies": 0.75, - "logps/chosen": -362.89215087890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -327.1744079589844, - "logps/ref_rejected": -287.7540588378906, - "logps/rejected": -391.6522216796875, - "loss": 0.1822, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7858877182006836, - "rewards/grad_term": 0.003739753272384405, - "rewards/margins": 3.4090189933776855, - "rewards/rejected": -5.194906234741211, - "step": 360 - }, - { - "epoch": 0.7734333154793787, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.009801801350886, - "learning_rate": 4.778042959427207e-07, - "logits/chosen": 0.9286985993385315, - "logits/rejected": 0.75478196144104, - "logps/accuracies": 0.75, - "logps/chosen": -341.165771484375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -286.45068359375, - "logps/ref_rejected": -226.50819396972656, - "logps/rejected": -388.7208251953125, - "loss": 0.1502, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7357535362243652, - "rewards/grad_term": 0.005150636192411184, - "rewards/margins": 5.374879360198975, - "rewards/rejected": -8.11063289642334, - "step": 361 - }, - { - "epoch": 0.7755757900374933, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.079915978073272, - "learning_rate": 4.769689737470167e-07, - "logits/chosen": 0.15154126286506653, - "logits/rejected": 0.23431162536144257, - "logps/accuracies": 0.75, - "logps/chosen": -141.20352172851562, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -122.65444946289062, - "logps/ref_rejected": -115.94908142089844, - "logps/rejected": -179.49026489257812, - "loss": 0.1532, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9274539351463318, - "rewards/grad_term": 0.005933484528213739, - "rewards/margins": 2.249605417251587, - "rewards/rejected": -3.1770591735839844, - "step": 362 - }, - { - "epoch": 0.7777182645956079, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.620781698654811, - "learning_rate": 4.7613365155131265e-07, - "logits/chosen": 0.6806411147117615, - "logits/rejected": 0.5877007842063904, - "logps/accuracies": 1.0, - "logps/chosen": -305.52105712890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -225.72100830078125, - "logps/ref_rejected": -230.88204956054688, - "logps/rejected": -416.0865783691406, - "loss": 0.1264, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9900035858154297, - "rewards/grad_term": 0.0007602861733175814, - "rewards/margins": 5.270223617553711, - "rewards/rejected": -9.26022720336914, - "step": 363 - }, - { - "epoch": 0.7798607391537226, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.162163225237173, - "learning_rate": 4.752983293556086e-07, - "logits/chosen": 0.3107702434062958, - "logits/rejected": 0.46145111322402954, - "logps/accuracies": 0.75, - "logps/chosen": -376.48138427734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -298.7530517578125, - "logps/ref_rejected": -248.01364135742188, - "logps/rejected": -483.236572265625, - "loss": 0.1518, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8864171504974365, - "rewards/grad_term": 0.0027246952522546053, - "rewards/margins": 7.874729156494141, - "rewards/rejected": -11.761146545410156, - "step": 364 - }, - { - "epoch": 0.7820032137118371, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.682442207333185, - "learning_rate": 4.744630071599045e-07, - "logits/chosen": 0.8286511301994324, - "logits/rejected": 0.7338289618492126, - "logps/accuracies": 0.75, - "logps/chosen": -475.2028503417969, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -419.9889221191406, - "logps/ref_rejected": -337.7398681640625, - "logps/rejected": -524.1643676757812, - "loss": 0.1149, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7606959342956543, - "rewards/grad_term": 0.0009419883135706186, - "rewards/margins": 6.5605316162109375, - "rewards/rejected": -9.321227073669434, - "step": 365 - }, - { - "epoch": 0.7841456882699518, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.969521309776873, - "learning_rate": 4.736276849642005e-07, - "logits/chosen": 0.4039073884487152, - "logits/rejected": 0.4401341676712036, - "logps/accuracies": 0.75, - "logps/chosen": -332.14044189453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -283.09283447265625, - "logps/ref_rejected": -206.31539916992188, - "logps/rejected": -350.80340576171875, - "loss": 0.1418, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.4523794651031494, - "rewards/grad_term": 0.0033193090930581093, - "rewards/margins": 4.772021293640137, - "rewards/rejected": -7.224400997161865, - "step": 366 - }, - { - "epoch": 0.7862881628280665, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.54876490811146, - "learning_rate": 4.727923627684964e-07, - "logits/chosen": 0.3236476182937622, - "logits/rejected": 0.3070971667766571, - "logps/accuracies": 0.5, - "logps/chosen": -317.3150329589844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -236.6580810546875, - "logps/ref_rejected": -158.20616149902344, - "logps/rejected": -292.539794921875, - "loss": 0.1308, - "rewards/accuracies": 0.75, - "rewards/chosen": -4.0328474044799805, - "rewards/grad_term": 0.016018129885196686, - "rewards/margins": 2.683833599090576, - "rewards/rejected": -6.716681003570557, - "step": 367 - }, - { - "epoch": 0.788430637386181, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.13550622656799, - "learning_rate": 4.7195704057279233e-07, - "logits/chosen": 0.762974739074707, - "logits/rejected": 0.506515622138977, - "logps/accuracies": 0.5, - "logps/chosen": -476.596923828125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -387.86663818359375, - "logps/ref_rejected": -268.920166015625, - "logps/rejected": -499.3272705078125, - "loss": 0.1571, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.4365129470825195, - "rewards/grad_term": 0.0013110407162457705, - "rewards/margins": 7.083842754364014, - "rewards/rejected": -11.520356178283691, - "step": 368 - }, - { - "epoch": 0.7905731119442957, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.573319632626368, - "learning_rate": 4.7112171837708825e-07, - "logits/chosen": 0.9512593150138855, - "logits/rejected": 0.7041115164756775, - "logps/accuracies": 0.75, - "logps/chosen": -443.4069519042969, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -363.83258056640625, - "logps/ref_rejected": -308.1307373046875, - "logps/rejected": -442.51458740234375, - "loss": 0.1507, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9787182807922363, - "rewards/grad_term": 0.005531268659979105, - "rewards/margins": 2.7404749393463135, - "rewards/rejected": -6.719193458557129, - "step": 369 - }, - { - "epoch": 0.7927155865024103, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.584409310189419, - "learning_rate": 4.7028639618138423e-07, - "logits/chosen": 0.6789947152137756, - "logits/rejected": 0.46235227584838867, - "logps/accuracies": 0.5, - "logps/chosen": -433.4268798828125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -329.707763671875, - "logps/ref_rejected": -254.446533203125, - "logps/rejected": -444.1405334472656, - "loss": 0.1189, - "rewards/accuracies": 0.75, - "rewards/chosen": -5.185955047607422, - "rewards/grad_term": 0.010698116384446621, - "rewards/margins": 4.2987446784973145, - "rewards/rejected": -9.484700202941895, - "step": 370 - }, - { - "epoch": 0.7948580610605249, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.82663440003344, - "learning_rate": 4.694510739856802e-07, - "logits/chosen": 0.8684903383255005, - "logits/rejected": 0.864480197429657, - "logps/accuracies": 1.0, - "logps/chosen": -616.7009887695312, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -439.9715270996094, - "logps/ref_rejected": -483.77447509765625, - "logps/rejected": -717.4370727539062, - "loss": 0.1749, - "rewards/accuracies": 0.75, - "rewards/chosen": -8.83647346496582, - "rewards/grad_term": 0.009361391887068748, - "rewards/margins": 2.84665846824646, - "rewards/rejected": -11.68313217163086, - "step": 371 - }, - { - "epoch": 0.7970005356186395, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.700184569183158, - "learning_rate": 4.686157517899761e-07, - "logits/chosen": 0.3798208236694336, - "logits/rejected": 0.18705379962921143, - "logps/accuracies": 0.75, - "logps/chosen": -290.2025451660156, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -256.36285400390625, - "logps/ref_rejected": -187.69712829589844, - "logps/rejected": -312.7464599609375, - "loss": 0.1903, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6919848918914795, - "rewards/grad_term": 0.0075987353920936584, - "rewards/margins": 4.560481548309326, - "rewards/rejected": -6.252466201782227, - "step": 372 - }, - { - "epoch": 0.7991430101767542, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.246215247843911, - "learning_rate": 4.6778042959427206e-07, - "logits/chosen": 0.8069986701011658, - "logits/rejected": 0.7122650146484375, - "logps/accuracies": 1.0, - "logps/chosen": -267.30328369140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -210.80979919433594, - "logps/ref_rejected": -209.55477905273438, - "logps/rejected": -342.04205322265625, - "loss": 0.1123, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8246755599975586, - "rewards/grad_term": 0.003649149788543582, - "rewards/margins": 3.799687385559082, - "rewards/rejected": -6.624362468719482, - "step": 373 - }, - { - "epoch": 0.8012854847348688, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.128291731066904, - "learning_rate": 4.6694510739856804e-07, - "logits/chosen": 0.8819460868835449, - "logits/rejected": 0.760127067565918, - "logps/accuracies": 0.75, - "logps/chosen": -614.6334228515625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -528.1080932617188, - "logps/ref_rejected": -390.6625061035156, - "logps/rejected": -600.1669921875, - "loss": 0.1455, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.326266288757324, - "rewards/grad_term": 0.0019579888321459293, - "rewards/margins": 6.148959159851074, - "rewards/rejected": -10.475224494934082, - "step": 374 - }, - { - "epoch": 0.8034279592929834, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 15.828594310560685, - "learning_rate": 4.661097852028639e-07, - "logits/chosen": 0.8489161729812622, - "logits/rejected": 0.753060519695282, - "logps/accuracies": 0.5, - "logps/chosen": -382.72735595703125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -323.9566650390625, - "logps/ref_rejected": -298.6670227050781, - "logps/rejected": -467.1494140625, - "loss": 0.1952, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.938535213470459, - "rewards/grad_term": 0.004120782017707825, - "rewards/margins": 5.485583305358887, - "rewards/rejected": -8.424118041992188, - "step": 375 - }, - { - "epoch": 0.8055704338510981, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 10.583125585454663, - "learning_rate": 4.652744630071599e-07, - "logits/chosen": 0.5661925673484802, - "logits/rejected": 0.3597102761268616, - "logps/accuracies": 0.5, - "logps/chosen": -306.2489318847656, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -252.90496826171875, - "logps/ref_rejected": -201.43569946289062, - "logps/rejected": -383.89324951171875, - "loss": 0.1964, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6671996116638184, - "rewards/grad_term": 0.0024612259585410357, - "rewards/margins": 6.455678462982178, - "rewards/rejected": -9.122878074645996, - "step": 376 - }, - { - "epoch": 0.8077129084092126, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.4392788570254815, - "learning_rate": 4.644391408114558e-07, - "logits/chosen": 0.3249109983444214, - "logits/rejected": 0.32942840456962585, - "logps/accuracies": 0.75, - "logps/chosen": -448.28216552734375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -387.504638671875, - "logps/ref_rejected": -307.7704162597656, - "logps/rejected": -473.29437255859375, - "loss": 0.1221, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.0388755798339844, - "rewards/grad_term": 0.0016592949395999312, - "rewards/margins": 5.237322807312012, - "rewards/rejected": -8.276198387145996, - "step": 377 - }, - { - "epoch": 0.8098553829673273, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.329181119972007, - "learning_rate": 4.636038186157518e-07, - "logits/chosen": 0.404056191444397, - "logits/rejected": 0.6022278070449829, - "logps/accuracies": 1.0, - "logps/chosen": -382.3240051269531, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -305.3075256347656, - "logps/ref_rejected": -321.82904052734375, - "logps/rejected": -545.7882080078125, - "loss": 0.1446, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8508248329162598, - "rewards/grad_term": 0.0002711013949010521, - "rewards/margins": 7.347134590148926, - "rewards/rejected": -11.197959899902344, - "step": 378 - }, - { - "epoch": 0.8119978575254418, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.175949590736797, - "learning_rate": 4.6276849642004767e-07, - "logits/chosen": 0.751959502696991, - "logits/rejected": 0.5633019804954529, - "logps/accuracies": 1.0, - "logps/chosen": -502.3103942871094, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -433.314697265625, - "logps/ref_rejected": -358.736083984375, - "logps/rejected": -592.6541748046875, - "loss": 0.1823, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.449784278869629, - "rewards/grad_term": 4.590416210703552e-05, - "rewards/margins": 8.24611759185791, - "rewards/rejected": -11.695901870727539, - "step": 379 - }, - { - "epoch": 0.8141403320835565, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.61637016421897, - "learning_rate": 4.6193317422434364e-07, - "logits/chosen": 0.5796254873275757, - "logits/rejected": 0.6662420630455017, - "logps/accuracies": 1.0, - "logps/chosen": -412.05316162109375, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -339.7696533203125, - "logps/ref_rejected": -340.8929748535156, - "logps/rejected": -524.9725341796875, - "loss": 0.1719, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.61417555809021, - "rewards/grad_term": 0.0034167964477092028, - "rewards/margins": 5.589802265167236, - "rewards/rejected": -9.203977584838867, - "step": 380 - }, - { - "epoch": 0.8162828066416711, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.094201585504359, - "learning_rate": 4.610978520286396e-07, - "logits/chosen": 0.7151302695274353, - "logits/rejected": 0.6419375538825989, - "logps/accuracies": 1.0, - "logps/chosen": -305.9559326171875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -270.7724609375, - "logps/ref_rejected": -270.0223083496094, - "logps/rejected": -445.2619934082031, - "loss": 0.125, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7591745853424072, - "rewards/grad_term": 0.0021109317895025015, - "rewards/margins": 7.002810001373291, - "rewards/rejected": -8.761983871459961, - "step": 381 - }, - { - "epoch": 0.8184252811997857, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 14.362881277342565, - "learning_rate": 4.602625298329356e-07, - "logits/chosen": 0.8437117338180542, - "logits/rejected": 0.7384243607521057, - "logps/accuracies": 0.5, - "logps/chosen": -461.12603759765625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -386.73773193359375, - "logps/ref_rejected": -304.27459716796875, - "logps/rejected": -513.4872436523438, - "loss": 0.1601, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.7194161415100098, - "rewards/grad_term": 0.00258549302816391, - "rewards/margins": 6.741215705871582, - "rewards/rejected": -10.46063232421875, - "step": 382 - }, - { - "epoch": 0.8205677557579004, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.456611514133404, - "learning_rate": 4.594272076372315e-07, - "logits/chosen": 0.7477813363075256, - "logits/rejected": 0.3820553123950958, - "logps/accuracies": 0.75, - "logps/chosen": -507.67462158203125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -389.72613525390625, - "logps/ref_rejected": -367.64569091796875, - "logps/rejected": -579.4849243164062, - "loss": 0.1604, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.89742374420166, - "rewards/grad_term": 0.007863424718379974, - "rewards/margins": 4.694538593292236, - "rewards/rejected": -10.591961860656738, - "step": 383 - }, - { - "epoch": 0.822710230316015, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.8619420532382325, - "learning_rate": 4.5859188544152745e-07, - "logits/chosen": 0.7787905931472778, - "logits/rejected": 0.767812967300415, - "logps/accuracies": 0.75, - "logps/chosen": -384.25, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -312.675537109375, - "logps/ref_rejected": -270.84429931640625, - "logps/rejected": -437.036376953125, - "loss": 0.1155, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.5787243843078613, - "rewards/grad_term": 0.003301040269434452, - "rewards/margins": 4.730880260467529, - "rewards/rejected": -8.30960464477539, - "step": 384 - }, - { - "epoch": 0.822710230316015, - "eval_flips/correct->correct": 0.14000000059604645, - "eval_flips/correct->incorrect": 0.019999999552965164, - "eval_flips/incorrect->correct": 0.5400000214576721, - "eval_flips/incorrect->incorrect": 0.30000001192092896, - "eval_logits/chosen": 0.6912816762924194, - "eval_logits/rejected": 0.5833743810653687, - "eval_logps/accuracies": 0.6800000071525574, - "eval_logps/chosen": -383.8902282714844, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -404.5517578125, - "eval_loss": 0.15501657128334045, - "eval_rewards/accuracies": 0.8799999952316284, - "eval_rewards/chosen": -3.018728256225586, - "eval_rewards/grad_term": 0.00715669384226203, - "eval_rewards/margins": 4.273808002471924, - "eval_rewards/rejected": -7.292536735534668, - "eval_runtime": 372.7419, - "eval_samples_per_second": 4.239, - "eval_steps_per_second": 0.134, - "step": 384 - }, - { - "epoch": 0.8248527048741296, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 8.34443643722511, - "learning_rate": 4.577565632458234e-07, - "logits/chosen": 0.6030393242835999, - "logits/rejected": 0.5619946718215942, - "logps/accuracies": 0.5, - "logps/chosen": -445.7275695800781, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -341.169921875, - "logps/ref_rejected": -280.4991455078125, - "logps/rejected": -452.7607116699219, - "loss": 0.1376, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.227883815765381, - "rewards/grad_term": 0.003972820471972227, - "rewards/margins": 3.3851945400238037, - "rewards/rejected": -8.613078117370605, - "step": 385 - }, - { - "epoch": 0.8269951794322442, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.067319624450543, - "learning_rate": 4.569212410501193e-07, - "logits/chosen": 0.6664745211601257, - "logits/rejected": 0.6004454493522644, - "logps/accuracies": 0.5, - "logps/chosen": -461.43505859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -375.47607421875, - "logps/ref_rejected": -329.8832092285156, - "logps/rejected": -491.4964599609375, - "loss": 0.1263, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.297948360443115, - "rewards/grad_term": 0.006247952580451965, - "rewards/margins": 3.7827141284942627, - "rewards/rejected": -8.080662727355957, - "step": 386 - }, - { - "epoch": 0.8291376539903589, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.42647077537532, - "learning_rate": 4.5608591885441523e-07, - "logits/chosen": 0.6861754655838013, - "logits/rejected": 0.39711299538612366, - "logps/accuracies": 0.75, - "logps/chosen": -266.0076904296875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -222.40213012695312, - "logps/ref_rejected": -151.0525665283203, - "logps/rejected": -257.8247985839844, - "loss": 0.1474, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1802759170532227, - "rewards/grad_term": 0.0029023890383541584, - "rewards/margins": 3.1583352088928223, - "rewards/rejected": -5.338611125946045, - "step": 387 - }, - { - "epoch": 0.8312801285484734, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.54998768600324, - "learning_rate": 4.552505966587112e-07, - "logits/chosen": 0.7813968658447266, - "logits/rejected": 0.675197184085846, - "logps/accuracies": 0.75, - "logps/chosen": -384.80474853515625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -335.5448913574219, - "logps/ref_rejected": -288.6382141113281, - "logps/rejected": -429.424072265625, - "loss": 0.105, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.462993621826172, - "rewards/grad_term": 0.004009037744253874, - "rewards/margins": 4.57629919052124, - "rewards/rejected": -7.039292335510254, - "step": 388 - }, - { - "epoch": 0.8334226031065881, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.3771391402744175, - "learning_rate": 4.544152744630072e-07, - "logits/chosen": 0.5336940884590149, - "logits/rejected": 0.61911940574646, - "logps/accuracies": 1.0, - "logps/chosen": -435.4273986816406, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -310.7679443359375, - "logps/ref_rejected": -220.2869110107422, - "logps/rejected": -539.0606079101562, - "loss": 0.1034, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.232973098754883, - "rewards/grad_term": 0.002168377162888646, - "rewards/margins": 9.705711364746094, - "rewards/rejected": -15.938684463500977, - "step": 389 - }, - { - "epoch": 0.8355650776647028, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.772490840913882, - "learning_rate": 4.5357995226730306e-07, - "logits/chosen": 0.7710881233215332, - "logits/rejected": 0.52399080991745, - "logps/accuracies": 0.75, - "logps/chosen": -394.009033203125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -299.70672607421875, - "logps/ref_rejected": -240.57080078125, - "logps/rejected": -437.76953125, - "loss": 0.0987, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.715115070343018, - "rewards/grad_term": 0.0006557138403877616, - "rewards/margins": 5.14482307434082, - "rewards/rejected": -9.85993766784668, - "step": 390 - }, - { - "epoch": 0.8377075522228173, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.038299673221383, - "learning_rate": 4.5274463007159904e-07, - "logits/chosen": 0.825120747089386, - "logits/rejected": 0.6199119687080383, - "logps/accuracies": 0.25, - "logps/chosen": -434.44329833984375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -317.6658630371094, - "logps/ref_rejected": -241.26890563964844, - "logps/rejected": -444.71636962890625, - "loss": 0.1355, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.838871002197266, - "rewards/grad_term": 0.001953305210918188, - "rewards/margins": 4.333502769470215, - "rewards/rejected": -10.172372817993164, - "step": 391 - }, - { - "epoch": 0.839850026780932, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.157799585200374, - "learning_rate": 4.5190930787589496e-07, - "logits/chosen": 0.7274588346481323, - "logits/rejected": 0.658560037612915, - "logps/accuracies": 1.0, - "logps/chosen": -377.17510986328125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -304.23382568359375, - "logps/ref_rejected": -287.71441650390625, - "logps/rejected": -464.94073486328125, - "loss": 0.1512, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6470634937286377, - "rewards/grad_term": 0.0007109759608283639, - "rewards/margins": 5.214252948760986, - "rewards/rejected": -8.861316680908203, - "step": 392 - }, - { - "epoch": 0.8419925013390466, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.274433302537343, - "learning_rate": 4.510739856801909e-07, - "logits/chosen": 0.5300111770629883, - "logits/rejected": 0.4925421476364136, - "logps/accuracies": 0.75, - "logps/chosen": -535.7554321289062, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -405.7460021972656, - "logps/ref_rejected": -341.57281494140625, - "logps/rejected": -637.828369140625, - "loss": 0.1401, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.500471115112305, - "rewards/grad_term": 0.0015781800029799342, - "rewards/margins": 8.312305450439453, - "rewards/rejected": -14.812776565551758, - "step": 393 - }, - { - "epoch": 0.8441349758971612, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.636233486579053, - "learning_rate": 4.502386634844868e-07, - "logits/chosen": 0.4728167653083801, - "logits/rejected": 0.7265413999557495, - "logps/accuracies": 1.0, - "logps/chosen": -329.78082275390625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -262.4573669433594, - "logps/ref_rejected": -250.28829956054688, - "logps/rejected": -432.49798583984375, - "loss": 0.1203, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3661720752716064, - "rewards/grad_term": 0.002081993967294693, - "rewards/margins": 5.744311332702637, - "rewards/rejected": -9.110483169555664, - "step": 394 - }, - { - "epoch": 0.8462774504552758, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.191471059477506, - "learning_rate": 4.494033412887828e-07, - "logits/chosen": 0.5292783975601196, - "logits/rejected": 0.2622219920158386, - "logps/accuracies": 0.75, - "logps/chosen": -296.090576171875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -242.09730529785156, - "logps/ref_rejected": -185.52487182617188, - "logps/rejected": -350.0084228515625, - "loss": 0.1594, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.6996638774871826, - "rewards/grad_term": 0.004369885195046663, - "rewards/margins": 5.5245137214660645, - "rewards/rejected": -8.224177360534668, - "step": 395 - }, - { - "epoch": 0.8484199250133905, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.697422074983217, - "learning_rate": 4.4856801909307877e-07, - "logits/chosen": 0.5663112998008728, - "logits/rejected": 0.41860711574554443, - "logps/accuracies": 0.75, - "logps/chosen": -370.9486083984375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -322.1562805175781, - "logps/ref_rejected": -241.59457397460938, - "logps/rejected": -408.6671142578125, - "loss": 0.1077, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.439617156982422, - "rewards/grad_term": 0.0069171166978776455, - "rewards/margins": 5.914010047912598, - "rewards/rejected": -8.353628158569336, - "step": 396 - }, - { - "epoch": 0.850562399571505, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 13.015020922003387, - "learning_rate": 4.4773269689737464e-07, - "logits/chosen": 0.7214323282241821, - "logits/rejected": 0.633575975894928, - "logps/accuracies": 1.0, - "logps/chosen": -596.1569213867188, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -488.6678161621094, - "logps/ref_rejected": -393.7802734375, - "logps/rejected": -751.7131958007812, - "loss": 0.2131, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.374455451965332, - "rewards/grad_term": 6.355544996949902e-07, - "rewards/margins": 12.522188186645508, - "rewards/rejected": -17.896644592285156, - "step": 397 - }, - { - "epoch": 0.8527048741296197, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 4.892748358005199, - "learning_rate": 4.468973747016706e-07, - "logits/chosen": 0.7805695533752441, - "logits/rejected": 0.6413770914077759, - "logps/accuracies": 1.0, - "logps/chosen": -511.5725402832031, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -438.80120849609375, - "logps/ref_rejected": -409.7392578125, - "logps/rejected": -677.0660400390625, - "loss": 0.0722, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.6385655403137207, - "rewards/grad_term": 2.607361057016533e-05, - "rewards/margins": 9.727773666381836, - "rewards/rejected": -13.366338729858398, - "step": 398 - }, - { - "epoch": 0.8548473486877344, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 12.448276262372078, - "learning_rate": 4.460620525059666e-07, - "logits/chosen": 0.7015663385391235, - "logits/rejected": 0.5971590280532837, - "logps/accuracies": 1.0, - "logps/chosen": -475.1459655761719, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -387.7801818847656, - "logps/ref_rejected": -357.5787353515625, - "logps/rejected": -657.3110961914062, - "loss": 0.1691, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.368288993835449, - "rewards/grad_term": 0.0006424501189030707, - "rewards/margins": 10.618330955505371, - "rewards/rejected": -14.98661994934082, - "step": 399 - }, - { - "epoch": 0.8569898232458489, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.52996684599927, - "learning_rate": 4.452267303102625e-07, - "logits/chosen": 0.5823007822036743, - "logits/rejected": 0.5328406095504761, - "logps/accuracies": 1.0, - "logps/chosen": -302.14312744140625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -234.80886840820312, - "logps/ref_rejected": -250.11843872070312, - "logps/rejected": -457.92333984375, - "loss": 0.1442, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.3667118549346924, - "rewards/grad_term": 0.0009050341905094683, - "rewards/margins": 7.023532867431641, - "rewards/rejected": -10.390244483947754, - "step": 400 - }, - { - "epoch": 0.8591322978039636, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.537952351751538, - "learning_rate": 4.4439140811455845e-07, - "logits/chosen": 0.929060161113739, - "logits/rejected": 0.7344148755073547, - "logps/accuracies": 0.5, - "logps/chosen": -484.63690185546875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -397.1466064453125, - "logps/ref_rejected": -278.0077819824219, - "logps/rejected": -507.8617858886719, - "loss": 0.1082, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.374514102935791, - "rewards/grad_term": 0.0002766952384263277, - "rewards/margins": 7.11818790435791, - "rewards/rejected": -11.492701530456543, - "step": 401 - }, - { - "epoch": 0.8612747723620782, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.644686107983424, - "learning_rate": 4.435560859188544e-07, - "logits/chosen": 0.771916389465332, - "logits/rejected": 0.6290404200553894, - "logps/accuracies": 0.75, - "logps/chosen": -315.5869140625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -250.51466369628906, - "logps/ref_rejected": -214.51107788085938, - "logps/rejected": -375.64447021484375, - "loss": 0.1285, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.253613233566284, - "rewards/grad_term": 0.0008995306561701, - "rewards/margins": 4.803055763244629, - "rewards/rejected": -8.056669235229492, - "step": 402 - }, - { - "epoch": 0.8634172469201928, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.733257087221838, - "learning_rate": 4.4272076372315035e-07, - "logits/chosen": 0.7453795075416565, - "logits/rejected": 0.6315554976463318, - "logps/accuracies": 1.0, - "logps/chosen": -232.2364501953125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -196.71493530273438, - "logps/ref_rejected": -155.52903747558594, - "logps/rejected": -270.3780822753906, - "loss": 0.0766, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7760753631591797, - "rewards/grad_term": 0.002003555418923497, - "rewards/margins": 3.966376543045044, - "rewards/rejected": -5.7424516677856445, - "step": 403 - }, - { - "epoch": 0.8655597214783074, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.60971738924586, - "learning_rate": 4.418854415274462e-07, - "logits/chosen": 0.7506588697433472, - "logits/rejected": 0.6142828464508057, - "logps/accuracies": 1.0, - "logps/chosen": -397.05242919921875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -317.8343505859375, - "logps/ref_rejected": -303.29852294921875, - "logps/rejected": -455.8990783691406, - "loss": 0.1456, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9609031677246094, - "rewards/grad_term": 0.0018456345424056053, - "rewards/margins": 3.6691231727600098, - "rewards/rejected": -7.630025863647461, - "step": 404 - }, - { - "epoch": 0.8677021960364221, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.056320190778132, - "learning_rate": 4.410501193317422e-07, - "logits/chosen": 0.5372669696807861, - "logits/rejected": 0.5302236676216125, - "logps/accuracies": 0.75, - "logps/chosen": -419.73486328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -331.661865234375, - "logps/ref_rejected": -297.73260498046875, - "logps/rejected": -506.001220703125, - "loss": 0.1351, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.403650283813477, - "rewards/grad_term": 0.0054626427590847015, - "rewards/margins": 6.0097808837890625, - "rewards/rejected": -10.413432121276855, - "step": 405 - }, - { - "epoch": 0.8698446705945367, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.174234002962357, - "learning_rate": 4.402147971360382e-07, - "logits/chosen": 0.7001669406890869, - "logits/rejected": 0.39515483379364014, - "logps/accuracies": 0.75, - "logps/chosen": -257.07257080078125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -186.89627075195312, - "logps/ref_rejected": -187.3160400390625, - "logps/rejected": -366.7100524902344, - "loss": 0.1342, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.508815288543701, - "rewards/grad_term": 0.0030996918212622404, - "rewards/margins": 5.4608845710754395, - "rewards/rejected": -8.96969985961914, - "step": 406 - }, - { - "epoch": 0.8719871451526513, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.813541027309185, - "learning_rate": 4.3937947494033416e-07, - "logits/chosen": 0.7106292247772217, - "logits/rejected": 0.17329132556915283, - "logps/accuracies": 0.5, - "logps/chosen": -406.671142578125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -309.2118835449219, - "logps/ref_rejected": -226.40023803710938, - "logps/rejected": -371.4858093261719, - "loss": 0.1001, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.872962474822998, - "rewards/grad_term": 0.007784712128341198, - "rewards/margins": 2.381316900253296, - "rewards/rejected": -7.254279136657715, - "step": 407 - }, - { - "epoch": 0.874129619710766, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.142665145862327, - "learning_rate": 4.3854415274463003e-07, - "logits/chosen": 0.6431873440742493, - "logits/rejected": 0.5117607116699219, - "logps/accuracies": 0.5, - "logps/chosen": -388.5247802734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -360.99249267578125, - "logps/ref_rejected": -295.61029052734375, - "logps/rejected": -396.9288330078125, - "loss": 0.0839, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.3766143321990967, - "rewards/grad_term": 0.008713779971003532, - "rewards/margins": 3.689311981201172, - "rewards/rejected": -5.0659260749816895, - "step": 408 - }, - { - "epoch": 0.8762720942688805, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.302023436651439, - "learning_rate": 4.37708830548926e-07, - "logits/chosen": 0.7255518436431885, - "logits/rejected": 0.6550572514533997, - "logps/accuracies": 1.0, - "logps/chosen": -503.82879638671875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -395.3870849609375, - "logps/ref_rejected": -354.33709716796875, - "logps/rejected": -625.1456298828125, - "loss": 0.1401, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.422085285186768, - "rewards/grad_term": 0.0003362699644640088, - "rewards/margins": 8.118339538574219, - "rewards/rejected": -13.540424346923828, - "step": 409 - }, - { - "epoch": 0.8784145688269952, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.292464980810114, - "learning_rate": 4.3687350835322194e-07, - "logits/chosen": 0.6224971413612366, - "logits/rejected": 0.7813397645950317, - "logps/accuracies": 1.0, - "logps/chosen": -246.21495056152344, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -178.90476989746094, - "logps/ref_rejected": -192.27903747558594, - "logps/rejected": -309.7457275390625, - "loss": 0.0869, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.365509033203125, - "rewards/grad_term": 0.005968024022877216, - "rewards/margins": 2.5078253746032715, - "rewards/rejected": -5.8733344078063965, - "step": 410 - }, - { - "epoch": 0.8805570433851098, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.994486636309479, - "learning_rate": 4.360381861575179e-07, - "logits/chosen": 0.5689117908477783, - "logits/rejected": 0.4422415494918823, - "logps/accuracies": 0.75, - "logps/chosen": -327.41168212890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -288.2694091796875, - "logps/ref_rejected": -235.6809844970703, - "logps/rejected": -354.2411804199219, - "loss": 0.1481, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9571146965026855, - "rewards/grad_term": 0.01035099197179079, - "rewards/margins": 3.9708945751190186, - "rewards/rejected": -5.928009033203125, - "step": 411 - }, - { - "epoch": 0.8826995179432244, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.858727469447434, - "learning_rate": 4.352028639618138e-07, - "logits/chosen": 0.758124589920044, - "logits/rejected": 0.7636011242866516, - "logps/accuracies": 1.0, - "logps/chosen": -433.85906982421875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -363.50360107421875, - "logps/ref_rejected": -367.9163513183594, - "logps/rejected": -580.310791015625, - "loss": 0.0963, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.5177736282348633, - "rewards/grad_term": 0.0005504750879481435, - "rewards/margins": 7.101946830749512, - "rewards/rejected": -10.619720458984375, - "step": 412 - }, - { - "epoch": 0.884841992501339, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.354863325065816, - "learning_rate": 4.3436754176610977e-07, - "logits/chosen": 0.6459320783615112, - "logits/rejected": 0.5770057439804077, - "logps/accuracies": 1.0, - "logps/chosen": -574.0004272460938, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -458.69244384765625, - "logps/ref_rejected": -463.82720947265625, - "logps/rejected": -754.6375122070312, - "loss": 0.0889, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.7653985023498535, - "rewards/grad_term": 0.00017192790983244777, - "rewards/margins": 8.775115966796875, - "rewards/rejected": -14.540514945983887, - "step": 413 - }, - { - "epoch": 0.8869844670594537, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.641937024128401, - "learning_rate": 4.3353221957040575e-07, - "logits/chosen": 0.6838860511779785, - "logits/rejected": 0.5406701564788818, - "logps/accuracies": 0.75, - "logps/chosen": -305.5446472167969, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -251.28353881835938, - "logps/ref_rejected": -207.93319702148438, - "logps/rejected": -345.9137268066406, - "loss": 0.1227, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.7130556106567383, - "rewards/grad_term": 0.000992645276710391, - "rewards/margins": 4.185970306396484, - "rewards/rejected": -6.899025917053223, - "step": 414 - }, - { - "epoch": 0.8891269416175683, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.853463086288327, - "learning_rate": 4.326968973747016e-07, - "logits/chosen": 0.6326106786727905, - "logits/rejected": 0.5018441677093506, - "logps/accuracies": 1.0, - "logps/chosen": -350.4998779296875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -291.879150390625, - "logps/ref_rejected": -234.96595764160156, - "logps/rejected": -414.79351806640625, - "loss": 0.1072, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.931034564971924, - "rewards/grad_term": 0.007088256999850273, - "rewards/margins": 6.060344696044922, - "rewards/rejected": -8.991378784179688, - "step": 415 - }, - { - "epoch": 0.8912694161756829, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.353678579870046, - "learning_rate": 4.318615751789976e-07, - "logits/chosen": 0.7541974186897278, - "logits/rejected": 0.6350035667419434, - "logps/accuracies": 1.0, - "logps/chosen": -488.8894348144531, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -403.95233154296875, - "logps/ref_rejected": -311.3324890136719, - "logps/rejected": -574.9945678710938, - "loss": 0.1862, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.246854782104492, - "rewards/grad_term": 0.0008102880092337728, - "rewards/margins": 8.936247825622559, - "rewards/rejected": -13.18310260772705, - "step": 416 - }, - { - "epoch": 0.8934118907337976, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.150177255687934, - "learning_rate": 4.310262529832935e-07, - "logits/chosen": 0.7215501666069031, - "logits/rejected": 0.7029194235801697, - "logps/accuracies": 1.0, - "logps/chosen": -449.3460693359375, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -361.64422607421875, - "logps/ref_rejected": -362.3080139160156, - "logps/rejected": -613.3135375976562, - "loss": 0.1859, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.385092258453369, - "rewards/grad_term": 0.0001239069679286331, - "rewards/margins": 8.165184020996094, - "rewards/rejected": -12.550276756286621, - "step": 417 - }, - { - "epoch": 0.8955543652919121, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.057751158218558, - "learning_rate": 4.301909307875895e-07, - "logits/chosen": 0.49071842432022095, - "logits/rejected": 0.4239951968193054, - "logps/accuracies": 0.25, - "logps/chosen": -355.53564453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -259.5860900878906, - "logps/ref_rejected": -194.45254516601562, - "logps/rejected": -348.5555725097656, - "loss": 0.1136, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.797478675842285, - "rewards/grad_term": 0.01176033727824688, - "rewards/margins": 2.9076716899871826, - "rewards/rejected": -7.705150127410889, - "step": 418 - }, - { - "epoch": 0.8976968398500268, - "flips/correct->correct": 0.75, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.128643742839769, - "learning_rate": 4.2935560859188537e-07, - "logits/chosen": 0.5078434944152832, - "logits/rejected": 0.3587588369846344, - "logps/accuracies": 0.75, - "logps/chosen": -334.2418518066406, - "logps/ref_accuracies": 0.75, - "logps/ref_chosen": -276.947998046875, - "logps/ref_rejected": -290.69976806640625, - "logps/rejected": -465.8039245605469, - "loss": 0.1086, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.864691972732544, - "rewards/grad_term": 0.001770102884620428, - "rewards/margins": 5.890515327453613, - "rewards/rejected": -8.755208015441895, - "step": 419 - }, - { - "epoch": 0.8998393144081414, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.460646869207045, - "learning_rate": 4.2852028639618135e-07, - "logits/chosen": 0.6151570081710815, - "logits/rejected": 0.5713327527046204, - "logps/accuracies": 0.75, - "logps/chosen": -357.33209228515625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -289.7698974609375, - "logps/ref_rejected": -239.04769897460938, - "logps/rejected": -366.888427734375, - "loss": 0.1259, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.378108501434326, - "rewards/grad_term": 0.004311670083552599, - "rewards/margins": 3.013929843902588, - "rewards/rejected": -6.392038345336914, - "step": 420 - }, - { - "epoch": 0.901981788966256, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.054391516220369, - "learning_rate": 4.2768496420047733e-07, - "logits/chosen": 0.6619023084640503, - "logits/rejected": 0.6487151980400085, - "logps/accuracies": 0.75, - "logps/chosen": -419.2369689941406, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -322.2930603027344, - "logps/ref_rejected": -283.63616943359375, - "logps/rejected": -483.73504638671875, - "loss": 0.1269, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.847194671630859, - "rewards/grad_term": 0.0035053249448537827, - "rewards/margins": 5.157749652862549, - "rewards/rejected": -10.004944801330566, - "step": 421 - }, - { - "epoch": 0.9041242635243707, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.627822881128733, - "learning_rate": 4.268496420047732e-07, - "logits/chosen": 0.7480889558792114, - "logits/rejected": 0.6404839754104614, - "logps/accuracies": 0.75, - "logps/chosen": -348.94525146484375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -275.64013671875, - "logps/ref_rejected": -239.08200073242188, - "logps/rejected": -421.51513671875, - "loss": 0.0891, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.665255546569824, - "rewards/grad_term": 0.008442794904112816, - "rewards/margins": 5.456399917602539, - "rewards/rejected": -9.121655464172363, - "step": 422 - }, - { - "epoch": 0.9062667380824853, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 7.335769347864956, - "learning_rate": 4.260143198090692e-07, - "logits/chosen": 0.6401829123497009, - "logits/rejected": 0.7815831899642944, - "logps/accuracies": 0.5, - "logps/chosen": -379.6376647949219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -265.36968994140625, - "logps/ref_rejected": -255.16949462890625, - "logps/rejected": -414.13201904296875, - "loss": 0.1133, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.713399410247803, - "rewards/grad_term": 0.007248271256685257, - "rewards/margins": 2.2347278594970703, - "rewards/rejected": -7.948126792907715, - "step": 423 - }, - { - "epoch": 0.9084092126405999, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 9.129137286638404, - "learning_rate": 4.2517899761336516e-07, - "logits/chosen": 0.6732430458068848, - "logits/rejected": 0.5181256532669067, - "logps/accuracies": 0.25, - "logps/chosen": -605.279541015625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -473.38079833984375, - "logps/ref_rejected": -406.4195861816406, - "logps/rejected": -641.9754638671875, - "loss": 0.1132, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.594937801361084, - "rewards/grad_term": 0.0023317153099924326, - "rewards/margins": 5.182857513427734, - "rewards/rejected": -11.777795791625977, - "step": 424 - }, - { - "epoch": 0.9105516871987145, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 14.734996244623405, - "learning_rate": 4.243436754176611e-07, - "logits/chosen": 0.48378825187683105, - "logits/rejected": 0.3249673843383789, - "logps/accuracies": 0.75, - "logps/chosen": -461.893310546875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -390.8454895019531, - "logps/ref_rejected": -320.3680419921875, - "logps/rejected": -476.0527038574219, - "loss": 0.1884, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.552391290664673, - "rewards/grad_term": 0.004285210277885199, - "rewards/margins": 4.231842041015625, - "rewards/rejected": -7.784233093261719, - "step": 425 - }, - { - "epoch": 0.9126941617568292, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.389398153896957, - "learning_rate": 4.23508353221957e-07, - "logits/chosen": 0.7646510004997253, - "logits/rejected": 0.669806957244873, - "logps/accuracies": 1.0, - "logps/chosen": -448.4755859375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -365.5307922363281, - "logps/ref_rejected": -346.1925354003906, - "logps/rejected": -590.5164184570312, - "loss": 0.1623, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.147238254547119, - "rewards/grad_term": 0.00027649421826936305, - "rewards/margins": 8.068957328796387, - "rewards/rejected": -12.216196060180664, - "step": 426 - }, - { - "epoch": 0.9148366363149437, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.370091633753265, - "learning_rate": 4.2267303102625293e-07, - "logits/chosen": 0.7482293844223022, - "logits/rejected": 0.6367194652557373, - "logps/accuracies": 1.0, - "logps/chosen": -521.5593872070312, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -416.5623779296875, - "logps/ref_rejected": -381.7167663574219, - "logps/rejected": -664.441162109375, - "loss": 0.1458, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.249850273132324, - "rewards/grad_term": 0.0006000981666147709, - "rewards/margins": 8.886370658874512, - "rewards/rejected": -14.136220932006836, - "step": 427 - }, - { - "epoch": 0.9169791108730584, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.038400084548858, - "learning_rate": 4.218377088305489e-07, - "logits/chosen": 0.8124886751174927, - "logits/rejected": 0.47504645586013794, - "logps/accuracies": 0.25, - "logps/chosen": -516.41015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -438.93255615234375, - "logps/ref_rejected": -273.8423156738281, - "logps/rejected": -499.37445068359375, - "loss": 0.145, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8738789558410645, - "rewards/grad_term": 0.0022317732218652964, - "rewards/margins": 7.402729034423828, - "rewards/rejected": -11.276607513427734, - "step": 428 - }, - { - "epoch": 0.919121585431173, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.572014319270594, - "learning_rate": 4.210023866348449e-07, - "logits/chosen": 0.6831299066543579, - "logits/rejected": 0.6394398212432861, - "logps/accuracies": 1.0, - "logps/chosen": -486.3926086425781, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -410.18096923828125, - "logps/ref_rejected": -368.5516357421875, - "logps/rejected": -578.9801635742188, - "loss": 0.1459, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.8105835914611816, - "rewards/grad_term": 0.00010745471809059381, - "rewards/margins": 6.710842609405518, - "rewards/rejected": -10.521427154541016, - "step": 429 - }, - { - "epoch": 0.9212640599892876, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.689438797952636, - "learning_rate": 4.2016706443914076e-07, - "logits/chosen": 0.7243601083755493, - "logits/rejected": 0.6393451690673828, - "logps/accuracies": 1.0, - "logps/chosen": -562.4426879882812, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -483.3052978515625, - "logps/ref_rejected": -426.74920654296875, - "logps/rejected": -627.9085693359375, - "loss": 0.186, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9568703174591064, - "rewards/grad_term": 0.000488718505948782, - "rewards/margins": 6.101097106933594, - "rewards/rejected": -10.057967185974121, - "step": 430 - }, - { - "epoch": 0.9234065345474023, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.8721575355892615, - "learning_rate": 4.1933174224343674e-07, - "logits/chosen": 0.7541458010673523, - "logits/rejected": 0.6002135276794434, - "logps/accuracies": 1.0, - "logps/chosen": -398.8825378417969, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -333.1788330078125, - "logps/ref_rejected": -287.464599609375, - "logps/rejected": -493.71929931640625, - "loss": 0.0908, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.285184860229492, - "rewards/grad_term": 0.0002467437880113721, - "rewards/margins": 7.027547359466553, - "rewards/rejected": -10.312731742858887, - "step": 431 - }, - { - "epoch": 0.9255490091055169, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.73075697487602, - "learning_rate": 4.184964200477327e-07, - "logits/chosen": 0.5785739421844482, - "logits/rejected": 0.6086280345916748, - "logps/accuracies": 1.0, - "logps/chosen": -356.680419921875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -259.8116149902344, - "logps/ref_rejected": -233.67578125, - "logps/rejected": -508.74652099609375, - "loss": 0.0957, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.843441009521484, - "rewards/grad_term": 0.003247784450650215, - "rewards/margins": 8.91009521484375, - "rewards/rejected": -13.75353717803955, - "step": 432 - }, - { - "epoch": 0.9276914836636315, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.580143703823092, - "learning_rate": 4.176610978520286e-07, - "logits/chosen": 0.5819364190101624, - "logits/rejected": 0.6432737708091736, - "logps/accuracies": 0.75, - "logps/chosen": -228.54739379882812, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -149.87022399902344, - "logps/ref_rejected": -146.04185485839844, - "logps/rejected": -256.3289489746094, - "loss": 0.1975, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.933859348297119, - "rewards/grad_term": 0.014459378086030483, - "rewards/margins": 1.580496072769165, - "rewards/rejected": -5.514355659484863, - "step": 433 - }, - { - "epoch": 0.9298339582217461, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.738131094632276, - "learning_rate": 4.1682577565632457e-07, - "logits/chosen": 0.6606433391571045, - "logits/rejected": 0.7345594167709351, - "logps/accuracies": 0.75, - "logps/chosen": -257.4786376953125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -191.63543701171875, - "logps/ref_rejected": -178.1594696044922, - "logps/rejected": -321.8557434082031, - "loss": 0.1658, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.2921600341796875, - "rewards/grad_term": 0.003366851480677724, - "rewards/margins": 3.8926539421081543, - "rewards/rejected": -7.184814453125, - "step": 434 - }, - { - "epoch": 0.9319764327798608, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.358516114828291, - "learning_rate": 4.159904534606205e-07, - "logits/chosen": 0.7316790819168091, - "logits/rejected": 0.4060133099555969, - "logps/accuracies": 0.75, - "logps/chosen": -300.3427734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -247.51315307617188, - "logps/ref_rejected": -180.51429748535156, - "logps/rejected": -308.9923400878906, - "loss": 0.1218, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6414809226989746, - "rewards/grad_term": 0.010488856583833694, - "rewards/margins": 3.7824203968048096, - "rewards/rejected": -6.423901081085205, - "step": 435 - }, - { - "epoch": 0.9341189073379753, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.691996285863658, - "learning_rate": 4.151551312649165e-07, - "logits/chosen": 0.7785161733627319, - "logits/rejected": 0.6878386735916138, - "logps/accuracies": 0.75, - "logps/chosen": -333.62005615234375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -292.43572998046875, - "logps/ref_rejected": -241.63511657714844, - "logps/rejected": -372.54913330078125, - "loss": 0.1926, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.059215545654297, - "rewards/grad_term": 0.0018129110103473067, - "rewards/margins": 4.486485481262207, - "rewards/rejected": -6.545701026916504, - "step": 436 - }, - { - "epoch": 0.93626138189609, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 6.403058396602235, - "learning_rate": 4.1431980906921235e-07, - "logits/chosen": 0.6294098496437073, - "logits/rejected": 0.4260585904121399, - "logps/accuracies": 1.0, - "logps/chosen": -306.869384765625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -250.37619018554688, - "logps/ref_rejected": -255.60894775390625, - "logps/rejected": -436.3756103515625, - "loss": 0.1025, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.824659824371338, - "rewards/grad_term": 0.0016512804431840777, - "rewards/margins": 6.2136712074279785, - "rewards/rejected": -9.038331031799316, - "step": 437 - }, - { - "epoch": 0.9384038564542047, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 10.33335056458741, - "learning_rate": 4.1348448687350833e-07, - "logits/chosen": 0.5839954018592834, - "logits/rejected": 0.5881964564323425, - "logps/accuracies": 0.75, - "logps/chosen": -430.4659118652344, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -370.88543701171875, - "logps/ref_rejected": -322.64886474609375, - "logps/rejected": -474.2005615234375, - "loss": 0.1119, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.9790241718292236, - "rewards/grad_term": 0.003231536131352186, - "rewards/margins": 4.598560810089111, - "rewards/rejected": -7.577584743499756, - "step": 438 - }, - { - "epoch": 0.9405463310123192, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.524534921674278, - "learning_rate": 4.126491646778043e-07, - "logits/chosen": 0.8089597225189209, - "logits/rejected": 0.6896387338638306, - "logps/accuracies": 1.0, - "logps/chosen": -643.2845458984375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -516.3448486328125, - "logps/ref_rejected": -450.1552734375, - "logps/rejected": -712.5276489257812, - "loss": 0.1106, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.346982955932617, - "rewards/grad_term": 0.0003070410166401416, - "rewards/margins": 6.771636486053467, - "rewards/rejected": -13.118619918823242, - "step": 439 - }, - { - "epoch": 0.9426888055704339, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.35386030961725, - "learning_rate": 4.118138424821002e-07, - "logits/chosen": 0.4550638198852539, - "logits/rejected": 0.11324800550937653, - "logps/accuracies": 0.75, - "logps/chosen": -256.8120422363281, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -193.059326171875, - "logps/ref_rejected": -195.65859985351562, - "logps/rejected": -337.16546630859375, - "loss": 0.1343, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1876347064971924, - "rewards/grad_term": 0.007398877292871475, - "rewards/margins": 3.8877077102661133, - "rewards/rejected": -7.075342655181885, - "step": 440 - }, - { - "epoch": 0.9448312801285484, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 11.730894891356659, - "learning_rate": 4.1097852028639616e-07, - "logits/chosen": 0.7687960267066956, - "logits/rejected": 0.6161290407180786, - "logps/accuracies": 1.0, - "logps/chosen": -429.12823486328125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -342.45672607421875, - "logps/ref_rejected": -322.35748291015625, - "logps/rejected": -519.225341796875, - "loss": 0.1435, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.3335771560668945, - "rewards/grad_term": 0.0011160215362906456, - "rewards/margins": 5.5098161697387695, - "rewards/rejected": -9.843393325805664, - "step": 441 - }, - { - "epoch": 0.9469737546866631, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.1151510120037, - "learning_rate": 4.1014319809069213e-07, - "logits/chosen": 0.5211978554725647, - "logits/rejected": 0.4055239260196686, - "logps/accuracies": 0.75, - "logps/chosen": -252.66140747070312, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -219.652099609375, - "logps/ref_rejected": -156.125244140625, - "logps/rejected": -275.7467956542969, - "loss": 0.1075, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6504652500152588, - "rewards/grad_term": 0.005798153579235077, - "rewards/margins": 4.330613136291504, - "rewards/rejected": -5.981078624725342, - "step": 442 - }, - { - "epoch": 0.9491162292447777, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.643922368612464, - "learning_rate": 4.0930787589498806e-07, - "logits/chosen": 0.7781935930252075, - "logits/rejected": 0.3824615180492401, - "logps/accuracies": 0.75, - "logps/chosen": -381.4145202636719, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -311.0098571777344, - "logps/ref_rejected": -217.48388671875, - "logps/rejected": -411.19970703125, - "loss": 0.0984, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.520233631134033, - "rewards/grad_term": 0.00042740529170259833, - "rewards/margins": 6.1655592918396, - "rewards/rejected": -9.685792922973633, - "step": 443 - }, - { - "epoch": 0.9512587038028923, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.91825980199573, - "learning_rate": 4.08472553699284e-07, - "logits/chosen": 0.6866825819015503, - "logits/rejected": 0.6368831992149353, - "logps/accuracies": 0.5, - "logps/chosen": -438.6324462890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -397.3633728027344, - "logps/ref_rejected": -329.361572265625, - "logps/rejected": -486.6770324707031, - "loss": 0.133, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.06345272064209, - "rewards/grad_term": 0.0025374325923621655, - "rewards/margins": 5.80232048034668, - "rewards/rejected": -7.865772724151611, - "step": 444 - }, - { - "epoch": 0.953401178361007, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 5.821241307927092, - "learning_rate": 4.076372315035799e-07, - "logits/chosen": 0.8077370524406433, - "logits/rejected": 0.5723898410797119, - "logps/accuracies": 0.75, - "logps/chosen": -629.1956787109375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -488.9486389160156, - "logps/ref_rejected": -358.0147705078125, - "logps/rejected": -686.6176147460938, - "loss": 0.1339, - "rewards/accuracies": 1.0, - "rewards/chosen": -7.012351036071777, - "rewards/grad_term": 8.414402145717759e-06, - "rewards/margins": 9.417790412902832, - "rewards/rejected": -16.430139541625977, - "step": 445 - }, - { - "epoch": 0.9555436529191216, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.65211568963401, - "learning_rate": 4.068019093078759e-07, - "logits/chosen": 0.7759550213813782, - "logits/rejected": 0.7140344977378845, - "logps/accuracies": 1.0, - "logps/chosen": -505.0233459472656, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -406.8385009765625, - "logps/ref_rejected": -350.51312255859375, - "logps/rejected": -550.5768432617188, - "loss": 0.1625, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.909243106842041, - "rewards/grad_term": 0.0008991943905130029, - "rewards/margins": 5.0939459800720215, - "rewards/rejected": -10.003189086914062, - "step": 446 - }, - { - "epoch": 0.9576861274772362, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.561770484711239, - "learning_rate": 4.0596658711217187e-07, - "logits/chosen": 0.761772632598877, - "logits/rejected": 0.5069707632064819, - "logps/accuracies": 0.75, - "logps/chosen": -414.2353210449219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -330.43389892578125, - "logps/ref_rejected": -286.1955261230469, - "logps/rejected": -513.2139892578125, - "loss": 0.1028, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.190073490142822, - "rewards/grad_term": 0.0025082218926399946, - "rewards/margins": 7.160850524902344, - "rewards/rejected": -11.350923538208008, - "step": 447 - }, - { - "epoch": 0.9598286020353508, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 11.50375604154419, - "learning_rate": 4.0513126491646774e-07, - "logits/chosen": 0.7668182849884033, - "logits/rejected": 0.6712717413902283, - "logps/accuracies": 0.75, - "logps/chosen": -451.71124267578125, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -316.8441467285156, - "logps/ref_rejected": -260.1050720214844, - "logps/rejected": -519.7904052734375, - "loss": 0.1387, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.743355751037598, - "rewards/grad_term": 0.0019724913872778416, - "rewards/margins": 6.240912437438965, - "rewards/rejected": -12.984268188476562, - "step": 448 - }, - { - "epoch": 0.9598286020353508, - "eval_flips/correct->correct": 0.14000000059604645, - "eval_flips/correct->incorrect": 0.019999999552965164, - "eval_flips/incorrect->correct": 0.5799999833106995, - "eval_flips/incorrect->incorrect": 0.25999999046325684, - "eval_logits/chosen": 0.6544824838638306, - "eval_logits/rejected": 0.5577883720397949, - "eval_logps/accuracies": 0.7200000286102295, - "eval_logps/chosen": -392.440185546875, - "eval_logps/ref_accuracies": 0.1599999964237213, - "eval_logps/ref_chosen": -323.51568603515625, - "eval_logps/ref_rejected": -258.70098876953125, - "eval_logps/rejected": -422.9569091796875, - "eval_loss": 0.13714653253555298, - "eval_rewards/accuracies": 0.9200000166893005, - "eval_rewards/chosen": -3.446227788925171, - "eval_rewards/grad_term": 0.006565955467522144, - "eval_rewards/margins": 4.766568660736084, - "eval_rewards/rejected": -8.212796211242676, - "eval_runtime": 373.2435, - "eval_samples_per_second": 4.233, - "eval_steps_per_second": 0.134, - "step": 448 - }, - { - "epoch": 0.9619710765934655, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.75, - "grad_norm": 7.143313955724084, - "learning_rate": 4.042959427207637e-07, - "logits/chosen": 0.6099262833595276, - "logits/rejected": 0.41160258650779724, - "logps/accuracies": 0.25, - "logps/chosen": -495.52197265625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -402.29571533203125, - "logps/ref_rejected": -238.91366577148438, - "logps/rejected": -411.18255615234375, - "loss": 0.0955, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.661313056945801, - "rewards/grad_term": 0.008577575907111168, - "rewards/margins": 3.9521327018737793, - "rewards/rejected": -8.613445281982422, - "step": 449 - }, - { - "epoch": 0.96411355115158, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.099073379071028, - "learning_rate": 4.0346062052505964e-07, - "logits/chosen": 0.7317103147506714, - "logits/rejected": 0.5711554884910583, - "logps/accuracies": 0.75, - "logps/chosen": -509.8515319824219, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -422.57989501953125, - "logps/ref_rejected": -335.5193176269531, - "logps/rejected": -622.3995971679688, - "loss": 0.1295, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.363581657409668, - "rewards/grad_term": 4.9737202061805874e-05, - "rewards/margins": 9.980432510375977, - "rewards/rejected": -14.344014167785645, - "step": 450 - }, - { - "epoch": 0.9662560257096947, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.501700427807891, - "learning_rate": 4.0262529832935557e-07, - "logits/chosen": 0.7478474974632263, - "logits/rejected": 0.6434618234634399, - "logps/accuracies": 1.0, - "logps/chosen": -299.7742004394531, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -228.0120849609375, - "logps/ref_rejected": -183.03488159179688, - "logps/rejected": -342.20123291015625, - "loss": 0.1458, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.588106632232666, - "rewards/grad_term": 0.003840662771835923, - "rewards/margins": 4.370211124420166, - "rewards/rejected": -7.958317279815674, - "step": 451 - }, - { - "epoch": 0.9683985002678093, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.391268423681992, - "learning_rate": 4.017899761336515e-07, - "logits/chosen": 0.4842509627342224, - "logits/rejected": 0.3438522517681122, - "logps/accuracies": 0.5, - "logps/chosen": -378.8311767578125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -306.450439453125, - "logps/ref_rejected": -239.88665771484375, - "logps/rejected": -436.4774475097656, - "loss": 0.1131, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.61903715133667, - "rewards/grad_term": 0.0038894114550203085, - "rewards/margins": 6.210503578186035, - "rewards/rejected": -9.829540252685547, - "step": 452 - }, - { - "epoch": 0.9705409748259239, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 13.121406224741564, - "learning_rate": 4.0095465393794747e-07, - "logits/chosen": 0.7076852321624756, - "logits/rejected": 0.5975925922393799, - "logps/accuracies": 0.75, - "logps/chosen": -533.9036254882812, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -425.0966796875, - "logps/ref_rejected": -362.5286560058594, - "logps/rejected": -631.1089477539062, - "loss": 0.162, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.4403462409973145, - "rewards/grad_term": 7.804081542417407e-05, - "rewards/margins": 7.988667011260986, - "rewards/rejected": -13.4290132522583, - "step": 453 - }, - { - "epoch": 0.9726834493840386, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 11.264528709557153, - "learning_rate": 4.0011933174224345e-07, - "logits/chosen": 0.5773014426231384, - "logits/rejected": 0.37737879157066345, - "logps/accuracies": 0.5, - "logps/chosen": -441.2142639160156, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -361.462890625, - "logps/ref_rejected": -243.72628784179688, - "logps/rejected": -388.4267272949219, - "loss": 0.1549, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.9875683784484863, - "rewards/grad_term": 0.0044038868509233, - "rewards/margins": 3.2474539279937744, - "rewards/rejected": -7.23502254486084, - "step": 454 - }, - { - "epoch": 0.9748259239421532, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 17.35451032981818, - "learning_rate": 3.992840095465393e-07, - "logits/chosen": 0.7966763377189636, - "logits/rejected": 0.6966639161109924, - "logps/accuracies": 0.75, - "logps/chosen": -576.56640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -470.6474304199219, - "logps/ref_rejected": -458.44146728515625, - "logps/rejected": -663.11572265625, - "loss": 0.2188, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.295950889587402, - "rewards/grad_term": 0.0004940250655636191, - "rewards/margins": 4.937762260437012, - "rewards/rejected": -10.233713150024414, - "step": 455 - }, - { - "epoch": 0.9769683985002678, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.682054456108627, - "learning_rate": 3.984486873508353e-07, - "logits/chosen": 0.9120834469795227, - "logits/rejected": 0.6336569786071777, - "logps/accuracies": 1.0, - "logps/chosen": -432.57769775390625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -369.40673828125, - "logps/ref_rejected": -295.18524169921875, - "logps/rejected": -496.9414978027344, - "loss": 0.1222, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1585474014282227, - "rewards/grad_term": 8.77337297424674e-05, - "rewards/margins": 6.929265022277832, - "rewards/rejected": -10.087812423706055, - "step": 456 - }, - { - "epoch": 0.9791108730583824, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.798800443633192, - "learning_rate": 3.976133651551313e-07, - "logits/chosen": 0.7685093879699707, - "logits/rejected": 0.6625791788101196, - "logps/accuracies": 0.75, - "logps/chosen": -364.49139404296875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -279.5849609375, - "logps/ref_rejected": -255.69351196289062, - "logps/rejected": -443.4805908203125, - "loss": 0.1135, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.245321750640869, - "rewards/grad_term": 0.007468358147889376, - "rewards/margins": 5.144031524658203, - "rewards/rejected": -9.389352798461914, - "step": 457 - }, - { - "epoch": 0.9812533476164971, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 10.393895759274342, - "learning_rate": 3.967780429594272e-07, - "logits/chosen": 0.7607497572898865, - "logits/rejected": 0.7056368589401245, - "logps/accuracies": 1.0, - "logps/chosen": -333.776123046875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -276.8760986328125, - "logps/ref_rejected": -238.05545043945312, - "logps/rejected": -385.234375, - "loss": 0.1466, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.8449995517730713, - "rewards/grad_term": 0.010424750857055187, - "rewards/margins": 4.513948917388916, - "rewards/rejected": -7.358948230743408, - "step": 458 - }, - { - "epoch": 0.9833958221746116, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 13.14222406723579, - "learning_rate": 3.9594272076372313e-07, - "logits/chosen": 0.5317684412002563, - "logits/rejected": 0.3798009753227234, - "logps/accuracies": 0.5, - "logps/chosen": -388.8833923339844, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -284.9061584472656, - "logps/ref_rejected": -228.88584899902344, - "logps/rejected": -419.9346923828125, - "loss": 0.2091, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.1988606452941895, - "rewards/grad_term": 0.006651153787970543, - "rewards/margins": 4.35358190536499, - "rewards/rejected": -9.55244255065918, - "step": 459 - }, - { - "epoch": 0.9855382967327263, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 9.290117062157519, - "learning_rate": 3.9510739856801906e-07, - "logits/chosen": 0.7666717171669006, - "logits/rejected": 0.6038914918899536, - "logps/accuracies": 0.75, - "logps/chosen": -517.7335205078125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -393.77362060546875, - "logps/ref_rejected": -367.8737487792969, - "logps/rejected": -612.9761962890625, - "loss": 0.1298, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.197994232177734, - "rewards/grad_term": 0.002154412679374218, - "rewards/margins": 6.057126045227051, - "rewards/rejected": -12.255121231079102, - "step": 460 - }, - { - "epoch": 0.987680771290841, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 8.418543548161525, - "learning_rate": 3.9427207637231504e-07, - "logits/chosen": 0.4025576114654541, - "logits/rejected": 0.24721147119998932, - "logps/accuracies": 0.75, - "logps/chosen": -312.319091796875, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -264.7703857421875, - "logps/ref_rejected": -213.37884521484375, - "logps/rejected": -414.29150390625, - "loss": 0.1181, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3774335384368896, - "rewards/grad_term": 0.002074267016723752, - "rewards/margins": 7.668199062347412, - "rewards/rejected": -10.045632362365723, - "step": 461 - }, - { - "epoch": 0.9898232458489555, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.924066455521681, - "learning_rate": 3.934367541766109e-07, - "logits/chosen": 0.7529336810112, - "logits/rejected": 0.5446640253067017, - "logps/accuracies": 0.5, - "logps/chosen": -391.60406494140625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -341.4349365234375, - "logps/ref_rejected": -242.84326171875, - "logps/rejected": -389.7578125, - "loss": 0.0904, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5084564685821533, - "rewards/grad_term": 0.0086339320987463, - "rewards/margins": 4.837271690368652, - "rewards/rejected": -7.345727920532227, - "step": 462 - }, - { - "epoch": 0.9919657204070702, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.341264657659469, - "learning_rate": 3.926014319809069e-07, - "logits/chosen": 0.5941088199615479, - "logits/rejected": 0.5042127370834351, - "logps/accuracies": 1.0, - "logps/chosen": -501.0787353515625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -397.529541015625, - "logps/ref_rejected": -341.83441162109375, - "logps/rejected": -633.3097534179688, - "loss": 0.1559, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.177460193634033, - "rewards/grad_term": 4.0835613617673516e-05, - "rewards/margins": 9.396307945251465, - "rewards/rejected": -14.573768615722656, - "step": 463 - }, - { - "epoch": 0.9941081949651848, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.207024818309272, - "learning_rate": 3.9176610978520286e-07, - "logits/chosen": 0.3791959285736084, - "logits/rejected": 0.4412694573402405, - "logps/accuracies": 1.0, - "logps/chosen": -288.90289306640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -218.5885772705078, - "logps/ref_rejected": -224.43006896972656, - "logps/rejected": -435.8794860839844, - "loss": 0.1242, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.5157151222229004, - "rewards/grad_term": 0.0030132310930639505, - "rewards/margins": 7.056755065917969, - "rewards/rejected": -10.572470664978027, - "step": 464 - }, - { - "epoch": 0.9962506695232994, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.607745217245667, - "learning_rate": 3.9093078758949884e-07, - "logits/chosen": 0.5457050800323486, - "logits/rejected": 0.5041043162345886, - "logps/accuracies": 1.0, - "logps/chosen": -283.53179931640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -227.90573120117188, - "logps/ref_rejected": -220.95458984375, - "logps/rejected": -366.6824951171875, - "loss": 0.1246, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.781303882598877, - "rewards/grad_term": 0.003597520524635911, - "rewards/margins": 4.50508975982666, - "rewards/rejected": -7.286393642425537, - "step": 465 - }, - { - "epoch": 0.998393144081414, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 7.293270758501063, - "learning_rate": 3.900954653937947e-07, - "logits/chosen": 0.6813356280326843, - "logits/rejected": 0.6149817109107971, - "logps/accuracies": 0.75, - "logps/chosen": -369.541748046875, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -312.1705017089844, - "logps/ref_rejected": -293.1113586425781, - "logps/rejected": -475.52325439453125, - "loss": 0.1107, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.8685624599456787, - "rewards/grad_term": 0.005431050434708595, - "rewards/margins": 6.25203275680542, - "rewards/rejected": -9.120595932006836, - "step": 466 - }, - { - "epoch": 1.0005356186395287, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 7.806449837480457, - "learning_rate": 3.892601431980907e-07, - "logits/chosen": 0.5919859409332275, - "logits/rejected": 0.5162959694862366, - "logps/accuracies": 1.0, - "logps/chosen": -398.9145812988281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -307.1631774902344, - "logps/ref_rejected": -271.4671630859375, - "logps/rejected": -484.1536865234375, - "loss": 0.0956, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.5875701904296875, - "rewards/grad_term": 0.0008761522476561368, - "rewards/margins": 6.046757221221924, - "rewards/rejected": -10.63432788848877, - "step": 467 - }, - { - "epoch": 1.0026780931976433, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 3.522329323680681, - "learning_rate": 3.884248210023866e-07, - "logits/chosen": 0.6553293466567993, - "logits/rejected": 0.5018194913864136, - "logps/accuracies": 0.75, - "logps/chosen": -324.5142517089844, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -261.5387878417969, - "logps/ref_rejected": -222.56907653808594, - "logps/rejected": -426.75311279296875, - "loss": 0.0507, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.1487741470336914, - "rewards/grad_term": 0.0020364022348076105, - "rewards/margins": 7.060428619384766, - "rewards/rejected": -10.20920181274414, - "step": 468 - }, - { - "epoch": 1.004820567755758, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 6.612470098819514, - "learning_rate": 3.8758949880668254e-07, - "logits/chosen": 0.6158171892166138, - "logits/rejected": 0.5050429105758667, - "logps/accuracies": 0.5, - "logps/chosen": -406.8590087890625, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -289.06756591796875, - "logps/ref_rejected": -229.218994140625, - "logps/rejected": -454.9029846191406, - "loss": 0.1037, - "rewards/accuracies": 1.0, - "rewards/chosen": -5.889570713043213, - "rewards/grad_term": 0.0034782271832227707, - "rewards/margins": 5.39462947845459, - "rewards/rejected": -11.284200668334961, - "step": 469 - }, - { - "epoch": 1.0069630423138725, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 3.539220855844499, - "learning_rate": 3.8675417661097847e-07, - "logits/chosen": 0.7073222994804382, - "logits/rejected": 0.671281635761261, - "logps/accuracies": 0.75, - "logps/chosen": -430.43096923828125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -342.07391357421875, - "logps/ref_rejected": -324.9189453125, - "logps/rejected": -591.3267211914062, - "loss": 0.041, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.417852401733398, - "rewards/grad_term": 0.00558136124163866, - "rewards/margins": 8.902535438537598, - "rewards/rejected": -13.320388793945312, - "step": 470 - }, - { - "epoch": 1.0091055168719871, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 6.354661123726489, - "learning_rate": 3.8591885441527445e-07, - "logits/chosen": 0.7619997262954712, - "logits/rejected": 0.706304669380188, - "logps/accuracies": 0.75, - "logps/chosen": -384.338623046875, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -320.0787658691406, - "logps/ref_rejected": -228.5266876220703, - "logps/rejected": -378.6219482421875, - "loss": 0.0982, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.212991237640381, - "rewards/grad_term": 0.0007301281439140439, - "rewards/margins": 4.291770935058594, - "rewards/rejected": -7.504762649536133, - "step": 471 - }, - { - "epoch": 1.0112479914301018, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 4.790880655224275, - "learning_rate": 3.8508353221957043e-07, - "logits/chosen": 0.7498035430908203, - "logits/rejected": 0.518231987953186, - "logps/accuracies": 0.75, - "logps/chosen": -450.41552734375, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -375.4889221191406, - "logps/ref_rejected": -315.0843200683594, - "logps/rejected": -485.2691650390625, - "loss": 0.0852, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.7463297843933105, - "rewards/grad_term": 0.0010887248208746314, - "rewards/margins": 4.762913227081299, - "rewards/rejected": -8.50924301147461, - "step": 472 - }, - { - "epoch": 1.0133904659882165, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 5.7481678574942885, - "learning_rate": 3.842482100238663e-07, - "logits/chosen": 0.5814411640167236, - "logits/rejected": 0.48274481296539307, - "logps/accuracies": 1.0, - "logps/chosen": -477.7788391113281, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -356.0843200683594, - "logps/ref_rejected": -339.63299560546875, - "logps/rejected": -642.8599853515625, - "loss": 0.1017, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.084725856781006, - "rewards/grad_term": 1.688380325504113e-05, - "rewards/margins": 9.07662296295166, - "rewards/rejected": -15.16135025024414, - "step": 473 - }, - { - "epoch": 1.015532940546331, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.361744125881987, - "learning_rate": 3.834128878281623e-07, - "logits/chosen": 0.9029494524002075, - "logits/rejected": 0.8422863483428955, - "logps/accuracies": 1.0, - "logps/chosen": -642.5513916015625, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -501.9047546386719, - "logps/ref_rejected": -415.24755859375, - "logps/rejected": -698.61962890625, - "loss": 0.104, - "rewards/accuracies": 1.0, - "rewards/chosen": -7.0323286056518555, - "rewards/grad_term": 0.001934091211296618, - "rewards/margins": 7.1362762451171875, - "rewards/rejected": -14.16860580444336, - "step": 474 - }, - { - "epoch": 1.0176754151044456, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 4.64410207259052, - "learning_rate": 3.8257756563245826e-07, - "logits/chosen": 0.5650697946548462, - "logits/rejected": 0.5173856019973755, - "logps/accuracies": 1.0, - "logps/chosen": -540.633056640625, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -457.04205322265625, - "logps/ref_rejected": -435.2861633300781, - "logps/rejected": -643.3292846679688, - "loss": 0.0748, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.1795477867126465, - "rewards/grad_term": 0.0007545886328443885, - "rewards/margins": 6.2226080894470215, - "rewards/rejected": -10.402155876159668, - "step": 475 - }, - { - "epoch": 1.0198178896625603, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.25, - "flips/incorrect->incorrect": 0.25, - "grad_norm": 4.573143939908089, - "learning_rate": 3.817422434367542e-07, - "logits/chosen": 0.5865468978881836, - "logits/rejected": 0.7002366185188293, - "logps/accuracies": 0.75, - "logps/chosen": -433.32452392578125, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -351.2342529296875, - "logps/ref_rejected": -321.6819152832031, - "logps/rejected": -513.435791015625, - "loss": 0.0725, - "rewards/accuracies": 1.0, - "rewards/chosen": -4.104512691497803, - "rewards/grad_term": 0.003012165194377303, - "rewards/margins": 5.483182430267334, - "rewards/rejected": -9.587695121765137, - "step": 476 - }, - { - "epoch": 1.021960364220675, - "flips/correct->correct": 0.25, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.75, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 8.11310764037292, - "learning_rate": 3.809069212410501e-07, - "logits/chosen": 0.4245867133140564, - "logits/rejected": 0.4632042348384857, - "logps/accuracies": 1.0, - "logps/chosen": -426.9747314453125, - "logps/ref_accuracies": 0.25, - "logps/ref_chosen": -358.86248779296875, - "logps/ref_rejected": -292.428466796875, - "logps/rejected": -462.8052673339844, - "loss": 0.0946, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.4056124687194824, - "rewards/grad_term": 0.001515088020823896, - "rewards/margins": 5.113227844238281, - "rewards/rejected": -8.518840789794922, - "step": 477 - }, - { - "epoch": 1.0241028387787896, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.5, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 4.791373581896772, - "learning_rate": 3.8007159904534603e-07, - "logits/chosen": 0.5890440940856934, - "logits/rejected": 0.36139726638793945, - "logps/accuracies": 0.5, - "logps/chosen": -490.9143371582031, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -361.0188903808594, - "logps/ref_rejected": -289.797119140625, - "logps/rejected": -517.291259765625, - "loss": 0.0621, - "rewards/accuracies": 1.0, - "rewards/chosen": -6.494773864746094, - "rewards/grad_term": 0.0016336999833583832, - "rewards/margins": 4.879931449890137, - "rewards/rejected": -11.37470531463623, - "step": 478 - }, - { - "epoch": 1.026245313336904, - "flips/correct->correct": 0.0, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 1.0, - "flips/incorrect->incorrect": 0.0, - "grad_norm": 9.419742623980941, - "learning_rate": 3.79236276849642e-07, - "logits/chosen": 0.6677709817886353, - "logits/rejected": 0.6186258792877197, - "logps/accuracies": 1.0, - "logps/chosen": -357.1605224609375, - "logps/ref_accuracies": 0.0, - "logps/ref_chosen": -280.62677001953125, - "logps/ref_rejected": -218.89462280273438, - "logps/rejected": -433.9444274902344, - "loss": 0.0807, - "rewards/accuracies": 1.0, - "rewards/chosen": -3.826688766479492, - "rewards/grad_term": 0.004192608408629894, - "rewards/margins": 6.925801753997803, - "rewards/rejected": -10.752490043640137, - "step": 479 - }, - { - "epoch": 1.0283877878950187, - "flips/correct->correct": 0.5, - "flips/correct->incorrect": 0.0, - "flips/incorrect->correct": 0.0, - "flips/incorrect->incorrect": 0.5, - "grad_norm": 5.500558735593387, - "learning_rate": 3.784009546539379e-07, - "logits/chosen": 0.5895904302597046, - "logits/rejected": 0.5445250868797302, - "logps/accuracies": 0.5, - "logps/chosen": -155.37521362304688, - "logps/ref_accuracies": 0.5, - "logps/ref_chosen": -109.69457244873047, - "logps/ref_rejected": -98.66912078857422, - "logps/rejected": -187.35279846191406, - "loss": 0.0968, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.284031867980957, - "rewards/grad_term": 0.0074999695643782616, - "rewards/margins": 2.1501517295837402, - "rewards/rejected": -4.4341840744018555, - "step": 480 - } - ], - "logging_steps": 1, - "max_steps": 932, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 96, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 0.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}