diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8352 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5811, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 8.591065292096219e-10, + "logits/chosen": -2.5853981971740723, + "logits/rejected": -2.470163345336914, + "logps/chosen": -144.5498046875, + "logps/rejected": -91.19886779785156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -2.9029548168182373, + "logits/rejected": -2.959444522857666, + "logps/chosen": -362.0794982910156, + "logps/rejected": -262.45947265625, + "loss": 0.6935, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 0.005129138007760048, + "rewards/margins": 0.016535116359591484, + "rewards/rejected": -0.011405976489186287, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.7615294456481934, + "logits/rejected": -2.725064754486084, + "logps/chosen": -269.28619384765625, + "logps/rejected": -202.5450897216797, + "loss": 0.6933, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.019564008340239525, + "rewards/margins": -0.026170048862695694, + "rewards/rejected": 0.006606035865843296, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -2.966618776321411, + "logits/rejected": -2.9526820182800293, + "logps/chosen": -271.7405090332031, + "logps/rejected": -236.72415161132812, + "loss": 0.689, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.009709501639008522, + "rewards/margins": -0.013466158881783485, + "rewards/rejected": 0.0037566572427749634, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.7589166164398193, + "logits/rejected": -2.7409415245056152, + "logps/chosen": -278.0064697265625, + "logps/rejected": -256.8775939941406, + "loss": 0.6829, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.011239729821681976, + "rewards/margins": 3.3503398299217224e-05, + "rewards/rejected": -0.011273231357336044, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 4.29553264604811e-08, + "logits/chosen": -2.9569365978240967, + "logits/rejected": -2.904877185821533, + "logps/chosen": -293.72796630859375, + "logps/rejected": -220.7270050048828, + "loss": 0.6639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.036680832505226135, + "rewards/margins": 0.07365237176418304, + "rewards/rejected": -0.03697153925895691, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.841106414794922, + "logits/rejected": -2.8289294242858887, + "logps/chosen": -277.8804016113281, + "logps/rejected": -258.37542724609375, + "loss": 0.6525, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.057021915912628174, + "rewards/margins": 0.10140474885702133, + "rewards/rejected": -0.044382836669683456, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 6.013745704467354e-08, + "logits/chosen": -2.944255828857422, + "logits/rejected": -2.9709508419036865, + "logps/chosen": -341.34271240234375, + "logps/rejected": -246.50521850585938, + "loss": 0.6418, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0887238085269928, + "rewards/margins": 0.16679176688194275, + "rewards/rejected": -0.07806795090436935, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.864830493927002, + "logits/rejected": -2.8411145210266113, + "logps/chosen": -332.96881103515625, + "logps/rejected": -261.268798828125, + "loss": 0.6062, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09977231174707413, + "rewards/margins": 0.1671280413866043, + "rewards/rejected": -0.06735573709011078, + "step": 80 + }, + { + "epoch": 0.05, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": -2.901690721511841, + "logits/rejected": -2.916935682296753, + "logps/chosen": -302.84075927734375, + "logps/rejected": -185.1248321533203, + "loss": 0.6045, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0786793902516365, + "rewards/margins": 0.2470569908618927, + "rewards/rejected": -0.1683776080608368, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.76128888130188, + "logits/rejected": -2.780914068222046, + "logps/chosen": -248.6319122314453, + "logps/rejected": -200.15841674804688, + "loss": 0.5933, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.028152083978056908, + "rewards/margins": 0.2552409768104553, + "rewards/rejected": -0.22708889842033386, + "step": 100 + }, + { + "epoch": 0.06, + "learning_rate": 9.450171821305841e-08, + "logits/chosen": -2.8498265743255615, + "logits/rejected": -2.935537576675415, + "logps/chosen": -282.4373779296875, + "logps/rejected": -253.15170288085938, + "loss": 0.5897, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03426863253116608, + "rewards/margins": 0.40692123770713806, + "rewards/rejected": -0.3726526200771332, + "step": 110 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.805190324783325, + "logits/rejected": -2.925807476043701, + "logps/chosen": -213.3897247314453, + "logps/rejected": -224.468017578125, + "loss": 0.5578, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009538640268146992, + "rewards/margins": 0.30600637197494507, + "rewards/rejected": -0.3155450224876404, + "step": 120 + }, + { + "epoch": 0.07, + "learning_rate": 1.1168384879725086e-07, + "logits/chosen": -2.9988574981689453, + "logits/rejected": -2.885953426361084, + "logps/chosen": -334.03802490234375, + "logps/rejected": -216.3173370361328, + "loss": 0.5695, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3172129988670349, + "rewards/margins": 0.7236066460609436, + "rewards/rejected": -0.4063936173915863, + "step": 130 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.866483449935913, + "logits/rejected": -2.8898892402648926, + "logps/chosen": -259.0267639160156, + "logps/rejected": -230.06961059570312, + "loss": 0.5536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06076876074075699, + "rewards/margins": 0.6484912633895874, + "rewards/rejected": -0.7092598676681519, + "step": 140 + }, + { + "epoch": 0.08, + "learning_rate": 1.2886597938144328e-07, + "logits/chosen": -2.9667582511901855, + "logits/rejected": -2.937042236328125, + "logps/chosen": -295.5273742675781, + "logps/rejected": -256.44903564453125, + "loss": 0.5298, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.3221905827522278, + "rewards/margins": 0.941267192363739, + "rewards/rejected": -0.619076669216156, + "step": 150 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.8627123832702637, + "logits/rejected": -2.849468946456909, + "logps/chosen": -308.3032531738281, + "logps/rejected": -260.61444091796875, + "loss": 0.5207, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.28995996713638306, + "rewards/margins": 0.9411754608154297, + "rewards/rejected": -0.6512155532836914, + "step": 160 + }, + { + "epoch": 0.09, + "learning_rate": 1.4604810996563573e-07, + "logits/chosen": -2.8260326385498047, + "logits/rejected": -2.799665927886963, + "logps/chosen": -239.97119140625, + "logps/rejected": -209.0448760986328, + "loss": 0.4792, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3469822108745575, + "rewards/margins": 1.0210093259811401, + "rewards/rejected": -0.6740272045135498, + "step": 170 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.8760595321655273, + "logits/rejected": -2.821604013442993, + "logps/chosen": -206.1426239013672, + "logps/rejected": -221.94351196289062, + "loss": 0.4943, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24601764976978302, + "rewards/margins": 0.881085991859436, + "rewards/rejected": -0.6350683569908142, + "step": 180 + }, + { + "epoch": 0.1, + "learning_rate": 1.6323024054982818e-07, + "logits/chosen": -2.867799758911133, + "logits/rejected": -2.9438815116882324, + "logps/chosen": -249.8555145263672, + "logps/rejected": -231.462158203125, + "loss": 0.5777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23328867554664612, + "rewards/margins": 0.6879739761352539, + "rewards/rejected": -0.45468538999557495, + "step": 190 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.684551954269409, + "logits/rejected": -2.760136604309082, + "logps/chosen": -290.9544372558594, + "logps/rejected": -195.71920776367188, + "loss": 0.4786, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.26035887002944946, + "rewards/margins": 0.8763397336006165, + "rewards/rejected": -0.615980863571167, + "step": 200 + }, + { + "epoch": 0.11, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": -2.7473480701446533, + "logits/rejected": -2.615626811981201, + "logps/chosen": -260.44232177734375, + "logps/rejected": -212.9258575439453, + "loss": 0.5687, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25246724486351013, + "rewards/margins": 0.5547892451286316, + "rewards/rejected": -0.8072565197944641, + "step": 210 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.894285202026367, + "logits/rejected": -2.859070301055908, + "logps/chosen": -330.01751708984375, + "logps/rejected": -225.8118133544922, + "loss": 0.5265, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.13710172474384308, + "rewards/margins": 0.8895937204360962, + "rewards/rejected": -0.7524920701980591, + "step": 220 + }, + { + "epoch": 0.12, + "learning_rate": 1.9759450171821303e-07, + "logits/chosen": -2.868990898132324, + "logits/rejected": -2.8889577388763428, + "logps/chosen": -262.4832458496094, + "logps/rejected": -243.00057983398438, + "loss": 0.5065, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04553854465484619, + "rewards/margins": 0.5204902291297913, + "rewards/rejected": -0.4749516546726227, + "step": 230 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.8463659286499023, + "logits/rejected": -2.8954005241394043, + "logps/chosen": -322.6001892089844, + "logps/rejected": -220.10531616210938, + "loss": 0.5306, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08190703392028809, + "rewards/margins": 0.6268805265426636, + "rewards/rejected": -0.7087875604629517, + "step": 240 + }, + { + "epoch": 0.13, + "learning_rate": 2.1477663230240549e-07, + "logits/chosen": -2.980902671813965, + "logits/rejected": -2.9693779945373535, + "logps/chosen": -273.3463439941406, + "logps/rejected": -282.342529296875, + "loss": 0.5035, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22112369537353516, + "rewards/margins": 0.9812121391296387, + "rewards/rejected": -0.7600885033607483, + "step": 250 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -2.956479072570801, + "logits/rejected": -2.9315035343170166, + "logps/chosen": -250.66293334960938, + "logps/rejected": -227.837646484375, + "loss": 0.54, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.41840487718582153, + "rewards/margins": 1.014347791671753, + "rewards/rejected": -0.5959428548812866, + "step": 260 + }, + { + "epoch": 0.14, + "learning_rate": 2.3195876288659794e-07, + "logits/chosen": -2.898144483566284, + "logits/rejected": -2.8302385807037354, + "logps/chosen": -291.5244140625, + "logps/rejected": -249.4423065185547, + "loss": 0.4798, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.2764865756034851, + "rewards/margins": 0.9568204879760742, + "rewards/rejected": -0.6803339719772339, + "step": 270 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.9385907649993896, + "logits/rejected": -2.8712496757507324, + "logps/chosen": -314.6116027832031, + "logps/rejected": -226.3020477294922, + "loss": 0.5331, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.25499850511550903, + "rewards/margins": 0.9774402379989624, + "rewards/rejected": -0.7224417924880981, + "step": 280 + }, + { + "epoch": 0.15, + "learning_rate": 2.4914089347079036e-07, + "logits/chosen": -2.724156141281128, + "logits/rejected": -2.797109603881836, + "logps/chosen": -264.0115661621094, + "logps/rejected": -231.69497680664062, + "loss": 0.5239, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.23494203388690948, + "rewards/margins": 0.8044212460517883, + "rewards/rejected": -0.5694791078567505, + "step": 290 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.8613712787628174, + "logits/rejected": -2.929839849472046, + "logps/chosen": -316.16357421875, + "logps/rejected": -230.9220733642578, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43644723296165466, + "rewards/margins": 1.0967363119125366, + "rewards/rejected": -0.6602891087532043, + "step": 300 + }, + { + "epoch": 0.16, + "learning_rate": 2.663230240549828e-07, + "logits/chosen": -2.952826976776123, + "logits/rejected": -2.980451822280884, + "logps/chosen": -276.19171142578125, + "logps/rejected": -212.08193969726562, + "loss": 0.5129, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.25965097546577454, + "rewards/margins": 0.9357119798660278, + "rewards/rejected": -0.6760609149932861, + "step": 310 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.910007953643799, + "logits/rejected": -2.9905753135681152, + "logps/chosen": -309.59637451171875, + "logps/rejected": -234.2860870361328, + "loss": 0.4637, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0740666389465332, + "rewards/margins": 0.9599502682685852, + "rewards/rejected": -0.885883629322052, + "step": 320 + }, + { + "epoch": 0.17, + "learning_rate": 2.835051546391752e-07, + "logits/chosen": -2.8667492866516113, + "logits/rejected": -2.910284996032715, + "logps/chosen": -275.6033935546875, + "logps/rejected": -221.75222778320312, + "loss": 0.4938, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19026069343090057, + "rewards/margins": 1.2155379056930542, + "rewards/rejected": -1.0252773761749268, + "step": 330 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.825681209564209, + "logits/rejected": -2.8391976356506348, + "logps/chosen": -303.2389221191406, + "logps/rejected": -253.29177856445312, + "loss": 0.5263, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.33694472908973694, + "rewards/margins": 1.066750168800354, + "rewards/rejected": -0.7298054695129395, + "step": 340 + }, + { + "epoch": 0.18, + "learning_rate": 3.006872852233677e-07, + "logits/chosen": -2.945141315460205, + "logits/rejected": -2.9210100173950195, + "logps/chosen": -197.90316772460938, + "logps/rejected": -210.59317016601562, + "loss": 0.4741, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015146581456065178, + "rewards/margins": 0.900895893573761, + "rewards/rejected": -0.9160425066947937, + "step": 350 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.970536708831787, + "logits/rejected": -2.9317080974578857, + "logps/chosen": -322.3656921386719, + "logps/rejected": -263.9642028808594, + "loss": 0.4635, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.3962644636631012, + "rewards/margins": 1.5553843975067139, + "rewards/rejected": -1.159119963645935, + "step": 360 + }, + { + "epoch": 0.19, + "learning_rate": 3.178694158075601e-07, + "logits/chosen": -2.888442277908325, + "logits/rejected": -2.83204984664917, + "logps/chosen": -233.41043090820312, + "logps/rejected": -189.86026000976562, + "loss": 0.4999, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.4106314182281494, + "rewards/margins": 1.5635592937469482, + "rewards/rejected": -1.1529278755187988, + "step": 370 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.8554654121398926, + "logits/rejected": -2.790348529815674, + "logps/chosen": -225.08139038085938, + "logps/rejected": -242.3370361328125, + "loss": 0.5777, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.026898954063653946, + "rewards/margins": 0.547136127948761, + "rewards/rejected": -0.5202370882034302, + "step": 380 + }, + { + "epoch": 0.2, + "learning_rate": 3.3505154639175255e-07, + "logits/chosen": -2.8897109031677246, + "logits/rejected": -2.8853700160980225, + "logps/chosen": -288.1002502441406, + "logps/rejected": -220.94772338867188, + "loss": 0.4449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2607569098472595, + "rewards/margins": 1.4408096075057983, + "rewards/rejected": -1.1800527572631836, + "step": 390 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -2.944121837615967, + "logits/rejected": -2.8930344581604004, + "logps/chosen": -233.96829223632812, + "logps/rejected": -208.0071258544922, + "loss": 0.5594, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15474456548690796, + "rewards/margins": 0.6924557685852051, + "rewards/rejected": -0.8472002744674683, + "step": 400 + }, + { + "epoch": 0.21, + "learning_rate": 3.5223367697594503e-07, + "logits/chosen": -2.7703442573547363, + "logits/rejected": -2.687839984893799, + "logps/chosen": -319.0751953125, + "logps/rejected": -231.11868286132812, + "loss": 0.4436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28324756026268005, + "rewards/margins": 1.0761455297470093, + "rewards/rejected": -0.7928978204727173, + "step": 410 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.8674497604370117, + "logits/rejected": -2.8678717613220215, + "logps/chosen": -281.7210388183594, + "logps/rejected": -257.6501159667969, + "loss": 0.5064, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058492355048656464, + "rewards/margins": 1.1918013095855713, + "rewards/rejected": -1.2502937316894531, + "step": 420 + }, + { + "epoch": 0.22, + "learning_rate": 3.6941580756013745e-07, + "logits/chosen": -2.916782855987549, + "logits/rejected": -2.892936944961548, + "logps/chosen": -205.8938751220703, + "logps/rejected": -157.27175903320312, + "loss": 0.5008, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06811252981424332, + "rewards/margins": 1.2915557622909546, + "rewards/rejected": -1.3596681356430054, + "step": 430 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.830094814300537, + "logits/rejected": -2.850872278213501, + "logps/chosen": -303.74920654296875, + "logps/rejected": -294.87884521484375, + "loss": 0.6483, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01641862280666828, + "rewards/margins": 0.8298453092575073, + "rewards/rejected": -0.8462640643119812, + "step": 440 + }, + { + "epoch": 0.23, + "learning_rate": 3.865979381443299e-07, + "logits/chosen": -2.8804867267608643, + "logits/rejected": -2.8648791313171387, + "logps/chosen": -284.320556640625, + "logps/rejected": -258.94122314453125, + "loss": 0.5104, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2616979479789734, + "rewards/margins": 0.7781568169593811, + "rewards/rejected": -1.0398547649383545, + "step": 450 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.888380527496338, + "logits/rejected": -2.9484517574310303, + "logps/chosen": -274.451416015625, + "logps/rejected": -266.95623779296875, + "loss": 0.5914, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.061025023460388184, + "rewards/margins": 1.4974867105484009, + "rewards/rejected": -1.558511734008789, + "step": 460 + }, + { + "epoch": 0.24, + "learning_rate": 4.037800687285223e-07, + "logits/chosen": -2.905761480331421, + "logits/rejected": -2.9156107902526855, + "logps/chosen": -322.3829650878906, + "logps/rejected": -198.50205993652344, + "loss": 0.4749, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05220061540603638, + "rewards/margins": 1.2210773229599, + "rewards/rejected": -1.2732778787612915, + "step": 470 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -2.93009614944458, + "logits/rejected": -2.9415712356567383, + "logps/chosen": -253.5304718017578, + "logps/rejected": -240.3785400390625, + "loss": 0.5669, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2351093739271164, + "rewards/margins": 0.7879358530044556, + "rewards/rejected": -0.5528265237808228, + "step": 480 + }, + { + "epoch": 0.25, + "learning_rate": 4.209621993127148e-07, + "logits/chosen": -2.872180461883545, + "logits/rejected": -2.8901145458221436, + "logps/chosen": -302.1181640625, + "logps/rejected": -239.98046875, + "loss": 0.4494, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07415000349283218, + "rewards/margins": 1.3110253810882568, + "rewards/rejected": -1.236875295639038, + "step": 490 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -2.9795875549316406, + "logits/rejected": -2.9768428802490234, + "logps/chosen": -296.80718994140625, + "logps/rejected": -264.24664306640625, + "loss": 0.5631, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.027627941220998764, + "rewards/margins": 1.2383382320404053, + "rewards/rejected": -1.2107102870941162, + "step": 500 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.957718849182129, + "eval_logits/rejected": -2.9466848373413086, + "eval_logps/chosen": -298.3453369140625, + "eval_logps/rejected": -251.98329162597656, + "eval_loss": 0.5260158777236938, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": 0.028815001249313354, + "eval_rewards/margins": 1.2370529174804688, + "eval_rewards/rejected": -1.208237886428833, + "eval_runtime": 218.0881, + "eval_samples_per_second": 9.171, + "eval_steps_per_second": 0.289, + "step": 500 + }, + { + "epoch": 0.26, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": -2.8311476707458496, + "logits/rejected": -2.818819284439087, + "logps/chosen": -294.35272216796875, + "logps/rejected": -245.2067413330078, + "loss": 0.6016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3445579707622528, + "rewards/margins": 0.7032971978187561, + "rewards/rejected": -1.047855257987976, + "step": 510 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -2.9695799350738525, + "logits/rejected": -3.041219711303711, + "logps/chosen": -228.91506958007812, + "logps/rejected": -206.9320831298828, + "loss": 0.5592, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.006498217582702637, + "rewards/margins": 1.6344130039215088, + "rewards/rejected": -1.6279146671295166, + "step": 520 + }, + { + "epoch": 0.27, + "learning_rate": 4.5532646048109964e-07, + "logits/chosen": -2.9132516384124756, + "logits/rejected": -2.9083011150360107, + "logps/chosen": -263.5189208984375, + "logps/rejected": -193.60177612304688, + "loss": 0.5477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0026242255698889494, + "rewards/margins": 1.3856732845306396, + "rewards/rejected": -1.3830487728118896, + "step": 530 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -2.9057679176330566, + "logits/rejected": -2.905860424041748, + "logps/chosen": -238.1947784423828, + "logps/rejected": -240.0609130859375, + "loss": 0.6173, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4448246955871582, + "rewards/margins": 0.9073933362960815, + "rewards/rejected": -1.3522180318832397, + "step": 540 + }, + { + "epoch": 0.28, + "learning_rate": 4.7250859106529206e-07, + "logits/chosen": -2.9397683143615723, + "logits/rejected": -2.9519639015197754, + "logps/chosen": -315.84698486328125, + "logps/rejected": -261.22149658203125, + "loss": 0.6392, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08306878060102463, + "rewards/margins": 1.4375866651535034, + "rewards/rejected": -1.5206555128097534, + "step": 550 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -2.993764877319336, + "logits/rejected": -2.986448049545288, + "logps/chosen": -309.62298583984375, + "logps/rejected": -256.41461181640625, + "loss": 0.5609, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2558799386024475, + "rewards/margins": 1.2214542627334595, + "rewards/rejected": -1.4773342609405518, + "step": 560 + }, + { + "epoch": 0.29, + "learning_rate": 4.896907216494845e-07, + "logits/chosen": -2.9518275260925293, + "logits/rejected": -2.824982166290283, + "logps/chosen": -334.22686767578125, + "logps/rejected": -242.75833129882812, + "loss": 0.5569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5716447830200195, + "rewards/margins": 0.9872225522994995, + "rewards/rejected": -1.5588672161102295, + "step": 570 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -2.9013009071350098, + "logits/rejected": -3.0167527198791504, + "logps/chosen": -269.56573486328125, + "logps/rejected": -236.23318481445312, + "loss": 0.6727, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1722039431333542, + "rewards/margins": 1.2019895315170288, + "rewards/rejected": -1.374193549156189, + "step": 580 + }, + { + "epoch": 0.3, + "learning_rate": 4.992350353796136e-07, + "logits/chosen": -2.9110538959503174, + "logits/rejected": -3.0132861137390137, + "logps/chosen": -226.79293823242188, + "logps/rejected": -207.0228271484375, + "loss": 0.5651, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3664969801902771, + "rewards/margins": 1.9526106119155884, + "rewards/rejected": -1.586113691329956, + "step": 590 + }, + { + "epoch": 0.31, + "learning_rate": 4.982788296041308e-07, + "logits/chosen": -2.8610830307006836, + "logits/rejected": -2.8405697345733643, + "logps/chosen": -256.3024597167969, + "logps/rejected": -247.03085327148438, + "loss": 0.5457, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07645522803068161, + "rewards/margins": 1.704345464706421, + "rewards/rejected": -1.6278903484344482, + "step": 600 + }, + { + "epoch": 0.31, + "learning_rate": 4.973226238286479e-07, + "logits/chosen": -2.923488140106201, + "logits/rejected": -2.950122117996216, + "logps/chosen": -353.5453186035156, + "logps/rejected": -298.95477294921875, + "loss": 0.5858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09489689767360687, + "rewards/margins": 1.4458928108215332, + "rewards/rejected": -1.5407898426055908, + "step": 610 + }, + { + "epoch": 0.32, + "learning_rate": 4.96366418053165e-07, + "logits/chosen": -2.972114086151123, + "logits/rejected": -3.030524253845215, + "logps/chosen": -311.63153076171875, + "logps/rejected": -240.6847381591797, + "loss": 0.6625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.015180367045104504, + "rewards/margins": 1.1330465078353882, + "rewards/rejected": -1.1482269763946533, + "step": 620 + }, + { + "epoch": 0.33, + "learning_rate": 4.954102122776821e-07, + "logits/chosen": -2.8993327617645264, + "logits/rejected": -2.9407591819763184, + "logps/chosen": -220.35940551757812, + "logps/rejected": -179.3049774169922, + "loss": 0.5138, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39531436562538147, + "rewards/margins": 1.0542871952056885, + "rewards/rejected": -1.4496015310287476, + "step": 630 + }, + { + "epoch": 0.33, + "learning_rate": 4.944540065021993e-07, + "logits/chosen": -2.7829689979553223, + "logits/rejected": -2.729430675506592, + "logps/chosen": -244.7413330078125, + "logps/rejected": -210.92080688476562, + "loss": 0.5693, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.12804606556892395, + "rewards/margins": 1.9121665954589844, + "rewards/rejected": -1.784120798110962, + "step": 640 + }, + { + "epoch": 0.34, + "learning_rate": 4.934978007267163e-07, + "logits/chosen": -2.859757900238037, + "logits/rejected": -2.943504571914673, + "logps/chosen": -283.7872314453125, + "logps/rejected": -251.78982543945312, + "loss": 0.5549, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.581717848777771, + "rewards/margins": 1.621364951133728, + "rewards/rejected": -1.039647102355957, + "step": 650 + }, + { + "epoch": 0.34, + "learning_rate": 4.925415949512335e-07, + "logits/chosen": -2.8729991912841797, + "logits/rejected": -2.932633876800537, + "logps/chosen": -305.8207702636719, + "logps/rejected": -248.5808868408203, + "loss": 0.5541, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07670129835605621, + "rewards/margins": 1.1095455884933472, + "rewards/rejected": -1.0328443050384521, + "step": 660 + }, + { + "epoch": 0.35, + "learning_rate": 4.915853891757506e-07, + "logits/chosen": -2.7519607543945312, + "logits/rejected": -2.755323886871338, + "logps/chosen": -168.88475036621094, + "logps/rejected": -209.848876953125, + "loss": 0.6278, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14984068274497986, + "rewards/margins": 0.7654241323471069, + "rewards/rejected": -0.6155833601951599, + "step": 670 + }, + { + "epoch": 0.35, + "learning_rate": 4.906291834002677e-07, + "logits/chosen": -2.7901368141174316, + "logits/rejected": -2.767921209335327, + "logps/chosen": -291.7945251464844, + "logps/rejected": -237.0704803466797, + "loss": 0.5762, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1925654113292694, + "rewards/margins": 0.9999873042106628, + "rewards/rejected": -1.1925528049468994, + "step": 680 + }, + { + "epoch": 0.36, + "learning_rate": 4.896729776247848e-07, + "logits/chosen": -2.902186155319214, + "logits/rejected": -2.9201323986053467, + "logps/chosen": -265.80633544921875, + "logps/rejected": -185.19403076171875, + "loss": 0.6003, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.33004412055015564, + "rewards/margins": 1.19220769405365, + "rewards/rejected": -1.522251844406128, + "step": 690 + }, + { + "epoch": 0.36, + "learning_rate": 4.88716771849302e-07, + "logits/chosen": -2.8365702629089355, + "logits/rejected": -2.880140781402588, + "logps/chosen": -322.38421630859375, + "logps/rejected": -299.58990478515625, + "loss": 0.5984, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.023646574467420578, + "rewards/margins": 1.6279172897338867, + "rewards/rejected": -1.6515636444091797, + "step": 700 + }, + { + "epoch": 0.37, + "learning_rate": 4.87760566073819e-07, + "logits/chosen": -2.8931069374084473, + "logits/rejected": -2.9269511699676514, + "logps/chosen": -271.537841796875, + "logps/rejected": -249.77090454101562, + "loss": 0.5546, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3147915303707123, + "rewards/margins": 1.2212846279144287, + "rewards/rejected": -1.5360761880874634, + "step": 710 + }, + { + "epoch": 0.37, + "learning_rate": 4.868043602983362e-07, + "logits/chosen": -2.847012996673584, + "logits/rejected": -2.8868696689605713, + "logps/chosen": -287.21282958984375, + "logps/rejected": -295.8669738769531, + "loss": 0.6139, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.21573197841644287, + "rewards/margins": 2.241217851638794, + "rewards/rejected": -2.4569497108459473, + "step": 720 + }, + { + "epoch": 0.38, + "learning_rate": 4.858481545228533e-07, + "logits/chosen": -2.949655771255493, + "logits/rejected": -2.920367479324341, + "logps/chosen": -289.61383056640625, + "logps/rejected": -283.63714599609375, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4029797911643982, + "rewards/margins": 1.6155755519866943, + "rewards/rejected": -2.0185556411743164, + "step": 730 + }, + { + "epoch": 0.38, + "learning_rate": 4.848919487473704e-07, + "logits/chosen": -2.8386573791503906, + "logits/rejected": -2.833543300628662, + "logps/chosen": -272.10321044921875, + "logps/rejected": -241.79733276367188, + "loss": 0.6765, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6702455878257751, + "rewards/margins": 0.9414161443710327, + "rewards/rejected": -1.6116619110107422, + "step": 740 + }, + { + "epoch": 0.39, + "learning_rate": 4.839357429718875e-07, + "logits/chosen": -2.8777010440826416, + "logits/rejected": -2.9025700092315674, + "logps/chosen": -286.5823974609375, + "logps/rejected": -260.0926208496094, + "loss": 0.5844, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14236275851726532, + "rewards/margins": 2.0217671394348145, + "rewards/rejected": -2.1641299724578857, + "step": 750 + }, + { + "epoch": 0.39, + "learning_rate": 4.829795371964047e-07, + "logits/chosen": -2.8598580360412598, + "logits/rejected": -2.91709566116333, + "logps/chosen": -309.28656005859375, + "logps/rejected": -249.3534698486328, + "loss": 0.6013, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.171111062169075, + "rewards/margins": 1.5951813459396362, + "rewards/rejected": -1.7662923336029053, + "step": 760 + }, + { + "epoch": 0.4, + "learning_rate": 4.820233314209217e-07, + "logits/chosen": -2.7693912982940674, + "logits/rejected": -2.7938313484191895, + "logps/chosen": -234.9104766845703, + "logps/rejected": -214.0842742919922, + "loss": 0.6131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1327628642320633, + "rewards/margins": 1.7671611309051514, + "rewards/rejected": -1.8999239206314087, + "step": 770 + }, + { + "epoch": 0.4, + "learning_rate": 4.810671256454389e-07, + "logits/chosen": -2.7556939125061035, + "logits/rejected": -2.7517266273498535, + "logps/chosen": -295.79791259765625, + "logps/rejected": -242.08694458007812, + "loss": 0.5581, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.006750267930328846, + "rewards/margins": 2.7443861961364746, + "rewards/rejected": -2.751136302947998, + "step": 780 + }, + { + "epoch": 0.41, + "learning_rate": 4.80110919869956e-07, + "logits/chosen": -2.7430715560913086, + "logits/rejected": -2.8337535858154297, + "logps/chosen": -257.67437744140625, + "logps/rejected": -220.5835418701172, + "loss": 0.6226, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.34579893946647644, + "rewards/margins": 1.5945725440979004, + "rewards/rejected": -1.9403712749481201, + "step": 790 + }, + { + "epoch": 0.41, + "learning_rate": 4.791547140944731e-07, + "logits/chosen": -2.6196367740631104, + "logits/rejected": -2.7076234817504883, + "logps/chosen": -232.72207641601562, + "logps/rejected": -255.9217987060547, + "loss": 0.64, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08904242515563965, + "rewards/margins": 1.5046972036361694, + "rewards/rejected": -1.4156547784805298, + "step": 800 + }, + { + "epoch": 0.42, + "learning_rate": 4.781985083189902e-07, + "logits/chosen": -2.788634777069092, + "logits/rejected": -2.8340401649475098, + "logps/chosen": -240.91201782226562, + "logps/rejected": -233.1719512939453, + "loss": 0.6055, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.03584769368171692, + "rewards/margins": 1.7759116888046265, + "rewards/rejected": -1.811759352684021, + "step": 810 + }, + { + "epoch": 0.42, + "learning_rate": 4.772423025435074e-07, + "logits/chosen": -2.765285015106201, + "logits/rejected": -2.7674434185028076, + "logps/chosen": -253.8185577392578, + "logps/rejected": -223.858642578125, + "loss": 0.6672, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.6709930300712585, + "rewards/margins": 0.025269517675042152, + "rewards/rejected": -0.6962625980377197, + "step": 820 + }, + { + "epoch": 0.43, + "learning_rate": 4.762860967680244e-07, + "logits/chosen": -2.8905787467956543, + "logits/rejected": -2.8797848224639893, + "logps/chosen": -206.35836791992188, + "logps/rejected": -200.47171020507812, + "loss": 0.6469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09678085148334503, + "rewards/margins": 0.6925548315048218, + "rewards/rejected": -0.7893357276916504, + "step": 830 + }, + { + "epoch": 0.43, + "learning_rate": 4.7532989099254154e-07, + "logits/chosen": -2.965175151824951, + "logits/rejected": -2.9960315227508545, + "logps/chosen": -281.5271911621094, + "logps/rejected": -253.3331298828125, + "loss": 0.6327, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5533876419067383, + "rewards/margins": 1.022857666015625, + "rewards/rejected": -1.5762451887130737, + "step": 840 + }, + { + "epoch": 0.44, + "learning_rate": 4.7437368521705866e-07, + "logits/chosen": -2.822181224822998, + "logits/rejected": -2.9410040378570557, + "logps/chosen": -230.7578582763672, + "logps/rejected": -242.03726196289062, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11291544139385223, + "rewards/margins": 1.7286335229873657, + "rewards/rejected": -1.615717887878418, + "step": 850 + }, + { + "epoch": 0.44, + "learning_rate": 4.7341747944157577e-07, + "logits/chosen": -2.80778431892395, + "logits/rejected": -2.801071882247925, + "logps/chosen": -237.62393188476562, + "logps/rejected": -235.6117401123047, + "loss": 0.6497, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22725868225097656, + "rewards/margins": 1.3354547023773193, + "rewards/rejected": -1.5627135038375854, + "step": 860 + }, + { + "epoch": 0.45, + "learning_rate": 4.724612736660929e-07, + "logits/chosen": -2.878175735473633, + "logits/rejected": -2.906575918197632, + "logps/chosen": -274.377197265625, + "logps/rejected": -188.13107299804688, + "loss": 0.5401, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6024242043495178, + "rewards/margins": 2.734527587890625, + "rewards/rejected": -2.132103204727173, + "step": 870 + }, + { + "epoch": 0.45, + "learning_rate": 4.7150506789061006e-07, + "logits/chosen": -2.790346622467041, + "logits/rejected": -2.76853609085083, + "logps/chosen": -254.47323608398438, + "logps/rejected": -271.0780944824219, + "loss": 0.5927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1932486593723297, + "rewards/margins": 2.158299446105957, + "rewards/rejected": -1.9650509357452393, + "step": 880 + }, + { + "epoch": 0.46, + "learning_rate": 4.7054886211512717e-07, + "logits/chosen": -2.8026018142700195, + "logits/rejected": -2.8157997131347656, + "logps/chosen": -284.82037353515625, + "logps/rejected": -261.57513427734375, + "loss": 0.6557, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.060963429510593414, + "rewards/margins": 0.8900277018547058, + "rewards/rejected": -0.8290642499923706, + "step": 890 + }, + { + "epoch": 0.46, + "learning_rate": 4.695926563396443e-07, + "logits/chosen": -2.9850142002105713, + "logits/rejected": -3.0215537548065186, + "logps/chosen": -235.89859008789062, + "logps/rejected": -222.79476928710938, + "loss": 0.595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18551968038082123, + "rewards/margins": 1.2160269021987915, + "rewards/rejected": -1.4015467166900635, + "step": 900 + }, + { + "epoch": 0.47, + "learning_rate": 4.686364505641614e-07, + "logits/chosen": -2.9402737617492676, + "logits/rejected": -2.9274373054504395, + "logps/chosen": -294.6170349121094, + "logps/rejected": -215.5508575439453, + "loss": 0.5671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08820157498121262, + "rewards/margins": 1.4755268096923828, + "rewards/rejected": -1.5637282133102417, + "step": 910 + }, + { + "epoch": 0.47, + "learning_rate": 4.676802447886785e-07, + "logits/chosen": -2.8315272331237793, + "logits/rejected": -2.892688512802124, + "logps/chosen": -266.54571533203125, + "logps/rejected": -204.66717529296875, + "loss": 0.6862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.32338160276412964, + "rewards/margins": 1.7928798198699951, + "rewards/rejected": -1.4694981575012207, + "step": 920 + }, + { + "epoch": 0.48, + "learning_rate": 4.6672403901319564e-07, + "logits/chosen": -2.8591322898864746, + "logits/rejected": -2.8798279762268066, + "logps/chosen": -284.8063659667969, + "logps/rejected": -212.33853149414062, + "loss": 0.5101, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.332103967666626, + "rewards/margins": 2.160444736480713, + "rewards/rejected": -1.8283405303955078, + "step": 930 + }, + { + "epoch": 0.49, + "learning_rate": 4.6576783323771275e-07, + "logits/chosen": -2.767289638519287, + "logits/rejected": -2.8271992206573486, + "logps/chosen": -251.9027099609375, + "logps/rejected": -190.04098510742188, + "loss": 0.5335, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32957202196121216, + "rewards/margins": 1.1496002674102783, + "rewards/rejected": -1.4791723489761353, + "step": 940 + }, + { + "epoch": 0.49, + "learning_rate": 4.6481162746222987e-07, + "logits/chosen": -2.817572832107544, + "logits/rejected": -2.8548407554626465, + "logps/chosen": -295.7524719238281, + "logps/rejected": -238.9110107421875, + "loss": 0.5596, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.19818595051765442, + "rewards/margins": 2.651278257369995, + "rewards/rejected": -2.453092336654663, + "step": 950 + }, + { + "epoch": 0.5, + "learning_rate": 4.63855421686747e-07, + "logits/chosen": -2.7868804931640625, + "logits/rejected": -2.8993027210235596, + "logps/chosen": -293.16888427734375, + "logps/rejected": -254.9773406982422, + "loss": 0.5739, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07755409181118011, + "rewards/margins": 1.6788995265960693, + "rewards/rejected": -1.601345419883728, + "step": 960 + }, + { + "epoch": 0.5, + "learning_rate": 4.628992159112641e-07, + "logits/chosen": -2.8409125804901123, + "logits/rejected": -2.7442398071289062, + "logps/chosen": -285.7856140136719, + "logps/rejected": -226.0821533203125, + "loss": 0.6429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05490158870816231, + "rewards/margins": 1.3684966564178467, + "rewards/rejected": -1.4233982563018799, + "step": 970 + }, + { + "epoch": 0.51, + "learning_rate": 4.6194301013578116e-07, + "logits/chosen": -2.872086763381958, + "logits/rejected": -2.79237699508667, + "logps/chosen": -339.43365478515625, + "logps/rejected": -262.66766357421875, + "loss": 0.5547, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03792442008852959, + "rewards/margins": 2.0819664001464844, + "rewards/rejected": -2.0440421104431152, + "step": 980 + }, + { + "epoch": 0.51, + "learning_rate": 4.609868043602983e-07, + "logits/chosen": -2.854628324508667, + "logits/rejected": -2.843848705291748, + "logps/chosen": -233.026611328125, + "logps/rejected": -227.99008178710938, + "loss": 0.5319, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.05646134167909622, + "rewards/margins": 1.7762739658355713, + "rewards/rejected": -1.7198127508163452, + "step": 990 + }, + { + "epoch": 0.52, + "learning_rate": 4.600305985848154e-07, + "logits/chosen": -2.756126880645752, + "logits/rejected": -2.9117398262023926, + "logps/chosen": -217.8252410888672, + "logps/rejected": -191.711669921875, + "loss": 0.5432, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2646777033805847, + "rewards/margins": 0.8738463521003723, + "rewards/rejected": -1.1385241746902466, + "step": 1000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.847585678100586, + "eval_logits/rejected": -2.8465394973754883, + "eval_logps/chosen": -298.6568298339844, + "eval_logps/rejected": -255.18309020996094, + "eval_loss": 0.5888190269470215, + "eval_rewards/accuracies": 0.7539682388305664, + "eval_rewards/chosen": -0.03348641097545624, + "eval_rewards/margins": 1.8147083520889282, + "eval_rewards/rejected": -1.8481948375701904, + "eval_runtime": 217.5601, + "eval_samples_per_second": 9.193, + "eval_steps_per_second": 0.29, + "step": 1000 + }, + { + "epoch": 0.52, + "learning_rate": 4.590743928093325e-07, + "logits/chosen": -2.80271315574646, + "logits/rejected": -2.7094879150390625, + "logps/chosen": -297.522705078125, + "logps/rejected": -240.9547576904297, + "loss": 0.6857, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8173992037773132, + "rewards/margins": 0.8880969285964966, + "rewards/rejected": -1.705496072769165, + "step": 1010 + }, + { + "epoch": 0.53, + "learning_rate": 4.581181870338497e-07, + "logits/chosen": -2.8086001873016357, + "logits/rejected": -2.9041824340820312, + "logps/chosen": -263.6119384765625, + "logps/rejected": -251.357421875, + "loss": 0.6192, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15594568848609924, + "rewards/margins": 1.9435104131698608, + "rewards/rejected": -1.7875648736953735, + "step": 1020 + }, + { + "epoch": 0.53, + "learning_rate": 4.571619812583668e-07, + "logits/chosen": -2.837925910949707, + "logits/rejected": -2.8686470985412598, + "logps/chosen": -250.3289337158203, + "logps/rejected": -233.561767578125, + "loss": 0.5624, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02483411133289337, + "rewards/margins": 2.197080135345459, + "rewards/rejected": -2.172245502471924, + "step": 1030 + }, + { + "epoch": 0.54, + "learning_rate": 4.562057754828839e-07, + "logits/chosen": -2.7975571155548096, + "logits/rejected": -2.8900492191314697, + "logps/chosen": -248.82400512695312, + "logps/rejected": -262.0280456542969, + "loss": 0.6051, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04311724752187729, + "rewards/margins": 2.246140241622925, + "rewards/rejected": -2.2030229568481445, + "step": 1040 + }, + { + "epoch": 0.54, + "learning_rate": 4.55249569707401e-07, + "logits/chosen": -2.8506252765655518, + "logits/rejected": -2.853006601333618, + "logps/chosen": -238.81204223632812, + "logps/rejected": -221.2905731201172, + "loss": 0.5727, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1139549016952515, + "rewards/margins": 1.4150480031967163, + "rewards/rejected": -2.5290026664733887, + "step": 1050 + }, + { + "epoch": 0.55, + "learning_rate": 4.5429336393191814e-07, + "logits/chosen": -2.769659996032715, + "logits/rejected": -2.8557846546173096, + "logps/chosen": -247.4389190673828, + "logps/rejected": -191.01991271972656, + "loss": 0.6656, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2663023769855499, + "rewards/margins": 1.4014960527420044, + "rewards/rejected": -1.6677982807159424, + "step": 1060 + }, + { + "epoch": 0.55, + "learning_rate": 4.5333715815643525e-07, + "logits/chosen": -2.843215227127075, + "logits/rejected": -2.920480489730835, + "logps/chosen": -302.1959533691406, + "logps/rejected": -237.12496948242188, + "loss": 0.6203, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5315932035446167, + "rewards/margins": 1.479473352432251, + "rewards/rejected": -2.011066436767578, + "step": 1070 + }, + { + "epoch": 0.56, + "learning_rate": 4.5238095238095237e-07, + "logits/chosen": -2.90852427482605, + "logits/rejected": -2.9359192848205566, + "logps/chosen": -252.1295928955078, + "logps/rejected": -236.31625366210938, + "loss": 0.6002, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6265767812728882, + "rewards/margins": 1.0830936431884766, + "rewards/rejected": -1.7096704244613647, + "step": 1080 + }, + { + "epoch": 0.56, + "learning_rate": 4.514247466054695e-07, + "logits/chosen": -2.874357223510742, + "logits/rejected": -2.8359124660491943, + "logps/chosen": -220.4912109375, + "logps/rejected": -213.4995880126953, + "loss": 0.5834, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7575775384902954, + "rewards/margins": 1.3033814430236816, + "rewards/rejected": -2.0609588623046875, + "step": 1090 + }, + { + "epoch": 0.57, + "learning_rate": 4.504685408299866e-07, + "logits/chosen": -2.8305556774139404, + "logits/rejected": -2.8685882091522217, + "logps/chosen": -304.5544128417969, + "logps/rejected": -278.60675048828125, + "loss": 0.6029, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8350940942764282, + "rewards/margins": 1.1936551332473755, + "rewards/rejected": -2.0287489891052246, + "step": 1100 + }, + { + "epoch": 0.57, + "learning_rate": 4.495123350545037e-07, + "logits/chosen": -2.8294761180877686, + "logits/rejected": -2.833220958709717, + "logps/chosen": -306.3173828125, + "logps/rejected": -255.7045135498047, + "loss": 0.5678, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4116579592227936, + "rewards/margins": 1.4458935260772705, + "rewards/rejected": -1.8575513362884521, + "step": 1110 + }, + { + "epoch": 0.58, + "learning_rate": 4.4855612927902083e-07, + "logits/chosen": -2.7491233348846436, + "logits/rejected": -2.7054574489593506, + "logps/chosen": -285.42779541015625, + "logps/rejected": -257.0852355957031, + "loss": 1.2474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2621387541294098, + "rewards/margins": 1.8200814723968506, + "rewards/rejected": -2.0822200775146484, + "step": 1120 + }, + { + "epoch": 0.58, + "learning_rate": 4.4759992350353795e-07, + "logits/chosen": -2.77081561088562, + "logits/rejected": -2.829399824142456, + "logps/chosen": -267.37384033203125, + "logps/rejected": -256.3465270996094, + "loss": 0.5097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.4821946620941162, + "rewards/margins": 1.6790558099746704, + "rewards/rejected": -2.161250591278076, + "step": 1130 + }, + { + "epoch": 0.59, + "learning_rate": 4.46643717728055e-07, + "logits/chosen": -2.660407304763794, + "logits/rejected": -2.7651076316833496, + "logps/chosen": -228.09249877929688, + "logps/rejected": -246.08883666992188, + "loss": 0.5661, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7026903033256531, + "rewards/margins": 1.6223033666610718, + "rewards/rejected": -2.32499361038208, + "step": 1140 + }, + { + "epoch": 0.59, + "learning_rate": 4.4568751195257213e-07, + "logits/chosen": -2.778298854827881, + "logits/rejected": -2.8174757957458496, + "logps/chosen": -298.8221435546875, + "logps/rejected": -201.99879455566406, + "loss": 0.6314, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3881188929080963, + "rewards/margins": 1.6859171390533447, + "rewards/rejected": -2.074036121368408, + "step": 1150 + }, + { + "epoch": 0.6, + "learning_rate": 4.447313061770893e-07, + "logits/chosen": -2.791391134262085, + "logits/rejected": -2.630458354949951, + "logps/chosen": -269.7861633300781, + "logps/rejected": -264.52484130859375, + "loss": 0.6332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6700159907341003, + "rewards/margins": 1.6704028844833374, + "rewards/rejected": -2.340418815612793, + "step": 1160 + }, + { + "epoch": 0.6, + "learning_rate": 4.437751004016064e-07, + "logits/chosen": -2.573615789413452, + "logits/rejected": -2.726407766342163, + "logps/chosen": -216.47705078125, + "logps/rejected": -234.5958251953125, + "loss": 0.5339, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.06952729821205139, + "rewards/margins": 1.8820451498031616, + "rewards/rejected": -1.9515727758407593, + "step": 1170 + }, + { + "epoch": 0.61, + "learning_rate": 4.4281889462612353e-07, + "logits/chosen": -2.8222270011901855, + "logits/rejected": -2.877793312072754, + "logps/chosen": -277.1562194824219, + "logps/rejected": -207.08447265625, + "loss": 0.5704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3084719181060791, + "rewards/margins": 1.7400152683258057, + "rewards/rejected": -2.0484869480133057, + "step": 1180 + }, + { + "epoch": 0.61, + "learning_rate": 4.4186268885064064e-07, + "logits/chosen": -2.9520726203918457, + "logits/rejected": -2.902529239654541, + "logps/chosen": -282.25469970703125, + "logps/rejected": -226.1280975341797, + "loss": 0.6336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8003433346748352, + "rewards/margins": 1.045416235923767, + "rewards/rejected": -1.845759630203247, + "step": 1190 + }, + { + "epoch": 0.62, + "learning_rate": 4.4090648307515776e-07, + "logits/chosen": -2.8176891803741455, + "logits/rejected": -2.8945231437683105, + "logps/chosen": -213.12429809570312, + "logps/rejected": -175.6876983642578, + "loss": 0.5776, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2775752544403076, + "rewards/margins": 0.9386798739433289, + "rewards/rejected": -2.216254949569702, + "step": 1200 + }, + { + "epoch": 0.62, + "learning_rate": 4.399502772996749e-07, + "logits/chosen": -2.8975610733032227, + "logits/rejected": -2.8700690269470215, + "logps/chosen": -280.55194091796875, + "logps/rejected": -269.6097717285156, + "loss": 0.5842, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7831735610961914, + "rewards/margins": 1.660369873046875, + "rewards/rejected": -2.4435436725616455, + "step": 1210 + }, + { + "epoch": 0.63, + "learning_rate": 4.38994071524192e-07, + "logits/chosen": -2.883453130722046, + "logits/rejected": -2.962209939956665, + "logps/chosen": -285.9277648925781, + "logps/rejected": -229.5140838623047, + "loss": 0.7004, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7747269868850708, + "rewards/margins": 1.7977510690689087, + "rewards/rejected": -2.5724778175354004, + "step": 1220 + }, + { + "epoch": 0.64, + "learning_rate": 4.380378657487091e-07, + "logits/chosen": -2.9898650646209717, + "logits/rejected": -2.9422051906585693, + "logps/chosen": -321.74664306640625, + "logps/rejected": -337.42620849609375, + "loss": 0.5917, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.40765270590782166, + "rewards/margins": 1.6362041234970093, + "rewards/rejected": -2.0438568592071533, + "step": 1230 + }, + { + "epoch": 0.64, + "learning_rate": 4.370816599732262e-07, + "logits/chosen": -2.9018731117248535, + "logits/rejected": -2.9015612602233887, + "logps/chosen": -271.43804931640625, + "logps/rejected": -233.69143676757812, + "loss": 0.6393, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1562955379486084, + "rewards/margins": 1.2955151796340942, + "rewards/rejected": -2.451810598373413, + "step": 1240 + }, + { + "epoch": 0.65, + "learning_rate": 4.3612545419774334e-07, + "logits/chosen": -2.7853074073791504, + "logits/rejected": -2.750446319580078, + "logps/chosen": -241.9004669189453, + "logps/rejected": -301.3990783691406, + "loss": 0.6279, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9854481816291809, + "rewards/margins": 1.3398268222808838, + "rewards/rejected": -2.32527494430542, + "step": 1250 + }, + { + "epoch": 0.65, + "learning_rate": 4.3516924842226045e-07, + "logits/chosen": -2.740088939666748, + "logits/rejected": -2.8612887859344482, + "logps/chosen": -291.9454650878906, + "logps/rejected": -230.01675415039062, + "loss": 0.6272, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7180348038673401, + "rewards/margins": 1.8446018695831299, + "rewards/rejected": -2.5626368522644043, + "step": 1260 + }, + { + "epoch": 0.66, + "learning_rate": 4.3421304264677757e-07, + "logits/chosen": -2.8873705863952637, + "logits/rejected": -2.757368803024292, + "logps/chosen": -262.8088684082031, + "logps/rejected": -228.0838623046875, + "loss": 0.5759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9115442037582397, + "rewards/margins": 1.8955144882202148, + "rewards/rejected": -2.807058572769165, + "step": 1270 + }, + { + "epoch": 0.66, + "learning_rate": 4.332568368712947e-07, + "logits/chosen": -2.869631767272949, + "logits/rejected": -2.9278321266174316, + "logps/chosen": -300.78094482421875, + "logps/rejected": -286.11224365234375, + "loss": 0.7003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4378373622894287, + "rewards/margins": 1.4002349376678467, + "rewards/rejected": -2.8380722999572754, + "step": 1280 + }, + { + "epoch": 0.67, + "learning_rate": 4.323006310958118e-07, + "logits/chosen": -2.889538288116455, + "logits/rejected": -2.952970027923584, + "logps/chosen": -342.14373779296875, + "logps/rejected": -262.52960205078125, + "loss": 0.5762, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7595783472061157, + "rewards/margins": 1.8629432916641235, + "rewards/rejected": -2.6225216388702393, + "step": 1290 + }, + { + "epoch": 0.67, + "learning_rate": 4.313444253203289e-07, + "logits/chosen": -2.895522117614746, + "logits/rejected": -2.7695717811584473, + "logps/chosen": -251.97232055664062, + "logps/rejected": -249.22775268554688, + "loss": 0.5322, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9817905426025391, + "rewards/margins": 1.8322546482086182, + "rewards/rejected": -2.814044952392578, + "step": 1300 + }, + { + "epoch": 0.68, + "learning_rate": 4.3038821954484603e-07, + "logits/chosen": -2.882139205932617, + "logits/rejected": -2.8528878688812256, + "logps/chosen": -244.2353973388672, + "logps/rejected": -250.3789520263672, + "loss": 0.5442, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.030928373336792, + "rewards/margins": 1.146545171737671, + "rewards/rejected": -2.177473545074463, + "step": 1310 + }, + { + "epoch": 0.68, + "learning_rate": 4.2943201376936315e-07, + "logits/chosen": -2.8502607345581055, + "logits/rejected": -2.83538556098938, + "logps/chosen": -267.75384521484375, + "logps/rejected": -256.6767578125, + "loss": 0.6736, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8556143045425415, + "rewards/margins": 1.707345962524414, + "rewards/rejected": -2.562960386276245, + "step": 1320 + }, + { + "epoch": 0.69, + "learning_rate": 4.2847580799388026e-07, + "logits/chosen": -2.8500895500183105, + "logits/rejected": -2.85542368888855, + "logps/chosen": -261.53204345703125, + "logps/rejected": -243.7827911376953, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5335882902145386, + "rewards/margins": 0.587089478969574, + "rewards/rejected": -2.120677947998047, + "step": 1330 + }, + { + "epoch": 0.69, + "learning_rate": 4.275196022183974e-07, + "logits/chosen": -2.8312344551086426, + "logits/rejected": -2.8955295085906982, + "logps/chosen": -351.7067565917969, + "logps/rejected": -237.8406524658203, + "loss": 0.6193, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3393104076385498, + "rewards/margins": 0.9419676065444946, + "rewards/rejected": -2.281277656555176, + "step": 1340 + }, + { + "epoch": 0.7, + "learning_rate": 4.265633964429145e-07, + "logits/chosen": -2.8178093433380127, + "logits/rejected": -2.8798580169677734, + "logps/chosen": -290.2537536621094, + "logps/rejected": -217.72128295898438, + "loss": 0.7113, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7937359809875488, + "rewards/margins": 1.3023329973220825, + "rewards/rejected": -2.096068859100342, + "step": 1350 + }, + { + "epoch": 0.7, + "learning_rate": 4.256071906674316e-07, + "logits/chosen": -2.9177136421203613, + "logits/rejected": -2.876124143600464, + "logps/chosen": -345.4211120605469, + "logps/rejected": -294.0853271484375, + "loss": 0.7343, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7909644842147827, + "rewards/margins": 1.53469717502594, + "rewards/rejected": -2.3256616592407227, + "step": 1360 + }, + { + "epoch": 0.71, + "learning_rate": 4.246509848919487e-07, + "logits/chosen": -2.9307923316955566, + "logits/rejected": -2.9114553928375244, + "logps/chosen": -313.3360595703125, + "logps/rejected": -229.73330688476562, + "loss": 0.6555, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8673244714736938, + "rewards/margins": 1.4944034814834595, + "rewards/rejected": -2.3617279529571533, + "step": 1370 + }, + { + "epoch": 0.71, + "learning_rate": 4.2369477911646584e-07, + "logits/chosen": -2.8465042114257812, + "logits/rejected": -2.927686929702759, + "logps/chosen": -271.71807861328125, + "logps/rejected": -244.21066284179688, + "loss": 0.6364, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0488981008529663, + "rewards/margins": 1.2555683851242065, + "rewards/rejected": -2.304466485977173, + "step": 1380 + }, + { + "epoch": 0.72, + "learning_rate": 4.2273857334098296e-07, + "logits/chosen": -2.851271152496338, + "logits/rejected": -2.84839129447937, + "logps/chosen": -274.51861572265625, + "logps/rejected": -216.4557647705078, + "loss": 0.651, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1298227310180664, + "rewards/margins": 1.343589186668396, + "rewards/rejected": -2.473412036895752, + "step": 1390 + }, + { + "epoch": 0.72, + "learning_rate": 4.2178236756550007e-07, + "logits/chosen": -2.8680837154388428, + "logits/rejected": -2.8205838203430176, + "logps/chosen": -285.1997375488281, + "logps/rejected": -279.20831298828125, + "loss": 0.7663, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7554957270622253, + "rewards/margins": 0.6390464901924133, + "rewards/rejected": -1.3945419788360596, + "step": 1400 + }, + { + "epoch": 0.73, + "learning_rate": 4.208261617900172e-07, + "logits/chosen": -2.8597347736358643, + "logits/rejected": -2.916684627532959, + "logps/chosen": -277.04498291015625, + "logps/rejected": -220.5397491455078, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8732470273971558, + "rewards/margins": 1.7728040218353271, + "rewards/rejected": -2.6460509300231934, + "step": 1410 + }, + { + "epoch": 0.73, + "learning_rate": 4.198699560145343e-07, + "logits/chosen": -2.795240879058838, + "logits/rejected": -2.7627010345458984, + "logps/chosen": -254.906494140625, + "logps/rejected": -226.43496704101562, + "loss": 0.6975, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9776231646537781, + "rewards/margins": 1.5017640590667725, + "rewards/rejected": -2.4793875217437744, + "step": 1420 + }, + { + "epoch": 0.74, + "learning_rate": 4.189137502390514e-07, + "logits/chosen": -2.8324992656707764, + "logits/rejected": -2.779787063598633, + "logps/chosen": -252.84201049804688, + "logps/rejected": -293.3067932128906, + "loss": 0.6673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8433632850646973, + "rewards/margins": 0.9194127321243286, + "rewards/rejected": -1.7627757787704468, + "step": 1430 + }, + { + "epoch": 0.74, + "learning_rate": 4.179575444635686e-07, + "logits/chosen": -2.825773000717163, + "logits/rejected": -2.8201498985290527, + "logps/chosen": -339.226318359375, + "logps/rejected": -279.8710021972656, + "loss": 0.7201, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3010396361351013, + "rewards/margins": 1.9969072341918945, + "rewards/rejected": -2.2979469299316406, + "step": 1440 + }, + { + "epoch": 0.75, + "learning_rate": 4.170013386880857e-07, + "logits/chosen": -2.7612528800964355, + "logits/rejected": -2.843858242034912, + "logps/chosen": -241.0049285888672, + "logps/rejected": -258.4814453125, + "loss": 0.5739, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.837487518787384, + "rewards/margins": 1.5697721242904663, + "rewards/rejected": -2.407259702682495, + "step": 1450 + }, + { + "epoch": 0.75, + "learning_rate": 4.1604513291260277e-07, + "logits/chosen": -2.8152997493743896, + "logits/rejected": -2.84366774559021, + "logps/chosen": -245.88607788085938, + "logps/rejected": -237.09329223632812, + "loss": 0.7015, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.6707072257995605, + "rewards/margins": 1.8989670276641846, + "rewards/rejected": -2.569674253463745, + "step": 1460 + }, + { + "epoch": 0.76, + "learning_rate": 4.150889271371199e-07, + "logits/chosen": -2.8110063076019287, + "logits/rejected": -2.838031530380249, + "logps/chosen": -261.3493347167969, + "logps/rejected": -255.3643341064453, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8193343281745911, + "rewards/margins": 1.385074257850647, + "rewards/rejected": -2.2044084072113037, + "step": 1470 + }, + { + "epoch": 0.76, + "learning_rate": 4.14132721361637e-07, + "logits/chosen": -2.832944869995117, + "logits/rejected": -2.9401321411132812, + "logps/chosen": -316.7156677246094, + "logps/rejected": -231.45065307617188, + "loss": 0.5062, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6513665318489075, + "rewards/margins": 2.372331142425537, + "rewards/rejected": -3.0236973762512207, + "step": 1480 + }, + { + "epoch": 0.77, + "learning_rate": 4.131765155861541e-07, + "logits/chosen": -2.6967649459838867, + "logits/rejected": -2.842420816421509, + "logps/chosen": -249.27249145507812, + "logps/rejected": -204.11135864257812, + "loss": 0.5513, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2104392945766449, + "rewards/margins": 2.1391537189483643, + "rewards/rejected": -2.349592924118042, + "step": 1490 + }, + { + "epoch": 0.77, + "learning_rate": 4.1222030981067123e-07, + "logits/chosen": -2.775503635406494, + "logits/rejected": -2.7504892349243164, + "logps/chosen": -320.334228515625, + "logps/rejected": -291.64080810546875, + "loss": 0.5368, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4635298252105713, + "rewards/margins": 0.9672223925590515, + "rewards/rejected": -2.4307522773742676, + "step": 1500 + }, + { + "epoch": 0.77, + "eval_logits/chosen": -2.8444581031799316, + "eval_logits/rejected": -2.845541000366211, + "eval_logps/chosen": -300.9072570800781, + "eval_logps/rejected": -257.592041015625, + "eval_loss": 0.5860165953636169, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -0.4835660457611084, + "eval_rewards/margins": 1.8464233875274658, + "eval_rewards/rejected": -2.329989194869995, + "eval_runtime": 217.5823, + "eval_samples_per_second": 9.192, + "eval_steps_per_second": 0.29, + "step": 1500 + }, + { + "epoch": 0.78, + "learning_rate": 4.1126410403518835e-07, + "logits/chosen": -2.781780242919922, + "logits/rejected": -2.6765735149383545, + "logps/chosen": -261.59130859375, + "logps/rejected": -262.6100158691406, + "loss": 0.6188, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7182345986366272, + "rewards/margins": 2.0275397300720215, + "rewards/rejected": -2.745774269104004, + "step": 1510 + }, + { + "epoch": 0.78, + "learning_rate": 4.1030789825970546e-07, + "logits/chosen": -2.8515138626098633, + "logits/rejected": -2.8549370765686035, + "logps/chosen": -265.7052307128906, + "logps/rejected": -269.73602294921875, + "loss": 0.6769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.644514262676239, + "rewards/margins": 1.1639376878738403, + "rewards/rejected": -1.8084518909454346, + "step": 1520 + }, + { + "epoch": 0.79, + "learning_rate": 4.093516924842226e-07, + "logits/chosen": -2.63688325881958, + "logits/rejected": -2.6839888095855713, + "logps/chosen": -241.33218383789062, + "logps/rejected": -242.0855712890625, + "loss": 0.55, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.38341224193573, + "rewards/margins": 1.7259324789047241, + "rewards/rejected": -2.109344959259033, + "step": 1530 + }, + { + "epoch": 0.8, + "learning_rate": 4.083954867087397e-07, + "logits/chosen": -2.9009692668914795, + "logits/rejected": -2.900012493133545, + "logps/chosen": -228.05783081054688, + "logps/rejected": -256.7281494140625, + "loss": 0.5737, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.7317920923233032, + "rewards/margins": 0.674118161201477, + "rewards/rejected": -1.4059102535247803, + "step": 1540 + }, + { + "epoch": 0.8, + "learning_rate": 4.074392809332568e-07, + "logits/chosen": -2.7877659797668457, + "logits/rejected": -2.843630790710449, + "logps/chosen": -314.5323486328125, + "logps/rejected": -220.87265014648438, + "loss": 0.6052, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5956573486328125, + "rewards/margins": 2.074176788330078, + "rewards/rejected": -2.6698338985443115, + "step": 1550 + }, + { + "epoch": 0.81, + "learning_rate": 4.064830751577739e-07, + "logits/chosen": -2.8032829761505127, + "logits/rejected": -2.897947311401367, + "logps/chosen": -218.86270141601562, + "logps/rejected": -240.29110717773438, + "loss": 0.6209, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6553669571876526, + "rewards/margins": 1.5234209299087524, + "rewards/rejected": -2.17878794670105, + "step": 1560 + }, + { + "epoch": 0.81, + "learning_rate": 4.0552686938229104e-07, + "logits/chosen": -2.8207037448883057, + "logits/rejected": -2.805596351623535, + "logps/chosen": -247.5675811767578, + "logps/rejected": -219.2466278076172, + "loss": 0.5239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06444098055362701, + "rewards/margins": 2.0179104804992676, + "rewards/rejected": -2.0823514461517334, + "step": 1570 + }, + { + "epoch": 0.82, + "learning_rate": 4.045706636068082e-07, + "logits/chosen": -2.8735411167144775, + "logits/rejected": -2.7937862873077393, + "logps/chosen": -249.93789672851562, + "logps/rejected": -250.5798797607422, + "loss": 0.5408, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.504967987537384, + "rewards/margins": 2.1664693355560303, + "rewards/rejected": -2.6714372634887695, + "step": 1580 + }, + { + "epoch": 0.82, + "learning_rate": 4.036144578313253e-07, + "logits/chosen": -2.7197256088256836, + "logits/rejected": -2.771433115005493, + "logps/chosen": -276.3327331542969, + "logps/rejected": -262.82745361328125, + "loss": 0.6799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6452814340591431, + "rewards/margins": 1.896369218826294, + "rewards/rejected": -2.5416505336761475, + "step": 1590 + }, + { + "epoch": 0.83, + "learning_rate": 4.0265825205584244e-07, + "logits/chosen": -2.7867817878723145, + "logits/rejected": -2.8295509815216064, + "logps/chosen": -258.8713073730469, + "logps/rejected": -244.15274047851562, + "loss": 0.6208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5796841979026794, + "rewards/margins": 1.6541448831558228, + "rewards/rejected": -2.2338290214538574, + "step": 1600 + }, + { + "epoch": 0.83, + "learning_rate": 4.0170204628035956e-07, + "logits/chosen": -2.7563138008117676, + "logits/rejected": -2.8291854858398438, + "logps/chosen": -204.390380859375, + "logps/rejected": -203.91371154785156, + "loss": 0.6143, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8684107661247253, + "rewards/margins": 1.3346433639526367, + "rewards/rejected": -2.203054189682007, + "step": 1610 + }, + { + "epoch": 0.84, + "learning_rate": 4.007458405048766e-07, + "logits/chosen": -2.867945432662964, + "logits/rejected": -2.9335618019104004, + "logps/chosen": -336.86126708984375, + "logps/rejected": -275.42034912109375, + "loss": 0.5998, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6653950214385986, + "rewards/margins": 1.793474793434143, + "rewards/rejected": -2.458869695663452, + "step": 1620 + }, + { + "epoch": 0.84, + "learning_rate": 3.9978963472939373e-07, + "logits/chosen": -2.7078988552093506, + "logits/rejected": -2.733445644378662, + "logps/chosen": -242.2951202392578, + "logps/rejected": -237.00595092773438, + "loss": 0.605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5769153833389282, + "rewards/margins": 1.0708200931549072, + "rewards/rejected": -1.647735357284546, + "step": 1630 + }, + { + "epoch": 0.85, + "learning_rate": 3.9883342895391085e-07, + "logits/chosen": -2.801527738571167, + "logits/rejected": -2.7758069038391113, + "logps/chosen": -322.3177490234375, + "logps/rejected": -236.0830078125, + "loss": 0.6412, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8352056741714478, + "rewards/margins": 1.7625186443328857, + "rewards/rejected": -2.597724199295044, + "step": 1640 + }, + { + "epoch": 0.85, + "learning_rate": 3.9787722317842796e-07, + "logits/chosen": -2.877664089202881, + "logits/rejected": -2.956833839416504, + "logps/chosen": -298.65057373046875, + "logps/rejected": -194.07290649414062, + "loss": 0.6583, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.11642839759588242, + "rewards/margins": 2.321591854095459, + "rewards/rejected": -2.4380202293395996, + "step": 1650 + }, + { + "epoch": 0.86, + "learning_rate": 3.969210174029451e-07, + "logits/chosen": -2.8761415481567383, + "logits/rejected": -2.8178863525390625, + "logps/chosen": -224.84701538085938, + "logps/rejected": -250.48324584960938, + "loss": 0.6195, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8668567538261414, + "rewards/margins": 1.9914608001708984, + "rewards/rejected": -2.8583176136016846, + "step": 1660 + }, + { + "epoch": 0.86, + "learning_rate": 3.959648116274622e-07, + "logits/chosen": -2.726407527923584, + "logits/rejected": -2.817777395248413, + "logps/chosen": -228.1207733154297, + "logps/rejected": -203.44581604003906, + "loss": 0.7566, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3446123600006104, + "rewards/margins": 0.8250246047973633, + "rewards/rejected": -2.1696372032165527, + "step": 1670 + }, + { + "epoch": 0.87, + "learning_rate": 3.950086058519793e-07, + "logits/chosen": -2.7981009483337402, + "logits/rejected": -2.838646650314331, + "logps/chosen": -253.6993408203125, + "logps/rejected": -226.14205932617188, + "loss": 0.7449, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7549091577529907, + "rewards/margins": 1.5974199771881104, + "rewards/rejected": -2.3523292541503906, + "step": 1680 + }, + { + "epoch": 0.87, + "learning_rate": 3.9405240007649643e-07, + "logits/chosen": -2.8806567192077637, + "logits/rejected": -2.896660327911377, + "logps/chosen": -249.8539276123047, + "logps/rejected": -246.86587524414062, + "loss": 0.5332, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.41273418068885803, + "rewards/margins": 2.331174612045288, + "rewards/rejected": -2.7439088821411133, + "step": 1690 + }, + { + "epoch": 0.88, + "learning_rate": 3.9309619430101354e-07, + "logits/chosen": -2.867082118988037, + "logits/rejected": -2.85441255569458, + "logps/chosen": -236.4873809814453, + "logps/rejected": -197.86598205566406, + "loss": 0.651, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5962778925895691, + "rewards/margins": 1.6670020818710327, + "rewards/rejected": -2.263280153274536, + "step": 1700 + }, + { + "epoch": 0.88, + "learning_rate": 3.9213998852553066e-07, + "logits/chosen": -2.912658214569092, + "logits/rejected": -2.878035545349121, + "logps/chosen": -301.4076843261719, + "logps/rejected": -269.9781494140625, + "loss": 0.6361, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7174623012542725, + "rewards/margins": 1.713463544845581, + "rewards/rejected": -2.4309258460998535, + "step": 1710 + }, + { + "epoch": 0.89, + "learning_rate": 3.9118378275004783e-07, + "logits/chosen": -2.9461216926574707, + "logits/rejected": -2.939534902572632, + "logps/chosen": -305.78533935546875, + "logps/rejected": -254.8824005126953, + "loss": 0.6718, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6530225872993469, + "rewards/margins": 1.5185779333114624, + "rewards/rejected": -2.171600818634033, + "step": 1720 + }, + { + "epoch": 0.89, + "learning_rate": 3.9022757697456494e-07, + "logits/chosen": -2.822605609893799, + "logits/rejected": -2.845618963241577, + "logps/chosen": -341.7474365234375, + "logps/rejected": -285.81787109375, + "loss": 0.5988, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.632053017616272, + "rewards/margins": 1.855086088180542, + "rewards/rejected": -2.4871389865875244, + "step": 1730 + }, + { + "epoch": 0.9, + "learning_rate": 3.8927137119908206e-07, + "logits/chosen": -2.87507963180542, + "logits/rejected": -2.9020707607269287, + "logps/chosen": -315.70379638671875, + "logps/rejected": -205.2082061767578, + "loss": 0.5804, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7224348783493042, + "rewards/margins": 1.805499792098999, + "rewards/rejected": -2.527935028076172, + "step": 1740 + }, + { + "epoch": 0.9, + "learning_rate": 3.883151654235992e-07, + "logits/chosen": -2.858518362045288, + "logits/rejected": -2.8852877616882324, + "logps/chosen": -345.0863952636719, + "logps/rejected": -283.49554443359375, + "loss": 0.6036, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5720993876457214, + "rewards/margins": 2.419344902038574, + "rewards/rejected": -2.9914443492889404, + "step": 1750 + }, + { + "epoch": 0.91, + "learning_rate": 3.873589596481163e-07, + "logits/chosen": -2.9621615409851074, + "logits/rejected": -2.916796922683716, + "logps/chosen": -300.87945556640625, + "logps/rejected": -227.98318481445312, + "loss": 0.6589, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0655115842819214, + "rewards/margins": 1.4646036624908447, + "rewards/rejected": -2.5301153659820557, + "step": 1760 + }, + { + "epoch": 0.91, + "learning_rate": 3.864027538726334e-07, + "logits/chosen": -2.869260787963867, + "logits/rejected": -2.9153895378112793, + "logps/chosen": -231.5424346923828, + "logps/rejected": -251.66708374023438, + "loss": 0.6614, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7593679428100586, + "rewards/margins": 2.1713595390319824, + "rewards/rejected": -2.930727243423462, + "step": 1770 + }, + { + "epoch": 0.92, + "learning_rate": 3.8544654809715047e-07, + "logits/chosen": -2.89825439453125, + "logits/rejected": -2.8985471725463867, + "logps/chosen": -336.5444030761719, + "logps/rejected": -288.14251708984375, + "loss": 0.6138, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2097070217132568, + "rewards/margins": 2.103384017944336, + "rewards/rejected": -3.313091278076172, + "step": 1780 + }, + { + "epoch": 0.92, + "learning_rate": 3.844903423216676e-07, + "logits/chosen": -2.8448703289031982, + "logits/rejected": -2.8999435901641846, + "logps/chosen": -230.13436889648438, + "logps/rejected": -220.8369140625, + "loss": 0.6235, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8140104413032532, + "rewards/margins": 1.2665331363677979, + "rewards/rejected": -2.0805437564849854, + "step": 1790 + }, + { + "epoch": 0.93, + "learning_rate": 3.835341365461847e-07, + "logits/chosen": -2.8749191761016846, + "logits/rejected": -2.901770830154419, + "logps/chosen": -269.7549133300781, + "logps/rejected": -208.7238311767578, + "loss": 0.593, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1777963638305664, + "rewards/margins": 2.0235533714294434, + "rewards/rejected": -2.2013497352600098, + "step": 1800 + }, + { + "epoch": 0.93, + "learning_rate": 3.825779307707018e-07, + "logits/chosen": -2.8150510787963867, + "logits/rejected": -2.7607598304748535, + "logps/chosen": -152.94459533691406, + "logps/rejected": -186.07669067382812, + "loss": 0.6177, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5417309403419495, + "rewards/margins": 1.2329695224761963, + "rewards/rejected": -1.7747001647949219, + "step": 1810 + }, + { + "epoch": 0.94, + "learning_rate": 3.8162172499521893e-07, + "logits/chosen": -2.8732874393463135, + "logits/rejected": -2.834174871444702, + "logps/chosen": -263.18890380859375, + "logps/rejected": -225.11953735351562, + "loss": 0.6484, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9690064191818237, + "rewards/margins": 1.3681566715240479, + "rewards/rejected": -2.3371634483337402, + "step": 1820 + }, + { + "epoch": 0.94, + "learning_rate": 3.8066551921973605e-07, + "logits/chosen": -2.811633825302124, + "logits/rejected": -2.837749719619751, + "logps/chosen": -246.95175170898438, + "logps/rejected": -214.7392578125, + "loss": 0.5281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3380643427371979, + "rewards/margins": 1.5887610912322998, + "rewards/rejected": -1.926825761795044, + "step": 1830 + }, + { + "epoch": 0.95, + "learning_rate": 3.7970931344425316e-07, + "logits/chosen": -2.8069844245910645, + "logits/rejected": -2.8743038177490234, + "logps/chosen": -272.55523681640625, + "logps/rejected": -217.64169311523438, + "loss": 0.5512, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6675989031791687, + "rewards/margins": 0.8704580068588257, + "rewards/rejected": -1.5380569696426392, + "step": 1840 + }, + { + "epoch": 0.96, + "learning_rate": 3.787531076687703e-07, + "logits/chosen": -2.632551670074463, + "logits/rejected": -2.784874200820923, + "logps/chosen": -280.067138671875, + "logps/rejected": -218.7887725830078, + "loss": 0.688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8508871793746948, + "rewards/margins": 0.9731461405754089, + "rewards/rejected": -1.824033498764038, + "step": 1850 + }, + { + "epoch": 0.96, + "learning_rate": 3.7779690189328745e-07, + "logits/chosen": -2.927870512008667, + "logits/rejected": -2.959735870361328, + "logps/chosen": -283.48797607421875, + "logps/rejected": -238.8175506591797, + "loss": 0.5138, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4625054895877838, + "rewards/margins": 1.5064427852630615, + "rewards/rejected": -1.9689483642578125, + "step": 1860 + }, + { + "epoch": 0.97, + "learning_rate": 3.7684069611780456e-07, + "logits/chosen": -2.826491117477417, + "logits/rejected": -2.9152965545654297, + "logps/chosen": -269.662841796875, + "logps/rejected": -202.84742736816406, + "loss": 0.562, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6988733410835266, + "rewards/margins": 1.1059156656265259, + "rewards/rejected": -1.8047891855239868, + "step": 1870 + }, + { + "epoch": 0.97, + "learning_rate": 3.758844903423217e-07, + "logits/chosen": -2.8463423252105713, + "logits/rejected": -2.8597524166107178, + "logps/chosen": -257.9284973144531, + "logps/rejected": -242.43838500976562, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.051194429397583, + "rewards/margins": 1.7188211679458618, + "rewards/rejected": -2.7700157165527344, + "step": 1880 + }, + { + "epoch": 0.98, + "learning_rate": 3.749282845668388e-07, + "logits/chosen": -2.8787481784820557, + "logits/rejected": -2.8675646781921387, + "logps/chosen": -279.16253662109375, + "logps/rejected": -263.882568359375, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7178214192390442, + "rewards/margins": 1.551257848739624, + "rewards/rejected": -2.2690792083740234, + "step": 1890 + }, + { + "epoch": 0.98, + "learning_rate": 3.739720787913559e-07, + "logits/chosen": -2.8247861862182617, + "logits/rejected": -2.97629976272583, + "logps/chosen": -266.18304443359375, + "logps/rejected": -228.7005615234375, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5098610520362854, + "rewards/margins": 1.5873671770095825, + "rewards/rejected": -2.0972282886505127, + "step": 1900 + }, + { + "epoch": 0.99, + "learning_rate": 3.73015873015873e-07, + "logits/chosen": -2.952709197998047, + "logits/rejected": -2.942049503326416, + "logps/chosen": -287.5732421875, + "logps/rejected": -249.9008026123047, + "loss": 0.6396, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4959273338317871, + "rewards/margins": 0.7513211369514465, + "rewards/rejected": -1.2472484111785889, + "step": 1910 + }, + { + "epoch": 0.99, + "learning_rate": 3.7205966724039014e-07, + "logits/chosen": -2.792046546936035, + "logits/rejected": -2.8531010150909424, + "logps/chosen": -281.8520202636719, + "logps/rejected": -232.72994995117188, + "loss": 0.5827, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7002347707748413, + "rewards/margins": 1.4923832416534424, + "rewards/rejected": -2.1926181316375732, + "step": 1920 + }, + { + "epoch": 1.0, + "learning_rate": 3.711034614649072e-07, + "logits/chosen": -2.8734848499298096, + "logits/rejected": -2.806027889251709, + "logps/chosen": -275.4158935546875, + "logps/rejected": -222.2928009033203, + "loss": 0.5198, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.6465750932693481, + "rewards/margins": 2.2874600887298584, + "rewards/rejected": -2.934035301208496, + "step": 1930 + }, + { + "epoch": 1.0, + "learning_rate": 3.701472556894243e-07, + "logits/chosen": -2.883420944213867, + "logits/rejected": -2.8368773460388184, + "logps/chosen": -223.2138214111328, + "logps/rejected": -249.0345916748047, + "loss": 0.4761, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.6173292398452759, + "rewards/margins": 4.069156169891357, + "rewards/rejected": -3.451826810836792, + "step": 1940 + }, + { + "epoch": 1.01, + "learning_rate": 3.6919104991394144e-07, + "logits/chosen": -2.9283018112182617, + "logits/rejected": -2.9259495735168457, + "logps/chosen": -214.7909393310547, + "logps/rejected": -229.945068359375, + "loss": 0.0768, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.4348597526550293, + "rewards/margins": 8.571413040161133, + "rewards/rejected": -6.136553764343262, + "step": 1950 + }, + { + "epoch": 1.01, + "learning_rate": 3.6823484413845855e-07, + "logits/chosen": -2.8366754055023193, + "logits/rejected": -2.852508306503296, + "logps/chosen": -272.7106628417969, + "logps/rejected": -307.4251708984375, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3836749792099, + "rewards/margins": 7.853079795837402, + "rewards/rejected": -6.469404697418213, + "step": 1960 + }, + { + "epoch": 1.02, + "learning_rate": 3.6727863836297567e-07, + "logits/chosen": -2.834152936935425, + "logits/rejected": -2.8273613452911377, + "logps/chosen": -244.5872344970703, + "logps/rejected": -234.9590301513672, + "loss": 0.0905, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.32602596282959, + "rewards/margins": 8.823614120483398, + "rewards/rejected": -6.497588157653809, + "step": 1970 + }, + { + "epoch": 1.02, + "learning_rate": 3.663224325874928e-07, + "logits/chosen": -2.8812031745910645, + "logits/rejected": -2.834937572479248, + "logps/chosen": -242.8521728515625, + "logps/rejected": -279.22393798828125, + "loss": 0.082, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4316458702087402, + "rewards/margins": 6.866148471832275, + "rewards/rejected": -5.434502601623535, + "step": 1980 + }, + { + "epoch": 1.03, + "learning_rate": 3.653662268120099e-07, + "logits/chosen": -2.7576725482940674, + "logits/rejected": -2.815948963165283, + "logps/chosen": -240.4783477783203, + "logps/rejected": -309.4148864746094, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.815195322036743, + "rewards/margins": 8.677802085876465, + "rewards/rejected": -5.862607002258301, + "step": 1990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6441002103652707e-07, + "logits/chosen": -2.766345262527466, + "logits/rejected": -2.7719063758850098, + "logps/chosen": -239.728515625, + "logps/rejected": -268.7380676269531, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.324842691421509, + "rewards/margins": 8.273411750793457, + "rewards/rejected": -5.948569297790527, + "step": 2000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.8639116287231445, + "eval_logits/rejected": -2.8686611652374268, + "eval_logps/chosen": -301.4748840332031, + "eval_logps/rejected": -259.4017639160156, + "eval_loss": 0.6024442315101624, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -0.5970897078514099, + "eval_rewards/margins": 2.0948448181152344, + "eval_rewards/rejected": -2.691934823989868, + "eval_runtime": 217.645, + "eval_samples_per_second": 9.189, + "eval_steps_per_second": 0.289, + "step": 2000 + }, + { + "epoch": 1.04, + "learning_rate": 3.634538152610442e-07, + "logits/chosen": -2.8193464279174805, + "logits/rejected": -2.942736864089966, + "logps/chosen": -268.42852783203125, + "logps/rejected": -279.1741027832031, + "loss": 0.046, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.551281452178955, + "rewards/margins": 9.481260299682617, + "rewards/rejected": -5.9299798011779785, + "step": 2010 + }, + { + "epoch": 1.04, + "learning_rate": 3.624976094855613e-07, + "logits/chosen": -2.844984292984009, + "logits/rejected": -2.822812557220459, + "logps/chosen": -246.75039672851562, + "logps/rejected": -248.1599578857422, + "loss": 0.0642, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2839322090148926, + "rewards/margins": 9.039632797241211, + "rewards/rejected": -6.755700588226318, + "step": 2020 + }, + { + "epoch": 1.05, + "learning_rate": 3.615414037100784e-07, + "logits/chosen": -2.83333683013916, + "logits/rejected": -2.878652334213257, + "logps/chosen": -288.70623779296875, + "logps/rejected": -251.38748168945312, + "loss": 0.0691, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2578727006912231, + "rewards/margins": 7.318972110748291, + "rewards/rejected": -6.061100006103516, + "step": 2030 + }, + { + "epoch": 1.05, + "learning_rate": 3.6058519793459553e-07, + "logits/chosen": -2.7812256813049316, + "logits/rejected": -2.7956368923187256, + "logps/chosen": -203.9931182861328, + "logps/rejected": -231.3068389892578, + "loss": 0.0671, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1192617416381836, + "rewards/margins": 8.319083213806152, + "rewards/rejected": -6.199821472167969, + "step": 2040 + }, + { + "epoch": 1.06, + "learning_rate": 3.5962899215911265e-07, + "logits/chosen": -2.7568271160125732, + "logits/rejected": -2.823901653289795, + "logps/chosen": -245.1362762451172, + "logps/rejected": -272.96942138671875, + "loss": 0.0492, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.207984447479248, + "rewards/margins": 8.376309394836426, + "rewards/rejected": -6.1683244705200195, + "step": 2050 + }, + { + "epoch": 1.06, + "learning_rate": 3.5867278638362976e-07, + "logits/chosen": -2.7440109252929688, + "logits/rejected": -2.844982624053955, + "logps/chosen": -292.696044921875, + "logps/rejected": -340.18450927734375, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5816397666931152, + "rewards/margins": 9.935778617858887, + "rewards/rejected": -7.354138374328613, + "step": 2060 + }, + { + "epoch": 1.07, + "learning_rate": 3.577165806081469e-07, + "logits/chosen": -2.864179849624634, + "logits/rejected": -2.906625270843506, + "logps/chosen": -217.19314575195312, + "logps/rejected": -231.2805938720703, + "loss": 0.0719, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5726702213287354, + "rewards/margins": 7.601313591003418, + "rewards/rejected": -6.0286431312561035, + "step": 2070 + }, + { + "epoch": 1.07, + "learning_rate": 3.56760374832664e-07, + "logits/chosen": -2.84359073638916, + "logits/rejected": -2.8455586433410645, + "logps/chosen": -296.8313293457031, + "logps/rejected": -280.0687255859375, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.307840347290039, + "rewards/margins": 12.213151931762695, + "rewards/rejected": -7.905311584472656, + "step": 2080 + }, + { + "epoch": 1.08, + "learning_rate": 3.5580416905718106e-07, + "logits/chosen": -2.7166662216186523, + "logits/rejected": -2.890003204345703, + "logps/chosen": -304.8444519042969, + "logps/rejected": -253.0063934326172, + "loss": 0.0762, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.852630138397217, + "rewards/margins": 9.45458984375, + "rewards/rejected": -6.601960182189941, + "step": 2090 + }, + { + "epoch": 1.08, + "learning_rate": 3.5484796328169817e-07, + "logits/chosen": -2.775017261505127, + "logits/rejected": -2.9256277084350586, + "logps/chosen": -213.930908203125, + "logps/rejected": -244.6073760986328, + "loss": 0.0553, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.39541494846344, + "rewards/margins": 7.692966461181641, + "rewards/rejected": -6.297551155090332, + "step": 2100 + }, + { + "epoch": 1.09, + "learning_rate": 3.538917575062153e-07, + "logits/chosen": -2.7707183361053467, + "logits/rejected": -2.860555410385132, + "logps/chosen": -222.232666015625, + "logps/rejected": -277.15673828125, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.953575849533081, + "rewards/margins": 9.444633483886719, + "rewards/rejected": -7.491057395935059, + "step": 2110 + }, + { + "epoch": 1.09, + "learning_rate": 3.529355517307324e-07, + "logits/chosen": -2.778550624847412, + "logits/rejected": -2.742030382156372, + "logps/chosen": -227.607421875, + "logps/rejected": -284.8799743652344, + "loss": 0.0459, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8729333877563477, + "rewards/margins": 9.644989967346191, + "rewards/rejected": -7.772056579589844, + "step": 2120 + }, + { + "epoch": 1.1, + "learning_rate": 3.519793459552495e-07, + "logits/chosen": -2.7962563037872314, + "logits/rejected": -2.7782444953918457, + "logps/chosen": -202.91845703125, + "logps/rejected": -298.21551513671875, + "loss": 0.0772, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.817417323589325, + "rewards/margins": 7.259305000305176, + "rewards/rejected": -6.441886901855469, + "step": 2130 + }, + { + "epoch": 1.1, + "learning_rate": 3.510231401797667e-07, + "logits/chosen": -2.763411283493042, + "logits/rejected": -2.769122838973999, + "logps/chosen": -334.50274658203125, + "logps/rejected": -307.247802734375, + "loss": 0.0751, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0035067796707153, + "rewards/margins": 10.06065559387207, + "rewards/rejected": -9.057147979736328, + "step": 2140 + }, + { + "epoch": 1.11, + "learning_rate": 3.500669344042838e-07, + "logits/chosen": -2.804309368133545, + "logits/rejected": -2.7383761405944824, + "logps/chosen": -303.35400390625, + "logps/rejected": -271.607421875, + "loss": 0.0538, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8834270238876343, + "rewards/margins": 7.979116916656494, + "rewards/rejected": -7.095690727233887, + "step": 2150 + }, + { + "epoch": 1.12, + "learning_rate": 3.491107286288009e-07, + "logits/chosen": -2.8362884521484375, + "logits/rejected": -2.798858880996704, + "logps/chosen": -219.6747589111328, + "logps/rejected": -253.2039794921875, + "loss": 0.0676, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.297351360321045, + "rewards/margins": 10.74948501586914, + "rewards/rejected": -8.452133178710938, + "step": 2160 + }, + { + "epoch": 1.12, + "learning_rate": 3.4815452285331803e-07, + "logits/chosen": -2.8299400806427, + "logits/rejected": -2.8571524620056152, + "logps/chosen": -313.93414306640625, + "logps/rejected": -275.51861572265625, + "loss": 0.0669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.4165822267532349, + "rewards/margins": 7.2616400718688965, + "rewards/rejected": -5.845057487487793, + "step": 2170 + }, + { + "epoch": 1.13, + "learning_rate": 3.4719831707783515e-07, + "logits/chosen": -2.632450819015503, + "logits/rejected": -2.730884552001953, + "logps/chosen": -273.5526123046875, + "logps/rejected": -258.4808349609375, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.159768581390381, + "rewards/margins": 8.972365379333496, + "rewards/rejected": -6.812596797943115, + "step": 2180 + }, + { + "epoch": 1.13, + "learning_rate": 3.4624211130235227e-07, + "logits/chosen": -2.8070766925811768, + "logits/rejected": -2.8275675773620605, + "logps/chosen": -169.80580139160156, + "logps/rejected": -181.44944763183594, + "loss": 0.0761, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0720124244689941, + "rewards/margins": 8.138480186462402, + "rewards/rejected": -7.06646728515625, + "step": 2190 + }, + { + "epoch": 1.14, + "learning_rate": 3.452859055268694e-07, + "logits/chosen": -2.8337655067443848, + "logits/rejected": -2.8809292316436768, + "logps/chosen": -197.19129943847656, + "logps/rejected": -220.0069122314453, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7570802569389343, + "rewards/margins": 8.503564834594727, + "rewards/rejected": -7.746484279632568, + "step": 2200 + }, + { + "epoch": 1.14, + "learning_rate": 3.443296997513865e-07, + "logits/chosen": -2.881232500076294, + "logits/rejected": -2.835149049758911, + "logps/chosen": -278.3822326660156, + "logps/rejected": -304.79852294921875, + "loss": 0.065, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.2739017009735107, + "rewards/margins": 11.037015914916992, + "rewards/rejected": -8.763113975524902, + "step": 2210 + }, + { + "epoch": 1.15, + "learning_rate": 3.433734939759036e-07, + "logits/chosen": -2.82194447517395, + "logits/rejected": -2.8195812702178955, + "logps/chosen": -297.4380798339844, + "logps/rejected": -266.0953369140625, + "loss": 0.1081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1260581016540527, + "rewards/margins": 8.836637496948242, + "rewards/rejected": -6.710579872131348, + "step": 2220 + }, + { + "epoch": 1.15, + "learning_rate": 3.4241728820042073e-07, + "logits/chosen": -2.7940726280212402, + "logits/rejected": -2.7838902473449707, + "logps/chosen": -231.32003784179688, + "logps/rejected": -283.4253845214844, + "loss": 0.0814, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.5174341797828674, + "rewards/margins": 7.280348777770996, + "rewards/rejected": -6.762915134429932, + "step": 2230 + }, + { + "epoch": 1.16, + "learning_rate": 3.4146108242493784e-07, + "logits/chosen": -2.8232216835021973, + "logits/rejected": -2.8321261405944824, + "logps/chosen": -223.7116241455078, + "logps/rejected": -267.0711669921875, + "loss": 0.0835, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.0258145332336426, + "rewards/margins": 9.584681510925293, + "rewards/rejected": -7.558867454528809, + "step": 2240 + }, + { + "epoch": 1.16, + "learning_rate": 3.405048766494549e-07, + "logits/chosen": -2.687310218811035, + "logits/rejected": -2.666550397872925, + "logps/chosen": -245.4330596923828, + "logps/rejected": -249.4916534423828, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9064534902572632, + "rewards/margins": 8.866552352905273, + "rewards/rejected": -6.960099220275879, + "step": 2250 + }, + { + "epoch": 1.17, + "learning_rate": 3.39548670873972e-07, + "logits/chosen": -2.7485148906707764, + "logits/rejected": -2.738548755645752, + "logps/chosen": -316.63946533203125, + "logps/rejected": -399.0462341308594, + "loss": 0.1264, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.945350170135498, + "rewards/margins": 12.575675010681152, + "rewards/rejected": -9.63032341003418, + "step": 2260 + }, + { + "epoch": 1.17, + "learning_rate": 3.3859246509848914e-07, + "logits/chosen": -2.7252113819122314, + "logits/rejected": -2.794020652770996, + "logps/chosen": -260.09027099609375, + "logps/rejected": -282.75201416015625, + "loss": 0.074, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8539409637451172, + "rewards/margins": 9.42973518371582, + "rewards/rejected": -7.5757951736450195, + "step": 2270 + }, + { + "epoch": 1.18, + "learning_rate": 3.376362593230063e-07, + "logits/chosen": -2.7548434734344482, + "logits/rejected": -2.7015717029571533, + "logps/chosen": -215.8569793701172, + "logps/rejected": -320.8585205078125, + "loss": 0.0624, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.395317792892456, + "rewards/margins": 9.333052635192871, + "rewards/rejected": -7.937734127044678, + "step": 2280 + }, + { + "epoch": 1.18, + "learning_rate": 3.366800535475234e-07, + "logits/chosen": -2.7667157649993896, + "logits/rejected": -2.743626117706299, + "logps/chosen": -330.3866271972656, + "logps/rejected": -277.27423095703125, + "loss": 0.0616, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2812180519104004, + "rewards/margins": 10.883448600769043, + "rewards/rejected": -8.602230072021484, + "step": 2290 + }, + { + "epoch": 1.19, + "learning_rate": 3.3572384777204054e-07, + "logits/chosen": -2.783461570739746, + "logits/rejected": -2.7846217155456543, + "logps/chosen": -313.592529296875, + "logps/rejected": -256.70355224609375, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5473904609680176, + "rewards/margins": 9.968744277954102, + "rewards/rejected": -7.4213547706604, + "step": 2300 + }, + { + "epoch": 1.19, + "learning_rate": 3.3476764199655765e-07, + "logits/chosen": -2.6445019245147705, + "logits/rejected": -2.675950765609741, + "logps/chosen": -212.31478881835938, + "logps/rejected": -210.2075958251953, + "loss": 0.0583, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8660532236099243, + "rewards/margins": 8.678271293640137, + "rewards/rejected": -6.81221866607666, + "step": 2310 + }, + { + "epoch": 1.2, + "learning_rate": 3.3381143622107477e-07, + "logits/chosen": -2.7665135860443115, + "logits/rejected": -2.7206473350524902, + "logps/chosen": -334.072998046875, + "logps/rejected": -348.7887878417969, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.038207530975342, + "rewards/margins": 11.963887214660645, + "rewards/rejected": -9.925680160522461, + "step": 2320 + }, + { + "epoch": 1.2, + "learning_rate": 3.328552304455919e-07, + "logits/chosen": -2.7567098140716553, + "logits/rejected": -2.7200610637664795, + "logps/chosen": -222.26730346679688, + "logps/rejected": -237.3346405029297, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.507472515106201, + "rewards/margins": 10.436367988586426, + "rewards/rejected": -7.928895473480225, + "step": 2330 + }, + { + "epoch": 1.21, + "learning_rate": 3.31899024670109e-07, + "logits/chosen": -2.7776715755462646, + "logits/rejected": -2.722181558609009, + "logps/chosen": -262.3143005371094, + "logps/rejected": -255.64309692382812, + "loss": 0.0642, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5882365703582764, + "rewards/margins": 9.245880126953125, + "rewards/rejected": -7.6576433181762695, + "step": 2340 + }, + { + "epoch": 1.21, + "learning_rate": 3.309428188946261e-07, + "logits/chosen": -2.527749538421631, + "logits/rejected": -2.5494728088378906, + "logps/chosen": -216.6414794921875, + "logps/rejected": -218.4356689453125, + "loss": 0.0808, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5761357545852661, + "rewards/margins": 6.512837886810303, + "rewards/rejected": -5.936702728271484, + "step": 2350 + }, + { + "epoch": 1.22, + "learning_rate": 3.2998661311914323e-07, + "logits/chosen": -2.7653374671936035, + "logits/rejected": -2.70037579536438, + "logps/chosen": -229.9672393798828, + "logps/rejected": -267.19720458984375, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.17948579788208, + "rewards/margins": 12.446728706359863, + "rewards/rejected": -10.267244338989258, + "step": 2360 + }, + { + "epoch": 1.22, + "learning_rate": 3.2903040734366035e-07, + "logits/chosen": -2.8731327056884766, + "logits/rejected": -2.8409879207611084, + "logps/chosen": -270.32586669921875, + "logps/rejected": -310.6203918457031, + "loss": 0.0533, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5364960432052612, + "rewards/margins": 9.386345863342285, + "rewards/rejected": -7.849849700927734, + "step": 2370 + }, + { + "epoch": 1.23, + "learning_rate": 3.2807420156817746e-07, + "logits/chosen": -2.7654192447662354, + "logits/rejected": -2.765045166015625, + "logps/chosen": -230.9626007080078, + "logps/rejected": -273.068603515625, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6310447454452515, + "rewards/margins": 8.72553825378418, + "rewards/rejected": -7.094493865966797, + "step": 2380 + }, + { + "epoch": 1.23, + "learning_rate": 3.271179957926946e-07, + "logits/chosen": -2.6663436889648438, + "logits/rejected": -2.797468423843384, + "logps/chosen": -272.3541564941406, + "logps/rejected": -292.03155517578125, + "loss": 0.0919, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.019334077835083, + "rewards/margins": 9.053455352783203, + "rewards/rejected": -8.034120559692383, + "step": 2390 + }, + { + "epoch": 1.24, + "learning_rate": 3.261617900172117e-07, + "logits/chosen": -2.8381245136260986, + "logits/rejected": -2.7584311962127686, + "logps/chosen": -225.3228302001953, + "logps/rejected": -280.5133056640625, + "loss": 0.2103, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5505833625793457, + "rewards/margins": 8.765647888183594, + "rewards/rejected": -7.21506404876709, + "step": 2400 + }, + { + "epoch": 1.24, + "learning_rate": 3.2520558424172876e-07, + "logits/chosen": -2.884033679962158, + "logits/rejected": -2.840308666229248, + "logps/chosen": -202.35057067871094, + "logps/rejected": -229.30661010742188, + "loss": 0.093, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9804633855819702, + "rewards/margins": 8.038414001464844, + "rewards/rejected": -7.057950019836426, + "step": 2410 + }, + { + "epoch": 1.25, + "learning_rate": 3.242493784662459e-07, + "logits/chosen": -2.8548994064331055, + "logits/rejected": -2.8542709350585938, + "logps/chosen": -228.2629852294922, + "logps/rejected": -269.2410583496094, + "loss": 0.0581, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.472010850906372, + "rewards/margins": 9.07557487487793, + "rewards/rejected": -7.6035637855529785, + "step": 2420 + }, + { + "epoch": 1.25, + "learning_rate": 3.2329317269076304e-07, + "logits/chosen": -2.892120838165283, + "logits/rejected": -2.798053741455078, + "logps/chosen": -267.05633544921875, + "logps/rejected": -296.55108642578125, + "loss": 0.0671, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2721009254455566, + "rewards/margins": 10.419027328491211, + "rewards/rejected": -8.14692497253418, + "step": 2430 + }, + { + "epoch": 1.26, + "learning_rate": 3.2233696691528016e-07, + "logits/chosen": -2.8099141120910645, + "logits/rejected": -2.8631486892700195, + "logps/chosen": -252.46932983398438, + "logps/rejected": -329.03936767578125, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.635554790496826, + "rewards/margins": 12.528615951538086, + "rewards/rejected": -8.893061637878418, + "step": 2440 + }, + { + "epoch": 1.26, + "learning_rate": 3.2138076113979727e-07, + "logits/chosen": -2.7621617317199707, + "logits/rejected": -2.7229347229003906, + "logps/chosen": -243.74124145507812, + "logps/rejected": -298.3870544433594, + "loss": 0.1001, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0718708038330078, + "rewards/margins": 9.034757614135742, + "rewards/rejected": -7.962886810302734, + "step": 2450 + }, + { + "epoch": 1.27, + "learning_rate": 3.204245553643144e-07, + "logits/chosen": -2.8180460929870605, + "logits/rejected": -2.8537211418151855, + "logps/chosen": -277.44879150390625, + "logps/rejected": -323.1278991699219, + "loss": 0.0771, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6137592792510986, + "rewards/margins": 8.935612678527832, + "rewards/rejected": -7.321854591369629, + "step": 2460 + }, + { + "epoch": 1.28, + "learning_rate": 3.194683495888315e-07, + "logits/chosen": -2.8711585998535156, + "logits/rejected": -2.836691379547119, + "logps/chosen": -301.29248046875, + "logps/rejected": -234.97311401367188, + "loss": 0.0672, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8811954259872437, + "rewards/margins": 9.141138076782227, + "rewards/rejected": -7.25994348526001, + "step": 2470 + }, + { + "epoch": 1.28, + "learning_rate": 3.185121438133486e-07, + "logits/chosen": -2.7138116359710693, + "logits/rejected": -2.7769246101379395, + "logps/chosen": -266.66094970703125, + "logps/rejected": -347.24310302734375, + "loss": 0.0878, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.6878013610839844, + "rewards/margins": 11.373885154724121, + "rewards/rejected": -8.68608570098877, + "step": 2480 + }, + { + "epoch": 1.29, + "learning_rate": 3.1755593803786574e-07, + "logits/chosen": -2.799628734588623, + "logits/rejected": -2.820836305618286, + "logps/chosen": -183.5235595703125, + "logps/rejected": -263.58477783203125, + "loss": 0.0918, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4970555901527405, + "rewards/margins": 7.879460334777832, + "rewards/rejected": -7.3824052810668945, + "step": 2490 + }, + { + "epoch": 1.29, + "learning_rate": 3.1659973226238285e-07, + "logits/chosen": -2.701214075088501, + "logits/rejected": -2.7091012001037598, + "logps/chosen": -210.8098907470703, + "logps/rejected": -258.5179748535156, + "loss": 0.0817, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5346753597259521, + "rewards/margins": 8.877952575683594, + "rewards/rejected": -7.343277931213379, + "step": 2500 + }, + { + "epoch": 1.29, + "eval_logits/chosen": -2.825404405593872, + "eval_logits/rejected": -2.825698137283325, + "eval_logps/chosen": -305.26666259765625, + "eval_logps/rejected": -265.15521240234375, + "eval_loss": 0.6655394434928894, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -1.3554495573043823, + "eval_rewards/margins": 2.4871773719787598, + "eval_rewards/rejected": -3.8426268100738525, + "eval_runtime": 217.4133, + "eval_samples_per_second": 9.199, + "eval_steps_per_second": 0.29, + "step": 2500 + }, + { + "epoch": 1.3, + "learning_rate": 3.1564352648689997e-07, + "logits/chosen": -2.887439012527466, + "logits/rejected": -2.8315329551696777, + "logps/chosen": -249.19839477539062, + "logps/rejected": -233.50259399414062, + "loss": 0.0951, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8421556949615479, + "rewards/margins": 8.87971305847168, + "rewards/rejected": -7.0375566482543945, + "step": 2510 + }, + { + "epoch": 1.3, + "learning_rate": 3.146873207114171e-07, + "logits/chosen": -2.830362319946289, + "logits/rejected": -2.8775646686553955, + "logps/chosen": -269.3638916015625, + "logps/rejected": -237.9186248779297, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7232071161270142, + "rewards/margins": 8.593416213989258, + "rewards/rejected": -6.870208740234375, + "step": 2520 + }, + { + "epoch": 1.31, + "learning_rate": 3.137311149359342e-07, + "logits/chosen": -2.885897159576416, + "logits/rejected": -2.8861446380615234, + "logps/chosen": -329.35772705078125, + "logps/rejected": -326.6771545410156, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5491175651550293, + "rewards/margins": 8.285898208618164, + "rewards/rejected": -6.736781120300293, + "step": 2530 + }, + { + "epoch": 1.31, + "learning_rate": 3.127749091604513e-07, + "logits/chosen": -2.8595659732818604, + "logits/rejected": -2.8982152938842773, + "logps/chosen": -275.5293884277344, + "logps/rejected": -290.2986755371094, + "loss": 0.0828, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8002418279647827, + "rewards/margins": 10.767290115356445, + "rewards/rejected": -8.967049598693848, + "step": 2540 + }, + { + "epoch": 1.32, + "learning_rate": 3.1181870338496843e-07, + "logits/chosen": -2.8094890117645264, + "logits/rejected": -2.795132637023926, + "logps/chosen": -194.06175231933594, + "logps/rejected": -261.2168884277344, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9291478395462036, + "rewards/margins": 8.161163330078125, + "rewards/rejected": -7.2320146560668945, + "step": 2550 + }, + { + "epoch": 1.32, + "learning_rate": 3.108624976094856e-07, + "logits/chosen": -2.7061691284179688, + "logits/rejected": -2.6929404735565186, + "logps/chosen": -278.30389404296875, + "logps/rejected": -262.9192199707031, + "loss": 0.0785, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9108970165252686, + "rewards/margins": 8.77668571472168, + "rewards/rejected": -6.865788459777832, + "step": 2560 + }, + { + "epoch": 1.33, + "learning_rate": 3.0990629183400266e-07, + "logits/chosen": -2.6665143966674805, + "logits/rejected": -2.6631417274475098, + "logps/chosen": -304.42266845703125, + "logps/rejected": -281.06915283203125, + "loss": 0.0857, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.79115891456604, + "rewards/margins": 10.763957977294922, + "rewards/rejected": -7.972798824310303, + "step": 2570 + }, + { + "epoch": 1.33, + "learning_rate": 3.089500860585198e-07, + "logits/chosen": -2.7755050659179688, + "logits/rejected": -2.7990615367889404, + "logps/chosen": -205.9984130859375, + "logps/rejected": -233.8455352783203, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7710639238357544, + "rewards/margins": 7.724207401275635, + "rewards/rejected": -6.953144073486328, + "step": 2580 + }, + { + "epoch": 1.34, + "learning_rate": 3.079938802830369e-07, + "logits/chosen": -2.770127773284912, + "logits/rejected": -2.762481927871704, + "logps/chosen": -201.91415405273438, + "logps/rejected": -215.83712768554688, + "loss": 0.0562, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6502482891082764, + "rewards/margins": 7.537081718444824, + "rewards/rejected": -6.886833190917969, + "step": 2590 + }, + { + "epoch": 1.34, + "learning_rate": 3.07037674507554e-07, + "logits/chosen": -2.7648098468780518, + "logits/rejected": -2.755012035369873, + "logps/chosen": -304.14727783203125, + "logps/rejected": -344.8402404785156, + "loss": 0.1352, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7347332239151, + "rewards/margins": 10.5750150680542, + "rewards/rejected": -8.840280532836914, + "step": 2600 + }, + { + "epoch": 1.35, + "learning_rate": 3.060814687320711e-07, + "logits/chosen": -2.6266608238220215, + "logits/rejected": -2.6018710136413574, + "logps/chosen": -263.6053771972656, + "logps/rejected": -259.5660400390625, + "loss": 0.0828, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.80942964553833, + "rewards/margins": 11.295743942260742, + "rewards/rejected": -8.48631477355957, + "step": 2610 + }, + { + "epoch": 1.35, + "learning_rate": 3.0512526295658824e-07, + "logits/chosen": -2.670837879180908, + "logits/rejected": -2.6270341873168945, + "logps/chosen": -243.7268524169922, + "logps/rejected": -280.92059326171875, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.4703638553619385, + "rewards/margins": 11.03471851348877, + "rewards/rejected": -8.564353942871094, + "step": 2620 + }, + { + "epoch": 1.36, + "learning_rate": 3.0416905718110536e-07, + "logits/chosen": -2.7124526500701904, + "logits/rejected": -2.7607948780059814, + "logps/chosen": -230.36959838867188, + "logps/rejected": -223.21713256835938, + "loss": 0.0758, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9104559421539307, + "rewards/margins": 9.06286907196045, + "rewards/rejected": -7.152412414550781, + "step": 2630 + }, + { + "epoch": 1.36, + "learning_rate": 3.0321285140562247e-07, + "logits/chosen": -2.760730743408203, + "logits/rejected": -2.729675531387329, + "logps/chosen": -260.95703125, + "logps/rejected": -284.439453125, + "loss": 0.1003, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.830008864402771, + "rewards/margins": 7.717930793762207, + "rewards/rejected": -6.887921333312988, + "step": 2640 + }, + { + "epoch": 1.37, + "learning_rate": 3.022566456301396e-07, + "logits/chosen": -2.742011308670044, + "logits/rejected": -2.7945969104766846, + "logps/chosen": -255.4883575439453, + "logps/rejected": -264.54864501953125, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9193328619003296, + "rewards/margins": 9.547621726989746, + "rewards/rejected": -7.628289222717285, + "step": 2650 + }, + { + "epoch": 1.37, + "learning_rate": 3.013004398546567e-07, + "logits/chosen": -2.665055513381958, + "logits/rejected": -2.681821346282959, + "logps/chosen": -316.40069580078125, + "logps/rejected": -302.1031494140625, + "loss": 0.075, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3142540454864502, + "rewards/margins": 8.91224193572998, + "rewards/rejected": -7.597988128662109, + "step": 2660 + }, + { + "epoch": 1.38, + "learning_rate": 3.003442340791738e-07, + "logits/chosen": -2.7507271766662598, + "logits/rejected": -2.7632174491882324, + "logps/chosen": -220.7988739013672, + "logps/rejected": -245.73281860351562, + "loss": 0.0561, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9091943502426147, + "rewards/margins": 9.140470504760742, + "rewards/rejected": -8.231274604797363, + "step": 2670 + }, + { + "epoch": 1.38, + "learning_rate": 2.9938802830369093e-07, + "logits/chosen": -2.761070489883423, + "logits/rejected": -2.739316701889038, + "logps/chosen": -197.91165161132812, + "logps/rejected": -252.8719024658203, + "loss": 0.0945, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6190016269683838, + "rewards/margins": 8.90224838256836, + "rewards/rejected": -8.283246994018555, + "step": 2680 + }, + { + "epoch": 1.39, + "learning_rate": 2.9843182252820805e-07, + "logits/chosen": -2.6564173698425293, + "logits/rejected": -2.6826603412628174, + "logps/chosen": -261.28460693359375, + "logps/rejected": -276.734130859375, + "loss": 0.0607, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1821815967559814, + "rewards/margins": 10.108437538146973, + "rewards/rejected": -8.92625617980957, + "step": 2690 + }, + { + "epoch": 1.39, + "learning_rate": 2.974756167527252e-07, + "logits/chosen": -2.792201519012451, + "logits/rejected": -2.789391040802002, + "logps/chosen": -249.32687377929688, + "logps/rejected": -265.88897705078125, + "loss": 0.0659, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5784503817558289, + "rewards/margins": 7.7233381271362305, + "rewards/rejected": -7.144888401031494, + "step": 2700 + }, + { + "epoch": 1.4, + "learning_rate": 2.9651941097724233e-07, + "logits/chosen": -2.7236266136169434, + "logits/rejected": -2.754272937774658, + "logps/chosen": -274.2682189941406, + "logps/rejected": -247.7708282470703, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2451229095458984, + "rewards/margins": 8.562277793884277, + "rewards/rejected": -7.317155361175537, + "step": 2710 + }, + { + "epoch": 1.4, + "learning_rate": 2.9556320520175945e-07, + "logits/chosen": -2.66971755027771, + "logits/rejected": -2.7084593772888184, + "logps/chosen": -264.0753479003906, + "logps/rejected": -289.4743957519531, + "loss": 0.1105, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5378565788269043, + "rewards/margins": 7.262510776519775, + "rewards/rejected": -7.8003668785095215, + "step": 2720 + }, + { + "epoch": 1.41, + "learning_rate": 2.946069994262765e-07, + "logits/chosen": -2.741020679473877, + "logits/rejected": -2.6548125743865967, + "logps/chosen": -313.64556884765625, + "logps/rejected": -337.761474609375, + "loss": 0.0706, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.526160717010498, + "rewards/margins": 10.978344917297363, + "rewards/rejected": -8.452184677124023, + "step": 2730 + }, + { + "epoch": 1.41, + "learning_rate": 2.9365079365079363e-07, + "logits/chosen": -2.7509303092956543, + "logits/rejected": -2.701751947402954, + "logps/chosen": -317.73651123046875, + "logps/rejected": -244.7392120361328, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.711538791656494, + "rewards/margins": 11.599513053894043, + "rewards/rejected": -7.887974739074707, + "step": 2740 + }, + { + "epoch": 1.42, + "learning_rate": 2.9269458787531074e-07, + "logits/chosen": -2.766720771789551, + "logits/rejected": -2.761768341064453, + "logps/chosen": -292.88800048828125, + "logps/rejected": -321.57855224609375, + "loss": 0.1135, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4988124370574951, + "rewards/margins": 9.010907173156738, + "rewards/rejected": -7.5120954513549805, + "step": 2750 + }, + { + "epoch": 1.42, + "learning_rate": 2.9173838209982786e-07, + "logits/chosen": -2.776931047439575, + "logits/rejected": -2.808295965194702, + "logps/chosen": -234.4710693359375, + "logps/rejected": -209.95736694335938, + "loss": 0.0965, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3970197439193726, + "rewards/margins": 8.639843940734863, + "rewards/rejected": -7.242823123931885, + "step": 2760 + }, + { + "epoch": 1.43, + "learning_rate": 2.90782176324345e-07, + "logits/chosen": -2.8999898433685303, + "logits/rejected": -2.8510196208953857, + "logps/chosen": -288.8046875, + "logps/rejected": -264.20550537109375, + "loss": 0.0871, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1829469203948975, + "rewards/margins": 9.268820762634277, + "rewards/rejected": -8.0858736038208, + "step": 2770 + }, + { + "epoch": 1.44, + "learning_rate": 2.898259705488621e-07, + "logits/chosen": -2.8797316551208496, + "logits/rejected": -2.8782153129577637, + "logps/chosen": -281.7368469238281, + "logps/rejected": -233.17752075195312, + "loss": 0.0731, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6002839803695679, + "rewards/margins": 8.053850173950195, + "rewards/rejected": -7.4535651206970215, + "step": 2780 + }, + { + "epoch": 1.44, + "learning_rate": 2.888697647733792e-07, + "logits/chosen": -2.771524429321289, + "logits/rejected": -2.766451358795166, + "logps/chosen": -343.028076171875, + "logps/rejected": -322.9091491699219, + "loss": 0.0661, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.140158176422119, + "rewards/margins": 11.231983184814453, + "rewards/rejected": -9.091826438903809, + "step": 2790 + }, + { + "epoch": 1.45, + "learning_rate": 2.879135589978963e-07, + "logits/chosen": -2.820176362991333, + "logits/rejected": -2.7664897441864014, + "logps/chosen": -280.60577392578125, + "logps/rejected": -251.74691772460938, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6660699844360352, + "rewards/margins": 8.87816047668457, + "rewards/rejected": -7.212090969085693, + "step": 2800 + }, + { + "epoch": 1.45, + "learning_rate": 2.8695735322241344e-07, + "logits/chosen": -2.7457170486450195, + "logits/rejected": -2.747769832611084, + "logps/chosen": -291.2057800292969, + "logps/rejected": -268.2013244628906, + "loss": 0.1417, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9127013683319092, + "rewards/margins": 10.081315994262695, + "rewards/rejected": -8.168614387512207, + "step": 2810 + }, + { + "epoch": 1.46, + "learning_rate": 2.8600114744693055e-07, + "logits/chosen": -2.723422050476074, + "logits/rejected": -2.807495594024658, + "logps/chosen": -289.26080322265625, + "logps/rejected": -301.3398742675781, + "loss": 0.0752, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1681814193725586, + "rewards/margins": 11.451757431030273, + "rewards/rejected": -9.283575057983398, + "step": 2820 + }, + { + "epoch": 1.46, + "learning_rate": 2.8504494167144767e-07, + "logits/chosen": -2.7436156272888184, + "logits/rejected": -2.7432618141174316, + "logps/chosen": -261.23052978515625, + "logps/rejected": -317.21588134765625, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8200260400772095, + "rewards/margins": 11.07886028289795, + "rewards/rejected": -9.258834838867188, + "step": 2830 + }, + { + "epoch": 1.47, + "learning_rate": 2.8408873589596484e-07, + "logits/chosen": -2.780362367630005, + "logits/rejected": -2.7789127826690674, + "logps/chosen": -248.43612670898438, + "logps/rejected": -268.502685546875, + "loss": 0.0611, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7744905948638916, + "rewards/margins": 9.7242431640625, + "rewards/rejected": -7.9497528076171875, + "step": 2840 + }, + { + "epoch": 1.47, + "learning_rate": 2.8313253012048195e-07, + "logits/chosen": -2.705899477005005, + "logits/rejected": -2.772587537765503, + "logps/chosen": -216.2031707763672, + "logps/rejected": -305.090087890625, + "loss": 0.1077, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.7985403537750244, + "rewards/margins": 10.163469314575195, + "rewards/rejected": -8.364928245544434, + "step": 2850 + }, + { + "epoch": 1.48, + "learning_rate": 2.8217632434499907e-07, + "logits/chosen": -2.7169694900512695, + "logits/rejected": -2.6625046730041504, + "logps/chosen": -286.58148193359375, + "logps/rejected": -295.5797424316406, + "loss": 0.0609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.99732506275177, + "rewards/margins": 8.176431655883789, + "rewards/rejected": -7.179105281829834, + "step": 2860 + }, + { + "epoch": 1.48, + "learning_rate": 2.812201185695162e-07, + "logits/chosen": -2.7098686695098877, + "logits/rejected": -2.783830165863037, + "logps/chosen": -197.52723693847656, + "logps/rejected": -286.0131530761719, + "loss": 0.079, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.9220987558364868, + "rewards/margins": 8.922266006469727, + "rewards/rejected": -7.000166416168213, + "step": 2870 + }, + { + "epoch": 1.49, + "learning_rate": 2.802639127940333e-07, + "logits/chosen": -2.8137047290802, + "logits/rejected": -2.7263553142547607, + "logps/chosen": -272.9234619140625, + "logps/rejected": -223.7790985107422, + "loss": 0.089, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8738445043563843, + "rewards/margins": 10.640775680541992, + "rewards/rejected": -8.76693058013916, + "step": 2880 + }, + { + "epoch": 1.49, + "learning_rate": 2.7930770701855036e-07, + "logits/chosen": -2.8235201835632324, + "logits/rejected": -2.8233487606048584, + "logps/chosen": -281.5433044433594, + "logps/rejected": -245.4292449951172, + "loss": 0.1254, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.013465642929077, + "rewards/margins": 9.75710678100586, + "rewards/rejected": -7.743639945983887, + "step": 2890 + }, + { + "epoch": 1.5, + "learning_rate": 2.783515012430675e-07, + "logits/chosen": -2.6952970027923584, + "logits/rejected": -2.6778531074523926, + "logps/chosen": -227.57217407226562, + "logps/rejected": -233.82666015625, + "loss": 0.1159, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5577604174613953, + "rewards/margins": 6.89922571182251, + "rewards/rejected": -6.341464996337891, + "step": 2900 + }, + { + "epoch": 1.5, + "learning_rate": 2.773952954675846e-07, + "logits/chosen": -2.8005638122558594, + "logits/rejected": -2.869779109954834, + "logps/chosen": -288.06671142578125, + "logps/rejected": -220.03268432617188, + "loss": 0.0974, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9301376342773438, + "rewards/margins": 8.675138473510742, + "rewards/rejected": -6.74500036239624, + "step": 2910 + }, + { + "epoch": 1.51, + "learning_rate": 2.764390896921017e-07, + "logits/chosen": -2.7519824504852295, + "logits/rejected": -2.7377419471740723, + "logps/chosen": -290.820068359375, + "logps/rejected": -291.615966796875, + "loss": 0.1115, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.328765392303467, + "rewards/margins": 12.472002029418945, + "rewards/rejected": -8.143235206604004, + "step": 2920 + }, + { + "epoch": 1.51, + "learning_rate": 2.754828839166188e-07, + "logits/chosen": -2.7895121574401855, + "logits/rejected": -2.7734270095825195, + "logps/chosen": -232.82510375976562, + "logps/rejected": -227.81820678710938, + "loss": 0.1025, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.8885366916656494, + "rewards/margins": 10.8139009475708, + "rewards/rejected": -7.925364017486572, + "step": 2930 + }, + { + "epoch": 1.52, + "learning_rate": 2.7452667814113594e-07, + "logits/chosen": -2.8098020553588867, + "logits/rejected": -2.9195146560668945, + "logps/chosen": -235.73355102539062, + "logps/rejected": -235.16580200195312, + "loss": 0.0936, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9349721074104309, + "rewards/margins": 6.663336277008057, + "rewards/rejected": -5.72836446762085, + "step": 2940 + }, + { + "epoch": 1.52, + "learning_rate": 2.7357047236565306e-07, + "logits/chosen": -2.683117389678955, + "logits/rejected": -2.7429752349853516, + "logps/chosen": -268.73150634765625, + "logps/rejected": -295.7161560058594, + "loss": 0.07, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1788768768310547, + "rewards/margins": 8.738747596740723, + "rewards/rejected": -6.559870719909668, + "step": 2950 + }, + { + "epoch": 1.53, + "learning_rate": 2.7261426659017017e-07, + "logits/chosen": -2.747555732727051, + "logits/rejected": -2.8072476387023926, + "logps/chosen": -287.62353515625, + "logps/rejected": -246.19345092773438, + "loss": 0.0784, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1814064979553223, + "rewards/margins": 8.878744125366211, + "rewards/rejected": -6.6973371505737305, + "step": 2960 + }, + { + "epoch": 1.53, + "learning_rate": 2.716580608146873e-07, + "logits/chosen": -2.7727088928222656, + "logits/rejected": -2.7909743785858154, + "logps/chosen": -330.4422607421875, + "logps/rejected": -276.64410400390625, + "loss": 0.0662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.657055377960205, + "rewards/margins": 11.456523895263672, + "rewards/rejected": -7.799468994140625, + "step": 2970 + }, + { + "epoch": 1.54, + "learning_rate": 2.7070185503920446e-07, + "logits/chosen": -2.5788490772247314, + "logits/rejected": -2.6384220123291016, + "logps/chosen": -245.68936157226562, + "logps/rejected": -298.78729248046875, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.223114252090454, + "rewards/margins": 10.831456184387207, + "rewards/rejected": -8.608342170715332, + "step": 2980 + }, + { + "epoch": 1.54, + "learning_rate": 2.6974564926372157e-07, + "logits/chosen": -2.7926948070526123, + "logits/rejected": -2.7871780395507812, + "logps/chosen": -282.8335876464844, + "logps/rejected": -262.1839904785156, + "loss": 0.0729, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1277525424957275, + "rewards/margins": 8.252248764038086, + "rewards/rejected": -7.124497413635254, + "step": 2990 + }, + { + "epoch": 1.55, + "learning_rate": 2.687894434882387e-07, + "logits/chosen": -2.743727684020996, + "logits/rejected": -2.712907075881958, + "logps/chosen": -246.3362274169922, + "logps/rejected": -242.5818328857422, + "loss": 0.0617, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.10309407860040665, + "rewards/margins": 6.269896984100342, + "rewards/rejected": -6.372990608215332, + "step": 3000 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.768277168273926, + "eval_logits/rejected": -2.7743842601776123, + "eval_logps/chosen": -304.76513671875, + "eval_logps/rejected": -264.7488098144531, + "eval_loss": 0.6421293020248413, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -1.2551521062850952, + "eval_rewards/margins": 2.5061919689178467, + "eval_rewards/rejected": -3.7613439559936523, + "eval_runtime": 217.9227, + "eval_samples_per_second": 9.178, + "eval_steps_per_second": 0.289, + "step": 3000 + }, + { + "epoch": 1.55, + "learning_rate": 2.678332377127558e-07, + "logits/chosen": -2.7960705757141113, + "logits/rejected": -2.8275809288024902, + "logps/chosen": -271.56878662109375, + "logps/rejected": -254.76614379882812, + "loss": 0.0561, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.7444472312927246, + "rewards/margins": 10.54531478881836, + "rewards/rejected": -7.800868034362793, + "step": 3010 + }, + { + "epoch": 1.56, + "learning_rate": 2.668770319372729e-07, + "logits/chosen": -2.6941845417022705, + "logits/rejected": -2.7289962768554688, + "logps/chosen": -231.16708374023438, + "logps/rejected": -205.00234985351562, + "loss": 0.0639, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4271122217178345, + "rewards/margins": 7.393969535827637, + "rewards/rejected": -5.966856479644775, + "step": 3020 + }, + { + "epoch": 1.56, + "learning_rate": 2.6592082616179004e-07, + "logits/chosen": -2.8552334308624268, + "logits/rejected": -2.7763783931732178, + "logps/chosen": -204.55636596679688, + "logps/rejected": -252.6420440673828, + "loss": 0.087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9946534037590027, + "rewards/margins": 8.126154899597168, + "rewards/rejected": -7.1315016746521, + "step": 3030 + }, + { + "epoch": 1.57, + "learning_rate": 2.649646203863071e-07, + "logits/chosen": -2.7002310752868652, + "logits/rejected": -2.6688289642333984, + "logps/chosen": -275.127685546875, + "logps/rejected": -310.6485290527344, + "loss": 0.0692, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.3441951274871826, + "rewards/margins": 9.95146656036377, + "rewards/rejected": -7.607271671295166, + "step": 3040 + }, + { + "epoch": 1.57, + "learning_rate": 2.640084146108242e-07, + "logits/chosen": -2.8274757862091064, + "logits/rejected": -2.7921900749206543, + "logps/chosen": -322.8690185546875, + "logps/rejected": -258.6299133300781, + "loss": 0.0643, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8817846775054932, + "rewards/margins": 10.3388671875, + "rewards/rejected": -8.45708179473877, + "step": 3050 + }, + { + "epoch": 1.58, + "learning_rate": 2.6305220883534133e-07, + "logits/chosen": -2.8685293197631836, + "logits/rejected": -2.913038969039917, + "logps/chosen": -263.8501281738281, + "logps/rejected": -243.12026977539062, + "loss": 0.0602, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.6231520175933838, + "rewards/margins": 7.873045444488525, + "rewards/rejected": -6.249893665313721, + "step": 3060 + }, + { + "epoch": 1.58, + "learning_rate": 2.6209600305985845e-07, + "logits/chosen": -2.801408052444458, + "logits/rejected": -2.7723915576934814, + "logps/chosen": -278.9658203125, + "logps/rejected": -317.4752197265625, + "loss": 0.0815, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.718047857284546, + "rewards/margins": 14.852376937866211, + "rewards/rejected": -12.134328842163086, + "step": 3070 + }, + { + "epoch": 1.59, + "learning_rate": 2.6113979728437556e-07, + "logits/chosen": -2.9154725074768066, + "logits/rejected": -2.9344992637634277, + "logps/chosen": -283.52862548828125, + "logps/rejected": -295.277099609375, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.83547043800354, + "rewards/margins": 10.664634704589844, + "rewards/rejected": -7.829165458679199, + "step": 3080 + }, + { + "epoch": 1.6, + "learning_rate": 2.601835915088927e-07, + "logits/chosen": -2.7709438800811768, + "logits/rejected": -2.8170695304870605, + "logps/chosen": -264.36090087890625, + "logps/rejected": -258.5511169433594, + "loss": 0.1273, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.3656535148620605, + "rewards/margins": 9.661222457885742, + "rewards/rejected": -7.29556941986084, + "step": 3090 + }, + { + "epoch": 1.6, + "learning_rate": 2.592273857334098e-07, + "logits/chosen": -2.757802963256836, + "logits/rejected": -2.804326057434082, + "logps/chosen": -237.77841186523438, + "logps/rejected": -249.50381469726562, + "loss": 0.1111, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.2772223949432373, + "rewards/margins": 10.901510238647461, + "rewards/rejected": -7.624286651611328, + "step": 3100 + }, + { + "epoch": 1.61, + "learning_rate": 2.582711799579269e-07, + "logits/chosen": -2.8544392585754395, + "logits/rejected": -2.903831720352173, + "logps/chosen": -302.3487854003906, + "logps/rejected": -274.150390625, + "loss": 0.0762, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2709031105041504, + "rewards/margins": 10.55983829498291, + "rewards/rejected": -8.288934707641602, + "step": 3110 + }, + { + "epoch": 1.61, + "learning_rate": 2.573149741824441e-07, + "logits/chosen": -2.878246784210205, + "logits/rejected": -2.8712260723114014, + "logps/chosen": -292.5808410644531, + "logps/rejected": -244.95993041992188, + "loss": 0.0604, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7662618160247803, + "rewards/margins": 9.28374195098877, + "rewards/rejected": -7.517480373382568, + "step": 3120 + }, + { + "epoch": 1.62, + "learning_rate": 2.563587684069612e-07, + "logits/chosen": -2.818538188934326, + "logits/rejected": -2.8021979331970215, + "logps/chosen": -297.29669189453125, + "logps/rejected": -280.97515869140625, + "loss": 0.0523, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8910287618637085, + "rewards/margins": 9.954824447631836, + "rewards/rejected": -8.063794136047363, + "step": 3130 + }, + { + "epoch": 1.62, + "learning_rate": 2.554025626314783e-07, + "logits/chosen": -2.8551039695739746, + "logits/rejected": -2.8618719577789307, + "logps/chosen": -267.74383544921875, + "logps/rejected": -318.0247497558594, + "loss": 0.0987, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0268582105636597, + "rewards/margins": 9.777185440063477, + "rewards/rejected": -8.750327110290527, + "step": 3140 + }, + { + "epoch": 1.63, + "learning_rate": 2.544463568559954e-07, + "logits/chosen": -2.856924057006836, + "logits/rejected": -2.859058380126953, + "logps/chosen": -299.3072204589844, + "logps/rejected": -320.89056396484375, + "loss": 0.0578, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.0914320945739746, + "rewards/margins": 11.473106384277344, + "rewards/rejected": -9.381673812866211, + "step": 3150 + }, + { + "epoch": 1.63, + "learning_rate": 2.5349015108051254e-07, + "logits/chosen": -2.784090518951416, + "logits/rejected": -2.8050949573516846, + "logps/chosen": -272.7767333984375, + "logps/rejected": -272.4282531738281, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9306879043579102, + "rewards/margins": 10.905633926391602, + "rewards/rejected": -8.974946975708008, + "step": 3160 + }, + { + "epoch": 1.64, + "learning_rate": 2.5253394530502966e-07, + "logits/chosen": -2.6712114810943604, + "logits/rejected": -2.8075413703918457, + "logps/chosen": -356.3838195800781, + "logps/rejected": -310.9335021972656, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7382097244262695, + "rewards/margins": 10.840738296508789, + "rewards/rejected": -8.102529525756836, + "step": 3170 + }, + { + "epoch": 1.64, + "learning_rate": 2.5157773952954677e-07, + "logits/chosen": -2.894787311553955, + "logits/rejected": -2.7637925148010254, + "logps/chosen": -287.5474853515625, + "logps/rejected": -285.1513977050781, + "loss": 0.0856, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.372936487197876, + "rewards/margins": 11.11128044128418, + "rewards/rejected": -9.738344192504883, + "step": 3180 + }, + { + "epoch": 1.65, + "learning_rate": 2.506215337540639e-07, + "logits/chosen": -2.830756425857544, + "logits/rejected": -2.7069759368896484, + "logps/chosen": -289.5176086425781, + "logps/rejected": -309.98321533203125, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2456578016281128, + "rewards/margins": 9.056872367858887, + "rewards/rejected": -7.811214447021484, + "step": 3190 + }, + { + "epoch": 1.65, + "learning_rate": 2.4966532797858095e-07, + "logits/chosen": -2.6839346885681152, + "logits/rejected": -2.648435592651367, + "logps/chosen": -281.0553283691406, + "logps/rejected": -281.7121276855469, + "loss": 0.0623, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.267576217651367, + "rewards/margins": 12.378862380981445, + "rewards/rejected": -10.111286163330078, + "step": 3200 + }, + { + "epoch": 1.66, + "learning_rate": 2.4870912220309807e-07, + "logits/chosen": -2.7548084259033203, + "logits/rejected": -2.929920196533203, + "logps/chosen": -265.11920166015625, + "logps/rejected": -276.0952453613281, + "loss": 0.085, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.494207739830017, + "rewards/margins": 7.784449577331543, + "rewards/rejected": -6.290242671966553, + "step": 3210 + }, + { + "epoch": 1.66, + "learning_rate": 2.477529164276152e-07, + "logits/chosen": -2.8512723445892334, + "logits/rejected": -2.8269026279449463, + "logps/chosen": -289.71673583984375, + "logps/rejected": -290.6452331542969, + "loss": 0.1119, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.928057074546814, + "rewards/margins": 9.863795280456543, + "rewards/rejected": -7.935737609863281, + "step": 3220 + }, + { + "epoch": 1.67, + "learning_rate": 2.4679671065213235e-07, + "logits/chosen": -2.726294994354248, + "logits/rejected": -2.779247999191284, + "logps/chosen": -300.8678894042969, + "logps/rejected": -243.9254608154297, + "loss": 0.0955, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6631157398223877, + "rewards/margins": 8.425924301147461, + "rewards/rejected": -6.762808322906494, + "step": 3230 + }, + { + "epoch": 1.67, + "learning_rate": 2.4584050487664947e-07, + "logits/chosen": -2.7843034267425537, + "logits/rejected": -2.866281509399414, + "logps/chosen": -301.68731689453125, + "logps/rejected": -259.0189514160156, + "loss": 0.051, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 4.0484724044799805, + "rewards/margins": 12.256793022155762, + "rewards/rejected": -8.208322525024414, + "step": 3240 + }, + { + "epoch": 1.68, + "learning_rate": 2.448842991011666e-07, + "logits/chosen": -2.700822591781616, + "logits/rejected": -2.7363715171813965, + "logps/chosen": -268.10528564453125, + "logps/rejected": -271.51776123046875, + "loss": 0.1079, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7115895748138428, + "rewards/margins": 9.137028694152832, + "rewards/rejected": -7.42543888092041, + "step": 3250 + }, + { + "epoch": 1.68, + "learning_rate": 2.439280933256837e-07, + "logits/chosen": -2.688324451446533, + "logits/rejected": -2.728954792022705, + "logps/chosen": -246.6923065185547, + "logps/rejected": -309.48809814453125, + "loss": 0.0476, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.557828664779663, + "rewards/margins": 11.568731307983398, + "rewards/rejected": -9.010904312133789, + "step": 3260 + }, + { + "epoch": 1.69, + "learning_rate": 2.429718875502008e-07, + "logits/chosen": -2.471862316131592, + "logits/rejected": -2.5184130668640137, + "logps/chosen": -242.50222778320312, + "logps/rejected": -264.163818359375, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2356094121932983, + "rewards/margins": 8.016067504882812, + "rewards/rejected": -6.780457973480225, + "step": 3270 + }, + { + "epoch": 1.69, + "learning_rate": 2.420156817747179e-07, + "logits/chosen": -2.7138893604278564, + "logits/rejected": -2.7802371978759766, + "logps/chosen": -322.5417785644531, + "logps/rejected": -268.6466064453125, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.545907497406006, + "rewards/margins": 9.485400199890137, + "rewards/rejected": -6.939493656158447, + "step": 3280 + }, + { + "epoch": 1.7, + "learning_rate": 2.41059475999235e-07, + "logits/chosen": -2.6953110694885254, + "logits/rejected": -2.6134727001190186, + "logps/chosen": -237.255126953125, + "logps/rejected": -310.57427978515625, + "loss": 0.0726, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.1554365158081055, + "rewards/margins": 11.051652908325195, + "rewards/rejected": -9.896215438842773, + "step": 3290 + }, + { + "epoch": 1.7, + "learning_rate": 2.4010327022375216e-07, + "logits/chosen": -2.745811939239502, + "logits/rejected": -2.7188100814819336, + "logps/chosen": -318.18414306640625, + "logps/rejected": -325.09307861328125, + "loss": 0.0684, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7801249027252197, + "rewards/margins": 11.05907917022705, + "rewards/rejected": -9.278955459594727, + "step": 3300 + }, + { + "epoch": 1.71, + "learning_rate": 2.391470644482693e-07, + "logits/chosen": -2.652722120285034, + "logits/rejected": -2.693530321121216, + "logps/chosen": -273.9671936035156, + "logps/rejected": -266.43548583984375, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9491304159164429, + "rewards/margins": 9.380148887634277, + "rewards/rejected": -7.431018829345703, + "step": 3310 + }, + { + "epoch": 1.71, + "learning_rate": 2.3819085867278636e-07, + "logits/chosen": -2.6838040351867676, + "logits/rejected": -2.68790864944458, + "logps/chosen": -178.25857543945312, + "logps/rejected": -211.74545288085938, + "loss": 0.0883, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.007467269897461, + "rewards/margins": 9.49828052520752, + "rewards/rejected": -8.490812301635742, + "step": 3320 + }, + { + "epoch": 1.72, + "learning_rate": 2.3723465289730348e-07, + "logits/chosen": -2.776559352874756, + "logits/rejected": -2.745678424835205, + "logps/chosen": -306.03582763671875, + "logps/rejected": -242.4042510986328, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9364816546440125, + "rewards/margins": 7.912406921386719, + "rewards/rejected": -6.975924491882324, + "step": 3330 + }, + { + "epoch": 1.72, + "learning_rate": 2.362784471218206e-07, + "logits/chosen": -2.720956325531006, + "logits/rejected": -2.714653491973877, + "logps/chosen": -295.54034423828125, + "logps/rejected": -292.66986083984375, + "loss": 0.0578, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.876239061355591, + "rewards/margins": 11.462187767028809, + "rewards/rejected": -8.58594799041748, + "step": 3340 + }, + { + "epoch": 1.73, + "learning_rate": 2.353222413463377e-07, + "logits/chosen": -2.737004518508911, + "logits/rejected": -2.8274214267730713, + "logps/chosen": -223.5535125732422, + "logps/rejected": -261.5608215332031, + "loss": 0.0541, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.330844521522522, + "rewards/margins": 9.457793235778809, + "rewards/rejected": -8.126947402954102, + "step": 3350 + }, + { + "epoch": 1.73, + "learning_rate": 2.3436603557085483e-07, + "logits/chosen": -2.814901351928711, + "logits/rejected": -2.721123218536377, + "logps/chosen": -261.89306640625, + "logps/rejected": -271.2594909667969, + "loss": 0.0962, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48820918798446655, + "rewards/margins": 7.547884941101074, + "rewards/rejected": -7.059675693511963, + "step": 3360 + }, + { + "epoch": 1.74, + "learning_rate": 2.3340982979537197e-07, + "logits/chosen": -2.756761074066162, + "logits/rejected": -2.788041830062866, + "logps/chosen": -338.298583984375, + "logps/rejected": -325.39788818359375, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7895655632019043, + "rewards/margins": 11.199186325073242, + "rewards/rejected": -8.40962028503418, + "step": 3370 + }, + { + "epoch": 1.74, + "learning_rate": 2.3245362401988909e-07, + "logits/chosen": -2.832690715789795, + "logits/rejected": -2.772152900695801, + "logps/chosen": -295.143310546875, + "logps/rejected": -312.2070617675781, + "loss": 0.0745, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.858865261077881, + "rewards/margins": 12.961053848266602, + "rewards/rejected": -10.102187156677246, + "step": 3380 + }, + { + "epoch": 1.75, + "learning_rate": 2.314974182444062e-07, + "logits/chosen": -2.787862777709961, + "logits/rejected": -2.790987730026245, + "logps/chosen": -249.3990020751953, + "logps/rejected": -269.93011474609375, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25441741943359375, + "rewards/margins": 9.50505256652832, + "rewards/rejected": -9.250636100769043, + "step": 3390 + }, + { + "epoch": 1.76, + "learning_rate": 2.305412124689233e-07, + "logits/chosen": -2.619658946990967, + "logits/rejected": -2.61407470703125, + "logps/chosen": -305.6173095703125, + "logps/rejected": -324.3746032714844, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.354332685470581, + "rewards/margins": 13.424585342407227, + "rewards/rejected": -10.070249557495117, + "step": 3400 + }, + { + "epoch": 1.76, + "learning_rate": 2.295850066934404e-07, + "logits/chosen": -2.713583469390869, + "logits/rejected": -2.6319689750671387, + "logps/chosen": -251.8077392578125, + "logps/rejected": -264.23101806640625, + "loss": 0.0685, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.20004582405090332, + "rewards/margins": 8.175451278686523, + "rewards/rejected": -8.375497817993164, + "step": 3410 + }, + { + "epoch": 1.77, + "learning_rate": 2.2862880091795752e-07, + "logits/chosen": -2.6867470741271973, + "logits/rejected": -2.723292350769043, + "logps/chosen": -252.469482421875, + "logps/rejected": -334.07525634765625, + "loss": 0.0726, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7798032760620117, + "rewards/margins": 11.993390083312988, + "rewards/rejected": -10.213586807250977, + "step": 3420 + }, + { + "epoch": 1.77, + "learning_rate": 2.2767259514247464e-07, + "logits/chosen": -2.610609292984009, + "logits/rejected": -2.564879894256592, + "logps/chosen": -269.3361511230469, + "logps/rejected": -295.60821533203125, + "loss": 0.0734, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9466493129730225, + "rewards/margins": 10.422674179077148, + "rewards/rejected": -8.476022720336914, + "step": 3430 + }, + { + "epoch": 1.78, + "learning_rate": 2.2671638936699178e-07, + "logits/chosen": -2.499741315841675, + "logits/rejected": -2.4898147583007812, + "logps/chosen": -280.90838623046875, + "logps/rejected": -286.2072448730469, + "loss": 0.0726, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.3896098136901855, + "rewards/margins": 10.770309448242188, + "rewards/rejected": -8.380699157714844, + "step": 3440 + }, + { + "epoch": 1.78, + "learning_rate": 2.257601835915089e-07, + "logits/chosen": -2.6703405380249023, + "logits/rejected": -2.68867826461792, + "logps/chosen": -284.33984375, + "logps/rejected": -301.51214599609375, + "loss": 0.0734, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9578365087509155, + "rewards/margins": 10.313423156738281, + "rewards/rejected": -8.35558795928955, + "step": 3450 + }, + { + "epoch": 1.79, + "learning_rate": 2.24803977816026e-07, + "logits/chosen": -2.575701951980591, + "logits/rejected": -2.7522482872009277, + "logps/chosen": -222.56655883789062, + "logps/rejected": -213.4800262451172, + "loss": 0.1307, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8081939220428467, + "rewards/margins": 10.595215797424316, + "rewards/rejected": -8.787023544311523, + "step": 3460 + }, + { + "epoch": 1.79, + "learning_rate": 2.2384777204054313e-07, + "logits/chosen": -2.7906508445739746, + "logits/rejected": -2.7896149158477783, + "logps/chosen": -276.54205322265625, + "logps/rejected": -303.80804443359375, + "loss": 0.1239, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.196168422698975, + "rewards/margins": 13.602836608886719, + "rewards/rejected": -9.406667709350586, + "step": 3470 + }, + { + "epoch": 1.8, + "learning_rate": 2.2289156626506022e-07, + "logits/chosen": -2.6736395359039307, + "logits/rejected": -2.5446834564208984, + "logps/chosen": -238.1494598388672, + "logps/rejected": -265.4019470214844, + "loss": 0.0708, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.9061222076416016, + "rewards/margins": 9.534143447875977, + "rewards/rejected": -7.6280198097229, + "step": 3480 + }, + { + "epoch": 1.8, + "learning_rate": 2.2193536048957733e-07, + "logits/chosen": -2.764954090118408, + "logits/rejected": -2.8380179405212402, + "logps/chosen": -241.90158081054688, + "logps/rejected": -259.67919921875, + "loss": 0.0755, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5641316175460815, + "rewards/margins": 8.558606147766113, + "rewards/rejected": -6.994473934173584, + "step": 3490 + }, + { + "epoch": 1.81, + "learning_rate": 2.2097915471409445e-07, + "logits/chosen": -2.772516965866089, + "logits/rejected": -2.728278398513794, + "logps/chosen": -304.7802734375, + "logps/rejected": -286.6618347167969, + "loss": 0.0765, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.4173293113708496, + "rewards/margins": 10.533430099487305, + "rewards/rejected": -8.116101264953613, + "step": 3500 + }, + { + "epoch": 1.81, + "eval_logits/chosen": -2.738905668258667, + "eval_logits/rejected": -2.740257501602173, + "eval_logps/chosen": -304.2354431152344, + "eval_logps/rejected": -266.13909912109375, + "eval_loss": 0.6581782102584839, + "eval_rewards/accuracies": 0.7658730149269104, + "eval_rewards/chosen": -1.149203896522522, + "eval_rewards/margins": 2.890200614929199, + "eval_rewards/rejected": -4.03940486907959, + "eval_runtime": 217.4993, + "eval_samples_per_second": 9.195, + "eval_steps_per_second": 0.29, + "step": 3500 + }, + { + "epoch": 1.81, + "learning_rate": 2.200229489386116e-07, + "logits/chosen": -2.7111144065856934, + "logits/rejected": -2.670027017593384, + "logps/chosen": -250.5125732421875, + "logps/rejected": -249.6392364501953, + "loss": 0.0825, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3286582231521606, + "rewards/margins": 8.792272567749023, + "rewards/rejected": -7.463613986968994, + "step": 3510 + }, + { + "epoch": 1.82, + "learning_rate": 2.190667431631287e-07, + "logits/chosen": -2.7528154850006104, + "logits/rejected": -2.7122788429260254, + "logps/chosen": -278.01910400390625, + "logps/rejected": -313.9847106933594, + "loss": 0.0943, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.2548537254333496, + "rewards/margins": 11.679734230041504, + "rewards/rejected": -9.424880027770996, + "step": 3520 + }, + { + "epoch": 1.82, + "learning_rate": 2.1811053738764582e-07, + "logits/chosen": -2.5960257053375244, + "logits/rejected": -2.6290123462677, + "logps/chosen": -204.6955108642578, + "logps/rejected": -282.3271179199219, + "loss": 0.0602, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0323585271835327, + "rewards/margins": 8.045377731323242, + "rewards/rejected": -7.013019561767578, + "step": 3530 + }, + { + "epoch": 1.83, + "learning_rate": 2.1715433161216294e-07, + "logits/chosen": -2.7657580375671387, + "logits/rejected": -2.744833469390869, + "logps/chosen": -288.4944152832031, + "logps/rejected": -280.72955322265625, + "loss": 0.055, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.4996562004089355, + "rewards/margins": 10.434306144714355, + "rewards/rejected": -7.9346513748168945, + "step": 3540 + }, + { + "epoch": 1.83, + "learning_rate": 2.1619812583668005e-07, + "logits/chosen": -2.682262897491455, + "logits/rejected": -2.7337307929992676, + "logps/chosen": -257.96453857421875, + "logps/rejected": -262.64801025390625, + "loss": 0.0884, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6263707876205444, + "rewards/margins": 8.416780471801758, + "rewards/rejected": -7.790409088134766, + "step": 3550 + }, + { + "epoch": 1.84, + "learning_rate": 2.1524192006119714e-07, + "logits/chosen": -2.66699481010437, + "logits/rejected": -2.7092580795288086, + "logps/chosen": -260.70819091796875, + "logps/rejected": -278.57171630859375, + "loss": 0.067, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.16346995532512665, + "rewards/margins": 7.7417144775390625, + "rewards/rejected": -7.578243255615234, + "step": 3560 + }, + { + "epoch": 1.84, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -2.829007863998413, + "logits/rejected": -2.7939159870147705, + "logps/chosen": -281.33416748046875, + "logps/rejected": -275.6802978515625, + "loss": 0.0438, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2156023979187012, + "rewards/margins": 9.397723197937012, + "rewards/rejected": -8.182120323181152, + "step": 3570 + }, + { + "epoch": 1.85, + "learning_rate": 2.133295085102314e-07, + "logits/chosen": -2.5332906246185303, + "logits/rejected": -2.5238547325134277, + "logps/chosen": -265.1497497558594, + "logps/rejected": -257.6131591796875, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2848436832427979, + "rewards/margins": 10.311678886413574, + "rewards/rejected": -9.026835441589355, + "step": 3580 + }, + { + "epoch": 1.85, + "learning_rate": 2.1237330273474851e-07, + "logits/chosen": -2.7879021167755127, + "logits/rejected": -2.816457509994507, + "logps/chosen": -330.1148986816406, + "logps/rejected": -284.9937438964844, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7244058847427368, + "rewards/margins": 9.594953536987305, + "rewards/rejected": -7.870547294616699, + "step": 3590 + }, + { + "epoch": 1.86, + "learning_rate": 2.1141709695926563e-07, + "logits/chosen": -2.7168538570404053, + "logits/rejected": -2.7340972423553467, + "logps/chosen": -250.33151245117188, + "logps/rejected": -226.6660614013672, + "loss": 0.0641, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.10770583152771, + "rewards/margins": 7.989068508148193, + "rewards/rejected": -6.881363868713379, + "step": 3600 + }, + { + "epoch": 1.86, + "learning_rate": 2.1046089118378275e-07, + "logits/chosen": -2.746110439300537, + "logits/rejected": -2.7883448600769043, + "logps/chosen": -253.4336395263672, + "logps/rejected": -300.6252136230469, + "loss": 0.0681, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5055543184280396, + "rewards/margins": 11.491781234741211, + "rewards/rejected": -9.986227035522461, + "step": 3610 + }, + { + "epoch": 1.87, + "learning_rate": 2.0950468540829986e-07, + "logits/chosen": -2.7674827575683594, + "logits/rejected": -2.760319471359253, + "logps/chosen": -262.53131103515625, + "logps/rejected": -249.01333618164062, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8487039804458618, + "rewards/margins": 9.199454307556152, + "rewards/rejected": -8.350749969482422, + "step": 3620 + }, + { + "epoch": 1.87, + "learning_rate": 2.0854847963281698e-07, + "logits/chosen": -2.604976177215576, + "logits/rejected": -2.541404962539673, + "logps/chosen": -246.70480346679688, + "logps/rejected": -250.37417602539062, + "loss": 0.0496, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.391180396080017, + "rewards/margins": 10.575980186462402, + "rewards/rejected": -9.184799194335938, + "step": 3630 + }, + { + "epoch": 1.88, + "learning_rate": 2.0759227385733407e-07, + "logits/chosen": -2.5545432567596436, + "logits/rejected": -2.6954281330108643, + "logps/chosen": -311.905029296875, + "logps/rejected": -267.7748107910156, + "loss": 0.0806, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.2542879581451416, + "rewards/margins": 9.12138557434082, + "rewards/rejected": -7.8670973777771, + "step": 3640 + }, + { + "epoch": 1.88, + "learning_rate": 2.066360680818512e-07, + "logits/chosen": -2.7910752296447754, + "logits/rejected": -2.8306946754455566, + "logps/chosen": -359.8431091308594, + "logps/rejected": -279.3539733886719, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.022540330886841, + "rewards/margins": 10.497331619262695, + "rewards/rejected": -8.47479248046875, + "step": 3650 + }, + { + "epoch": 1.89, + "learning_rate": 2.0567986230636832e-07, + "logits/chosen": -2.7704267501831055, + "logits/rejected": -2.77883243560791, + "logps/chosen": -235.0912628173828, + "logps/rejected": -271.0329895019531, + "loss": 0.1054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.019496440887451, + "rewards/margins": 10.298086166381836, + "rewards/rejected": -8.278589248657227, + "step": 3660 + }, + { + "epoch": 1.89, + "learning_rate": 2.0472365653088544e-07, + "logits/chosen": -2.720661163330078, + "logits/rejected": -2.725130558013916, + "logps/chosen": -275.77752685546875, + "logps/rejected": -312.16265869140625, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19810207188129425, + "rewards/margins": 8.972552299499512, + "rewards/rejected": -8.774450302124023, + "step": 3670 + }, + { + "epoch": 1.9, + "learning_rate": 2.0376745075540256e-07, + "logits/chosen": -2.833644390106201, + "logits/rejected": -2.8849120140075684, + "logps/chosen": -320.95904541015625, + "logps/rejected": -268.92987060546875, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2254230976104736, + "rewards/margins": 9.793492317199707, + "rewards/rejected": -8.568068504333496, + "step": 3680 + }, + { + "epoch": 1.91, + "learning_rate": 2.0281124497991967e-07, + "logits/chosen": -2.8426222801208496, + "logits/rejected": -2.813960313796997, + "logps/chosen": -240.68954467773438, + "logps/rejected": -312.35589599609375, + "loss": 0.0968, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.614633560180664, + "rewards/margins": 11.83614444732666, + "rewards/rejected": -9.22150993347168, + "step": 3690 + }, + { + "epoch": 1.91, + "learning_rate": 2.018550392044368e-07, + "logits/chosen": -2.8380324840545654, + "logits/rejected": -2.798767566680908, + "logps/chosen": -294.00396728515625, + "logps/rejected": -365.79522705078125, + "loss": 0.0533, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.709501266479492, + "rewards/margins": 12.373403549194336, + "rewards/rejected": -9.663900375366211, + "step": 3700 + }, + { + "epoch": 1.92, + "learning_rate": 2.0089883342895388e-07, + "logits/chosen": -2.806723117828369, + "logits/rejected": -2.939481258392334, + "logps/chosen": -275.45538330078125, + "logps/rejected": -234.10690307617188, + "loss": 0.0738, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2928574085235596, + "rewards/margins": 9.613503456115723, + "rewards/rejected": -8.320646286010742, + "step": 3710 + }, + { + "epoch": 1.92, + "learning_rate": 1.9994262765347102e-07, + "logits/chosen": -2.764084815979004, + "logits/rejected": -2.790250301361084, + "logps/chosen": -277.35833740234375, + "logps/rejected": -220.1525421142578, + "loss": 0.0817, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.888009250164032, + "rewards/margins": 8.029479026794434, + "rewards/rejected": -7.141470432281494, + "step": 3720 + }, + { + "epoch": 1.93, + "learning_rate": 1.9898642187798813e-07, + "logits/chosen": -2.640573501586914, + "logits/rejected": -2.656597137451172, + "logps/chosen": -292.8527526855469, + "logps/rejected": -301.4468688964844, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1724305152893066, + "rewards/margins": 11.529372215270996, + "rewards/rejected": -9.356943130493164, + "step": 3730 + }, + { + "epoch": 1.93, + "learning_rate": 1.9803021610250525e-07, + "logits/chosen": -2.7659599781036377, + "logits/rejected": -2.8064372539520264, + "logps/chosen": -246.72659301757812, + "logps/rejected": -294.17572021484375, + "loss": 0.0785, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.036099672317505, + "rewards/margins": 11.232912063598633, + "rewards/rejected": -9.196812629699707, + "step": 3740 + }, + { + "epoch": 1.94, + "learning_rate": 1.9707401032702237e-07, + "logits/chosen": -2.65791916847229, + "logits/rejected": -2.7072339057922363, + "logps/chosen": -232.99755859375, + "logps/rejected": -272.58465576171875, + "loss": 0.0484, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.425924301147461, + "rewards/margins": 10.325573921203613, + "rewards/rejected": -8.899649620056152, + "step": 3750 + }, + { + "epoch": 1.94, + "learning_rate": 1.9611780455153948e-07, + "logits/chosen": -2.799206256866455, + "logits/rejected": -2.832388401031494, + "logps/chosen": -259.90631103515625, + "logps/rejected": -268.10015869140625, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.051642894744873, + "rewards/margins": 10.926515579223633, + "rewards/rejected": -8.874872207641602, + "step": 3760 + }, + { + "epoch": 1.95, + "learning_rate": 1.951615987760566e-07, + "logits/chosen": -2.5179781913757324, + "logits/rejected": -2.6008613109588623, + "logps/chosen": -233.42056274414062, + "logps/rejected": -251.9779815673828, + "loss": 0.063, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4882890582084656, + "rewards/margins": 8.425145149230957, + "rewards/rejected": -7.936856269836426, + "step": 3770 + }, + { + "epoch": 1.95, + "learning_rate": 1.942053930005737e-07, + "logits/chosen": -2.9028313159942627, + "logits/rejected": -2.8197319507598877, + "logps/chosen": -275.57598876953125, + "logps/rejected": -298.7084045410156, + "loss": 0.0418, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8865839242935181, + "rewards/margins": 10.174108505249023, + "rewards/rejected": -9.287524223327637, + "step": 3780 + }, + { + "epoch": 1.96, + "learning_rate": 1.9324918722509086e-07, + "logits/chosen": -2.688558578491211, + "logits/rejected": -2.711604356765747, + "logps/chosen": -250.48641967773438, + "logps/rejected": -235.5607147216797, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9537031054496765, + "rewards/margins": 8.950319290161133, + "rewards/rejected": -7.996614933013916, + "step": 3790 + }, + { + "epoch": 1.96, + "learning_rate": 1.9229298144960794e-07, + "logits/chosen": -2.6265201568603516, + "logits/rejected": -2.643637180328369, + "logps/chosen": -274.25970458984375, + "logps/rejected": -283.9968566894531, + "loss": 0.0796, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.4979665279388428, + "rewards/margins": 10.394436836242676, + "rewards/rejected": -8.896470069885254, + "step": 3800 + }, + { + "epoch": 1.97, + "learning_rate": 1.9133677567412506e-07, + "logits/chosen": -2.630946159362793, + "logits/rejected": -2.723823070526123, + "logps/chosen": -282.9399108886719, + "logps/rejected": -233.25167846679688, + "loss": 0.1037, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.8352066278457642, + "rewards/margins": 8.478796005249023, + "rewards/rejected": -6.643589973449707, + "step": 3810 + }, + { + "epoch": 1.97, + "learning_rate": 1.9038056989864218e-07, + "logits/chosen": -2.7855098247528076, + "logits/rejected": -2.7989068031311035, + "logps/chosen": -274.2118835449219, + "logps/rejected": -282.32708740234375, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.546886920928955, + "rewards/margins": 10.067007064819336, + "rewards/rejected": -7.520121097564697, + "step": 3820 + }, + { + "epoch": 1.98, + "learning_rate": 1.894243641231593e-07, + "logits/chosen": -2.4910149574279785, + "logits/rejected": -2.501765727996826, + "logps/chosen": -235.5742950439453, + "logps/rejected": -227.42919921875, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2598899006843567, + "rewards/margins": 7.640264987945557, + "rewards/rejected": -7.380374908447266, + "step": 3830 + }, + { + "epoch": 1.98, + "learning_rate": 1.884681583476764e-07, + "logits/chosen": -2.649984836578369, + "logits/rejected": -2.794214963912964, + "logps/chosen": -231.7446746826172, + "logps/rejected": -270.55938720703125, + "loss": 0.0427, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4616106152534485, + "rewards/margins": 7.806038856506348, + "rewards/rejected": -7.344427585601807, + "step": 3840 + }, + { + "epoch": 1.99, + "learning_rate": 1.8751195257219352e-07, + "logits/chosen": -2.6904454231262207, + "logits/rejected": -2.7609245777130127, + "logps/chosen": -256.99359130859375, + "logps/rejected": -285.15380859375, + "loss": 0.0941, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.14900550246238708, + "rewards/margins": 7.018132209777832, + "rewards/rejected": -6.8691277503967285, + "step": 3850 + }, + { + "epoch": 1.99, + "learning_rate": 1.8655574679671067e-07, + "logits/chosen": -2.773998737335205, + "logits/rejected": -2.675830125808716, + "logps/chosen": -249.72525024414062, + "logps/rejected": -265.43524169921875, + "loss": 0.0859, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5194703340530396, + "rewards/margins": 10.313286781311035, + "rewards/rejected": -8.793817520141602, + "step": 3860 + }, + { + "epoch": 2.0, + "learning_rate": 1.8559954102122778e-07, + "logits/chosen": -2.656508684158325, + "logits/rejected": -2.6907639503479004, + "logps/chosen": -237.22695922851562, + "logps/rejected": -256.4031982421875, + "loss": 0.0951, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.107354760169983, + "rewards/margins": 7.762026786804199, + "rewards/rejected": -6.654671669006348, + "step": 3870 + }, + { + "epoch": 2.0, + "learning_rate": 1.8464333524574487e-07, + "logits/chosen": -2.6225860118865967, + "logits/rejected": -2.629585027694702, + "logps/chosen": -197.79290771484375, + "logps/rejected": -238.13485717773438, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3901655673980713, + "rewards/margins": 8.894407272338867, + "rewards/rejected": -7.504241943359375, + "step": 3880 + }, + { + "epoch": 2.01, + "learning_rate": 1.8368712947026199e-07, + "logits/chosen": -2.7568469047546387, + "logits/rejected": -2.792118549346924, + "logps/chosen": -279.4228820800781, + "logps/rejected": -296.11126708984375, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8633215427398682, + "rewards/margins": 9.97962760925293, + "rewards/rejected": -8.11630630493164, + "step": 3890 + }, + { + "epoch": 2.01, + "learning_rate": 1.827309236947791e-07, + "logits/chosen": -2.508894681930542, + "logits/rejected": -2.627253293991089, + "logps/chosen": -167.35305786132812, + "logps/rejected": -272.1786193847656, + "loss": 0.0206, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3741319179534912, + "rewards/margins": 11.488214492797852, + "rewards/rejected": -10.114082336425781, + "step": 3900 + }, + { + "epoch": 2.02, + "learning_rate": 1.8177471791929622e-07, + "logits/chosen": -2.562891960144043, + "logits/rejected": -2.627436876296997, + "logps/chosen": -219.225830078125, + "logps/rejected": -294.67742919921875, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6091563701629639, + "rewards/margins": 11.577367782592773, + "rewards/rejected": -9.968213081359863, + "step": 3910 + }, + { + "epoch": 2.02, + "learning_rate": 1.8081851214381333e-07, + "logits/chosen": -2.6254868507385254, + "logits/rejected": -2.662242889404297, + "logps/chosen": -272.97344970703125, + "logps/rejected": -386.03802490234375, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5487847328186035, + "rewards/margins": 13.016972541809082, + "rewards/rejected": -11.468185424804688, + "step": 3920 + }, + { + "epoch": 2.03, + "learning_rate": 1.7986230636833047e-07, + "logits/chosen": -2.7052996158599854, + "logits/rejected": -2.6561102867126465, + "logps/chosen": -184.27590942382812, + "logps/rejected": -234.31112670898438, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5409653782844543, + "rewards/margins": 8.709564208984375, + "rewards/rejected": -8.168600082397461, + "step": 3930 + }, + { + "epoch": 2.03, + "learning_rate": 1.789061005928476e-07, + "logits/chosen": -2.7472193241119385, + "logits/rejected": -2.7598049640655518, + "logps/chosen": -289.13043212890625, + "logps/rejected": -259.87261962890625, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.779617428779602, + "rewards/margins": 9.105241775512695, + "rewards/rejected": -8.325624465942383, + "step": 3940 + }, + { + "epoch": 2.04, + "learning_rate": 1.7794989481736468e-07, + "logits/chosen": -2.646831512451172, + "logits/rejected": -2.6080241203308105, + "logps/chosen": -234.96249389648438, + "logps/rejected": -258.51947021484375, + "loss": 0.0222, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3543825149536133, + "rewards/margins": 10.9358549118042, + "rewards/rejected": -9.581473350524902, + "step": 3950 + }, + { + "epoch": 2.04, + "learning_rate": 1.769936890418818e-07, + "logits/chosen": -2.6132800579071045, + "logits/rejected": -2.6494579315185547, + "logps/chosen": -269.6546325683594, + "logps/rejected": -271.829833984375, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.320793390274048, + "rewards/margins": 10.98828411102295, + "rewards/rejected": -8.667490005493164, + "step": 3960 + }, + { + "epoch": 2.05, + "learning_rate": 1.760374832663989e-07, + "logits/chosen": -2.6040878295898438, + "logits/rejected": -2.5603089332580566, + "logps/chosen": -298.0629577636719, + "logps/rejected": -297.24237060546875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.069619655609131, + "rewards/margins": 13.027959823608398, + "rewards/rejected": -9.958338737487793, + "step": 3970 + }, + { + "epoch": 2.05, + "learning_rate": 1.7508127749091603e-07, + "logits/chosen": -2.7057785987854004, + "logits/rejected": -2.7334468364715576, + "logps/chosen": -260.33624267578125, + "logps/rejected": -262.96954345703125, + "loss": 0.0121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2088730335235596, + "rewards/margins": 10.28221607208252, + "rewards/rejected": -9.073343276977539, + "step": 3980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7412507171543314e-07, + "logits/chosen": -2.7243123054504395, + "logits/rejected": -2.814331293106079, + "logps/chosen": -309.1561584472656, + "logps/rejected": -260.5794677734375, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2261295318603516, + "rewards/margins": 11.2640962600708, + "rewards/rejected": -10.03796672821045, + "step": 3990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7316886593995028e-07, + "logits/chosen": -2.729241371154785, + "logits/rejected": -2.680410861968994, + "logps/chosen": -240.87356567382812, + "logps/rejected": -250.3045654296875, + "loss": 0.0178, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6860274076461792, + "rewards/margins": 9.379659652709961, + "rewards/rejected": -8.693634033203125, + "step": 4000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.7272632122039795, + "eval_logits/rejected": -2.731018304824829, + "eval_logps/chosen": -307.7316589355469, + "eval_logps/rejected": -272.21661376953125, + "eval_loss": 0.6797224283218384, + "eval_rewards/accuracies": 0.761904776096344, + "eval_rewards/chosen": -1.8484528064727783, + "eval_rewards/margins": 3.40644907951355, + "eval_rewards/rejected": -5.254901885986328, + "eval_runtime": 218.0384, + "eval_samples_per_second": 9.173, + "eval_steps_per_second": 0.289, + "step": 4000 + }, + { + "epoch": 2.07, + "learning_rate": 1.722126601644674e-07, + "logits/chosen": -2.733309745788574, + "logits/rejected": -2.81620717048645, + "logps/chosen": -298.2601013183594, + "logps/rejected": -286.43121337890625, + "loss": 0.0125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6399898529052734, + "rewards/margins": 10.189987182617188, + "rewards/rejected": -9.549996376037598, + "step": 4010 + }, + { + "epoch": 2.08, + "learning_rate": 1.7125645438898452e-07, + "logits/chosen": -2.754019021987915, + "logits/rejected": -2.7992444038391113, + "logps/chosen": -292.0843505859375, + "logps/rejected": -278.53179931640625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7986736297607422, + "rewards/margins": 11.726593017578125, + "rewards/rejected": -9.927919387817383, + "step": 4020 + }, + { + "epoch": 2.08, + "learning_rate": 1.703002486135016e-07, + "logits/chosen": -2.6090283393859863, + "logits/rejected": -2.6274030208587646, + "logps/chosen": -247.7549591064453, + "logps/rejected": -275.93341064453125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.168842077255249, + "rewards/margins": 13.396291732788086, + "rewards/rejected": -11.227449417114258, + "step": 4030 + }, + { + "epoch": 2.09, + "learning_rate": 1.6934404283801872e-07, + "logits/chosen": -2.715796947479248, + "logits/rejected": -2.5600688457489014, + "logps/chosen": -269.89971923828125, + "logps/rejected": -321.0479431152344, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.061984062194824, + "rewards/margins": 16.50414276123047, + "rewards/rejected": -13.442158699035645, + "step": 4040 + }, + { + "epoch": 2.09, + "learning_rate": 1.6838783706253584e-07, + "logits/chosen": -2.722234010696411, + "logits/rejected": -2.648040294647217, + "logps/chosen": -201.85948181152344, + "logps/rejected": -215.13235473632812, + "loss": 0.0357, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3728597164154053, + "rewards/margins": 9.69245433807373, + "rewards/rejected": -8.319594383239746, + "step": 4050 + }, + { + "epoch": 2.1, + "learning_rate": 1.6743163128705295e-07, + "logits/chosen": -2.7006664276123047, + "logits/rejected": -2.738062858581543, + "logps/chosen": -271.821533203125, + "logps/rejected": -308.8503723144531, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8506834506988525, + "rewards/margins": 13.487665176391602, + "rewards/rejected": -11.636982917785645, + "step": 4060 + }, + { + "epoch": 2.1, + "learning_rate": 1.664754255115701e-07, + "logits/chosen": -2.6540932655334473, + "logits/rejected": -2.6025888919830322, + "logps/chosen": -205.8916778564453, + "logps/rejected": -249.5850067138672, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.721344470977783, + "rewards/margins": 12.787909507751465, + "rewards/rejected": -10.066566467285156, + "step": 4070 + }, + { + "epoch": 2.11, + "learning_rate": 1.655192197360872e-07, + "logits/chosen": -2.8125226497650146, + "logits/rejected": -2.768751621246338, + "logps/chosen": -287.2244873046875, + "logps/rejected": -293.54669189453125, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.948952853679657, + "rewards/margins": 11.471153259277344, + "rewards/rejected": -10.522199630737305, + "step": 4080 + }, + { + "epoch": 2.11, + "learning_rate": 1.6456301396060433e-07, + "logits/chosen": -2.6349759101867676, + "logits/rejected": -2.7236363887786865, + "logps/chosen": -278.74725341796875, + "logps/rejected": -290.44500732421875, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.579256296157837, + "rewards/margins": 9.917515754699707, + "rewards/rejected": -8.338258743286133, + "step": 4090 + }, + { + "epoch": 2.12, + "learning_rate": 1.6360680818512144e-07, + "logits/chosen": -2.8475401401519775, + "logits/rejected": -2.7553558349609375, + "logps/chosen": -308.8858337402344, + "logps/rejected": -360.74920654296875, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8224369287490845, + "rewards/margins": 11.919973373413086, + "rewards/rejected": -10.097536087036133, + "step": 4100 + }, + { + "epoch": 2.12, + "learning_rate": 1.6265060240963853e-07, + "logits/chosen": -2.769975185394287, + "logits/rejected": -2.719059467315674, + "logps/chosen": -308.78369140625, + "logps/rejected": -271.7587890625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.12917160987854, + "rewards/margins": 12.400825500488281, + "rewards/rejected": -10.27165412902832, + "step": 4110 + }, + { + "epoch": 2.13, + "learning_rate": 1.6169439663415565e-07, + "logits/chosen": -2.7416560649871826, + "logits/rejected": -2.804804801940918, + "logps/chosen": -285.85491943359375, + "logps/rejected": -307.9002380371094, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2552881240844727, + "rewards/margins": 13.406854629516602, + "rewards/rejected": -11.151565551757812, + "step": 4120 + }, + { + "epoch": 2.13, + "learning_rate": 1.6073819085867276e-07, + "logits/chosen": -2.8296265602111816, + "logits/rejected": -2.805642604827881, + "logps/chosen": -274.7913513183594, + "logps/rejected": -383.2001647949219, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.098189353942871, + "rewards/margins": 12.88170337677002, + "rewards/rejected": -10.783514022827148, + "step": 4130 + }, + { + "epoch": 2.14, + "learning_rate": 1.597819850831899e-07, + "logits/chosen": -2.7971558570861816, + "logits/rejected": -2.7784924507141113, + "logps/chosen": -230.7209930419922, + "logps/rejected": -262.358642578125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9080710411071777, + "rewards/margins": 13.876028060913086, + "rewards/rejected": -10.96795654296875, + "step": 4140 + }, + { + "epoch": 2.14, + "learning_rate": 1.5882577930770702e-07, + "logits/chosen": -2.7370054721832275, + "logits/rejected": -2.744947671890259, + "logps/chosen": -256.64117431640625, + "logps/rejected": -260.4443664550781, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3169360160827637, + "rewards/margins": 13.783903121948242, + "rewards/rejected": -10.46696662902832, + "step": 4150 + }, + { + "epoch": 2.15, + "learning_rate": 1.5786957353222414e-07, + "logits/chosen": -2.7004518508911133, + "logits/rejected": -2.7562923431396484, + "logps/chosen": -271.4837341308594, + "logps/rejected": -229.91641235351562, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.476590633392334, + "rewards/margins": 10.639229774475098, + "rewards/rejected": -9.162638664245605, + "step": 4160 + }, + { + "epoch": 2.15, + "learning_rate": 1.5691336775674125e-07, + "logits/chosen": -2.6011502742767334, + "logits/rejected": -2.620133399963379, + "logps/chosen": -261.44512939453125, + "logps/rejected": -240.9309844970703, + "loss": 0.0192, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5351409912109375, + "rewards/margins": 11.039804458618164, + "rewards/rejected": -9.504663467407227, + "step": 4170 + }, + { + "epoch": 2.16, + "learning_rate": 1.5595716198125837e-07, + "logits/chosen": -2.848597288131714, + "logits/rejected": -2.875506639480591, + "logps/chosen": -337.51287841796875, + "logps/rejected": -337.42193603515625, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44808346033096313, + "rewards/margins": 11.492895126342773, + "rewards/rejected": -11.044811248779297, + "step": 4180 + }, + { + "epoch": 2.16, + "learning_rate": 1.5500095620577546e-07, + "logits/chosen": -2.825258255004883, + "logits/rejected": -2.8877339363098145, + "logps/chosen": -259.0141296386719, + "logps/rejected": -222.65609741210938, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.203387975692749, + "rewards/margins": 9.037870407104492, + "rewards/rejected": -7.8344831466674805, + "step": 4190 + }, + { + "epoch": 2.17, + "learning_rate": 1.5404475043029257e-07, + "logits/chosen": -2.7203478813171387, + "logits/rejected": -2.6974122524261475, + "logps/chosen": -261.0373840332031, + "logps/rejected": -284.78057861328125, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7423670887947083, + "rewards/margins": 10.860982894897461, + "rewards/rejected": -10.118616104125977, + "step": 4200 + }, + { + "epoch": 2.17, + "learning_rate": 1.5308854465480971e-07, + "logits/chosen": -2.797438144683838, + "logits/rejected": -2.789064407348633, + "logps/chosen": -262.25006103515625, + "logps/rejected": -250.6930694580078, + "loss": 0.015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2764910459518433, + "rewards/margins": 9.582831382751465, + "rewards/rejected": -8.306341171264648, + "step": 4210 + }, + { + "epoch": 2.18, + "learning_rate": 1.5213233887932683e-07, + "logits/chosen": -2.72556734085083, + "logits/rejected": -2.7338197231292725, + "logps/chosen": -296.2276916503906, + "logps/rejected": -277.2362976074219, + "loss": 0.0081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4097917079925537, + "rewards/margins": 10.847259521484375, + "rewards/rejected": -9.437466621398926, + "step": 4220 + }, + { + "epoch": 2.18, + "learning_rate": 1.5117613310384395e-07, + "logits/chosen": -2.7299015522003174, + "logits/rejected": -2.768979549407959, + "logps/chosen": -248.88082885742188, + "logps/rejected": -251.6071319580078, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7584762573242188, + "rewards/margins": 11.731807708740234, + "rewards/rejected": -9.973332405090332, + "step": 4230 + }, + { + "epoch": 2.19, + "learning_rate": 1.5021992732836106e-07, + "logits/chosen": -2.723501205444336, + "logits/rejected": -2.798146963119507, + "logps/chosen": -289.02252197265625, + "logps/rejected": -341.7744445800781, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.187995195388794, + "rewards/margins": 14.743619918823242, + "rewards/rejected": -11.555627822875977, + "step": 4240 + }, + { + "epoch": 2.19, + "learning_rate": 1.4926372155287818e-07, + "logits/chosen": -2.8687949180603027, + "logits/rejected": -2.7464382648468018, + "logps/chosen": -214.82046508789062, + "logps/rejected": -266.95269775390625, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9481528997421265, + "rewards/margins": 11.698265075683594, + "rewards/rejected": -10.750112533569336, + "step": 4250 + }, + { + "epoch": 2.2, + "learning_rate": 1.483075157773953e-07, + "logits/chosen": -2.793741464614868, + "logits/rejected": -2.8271756172180176, + "logps/chosen": -290.243896484375, + "logps/rejected": -278.4471740722656, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8705357313156128, + "rewards/margins": 10.359903335571289, + "rewards/rejected": -9.489367485046387, + "step": 4260 + }, + { + "epoch": 2.2, + "learning_rate": 1.4735131000191238e-07, + "logits/chosen": -2.677849531173706, + "logits/rejected": -2.703662157058716, + "logps/chosen": -260.99310302734375, + "logps/rejected": -251.24911499023438, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6553013324737549, + "rewards/margins": 11.322183609008789, + "rewards/rejected": -9.66688346862793, + "step": 4270 + }, + { + "epoch": 2.21, + "learning_rate": 1.4639510422642952e-07, + "logits/chosen": -2.82415509223938, + "logits/rejected": -2.731492280960083, + "logps/chosen": -256.83941650390625, + "logps/rejected": -281.5820007324219, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9518572092056274, + "rewards/margins": 10.992048263549805, + "rewards/rejected": -10.040190696716309, + "step": 4280 + }, + { + "epoch": 2.21, + "learning_rate": 1.4543889845094664e-07, + "logits/chosen": -2.6741342544555664, + "logits/rejected": -2.625251054763794, + "logps/chosen": -296.27880859375, + "logps/rejected": -359.40643310546875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.28782057762146, + "rewards/margins": 15.902807235717773, + "rewards/rejected": -13.61498737335205, + "step": 4290 + }, + { + "epoch": 2.22, + "learning_rate": 1.4448269267546376e-07, + "logits/chosen": -2.65960693359375, + "logits/rejected": -2.767089605331421, + "logps/chosen": -283.67694091796875, + "logps/rejected": -276.77972412109375, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.666792869567871, + "rewards/margins": 13.424444198608398, + "rewards/rejected": -10.757650375366211, + "step": 4300 + }, + { + "epoch": 2.23, + "learning_rate": 1.4352648689998087e-07, + "logits/chosen": -2.7458882331848145, + "logits/rejected": -2.788189649581909, + "logps/chosen": -260.59552001953125, + "logps/rejected": -238.883056640625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0128729343414307, + "rewards/margins": 13.033166885375977, + "rewards/rejected": -11.020292282104492, + "step": 4310 + }, + { + "epoch": 2.23, + "learning_rate": 1.42570281124498e-07, + "logits/chosen": -2.6500084400177, + "logits/rejected": -2.7445132732391357, + "logps/chosen": -318.0313415527344, + "logps/rejected": -330.5144958496094, + "loss": 0.0165, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5414737462997437, + "rewards/margins": 13.6046781539917, + "rewards/rejected": -12.063204765319824, + "step": 4320 + }, + { + "epoch": 2.24, + "learning_rate": 1.416140753490151e-07, + "logits/chosen": -2.6413755416870117, + "logits/rejected": -2.7113699913024902, + "logps/chosen": -301.3822326660156, + "logps/rejected": -330.09271240234375, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6454537510871887, + "rewards/margins": 11.528231620788574, + "rewards/rejected": -10.882777214050293, + "step": 4330 + }, + { + "epoch": 2.24, + "learning_rate": 1.4065786957353222e-07, + "logits/chosen": -2.67690110206604, + "logits/rejected": -2.6537270545959473, + "logps/chosen": -276.92431640625, + "logps/rejected": -315.29595947265625, + "loss": 0.015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7478466629981995, + "rewards/margins": 11.373164176940918, + "rewards/rejected": -10.625317573547363, + "step": 4340 + }, + { + "epoch": 2.25, + "learning_rate": 1.3970166379804933e-07, + "logits/chosen": -2.64355731010437, + "logits/rejected": -2.569204807281494, + "logps/chosen": -315.84100341796875, + "logps/rejected": -309.93670654296875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3451144695281982, + "rewards/margins": 12.67324447631836, + "rewards/rejected": -11.32812786102295, + "step": 4350 + }, + { + "epoch": 2.25, + "learning_rate": 1.3874545802256645e-07, + "logits/chosen": -2.6960456371307373, + "logits/rejected": -2.6648309230804443, + "logps/chosen": -263.03717041015625, + "logps/rejected": -332.3580017089844, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8603103756904602, + "rewards/margins": 14.067952156066895, + "rewards/rejected": -13.207639694213867, + "step": 4360 + }, + { + "epoch": 2.26, + "learning_rate": 1.3778925224708357e-07, + "logits/chosen": -2.547229528427124, + "logits/rejected": -2.56522798538208, + "logps/chosen": -233.1015625, + "logps/rejected": -270.6748046875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7012211084365845, + "rewards/margins": 12.212430953979492, + "rewards/rejected": -10.511209487915039, + "step": 4370 + }, + { + "epoch": 2.26, + "learning_rate": 1.3683304647160068e-07, + "logits/chosen": -2.627701759338379, + "logits/rejected": -2.732863187789917, + "logps/chosen": -249.3077392578125, + "logps/rejected": -332.698486328125, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.220199704170227, + "rewards/margins": 13.386858940124512, + "rewards/rejected": -12.166659355163574, + "step": 4380 + }, + { + "epoch": 2.27, + "learning_rate": 1.358768406961178e-07, + "logits/chosen": -2.6899096965789795, + "logits/rejected": -2.6770434379577637, + "logps/chosen": -243.5663299560547, + "logps/rejected": -309.42755126953125, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4383031129837036, + "rewards/margins": 13.10438346862793, + "rewards/rejected": -11.666081428527832, + "step": 4390 + }, + { + "epoch": 2.27, + "learning_rate": 1.349206349206349e-07, + "logits/chosen": -2.7282745838165283, + "logits/rejected": -2.6892740726470947, + "logps/chosen": -263.52374267578125, + "logps/rejected": -268.36956787109375, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8276527523994446, + "rewards/margins": 9.619959831237793, + "rewards/rejected": -10.447612762451172, + "step": 4400 + }, + { + "epoch": 2.28, + "learning_rate": 1.3396442914515203e-07, + "logits/chosen": -2.598597526550293, + "logits/rejected": -2.6134636402130127, + "logps/chosen": -249.866455078125, + "logps/rejected": -296.13909912109375, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.13881632685661316, + "rewards/margins": 12.453330039978027, + "rewards/rejected": -12.31451416015625, + "step": 4410 + }, + { + "epoch": 2.28, + "learning_rate": 1.3300822336966917e-07, + "logits/chosen": -2.62734055519104, + "logits/rejected": -2.482203960418701, + "logps/chosen": -280.33648681640625, + "logps/rejected": -261.93939208984375, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6268467903137207, + "rewards/margins": 15.163488388061523, + "rewards/rejected": -12.536642074584961, + "step": 4420 + }, + { + "epoch": 2.29, + "learning_rate": 1.3205201759418626e-07, + "logits/chosen": -2.5897057056427, + "logits/rejected": -2.541538953781128, + "logps/chosen": -279.7352294921875, + "logps/rejected": -272.1889953613281, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6353164911270142, + "rewards/margins": 11.685983657836914, + "rewards/rejected": -10.050667762756348, + "step": 4430 + }, + { + "epoch": 2.29, + "learning_rate": 1.3109581181870338e-07, + "logits/chosen": -2.6762611865997314, + "logits/rejected": -2.712613821029663, + "logps/chosen": -289.64984130859375, + "logps/rejected": -287.3264465332031, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0438344478607178, + "rewards/margins": 14.380327224731445, + "rewards/rejected": -11.336492538452148, + "step": 4440 + }, + { + "epoch": 2.3, + "learning_rate": 1.301396060432205e-07, + "logits/chosen": -2.6527390480041504, + "logits/rejected": -2.6962897777557373, + "logps/chosen": -310.69927978515625, + "logps/rejected": -291.29052734375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.072523593902588, + "rewards/margins": 13.862905502319336, + "rewards/rejected": -11.790382385253906, + "step": 4450 + }, + { + "epoch": 2.3, + "learning_rate": 1.291834002677376e-07, + "logits/chosen": -2.692753791809082, + "logits/rejected": -2.6270699501037598, + "logps/chosen": -252.09933471679688, + "logps/rejected": -259.0921630859375, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35815539956092834, + "rewards/margins": 10.08549976348877, + "rewards/rejected": -9.72734546661377, + "step": 4460 + }, + { + "epoch": 2.31, + "learning_rate": 1.2822719449225472e-07, + "logits/chosen": -2.548062801361084, + "logits/rejected": -2.685159206390381, + "logps/chosen": -198.88003540039062, + "logps/rejected": -219.5816192626953, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3538029193878174, + "rewards/margins": 9.92510986328125, + "rewards/rejected": -9.571308135986328, + "step": 4470 + }, + { + "epoch": 2.31, + "learning_rate": 1.2727098871677184e-07, + "logits/chosen": -2.675671100616455, + "logits/rejected": -2.6893503665924072, + "logps/chosen": -323.32763671875, + "logps/rejected": -297.06390380859375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.494044303894043, + "rewards/margins": 12.427671432495117, + "rewards/rejected": -9.933626174926758, + "step": 4480 + }, + { + "epoch": 2.32, + "learning_rate": 1.2631478294128898e-07, + "logits/chosen": -2.7433652877807617, + "logits/rejected": -2.655972719192505, + "logps/chosen": -300.86981201171875, + "logps/rejected": -324.5699768066406, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4431943893432617, + "rewards/margins": 15.235920906066895, + "rewards/rejected": -12.792726516723633, + "step": 4490 + }, + { + "epoch": 2.32, + "learning_rate": 1.253585771658061e-07, + "logits/chosen": -2.6331238746643066, + "logits/rejected": -2.601410150527954, + "logps/chosen": -251.07992553710938, + "logps/rejected": -289.9195251464844, + "loss": 0.0165, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1148226261138916, + "rewards/margins": 11.354089736938477, + "rewards/rejected": -10.239266395568848, + "step": 4500 + }, + { + "epoch": 2.32, + "eval_logits/chosen": -2.700108766555786, + "eval_logits/rejected": -2.700565814971924, + "eval_logps/chosen": -309.5376281738281, + "eval_logps/rejected": -276.1910095214844, + "eval_loss": 0.735942006111145, + "eval_rewards/accuracies": 0.7817460298538208, + "eval_rewards/chosen": -2.209642171859741, + "eval_rewards/margins": 3.8401401042938232, + "eval_rewards/rejected": -6.0497822761535645, + "eval_runtime": 217.7058, + "eval_samples_per_second": 9.187, + "eval_steps_per_second": 0.289, + "step": 4500 + }, + { + "epoch": 2.33, + "learning_rate": 1.2440237139032319e-07, + "logits/chosen": -2.6619417667388916, + "logits/rejected": -2.7171759605407715, + "logps/chosen": -295.917236328125, + "logps/rejected": -280.96728515625, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4065229892730713, + "rewards/margins": 13.129376411437988, + "rewards/rejected": -11.722851753234863, + "step": 4510 + }, + { + "epoch": 2.33, + "learning_rate": 1.234461656148403e-07, + "logits/chosen": -2.6914889812469482, + "logits/rejected": -2.6381001472473145, + "logps/chosen": -299.221923828125, + "logps/rejected": -310.075439453125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4773738384246826, + "rewards/margins": 13.474810600280762, + "rewards/rejected": -11.9974365234375, + "step": 4520 + }, + { + "epoch": 2.34, + "learning_rate": 1.2248995983935742e-07, + "logits/chosen": -2.6824350357055664, + "logits/rejected": -2.6979966163635254, + "logps/chosen": -300.82366943359375, + "logps/rejected": -299.1722717285156, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3239176273345947, + "rewards/margins": 13.268686294555664, + "rewards/rejected": -10.9447660446167, + "step": 4530 + }, + { + "epoch": 2.34, + "learning_rate": 1.2153375406387456e-07, + "logits/chosen": -2.6138525009155273, + "logits/rejected": -2.7358133792877197, + "logps/chosen": -329.01947021484375, + "logps/rejected": -350.1726379394531, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.126481056213379, + "rewards/margins": 14.977691650390625, + "rewards/rejected": -12.85120964050293, + "step": 4540 + }, + { + "epoch": 2.35, + "learning_rate": 1.2057754828839165e-07, + "logits/chosen": -2.690929889678955, + "logits/rejected": -2.8147425651550293, + "logps/chosen": -267.2896728515625, + "logps/rejected": -314.25030517578125, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8327617645263672, + "rewards/margins": 11.646089553833008, + "rewards/rejected": -9.81332778930664, + "step": 4550 + }, + { + "epoch": 2.35, + "learning_rate": 1.1962134251290876e-07, + "logits/chosen": -2.581512928009033, + "logits/rejected": -2.6514651775360107, + "logps/chosen": -244.9253692626953, + "logps/rejected": -206.70034790039062, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3540361523628235, + "rewards/margins": 10.505193710327148, + "rewards/rejected": -10.85922908782959, + "step": 4560 + }, + { + "epoch": 2.36, + "learning_rate": 1.1866513673742588e-07, + "logits/chosen": -2.720597267150879, + "logits/rejected": -2.6273646354675293, + "logps/chosen": -262.72467041015625, + "logps/rejected": -277.3870544433594, + "loss": 0.0442, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.38703417778015137, + "rewards/margins": 10.342449188232422, + "rewards/rejected": -9.955414772033691, + "step": 4570 + }, + { + "epoch": 2.36, + "learning_rate": 1.1770893096194301e-07, + "logits/chosen": -2.683189868927002, + "logits/rejected": -2.6948843002319336, + "logps/chosen": -320.70184326171875, + "logps/rejected": -330.626953125, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.077538013458252, + "rewards/margins": 15.482243537902832, + "rewards/rejected": -12.404706954956055, + "step": 4580 + }, + { + "epoch": 2.37, + "learning_rate": 1.1675272518646012e-07, + "logits/chosen": -2.7156405448913574, + "logits/rejected": -2.637774705886841, + "logps/chosen": -258.29266357421875, + "logps/rejected": -310.4388732910156, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1347460746765137, + "rewards/margins": 14.748605728149414, + "rewards/rejected": -11.613859176635742, + "step": 4590 + }, + { + "epoch": 2.37, + "learning_rate": 1.1579651941097724e-07, + "logits/chosen": -2.6939098834991455, + "logits/rejected": -2.7423393726348877, + "logps/chosen": -299.42889404296875, + "logps/rejected": -245.23208618164062, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.775151789188385, + "rewards/margins": 11.489202499389648, + "rewards/rejected": -10.714052200317383, + "step": 4600 + }, + { + "epoch": 2.38, + "learning_rate": 1.1484031363549436e-07, + "logits/chosen": -2.6191563606262207, + "logits/rejected": -2.6700427532196045, + "logps/chosen": -314.7420349121094, + "logps/rejected": -322.2962951660156, + "loss": 0.0187, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6653627157211304, + "rewards/margins": 14.713505744934082, + "rewards/rejected": -14.048141479492188, + "step": 4610 + }, + { + "epoch": 2.39, + "learning_rate": 1.1388410786001147e-07, + "logits/chosen": -2.7032155990600586, + "logits/rejected": -2.714507579803467, + "logps/chosen": -241.12741088867188, + "logps/rejected": -308.56304931640625, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.174452543258667, + "rewards/margins": 13.297747611999512, + "rewards/rejected": -12.123295783996582, + "step": 4620 + }, + { + "epoch": 2.39, + "learning_rate": 1.1292790208452859e-07, + "logits/chosen": -2.6625142097473145, + "logits/rejected": -2.6771183013916016, + "logps/chosen": -240.2090301513672, + "logps/rejected": -263.73797607421875, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6283326745033264, + "rewards/margins": 13.485211372375488, + "rewards/rejected": -12.856880187988281, + "step": 4630 + }, + { + "epoch": 2.4, + "learning_rate": 1.119716963090457e-07, + "logits/chosen": -2.555988311767578, + "logits/rejected": -2.58622407913208, + "logps/chosen": -210.8300018310547, + "logps/rejected": -265.80462646484375, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8450849652290344, + "rewards/margins": 11.602185249328613, + "rewards/rejected": -10.757101058959961, + "step": 4640 + }, + { + "epoch": 2.4, + "learning_rate": 1.1101549053356282e-07, + "logits/chosen": -2.7897839546203613, + "logits/rejected": -2.7569496631622314, + "logps/chosen": -279.80267333984375, + "logps/rejected": -290.2506103515625, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9652635455131531, + "rewards/margins": 11.88095760345459, + "rewards/rejected": -10.915693283081055, + "step": 4650 + }, + { + "epoch": 2.41, + "learning_rate": 1.1005928475807993e-07, + "logits/chosen": -2.706782579421997, + "logits/rejected": -2.690147876739502, + "logps/chosen": -263.908935546875, + "logps/rejected": -268.34808349609375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13803401589393616, + "rewards/margins": 11.163463592529297, + "rewards/rejected": -11.301496505737305, + "step": 4660 + }, + { + "epoch": 2.41, + "learning_rate": 1.0910307898259705e-07, + "logits/chosen": -2.5569040775299072, + "logits/rejected": -2.5663299560546875, + "logps/chosen": -228.5583038330078, + "logps/rejected": -284.5388488769531, + "loss": 0.0164, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3399560451507568, + "rewards/margins": 12.518512725830078, + "rewards/rejected": -11.178556442260742, + "step": 4670 + }, + { + "epoch": 2.42, + "learning_rate": 1.0814687320711418e-07, + "logits/chosen": -2.6800427436828613, + "logits/rejected": -2.5935287475585938, + "logps/chosen": -198.6033477783203, + "logps/rejected": -292.9828796386719, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3410785496234894, + "rewards/margins": 13.070027351379395, + "rewards/rejected": -12.728948593139648, + "step": 4680 + }, + { + "epoch": 2.42, + "learning_rate": 1.0719066743163128e-07, + "logits/chosen": -2.589458465576172, + "logits/rejected": -2.6344993114471436, + "logps/chosen": -291.27984619140625, + "logps/rejected": -293.70477294921875, + "loss": 0.0146, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0233111381530762, + "rewards/margins": 14.37419605255127, + "rewards/rejected": -13.350885391235352, + "step": 4690 + }, + { + "epoch": 2.43, + "learning_rate": 1.062344616561484e-07, + "logits/chosen": -2.659127950668335, + "logits/rejected": -2.737194299697876, + "logps/chosen": -317.55108642578125, + "logps/rejected": -367.94293212890625, + "loss": 0.0109, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 3.5995655059814453, + "rewards/margins": 18.63315200805664, + "rewards/rejected": -15.033584594726562, + "step": 4700 + }, + { + "epoch": 2.43, + "learning_rate": 1.0527825588066551e-07, + "logits/chosen": -2.648740530014038, + "logits/rejected": -2.6491146087646484, + "logps/chosen": -304.24017333984375, + "logps/rejected": -293.5978088378906, + "loss": 0.0087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.9029955863952637, + "rewards/margins": 16.679758071899414, + "rewards/rejected": -13.776761054992676, + "step": 4710 + }, + { + "epoch": 2.44, + "learning_rate": 1.0432205010518264e-07, + "logits/chosen": -2.699047088623047, + "logits/rejected": -2.7981960773468018, + "logps/chosen": -233.66970825195312, + "logps/rejected": -306.4530944824219, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6058984994888306, + "rewards/margins": 14.446258544921875, + "rewards/rejected": -12.840359687805176, + "step": 4720 + }, + { + "epoch": 2.44, + "learning_rate": 1.0336584432969974e-07, + "logits/chosen": -2.567579746246338, + "logits/rejected": -2.5516488552093506, + "logps/chosen": -252.8605499267578, + "logps/rejected": -278.5452575683594, + "loss": 0.011, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2912280559539795, + "rewards/margins": 14.698877334594727, + "rewards/rejected": -12.407648086547852, + "step": 4730 + }, + { + "epoch": 2.45, + "learning_rate": 1.0240963855421686e-07, + "logits/chosen": -2.592355728149414, + "logits/rejected": -2.6269283294677734, + "logps/chosen": -322.3033142089844, + "logps/rejected": -310.4656982421875, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6497825384140015, + "rewards/margins": 13.311511039733887, + "rewards/rejected": -11.66172981262207, + "step": 4740 + }, + { + "epoch": 2.45, + "learning_rate": 1.0145343277873399e-07, + "logits/chosen": -2.652545213699341, + "logits/rejected": -2.7091572284698486, + "logps/chosen": -293.6610107421875, + "logps/rejected": -328.0738830566406, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4136769771575928, + "rewards/margins": 13.760273933410645, + "rewards/rejected": -12.346597671508789, + "step": 4750 + }, + { + "epoch": 2.46, + "learning_rate": 1.004972270032511e-07, + "logits/chosen": -2.4025778770446777, + "logits/rejected": -2.369227647781372, + "logps/chosen": -224.7193603515625, + "logps/rejected": -253.95285034179688, + "loss": 0.0146, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4258928298950195, + "rewards/margins": 13.243026733398438, + "rewards/rejected": -11.817132949829102, + "step": 4760 + }, + { + "epoch": 2.46, + "learning_rate": 9.95410212277682e-08, + "logits/chosen": -2.6297268867492676, + "logits/rejected": -2.6853363513946533, + "logps/chosen": -287.7942199707031, + "logps/rejected": -280.2574768066406, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.281556248664856, + "rewards/margins": 12.799880027770996, + "rewards/rejected": -11.518324851989746, + "step": 4770 + }, + { + "epoch": 2.47, + "learning_rate": 9.858481545228532e-08, + "logits/chosen": -2.6263976097106934, + "logits/rejected": -2.6547935009002686, + "logps/chosen": -253.96615600585938, + "logps/rejected": -236.6396026611328, + "loss": 0.0236, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.941236138343811, + "rewards/margins": 12.551959991455078, + "rewards/rejected": -10.610723495483398, + "step": 4780 + }, + { + "epoch": 2.47, + "learning_rate": 9.762860967680245e-08, + "logits/chosen": -2.648315906524658, + "logits/rejected": -2.6408438682556152, + "logps/chosen": -217.11599731445312, + "logps/rejected": -267.97222900390625, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5781963467597961, + "rewards/margins": 11.57778549194336, + "rewards/rejected": -10.999589920043945, + "step": 4790 + }, + { + "epoch": 2.48, + "learning_rate": 9.667240390131957e-08, + "logits/chosen": -2.711276054382324, + "logits/rejected": -2.798565626144409, + "logps/chosen": -260.7125549316406, + "logps/rejected": -253.3577117919922, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.451697587966919, + "rewards/margins": 14.93236255645752, + "rewards/rejected": -13.480664253234863, + "step": 4800 + }, + { + "epoch": 2.48, + "learning_rate": 9.571619812583667e-08, + "logits/chosen": -2.5804848670959473, + "logits/rejected": -2.550412178039551, + "logps/chosen": -296.70941162109375, + "logps/rejected": -289.5600891113281, + "loss": 0.0135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.216889500617981, + "rewards/margins": 12.942187309265137, + "rewards/rejected": -11.725297927856445, + "step": 4810 + }, + { + "epoch": 2.49, + "learning_rate": 9.47599923503538e-08, + "logits/chosen": -2.656806230545044, + "logits/rejected": -2.7578437328338623, + "logps/chosen": -273.0238342285156, + "logps/rejected": -310.3895568847656, + "loss": 0.0229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7180588245391846, + "rewards/margins": 13.919692993164062, + "rewards/rejected": -12.201634407043457, + "step": 4820 + }, + { + "epoch": 2.49, + "learning_rate": 9.380378657487091e-08, + "logits/chosen": -2.611632823944092, + "logits/rejected": -2.606079339981079, + "logps/chosen": -236.2656707763672, + "logps/rejected": -278.8091735839844, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.171788215637207, + "rewards/margins": 15.047994613647461, + "rewards/rejected": -12.876205444335938, + "step": 4830 + }, + { + "epoch": 2.5, + "learning_rate": 9.284758079938803e-08, + "logits/chosen": -2.752904176712036, + "logits/rejected": -2.702975034713745, + "logps/chosen": -300.28704833984375, + "logps/rejected": -323.9290466308594, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8785560131072998, + "rewards/margins": 14.25035572052002, + "rewards/rejected": -12.371798515319824, + "step": 4840 + }, + { + "epoch": 2.5, + "learning_rate": 9.189137502390513e-08, + "logits/chosen": -2.719749927520752, + "logits/rejected": -2.782289981842041, + "logps/chosen": -267.34246826171875, + "logps/rejected": -312.0362854003906, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0956392288208008, + "rewards/margins": 13.80529499053955, + "rewards/rejected": -12.70965576171875, + "step": 4850 + }, + { + "epoch": 2.51, + "learning_rate": 9.093516924842226e-08, + "logits/chosen": -2.7262961864471436, + "logits/rejected": -2.68577241897583, + "logps/chosen": -218.84176635742188, + "logps/rejected": -276.78289794921875, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0724873542785645, + "rewards/margins": 13.172772407531738, + "rewards/rejected": -11.100284576416016, + "step": 4860 + }, + { + "epoch": 2.51, + "learning_rate": 8.997896347293938e-08, + "logits/chosen": -2.718925952911377, + "logits/rejected": -2.708313226699829, + "logps/chosen": -245.9953155517578, + "logps/rejected": -305.19354248046875, + "loss": 0.0149, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8387666940689087, + "rewards/margins": 12.512483596801758, + "rewards/rejected": -10.67371654510498, + "step": 4870 + }, + { + "epoch": 2.52, + "learning_rate": 8.902275769745648e-08, + "logits/chosen": -2.6192898750305176, + "logits/rejected": -2.722072124481201, + "logps/chosen": -224.29110717773438, + "logps/rejected": -262.021240234375, + "loss": 0.0107, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.255924940109253, + "rewards/margins": 12.77586841583252, + "rewards/rejected": -11.519944190979004, + "step": 4880 + }, + { + "epoch": 2.52, + "learning_rate": 8.806655192197361e-08, + "logits/chosen": -2.67940616607666, + "logits/rejected": -2.7205865383148193, + "logps/chosen": -225.5251922607422, + "logps/rejected": -312.525390625, + "loss": 0.0083, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.921847939491272, + "rewards/margins": 16.122520446777344, + "rewards/rejected": -14.20067310333252, + "step": 4890 + }, + { + "epoch": 2.53, + "learning_rate": 8.711034614649072e-08, + "logits/chosen": -2.7915446758270264, + "logits/rejected": -2.7987782955169678, + "logps/chosen": -264.7381896972656, + "logps/rejected": -313.4402770996094, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.471101999282837, + "rewards/margins": 16.768362045288086, + "rewards/rejected": -14.297264099121094, + "step": 4900 + }, + { + "epoch": 2.53, + "learning_rate": 8.615414037100784e-08, + "logits/chosen": -2.678217887878418, + "logits/rejected": -2.6336052417755127, + "logps/chosen": -332.27801513671875, + "logps/rejected": -323.1417541503906, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.799132227897644, + "rewards/margins": 12.401124000549316, + "rewards/rejected": -11.601991653442383, + "step": 4910 + }, + { + "epoch": 2.54, + "learning_rate": 8.519793459552494e-08, + "logits/chosen": -2.5783820152282715, + "logits/rejected": -2.601856231689453, + "logps/chosen": -275.9147033691406, + "logps/rejected": -285.7947082519531, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3343265056610107, + "rewards/margins": 12.40494441986084, + "rewards/rejected": -11.070618629455566, + "step": 4920 + }, + { + "epoch": 2.55, + "learning_rate": 8.424172882004207e-08, + "logits/chosen": -2.7203049659729004, + "logits/rejected": -2.773594856262207, + "logps/chosen": -298.14984130859375, + "logps/rejected": -309.88861083984375, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4647815227508545, + "rewards/margins": 12.672274589538574, + "rewards/rejected": -11.207493782043457, + "step": 4930 + }, + { + "epoch": 2.55, + "learning_rate": 8.328552304455919e-08, + "logits/chosen": -2.603013515472412, + "logits/rejected": -2.6129584312438965, + "logps/chosen": -202.2315216064453, + "logps/rejected": -246.1285400390625, + "loss": 0.0132, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.493094265460968, + "rewards/margins": 10.736700057983398, + "rewards/rejected": -10.243606567382812, + "step": 4940 + }, + { + "epoch": 2.56, + "learning_rate": 8.23293172690763e-08, + "logits/chosen": -2.5959506034851074, + "logits/rejected": -2.761260509490967, + "logps/chosen": -240.5106964111328, + "logps/rejected": -269.2008361816406, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8836824297904968, + "rewards/margins": 13.944262504577637, + "rewards/rejected": -13.060582160949707, + "step": 4950 + }, + { + "epoch": 2.56, + "learning_rate": 8.137311149359343e-08, + "logits/chosen": -2.8082408905029297, + "logits/rejected": -2.8189988136291504, + "logps/chosen": -332.5243225097656, + "logps/rejected": -314.81414794921875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6978859901428223, + "rewards/margins": 15.818052291870117, + "rewards/rejected": -13.120168685913086, + "step": 4960 + }, + { + "epoch": 2.57, + "learning_rate": 8.041690571811053e-08, + "logits/chosen": -2.731813907623291, + "logits/rejected": -2.7160086631774902, + "logps/chosen": -273.2561950683594, + "logps/rejected": -297.5777587890625, + "loss": 0.0179, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1517640352249146, + "rewards/margins": 15.763650894165039, + "rewards/rejected": -14.611885070800781, + "step": 4970 + }, + { + "epoch": 2.57, + "learning_rate": 7.946069994262765e-08, + "logits/chosen": -2.634385347366333, + "logits/rejected": -2.653733253479004, + "logps/chosen": -249.900634765625, + "logps/rejected": -238.01327514648438, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5609699487686157, + "rewards/margins": 12.663952827453613, + "rewards/rejected": -11.102984428405762, + "step": 4980 + }, + { + "epoch": 2.58, + "learning_rate": 7.850449416714476e-08, + "logits/chosen": -2.6939308643341064, + "logits/rejected": -2.8180503845214844, + "logps/chosen": -339.6492614746094, + "logps/rejected": -327.8633728027344, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.133561134338379, + "rewards/margins": 13.298667907714844, + "rewards/rejected": -11.165107727050781, + "step": 4990 + }, + { + "epoch": 2.58, + "learning_rate": 7.754828839166188e-08, + "logits/chosen": -2.720078229904175, + "logits/rejected": -2.694797992706299, + "logps/chosen": -274.102294921875, + "logps/rejected": -277.6142578125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9390266537666321, + "rewards/margins": 11.481155395507812, + "rewards/rejected": -10.542128562927246, + "step": 5000 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.7196364402770996, + "eval_logits/rejected": -2.718513250350952, + "eval_logps/chosen": -312.903564453125, + "eval_logps/rejected": -280.2130126953125, + "eval_loss": 0.7864260673522949, + "eval_rewards/accuracies": 0.773809552192688, + "eval_rewards/chosen": -2.882828950881958, + "eval_rewards/margins": 3.971348762512207, + "eval_rewards/rejected": -6.854177951812744, + "eval_runtime": 217.5511, + "eval_samples_per_second": 9.193, + "eval_steps_per_second": 0.29, + "step": 5000 + }, + { + "epoch": 2.59, + "learning_rate": 7.6592082616179e-08, + "logits/chosen": -2.6231255531311035, + "logits/rejected": -2.626267910003662, + "logps/chosen": -235.2042236328125, + "logps/rejected": -297.21063232421875, + "loss": 0.0098, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.4571438431739807, + "rewards/margins": 15.142412185668945, + "rewards/rejected": -14.685267448425293, + "step": 5010 + }, + { + "epoch": 2.59, + "learning_rate": 7.563587684069611e-08, + "logits/chosen": -2.5229032039642334, + "logits/rejected": -2.6841416358947754, + "logps/chosen": -258.5490417480469, + "logps/rejected": -363.48162841796875, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.008655309677124, + "rewards/margins": 15.509332656860352, + "rewards/rejected": -13.500676155090332, + "step": 5020 + }, + { + "epoch": 2.6, + "learning_rate": 7.467967106521324e-08, + "logits/chosen": -2.6904513835906982, + "logits/rejected": -2.694772243499756, + "logps/chosen": -229.6025390625, + "logps/rejected": -202.74024963378906, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2432830333709717, + "rewards/margins": 12.049966812133789, + "rewards/rejected": -9.806684494018555, + "step": 5030 + }, + { + "epoch": 2.6, + "learning_rate": 7.372346528973034e-08, + "logits/chosen": -2.68023419380188, + "logits/rejected": -2.7025341987609863, + "logps/chosen": -220.0260772705078, + "logps/rejected": -261.80352783203125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38927072286605835, + "rewards/margins": 11.34105110168457, + "rewards/rejected": -10.951781272888184, + "step": 5040 + }, + { + "epoch": 2.61, + "learning_rate": 7.276725951424746e-08, + "logits/chosen": -2.738955020904541, + "logits/rejected": -2.659353494644165, + "logps/chosen": -230.02413940429688, + "logps/rejected": -324.76116943359375, + "loss": 0.0135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.024852514266967773, + "rewards/margins": 12.52303409576416, + "rewards/rejected": -12.547883987426758, + "step": 5050 + }, + { + "epoch": 2.61, + "learning_rate": 7.181105373876457e-08, + "logits/chosen": -2.627469539642334, + "logits/rejected": -2.6676948070526123, + "logps/chosen": -260.1104431152344, + "logps/rejected": -266.2936096191406, + "loss": 0.0137, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9252920150756836, + "rewards/margins": 12.830589294433594, + "rewards/rejected": -11.905298233032227, + "step": 5060 + }, + { + "epoch": 2.62, + "learning_rate": 7.08548479632817e-08, + "logits/chosen": -2.784240484237671, + "logits/rejected": -2.735193967819214, + "logps/chosen": -369.58270263671875, + "logps/rejected": -323.7354736328125, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6716524362564087, + "rewards/margins": 12.856073379516602, + "rewards/rejected": -11.184419631958008, + "step": 5070 + }, + { + "epoch": 2.62, + "learning_rate": 6.98986421877988e-08, + "logits/chosen": -2.740915298461914, + "logits/rejected": -2.7226786613464355, + "logps/chosen": -340.28802490234375, + "logps/rejected": -355.4793395996094, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.994368851184845, + "rewards/margins": 13.248014450073242, + "rewards/rejected": -12.253645896911621, + "step": 5080 + }, + { + "epoch": 2.63, + "learning_rate": 6.894243641231592e-08, + "logits/chosen": -2.6611123085021973, + "logits/rejected": -2.627011299133301, + "logps/chosen": -255.84457397460938, + "logps/rejected": -327.6121826171875, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7034765481948853, + "rewards/margins": 16.250394821166992, + "rewards/rejected": -14.546918869018555, + "step": 5090 + }, + { + "epoch": 2.63, + "learning_rate": 6.798623063683305e-08, + "logits/chosen": -2.6290283203125, + "logits/rejected": -2.695361852645874, + "logps/chosen": -279.5427551269531, + "logps/rejected": -333.14532470703125, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6256012320518494, + "rewards/margins": 16.523670196533203, + "rewards/rejected": -15.89806842803955, + "step": 5100 + }, + { + "epoch": 2.64, + "learning_rate": 6.703002486135017e-08, + "logits/chosen": -2.5708651542663574, + "logits/rejected": -2.6656506061553955, + "logps/chosen": -249.72073364257812, + "logps/rejected": -296.4393005371094, + "loss": 0.0139, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8271173238754272, + "rewards/margins": 16.879287719726562, + "rewards/rejected": -15.052169799804688, + "step": 5110 + }, + { + "epoch": 2.64, + "learning_rate": 6.607381908586727e-08, + "logits/chosen": -2.7096810340881348, + "logits/rejected": -2.700406074523926, + "logps/chosen": -238.05990600585938, + "logps/rejected": -302.31378173828125, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9860852956771851, + "rewards/margins": 14.121419906616211, + "rewards/rejected": -13.135335922241211, + "step": 5120 + }, + { + "epoch": 2.65, + "learning_rate": 6.511761331038438e-08, + "logits/chosen": -2.774763584136963, + "logits/rejected": -2.795804500579834, + "logps/chosen": -236.27658081054688, + "logps/rejected": -272.2627868652344, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.472693681716919, + "rewards/margins": 9.923027038574219, + "rewards/rejected": -11.395719528198242, + "step": 5130 + }, + { + "epoch": 2.65, + "learning_rate": 6.416140753490151e-08, + "logits/chosen": -2.7146975994110107, + "logits/rejected": -2.7896509170532227, + "logps/chosen": -368.1150817871094, + "logps/rejected": -340.7099609375, + "loss": 0.0099, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3701469898223877, + "rewards/margins": 13.160669326782227, + "rewards/rejected": -11.790521621704102, + "step": 5140 + }, + { + "epoch": 2.66, + "learning_rate": 6.320520175941863e-08, + "logits/chosen": -2.5971240997314453, + "logits/rejected": -2.595491647720337, + "logps/chosen": -248.2359619140625, + "logps/rejected": -278.7591857910156, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.151533603668213, + "rewards/margins": 13.753721237182617, + "rewards/rejected": -12.602187156677246, + "step": 5150 + }, + { + "epoch": 2.66, + "learning_rate": 6.224899598393573e-08, + "logits/chosen": -2.7201809883117676, + "logits/rejected": -2.751149892807007, + "logps/chosen": -292.6256103515625, + "logps/rejected": -320.6335754394531, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5118240714073181, + "rewards/margins": 13.12120532989502, + "rewards/rejected": -12.609381675720215, + "step": 5160 + }, + { + "epoch": 2.67, + "learning_rate": 6.129279020845286e-08, + "logits/chosen": -2.7117671966552734, + "logits/rejected": -2.8060145378112793, + "logps/chosen": -262.6538391113281, + "logps/rejected": -248.8769989013672, + "loss": 0.0102, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5282812118530273, + "rewards/margins": 12.767044067382812, + "rewards/rejected": -11.238761901855469, + "step": 5170 + }, + { + "epoch": 2.67, + "learning_rate": 6.033658443296998e-08, + "logits/chosen": -2.627601146697998, + "logits/rejected": -2.671121120452881, + "logps/chosen": -203.81683349609375, + "logps/rejected": -290.76055908203125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5020616054534912, + "rewards/margins": 14.89185619354248, + "rewards/rejected": -13.389793395996094, + "step": 5180 + }, + { + "epoch": 2.68, + "learning_rate": 5.9380378657487085e-08, + "logits/chosen": -2.7020039558410645, + "logits/rejected": -2.7345330715179443, + "logps/chosen": -323.268310546875, + "logps/rejected": -310.77606201171875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3893721103668213, + "rewards/margins": 14.052579879760742, + "rewards/rejected": -12.663207054138184, + "step": 5190 + }, + { + "epoch": 2.68, + "learning_rate": 5.842417288200421e-08, + "logits/chosen": -2.6971147060394287, + "logits/rejected": -2.7128520011901855, + "logps/chosen": -312.43505859375, + "logps/rejected": -313.35479736328125, + "loss": 0.0103, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1101088523864746, + "rewards/margins": 14.328615188598633, + "rewards/rejected": -12.218506813049316, + "step": 5200 + }, + { + "epoch": 2.69, + "learning_rate": 5.7467967106521317e-08, + "logits/chosen": -2.677502155303955, + "logits/rejected": -2.684664726257324, + "logps/chosen": -221.14669799804688, + "logps/rejected": -336.1509704589844, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3738515377044678, + "rewards/margins": 18.37417221069336, + "rewards/rejected": -16.000322341918945, + "step": 5210 + }, + { + "epoch": 2.69, + "learning_rate": 5.651176133103844e-08, + "logits/chosen": -2.7950339317321777, + "logits/rejected": -2.7664475440979004, + "logps/chosen": -280.81549072265625, + "logps/rejected": -316.634521484375, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.932924509048462, + "rewards/margins": 14.996597290039062, + "rewards/rejected": -13.063672065734863, + "step": 5220 + }, + { + "epoch": 2.7, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -2.6718757152557373, + "logits/rejected": -2.7057576179504395, + "logps/chosen": -278.4066162109375, + "logps/rejected": -269.1131896972656, + "loss": 0.0068, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.11279463768005371, + "rewards/margins": 12.314489364624023, + "rewards/rejected": -12.201693534851074, + "step": 5230 + }, + { + "epoch": 2.71, + "learning_rate": 5.459934978007267e-08, + "logits/chosen": -2.7089667320251465, + "logits/rejected": -2.8040003776550293, + "logps/chosen": -277.17437744140625, + "logps/rejected": -312.9452209472656, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6362769603729248, + "rewards/margins": 15.224557876586914, + "rewards/rejected": -13.588282585144043, + "step": 5240 + }, + { + "epoch": 2.71, + "learning_rate": 5.3643144004589786e-08, + "logits/chosen": -2.6221039295196533, + "logits/rejected": -2.5532572269439697, + "logps/chosen": -252.72628784179688, + "logps/rejected": -297.381103515625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.109865665435791, + "rewards/margins": 16.99216079711914, + "rewards/rejected": -13.882293701171875, + "step": 5250 + }, + { + "epoch": 2.72, + "learning_rate": 5.26869382291069e-08, + "logits/chosen": -2.7333858013153076, + "logits/rejected": -2.7665257453918457, + "logps/chosen": -237.46041870117188, + "logps/rejected": -275.907470703125, + "loss": 0.0222, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.37319669127464294, + "rewards/margins": 11.739015579223633, + "rewards/rejected": -11.365819931030273, + "step": 5260 + }, + { + "epoch": 2.72, + "learning_rate": 5.173073245362402e-08, + "logits/chosen": -2.548161745071411, + "logits/rejected": -2.7022764682769775, + "logps/chosen": -240.03506469726562, + "logps/rejected": -316.1889953613281, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8135687112808228, + "rewards/margins": 14.389727592468262, + "rewards/rejected": -13.57615852355957, + "step": 5270 + }, + { + "epoch": 2.73, + "learning_rate": 5.077452667814113e-08, + "logits/chosen": -2.6491031646728516, + "logits/rejected": -2.6999027729034424, + "logps/chosen": -220.13034057617188, + "logps/rejected": -269.79119873046875, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7047170400619507, + "rewards/margins": 13.21327018737793, + "rewards/rejected": -13.917986869812012, + "step": 5280 + }, + { + "epoch": 2.73, + "learning_rate": 4.981832090265825e-08, + "logits/chosen": -2.7604708671569824, + "logits/rejected": -2.7237050533294678, + "logps/chosen": -268.9174499511719, + "logps/rejected": -308.0942687988281, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1735785007476807, + "rewards/margins": 16.48755645751953, + "rewards/rejected": -14.313977241516113, + "step": 5290 + }, + { + "epoch": 2.74, + "learning_rate": 4.8862115127175364e-08, + "logits/chosen": -2.686573028564453, + "logits/rejected": -2.807529926300049, + "logps/chosen": -309.7162170410156, + "logps/rejected": -307.1707458496094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2362159490585327, + "rewards/margins": 12.712292671203613, + "rewards/rejected": -11.476076126098633, + "step": 5300 + }, + { + "epoch": 2.74, + "learning_rate": 4.790590935169248e-08, + "logits/chosen": -2.7090699672698975, + "logits/rejected": -2.5909218788146973, + "logps/chosen": -244.21267700195312, + "logps/rejected": -422.15533447265625, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.389432430267334, + "rewards/margins": 15.106956481933594, + "rewards/rejected": -14.717523574829102, + "step": 5310 + }, + { + "epoch": 2.75, + "learning_rate": 4.69497035762096e-08, + "logits/chosen": -2.621753454208374, + "logits/rejected": -2.606581211090088, + "logps/chosen": -291.43316650390625, + "logps/rejected": -347.6919860839844, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3517930507659912, + "rewards/margins": 13.509759902954102, + "rewards/rejected": -14.861552238464355, + "step": 5320 + }, + { + "epoch": 2.75, + "learning_rate": 4.599349780072671e-08, + "logits/chosen": -2.6323933601379395, + "logits/rejected": -2.65653133392334, + "logps/chosen": -311.70855712890625, + "logps/rejected": -272.53302001953125, + "loss": 0.0142, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.12222401052713394, + "rewards/margins": 12.896278381347656, + "rewards/rejected": -13.018501281738281, + "step": 5330 + }, + { + "epoch": 2.76, + "learning_rate": 4.5037292025243834e-08, + "logits/chosen": -2.6000657081604004, + "logits/rejected": -2.6205499172210693, + "logps/chosen": -250.4822235107422, + "logps/rejected": -320.0833740234375, + "loss": 0.0072, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5791562795639038, + "rewards/margins": 13.950708389282227, + "rewards/rejected": -13.371553421020508, + "step": 5340 + }, + { + "epoch": 2.76, + "learning_rate": 4.408108624976094e-08, + "logits/chosen": -2.833369016647339, + "logits/rejected": -2.6747612953186035, + "logps/chosen": -281.80267333984375, + "logps/rejected": -340.23828125, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0539871454238892, + "rewards/margins": 16.2853946685791, + "rewards/rejected": -15.231409072875977, + "step": 5350 + }, + { + "epoch": 2.77, + "learning_rate": 4.3124880474278065e-08, + "logits/chosen": -2.8220465183258057, + "logits/rejected": -2.790334701538086, + "logps/chosen": -293.7169494628906, + "logps/rejected": -277.62127685546875, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2055577039718628, + "rewards/margins": 13.435251235961914, + "rewards/rejected": -12.229693412780762, + "step": 5360 + }, + { + "epoch": 2.77, + "learning_rate": 4.2168674698795174e-08, + "logits/chosen": -2.7694520950317383, + "logits/rejected": -2.808863878250122, + "logps/chosen": -226.52627563476562, + "logps/rejected": -304.43389892578125, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23372545838356018, + "rewards/margins": 13.936861991882324, + "rewards/rejected": -13.70313549041748, + "step": 5370 + }, + { + "epoch": 2.78, + "learning_rate": 4.1212468923312296e-08, + "logits/chosen": -2.6227002143859863, + "logits/rejected": -2.589164972305298, + "logps/chosen": -232.10678100585938, + "logps/rejected": -308.0304870605469, + "loss": 0.0116, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9900895357131958, + "rewards/margins": 11.820328712463379, + "rewards/rejected": -12.810419082641602, + "step": 5380 + }, + { + "epoch": 2.78, + "learning_rate": 4.025626314782941e-08, + "logits/chosen": -2.654756784439087, + "logits/rejected": -2.7141530513763428, + "logps/chosen": -248.1104736328125, + "logps/rejected": -272.46295166015625, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5592339038848877, + "rewards/margins": 15.954304695129395, + "rewards/rejected": -13.395071029663086, + "step": 5390 + }, + { + "epoch": 2.79, + "learning_rate": 3.930005737234653e-08, + "logits/chosen": -2.6779532432556152, + "logits/rejected": -2.731102228164673, + "logps/chosen": -228.0155029296875, + "logps/rejected": -250.24398803710938, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5090616345405579, + "rewards/margins": 12.24303150177002, + "rewards/rejected": -11.73397159576416, + "step": 5400 + }, + { + "epoch": 2.79, + "learning_rate": 3.8343851596863644e-08, + "logits/chosen": -2.699240207672119, + "logits/rejected": -2.6759274005889893, + "logps/chosen": -265.9540100097656, + "logps/rejected": -252.60317993164062, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9274028539657593, + "rewards/margins": 13.91094970703125, + "rewards/rejected": -11.983546257019043, + "step": 5410 + }, + { + "epoch": 2.8, + "learning_rate": 3.738764582138076e-08, + "logits/chosen": -2.7132983207702637, + "logits/rejected": -2.699207067489624, + "logps/chosen": -306.46649169921875, + "logps/rejected": -347.5021057128906, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05111388489603996, + "rewards/margins": 13.25873851776123, + "rewards/rejected": -13.207626342773438, + "step": 5420 + }, + { + "epoch": 2.8, + "learning_rate": 3.6431440045897875e-08, + "logits/chosen": -2.632061243057251, + "logits/rejected": -2.628340721130371, + "logps/chosen": -310.02984619140625, + "logps/rejected": -364.7181701660156, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.396155595779419, + "rewards/margins": 15.769102096557617, + "rewards/rejected": -13.372945785522461, + "step": 5430 + }, + { + "epoch": 2.81, + "learning_rate": 3.547523427041499e-08, + "logits/chosen": -2.7925541400909424, + "logits/rejected": -2.7636916637420654, + "logps/chosen": -307.3076171875, + "logps/rejected": -317.149169921875, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.418766975402832, + "rewards/margins": 15.514017105102539, + "rewards/rejected": -13.095250129699707, + "step": 5440 + }, + { + "epoch": 2.81, + "learning_rate": 3.4519028494932106e-08, + "logits/chosen": -2.6972720623016357, + "logits/rejected": -2.6307532787323, + "logps/chosen": -246.3926239013672, + "logps/rejected": -326.93157958984375, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45250099897384644, + "rewards/margins": 13.748095512390137, + "rewards/rejected": -13.29559326171875, + "step": 5450 + }, + { + "epoch": 2.82, + "learning_rate": 3.356282271944923e-08, + "logits/chosen": -2.688779354095459, + "logits/rejected": -2.712975263595581, + "logps/chosen": -234.59335327148438, + "logps/rejected": -302.8453674316406, + "loss": 0.0163, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1471259593963623, + "rewards/margins": 15.453851699829102, + "rewards/rejected": -13.306724548339844, + "step": 5460 + }, + { + "epoch": 2.82, + "learning_rate": 3.260661694396634e-08, + "logits/chosen": -2.756401538848877, + "logits/rejected": -2.809408664703369, + "logps/chosen": -336.09332275390625, + "logps/rejected": -258.19744873046875, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9356826543807983, + "rewards/margins": 14.006448745727539, + "rewards/rejected": -13.070767402648926, + "step": 5470 + }, + { + "epoch": 2.83, + "learning_rate": 3.165041116848346e-08, + "logits/chosen": -2.7278525829315186, + "logits/rejected": -2.6752920150756836, + "logps/chosen": -236.9536590576172, + "logps/rejected": -334.90704345703125, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1287994384765625, + "rewards/margins": 14.35100269317627, + "rewards/rejected": -13.222201347351074, + "step": 5480 + }, + { + "epoch": 2.83, + "learning_rate": 3.0694205393000576e-08, + "logits/chosen": -2.6900107860565186, + "logits/rejected": -2.6366961002349854, + "logps/chosen": -231.33364868164062, + "logps/rejected": -267.84808349609375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.4923148155212402, + "rewards/margins": 15.316784858703613, + "rewards/rejected": -11.824469566345215, + "step": 5490 + }, + { + "epoch": 2.84, + "learning_rate": 2.9737999617517688e-08, + "logits/chosen": -2.707181692123413, + "logits/rejected": -2.603635787963867, + "logps/chosen": -254.90615844726562, + "logps/rejected": -268.971435546875, + "loss": 0.0094, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9463617205619812, + "rewards/margins": 13.233505249023438, + "rewards/rejected": -12.287142753601074, + "step": 5500 + }, + { + "epoch": 2.84, + "eval_logits/chosen": -2.7012245655059814, + "eval_logits/rejected": -2.698698043823242, + "eval_logps/chosen": -314.4378356933594, + "eval_logps/rejected": -282.4463806152344, + "eval_loss": 0.7952748537063599, + "eval_rewards/accuracies": 0.7579365372657776, + "eval_rewards/chosen": -3.189682722091675, + "eval_rewards/margins": 4.111170768737793, + "eval_rewards/rejected": -7.3008527755737305, + "eval_runtime": 217.7034, + "eval_samples_per_second": 9.187, + "eval_steps_per_second": 0.289, + "step": 5500 + }, + { + "epoch": 2.84, + "learning_rate": 2.8781793842034804e-08, + "logits/chosen": -2.605278491973877, + "logits/rejected": -2.5629143714904785, + "logps/chosen": -250.08090209960938, + "logps/rejected": -277.3458557128906, + "loss": 0.0147, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.06431760638952255, + "rewards/margins": 13.18347454071045, + "rewards/rejected": -13.247793197631836, + "step": 5510 + }, + { + "epoch": 2.85, + "learning_rate": 2.782558806655192e-08, + "logits/chosen": -2.6917879581451416, + "logits/rejected": -2.7281289100646973, + "logps/chosen": -289.0796203613281, + "logps/rejected": -350.89227294921875, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8091856241226196, + "rewards/margins": 14.719850540161133, + "rewards/rejected": -12.910664558410645, + "step": 5520 + }, + { + "epoch": 2.85, + "learning_rate": 2.6869382291069035e-08, + "logits/chosen": -2.7241485118865967, + "logits/rejected": -2.7581992149353027, + "logps/chosen": -277.55157470703125, + "logps/rejected": -296.58648681640625, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.336979478597641, + "rewards/margins": 13.269563674926758, + "rewards/rejected": -12.932583808898926, + "step": 5530 + }, + { + "epoch": 2.86, + "learning_rate": 2.591317651558615e-08, + "logits/chosen": -2.6184306144714355, + "logits/rejected": -2.6479744911193848, + "logps/chosen": -243.2447509765625, + "logps/rejected": -317.91619873046875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5495041608810425, + "rewards/margins": 13.560381889343262, + "rewards/rejected": -13.01087760925293, + "step": 5540 + }, + { + "epoch": 2.87, + "learning_rate": 2.4956970740103267e-08, + "logits/chosen": -2.707151412963867, + "logits/rejected": -2.621060609817505, + "logps/chosen": -282.298095703125, + "logps/rejected": -377.27313232421875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2465739250183105, + "rewards/margins": 17.916969299316406, + "rewards/rejected": -15.67039680480957, + "step": 5550 + }, + { + "epoch": 2.87, + "learning_rate": 2.4000764964620386e-08, + "logits/chosen": -2.7057225704193115, + "logits/rejected": -2.7142810821533203, + "logps/chosen": -342.3331298828125, + "logps/rejected": -351.43023681640625, + "loss": 0.0102, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6923713684082031, + "rewards/margins": 11.715002059936523, + "rewards/rejected": -11.022631645202637, + "step": 5560 + }, + { + "epoch": 2.88, + "learning_rate": 2.30445591891375e-08, + "logits/chosen": -2.7297680377960205, + "logits/rejected": -2.671332836151123, + "logps/chosen": -309.3148498535156, + "logps/rejected": -334.22381591796875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.074737071990967, + "rewards/margins": 16.295886993408203, + "rewards/rejected": -14.221150398254395, + "step": 5570 + }, + { + "epoch": 2.88, + "learning_rate": 2.2088353413654617e-08, + "logits/chosen": -2.514273166656494, + "logits/rejected": -2.5561318397521973, + "logps/chosen": -222.4387969970703, + "logps/rejected": -260.14617919921875, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7343811988830566, + "rewards/margins": 16.719032287597656, + "rewards/rejected": -13.984651565551758, + "step": 5580 + }, + { + "epoch": 2.89, + "learning_rate": 2.1132147638171733e-08, + "logits/chosen": -2.562824249267578, + "logits/rejected": -2.6268982887268066, + "logps/chosen": -324.80877685546875, + "logps/rejected": -313.646484375, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.58955717086792, + "rewards/margins": 17.370824813842773, + "rewards/rejected": -13.781267166137695, + "step": 5590 + }, + { + "epoch": 2.89, + "learning_rate": 2.0175941862688848e-08, + "logits/chosen": -2.6945369243621826, + "logits/rejected": -2.6687333583831787, + "logps/chosen": -249.3484344482422, + "logps/rejected": -313.9985656738281, + "loss": 0.0538, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.41767793893814087, + "rewards/margins": 13.5536527633667, + "rewards/rejected": -13.13597583770752, + "step": 5600 + }, + { + "epoch": 2.9, + "learning_rate": 1.9219736087205964e-08, + "logits/chosen": -2.642052173614502, + "logits/rejected": -2.716784715652466, + "logps/chosen": -241.4844970703125, + "logps/rejected": -301.21759033203125, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.055972099304199, + "rewards/margins": 14.421625137329102, + "rewards/rejected": -12.365653038024902, + "step": 5610 + }, + { + "epoch": 2.9, + "learning_rate": 1.826353031172308e-08, + "logits/chosen": -2.6660943031311035, + "logits/rejected": -2.667335033416748, + "logps/chosen": -356.5085754394531, + "logps/rejected": -324.35858154296875, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6099711060523987, + "rewards/margins": 12.738626480102539, + "rewards/rejected": -12.128656387329102, + "step": 5620 + }, + { + "epoch": 2.91, + "learning_rate": 1.73073245362402e-08, + "logits/chosen": -2.624074935913086, + "logits/rejected": -2.5837934017181396, + "logps/chosen": -310.8951721191406, + "logps/rejected": -266.6996154785156, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0079091787338257, + "rewards/margins": 14.171859741210938, + "rewards/rejected": -13.163949966430664, + "step": 5630 + }, + { + "epoch": 2.91, + "learning_rate": 1.6351118760757314e-08, + "logits/chosen": -2.663119077682495, + "logits/rejected": -2.6472928524017334, + "logps/chosen": -273.03973388671875, + "logps/rejected": -245.8977813720703, + "loss": 0.0165, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0619621276855469, + "rewards/margins": 12.637825012207031, + "rewards/rejected": -11.575862884521484, + "step": 5640 + }, + { + "epoch": 2.92, + "learning_rate": 1.539491298527443e-08, + "logits/chosen": -2.67164945602417, + "logits/rejected": -2.636287212371826, + "logps/chosen": -229.2053680419922, + "logps/rejected": -299.718994140625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.481825590133667, + "rewards/margins": 13.588674545288086, + "rewards/rejected": -13.106849670410156, + "step": 5650 + }, + { + "epoch": 2.92, + "learning_rate": 1.4438707209791546e-08, + "logits/chosen": -2.6152384281158447, + "logits/rejected": -2.8120269775390625, + "logps/chosen": -359.27978515625, + "logps/rejected": -286.0852966308594, + "loss": 0.0174, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.058980107307434, + "rewards/margins": 12.662736892700195, + "rewards/rejected": -11.60375690460205, + "step": 5660 + }, + { + "epoch": 2.93, + "learning_rate": 1.3482501434308661e-08, + "logits/chosen": -2.5586142539978027, + "logits/rejected": -2.6513123512268066, + "logps/chosen": -300.5698547363281, + "logps/rejected": -245.2171173095703, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.214424967765808, + "rewards/margins": 12.673861503601074, + "rewards/rejected": -11.459436416625977, + "step": 5670 + }, + { + "epoch": 2.93, + "learning_rate": 1.2526295658825777e-08, + "logits/chosen": -2.7828521728515625, + "logits/rejected": -2.752152919769287, + "logps/chosen": -265.4630126953125, + "logps/rejected": -351.28350830078125, + "loss": 0.0176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.451908826828003, + "rewards/margins": 15.984410285949707, + "rewards/rejected": -14.532501220703125, + "step": 5680 + }, + { + "epoch": 2.94, + "learning_rate": 1.1570089883342895e-08, + "logits/chosen": -2.6207876205444336, + "logits/rejected": -2.6226754188537598, + "logps/chosen": -309.70770263671875, + "logps/rejected": -365.6626892089844, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7145103216171265, + "rewards/margins": 16.66482925415039, + "rewards/rejected": -14.950319290161133, + "step": 5690 + }, + { + "epoch": 2.94, + "learning_rate": 1.061388410786001e-08, + "logits/chosen": -2.6478333473205566, + "logits/rejected": -2.686779022216797, + "logps/chosen": -259.3996887207031, + "logps/rejected": -246.5650634765625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.815895438194275, + "rewards/margins": 12.701074600219727, + "rewards/rejected": -10.88517951965332, + "step": 5700 + }, + { + "epoch": 2.95, + "learning_rate": 9.657678332377126e-09, + "logits/chosen": -2.6393017768859863, + "logits/rejected": -2.7247872352600098, + "logps/chosen": -262.43060302734375, + "logps/rejected": -258.43292236328125, + "loss": 0.011, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.0494627952575684, + "rewards/margins": 14.03632640838623, + "rewards/rejected": -11.98686408996582, + "step": 5710 + }, + { + "epoch": 2.95, + "learning_rate": 8.701472556894243e-09, + "logits/chosen": -2.6285250186920166, + "logits/rejected": -2.681663990020752, + "logps/chosen": -244.53811645507812, + "logps/rejected": -280.70391845703125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3365360200405121, + "rewards/margins": 13.008634567260742, + "rewards/rejected": -13.345170974731445, + "step": 5720 + }, + { + "epoch": 2.96, + "learning_rate": 7.745266781411359e-09, + "logits/chosen": -2.69207501411438, + "logits/rejected": -2.71191668510437, + "logps/chosen": -271.1983947753906, + "logps/rejected": -309.11407470703125, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11351821571588516, + "rewards/margins": 13.056490898132324, + "rewards/rejected": -13.170010566711426, + "step": 5730 + }, + { + "epoch": 2.96, + "learning_rate": 6.7890610059284754e-09, + "logits/chosen": -2.6312122344970703, + "logits/rejected": -2.6347126960754395, + "logps/chosen": -217.0697784423828, + "logps/rejected": -255.55484008789062, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3219164609909058, + "rewards/margins": 12.84302043914795, + "rewards/rejected": -11.52110481262207, + "step": 5740 + }, + { + "epoch": 2.97, + "learning_rate": 5.832855230445592e-09, + "logits/chosen": -2.6520233154296875, + "logits/rejected": -2.6504294872283936, + "logps/chosen": -268.7332763671875, + "logps/rejected": -300.84686279296875, + "loss": 0.0121, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.144339084625244, + "rewards/margins": 15.850810050964355, + "rewards/rejected": -13.706469535827637, + "step": 5750 + }, + { + "epoch": 2.97, + "learning_rate": 4.8766494549627085e-09, + "logits/chosen": -2.6992509365081787, + "logits/rejected": -2.681377410888672, + "logps/chosen": -273.0556335449219, + "logps/rejected": -297.84661865234375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6503357887268066, + "rewards/margins": 17.637630462646484, + "rewards/rejected": -13.987295150756836, + "step": 5760 + }, + { + "epoch": 2.98, + "learning_rate": 3.920443679479824e-09, + "logits/chosen": -2.778294086456299, + "logits/rejected": -2.667935609817505, + "logps/chosen": -249.29043579101562, + "logps/rejected": -267.514404296875, + "loss": 0.0183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5016997456550598, + "rewards/margins": 12.95671272277832, + "rewards/rejected": -12.455013275146484, + "step": 5770 + }, + { + "epoch": 2.98, + "learning_rate": 2.96423790399694e-09, + "logits/chosen": -2.723689317703247, + "logits/rejected": -2.811107635498047, + "logps/chosen": -278.9635925292969, + "logps/rejected": -307.65924072265625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.774082362651825, + "rewards/margins": 12.066235542297363, + "rewards/rejected": -11.292154312133789, + "step": 5780 + }, + { + "epoch": 2.99, + "learning_rate": 2.008032128514056e-09, + "logits/chosen": -2.688815116882324, + "logits/rejected": -2.7489705085754395, + "logps/chosen": -284.84881591796875, + "logps/rejected": -345.14422607421875, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01807079277932644, + "rewards/margins": 12.609796524047852, + "rewards/rejected": -12.591726303100586, + "step": 5790 + }, + { + "epoch": 2.99, + "learning_rate": 1.0518263530311723e-09, + "logits/chosen": -2.7407386302948, + "logits/rejected": -2.794053554534912, + "logps/chosen": -225.2655029296875, + "logps/rejected": -304.1062316894531, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2420426607131958, + "rewards/margins": 12.693281173706055, + "rewards/rejected": -11.451238632202148, + "step": 5800 + }, + { + "epoch": 3.0, + "learning_rate": 9.562057754828839e-11, + "logits/chosen": -2.6969985961914062, + "logits/rejected": -2.7415974140167236, + "logps/chosen": -257.86944580078125, + "logps/rejected": -356.72320556640625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3919858932495117, + "rewards/margins": 12.642386436462402, + "rewards/rejected": -11.250399589538574, + "step": 5810 + }, + { + "epoch": 3.0, + "step": 5811, + "total_flos": 0.0, + "train_loss": 0.22832600153418584, + "train_runtime": 42683.4293, + "train_samples_per_second": 4.355, + "train_steps_per_second": 0.136 + } + ], + "logging_steps": 10, + "max_steps": 5811, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}