diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9244 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997333028536595, + "eval_steps": 500, + "global_step": 3280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.5243902439024392e-07, + "logits/chosen": -1.6215482950210571, + "logits/rejected": -1.4746919870376587, + "logps/chosen": -188.31854248046875, + "logps/rejected": -214.3458709716797, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 7.621951219512196e-07, + "logits/chosen": -1.6072877645492554, + "logits/rejected": -1.5261101722717285, + "logps/chosen": -266.4974365234375, + "logps/rejected": -276.2115478515625, + "loss": 0.6935, + "rewards/accuracies": 0.3203125, + "rewards/chosen": -0.0012238634517416358, + "rewards/margins": 0.0034746606834232807, + "rewards/rejected": -0.004698523320257664, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 1.5243902439024391e-06, + "logits/chosen": -1.5626871585845947, + "logits/rejected": -1.499194860458374, + "logps/chosen": -200.6118621826172, + "logps/rejected": -229.2737579345703, + "loss": 0.6934, + "rewards/accuracies": 0.3687500059604645, + "rewards/chosen": -0.001540867961011827, + "rewards/margins": 0.0018269469728693366, + "rewards/rejected": -0.003367815865203738, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 2.2865853658536584e-06, + "logits/chosen": -1.528067708015442, + "logits/rejected": -1.425481915473938, + "logps/chosen": -230.9717254638672, + "logps/rejected": -243.3507537841797, + "loss": 0.6893, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.019894156605005264, + "rewards/margins": 0.011482590809464455, + "rewards/rejected": -0.03137674927711487, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 3.0487804878048782e-06, + "logits/chosen": -1.5903682708740234, + "logits/rejected": -1.508452296257019, + "logps/chosen": -220.4499053955078, + "logps/rejected": -250.4200897216797, + "loss": 0.6848, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.04624359309673309, + "rewards/margins": 0.019922306761145592, + "rewards/rejected": -0.06616590172052383, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 3.8109756097560976e-06, + "logits/chosen": -1.542128324508667, + "logits/rejected": -1.4916765689849854, + "logps/chosen": -214.9829864501953, + "logps/rejected": -228.2103271484375, + "loss": 0.6784, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.07907415926456451, + "rewards/margins": 0.03622515872120857, + "rewards/rejected": -0.11529930680990219, + "step": 25 + }, + { + "epoch": 0.01, + "learning_rate": 4.573170731707317e-06, + "logits/chosen": -1.5964621305465698, + "logits/rejected": -1.4607049226760864, + "logps/chosen": -226.9277801513672, + "logps/rejected": -208.7239227294922, + "loss": 0.6677, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.15189151465892792, + "rewards/margins": 0.0342349037528038, + "rewards/rejected": -0.18612642586231232, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.335365853658537e-06, + "logits/chosen": -1.608441948890686, + "logits/rejected": -1.538400411605835, + "logps/chosen": -241.2967071533203, + "logps/rejected": -258.8771057128906, + "loss": 0.6601, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2652124762535095, + "rewards/margins": 0.08378251641988754, + "rewards/rejected": -0.3489949703216553, + "step": 35 + }, + { + "epoch": 0.01, + "learning_rate": 6.0975609756097564e-06, + "logits/chosen": -1.5152992010116577, + "logits/rejected": -1.3842694759368896, + "logps/chosen": -265.1772155761719, + "logps/rejected": -267.61614990234375, + "loss": 0.6425, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.3918093740940094, + "rewards/margins": 0.12125066667795181, + "rewards/rejected": -0.5130600333213806, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 6.859756097560977e-06, + "logits/chosen": -1.4685131311416626, + "logits/rejected": -1.374135136604309, + "logps/chosen": -221.75424194335938, + "logps/rejected": -250.12954711914062, + "loss": 0.6139, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.4650735855102539, + "rewards/margins": 0.23447296023368835, + "rewards/rejected": -0.6995465755462646, + "step": 45 + }, + { + "epoch": 0.02, + "learning_rate": 7.621951219512195e-06, + "logits/chosen": -1.4883078336715698, + "logits/rejected": -1.3383334875106812, + "logps/chosen": -247.33468627929688, + "logps/rejected": -261.386962890625, + "loss": 0.63, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.6402769088745117, + "rewards/margins": 0.2748766839504242, + "rewards/rejected": -0.9151536822319031, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 8.384146341463415e-06, + "logits/chosen": -1.5034300088882446, + "logits/rejected": -1.4040600061416626, + "logps/chosen": -233.43331909179688, + "logps/rejected": -245.99453735351562, + "loss": 0.5996, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.7301857471466064, + "rewards/margins": 0.21929316222667694, + "rewards/rejected": -0.949478805065155, + "step": 55 + }, + { + "epoch": 0.02, + "learning_rate": 9.146341463414634e-06, + "logits/chosen": -1.438262701034546, + "logits/rejected": -1.3356047868728638, + "logps/chosen": -258.9648742675781, + "logps/rejected": -276.9036560058594, + "loss": 0.5881, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9311250448226929, + "rewards/margins": 0.41171926259994507, + "rewards/rejected": -1.3428443670272827, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 9.908536585365854e-06, + "logits/chosen": -1.5152195692062378, + "logits/rejected": -1.4204599857330322, + "logps/chosen": -232.12216186523438, + "logps/rejected": -245.5828094482422, + "loss": 0.5892, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.841234028339386, + "rewards/margins": 0.3476036489009857, + "rewards/rejected": -1.1888377666473389, + "step": 65 + }, + { + "epoch": 0.02, + "learning_rate": 1.0670731707317074e-05, + "logits/chosen": -1.4773343801498413, + "logits/rejected": -1.371441125869751, + "logps/chosen": -264.8576965332031, + "logps/rejected": -297.8639831542969, + "loss": 0.5599, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.033233880996704, + "rewards/margins": 0.5808243155479431, + "rewards/rejected": -1.6140581369400024, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.1432926829268294e-05, + "logits/chosen": -1.5124752521514893, + "logits/rejected": -1.4127274751663208, + "logps/chosen": -265.6245422363281, + "logps/rejected": -263.59515380859375, + "loss": 0.5978, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9992305040359497, + "rewards/margins": 0.321955144405365, + "rewards/rejected": -1.32118558883667, + "step": 75 + }, + { + "epoch": 0.02, + "learning_rate": 1.2195121951219513e-05, + "logits/chosen": -1.3964924812316895, + "logits/rejected": -1.356090784072876, + "logps/chosen": -246.69619750976562, + "logps/rejected": -292.822265625, + "loss": 0.5646, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9433499574661255, + "rewards/margins": 0.6655504703521729, + "rewards/rejected": -1.6089003086090088, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 1.2957317073170733e-05, + "logits/chosen": -1.4542206525802612, + "logits/rejected": -1.3648021221160889, + "logps/chosen": -261.1116638183594, + "logps/rejected": -290.764404296875, + "loss": 0.5359, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.813259482383728, + "rewards/margins": 0.595789909362793, + "rewards/rejected": -1.4090495109558105, + "step": 85 + }, + { + "epoch": 0.03, + "learning_rate": 1.3719512195121953e-05, + "logits/chosen": -1.5060111284255981, + "logits/rejected": -1.4409078359603882, + "logps/chosen": -243.53173828125, + "logps/rejected": -266.01361083984375, + "loss": 0.5356, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7168170809745789, + "rewards/margins": 0.6658238172531128, + "rewards/rejected": -1.3826408386230469, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.448170731707317e-05, + "logits/chosen": -1.377535104751587, + "logits/rejected": -1.3736878633499146, + "logps/chosen": -227.3628387451172, + "logps/rejected": -285.86334228515625, + "loss": 0.4957, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6789393424987793, + "rewards/margins": 0.7634402513504028, + "rewards/rejected": -1.4423797130584717, + "step": 95 + }, + { + "epoch": 0.03, + "learning_rate": 1.524390243902439e-05, + "logits/chosen": -1.4626977443695068, + "logits/rejected": -1.3379249572753906, + "logps/chosen": -254.5047607421875, + "logps/rejected": -249.72500610351562, + "loss": 0.4935, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8015538454055786, + "rewards/margins": 0.6757813692092896, + "rewards/rejected": -1.4773352146148682, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.600609756097561e-05, + "logits/chosen": -1.5257985591888428, + "logits/rejected": -1.380997657775879, + "logps/chosen": -232.9523162841797, + "logps/rejected": -260.36492919921875, + "loss": 0.4978, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6900753974914551, + "rewards/margins": 0.927207350730896, + "rewards/rejected": -1.6172831058502197, + "step": 105 + }, + { + "epoch": 0.03, + "learning_rate": 1.676829268292683e-05, + "logits/chosen": -1.4730074405670166, + "logits/rejected": -1.47112238407135, + "logps/chosen": -212.0960235595703, + "logps/rejected": -271.15191650390625, + "loss": 0.483, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7050459980964661, + "rewards/margins": 0.937295138835907, + "rewards/rejected": -1.6423410177230835, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 1.7530487804878047e-05, + "logits/chosen": -1.4949665069580078, + "logits/rejected": -1.5023237466812134, + "logps/chosen": -231.5373992919922, + "logps/rejected": -283.80023193359375, + "loss": 0.4846, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8734248280525208, + "rewards/margins": 1.1210492849349976, + "rewards/rejected": -1.994474172592163, + "step": 115 + }, + { + "epoch": 0.04, + "learning_rate": 1.8292682926829268e-05, + "logits/chosen": -1.4580038785934448, + "logits/rejected": -1.3469959497451782, + "logps/chosen": -220.4951934814453, + "logps/rejected": -260.50958251953125, + "loss": 0.4338, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8012296557426453, + "rewards/margins": 1.2397785186767578, + "rewards/rejected": -2.041008234024048, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 1.9054878048780488e-05, + "logits/chosen": -1.4429595470428467, + "logits/rejected": -1.3944337368011475, + "logps/chosen": -272.3973388671875, + "logps/rejected": -293.87725830078125, + "loss": 0.4562, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9875411987304688, + "rewards/margins": 1.1865813732147217, + "rewards/rejected": -2.1741225719451904, + "step": 125 + }, + { + "epoch": 0.04, + "learning_rate": 1.9817073170731708e-05, + "logits/chosen": -1.5181801319122314, + "logits/rejected": -1.4099066257476807, + "logps/chosen": -211.18594360351562, + "logps/rejected": -239.5623321533203, + "loss": 0.4578, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5671035051345825, + "rewards/margins": 1.078687310218811, + "rewards/rejected": -1.6457910537719727, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 2.0579268292682928e-05, + "logits/chosen": -1.4463402032852173, + "logits/rejected": -1.33974289894104, + "logps/chosen": -241.71322631835938, + "logps/rejected": -276.1986389160156, + "loss": 0.4639, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8435440063476562, + "rewards/margins": 1.1942956447601318, + "rewards/rejected": -2.037839651107788, + "step": 135 + }, + { + "epoch": 0.04, + "learning_rate": 2.134146341463415e-05, + "logits/chosen": -1.4488505125045776, + "logits/rejected": -1.2843577861785889, + "logps/chosen": -256.20452880859375, + "logps/rejected": -275.42095947265625, + "loss": 0.4569, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6826750636100769, + "rewards/margins": 1.3818552494049072, + "rewards/rejected": -2.06453013420105, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 2.210365853658537e-05, + "logits/chosen": -1.471332311630249, + "logits/rejected": -1.4009536504745483, + "logps/chosen": -260.52374267578125, + "logps/rejected": -292.8216247558594, + "loss": 0.4397, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5928641557693481, + "rewards/margins": 1.5231386423110962, + "rewards/rejected": -2.1160027980804443, + "step": 145 + }, + { + "epoch": 0.05, + "learning_rate": 2.286585365853659e-05, + "logits/chosen": -1.5144442319869995, + "logits/rejected": -1.4480407238006592, + "logps/chosen": -251.3614959716797, + "logps/rejected": -273.5283508300781, + "loss": 0.4372, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3241608738899231, + "rewards/margins": 1.3152064085006714, + "rewards/rejected": -1.6393673419952393, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 2.3628048780487806e-05, + "logits/chosen": -1.4405564069747925, + "logits/rejected": -1.388474941253662, + "logps/chosen": -214.82778930664062, + "logps/rejected": -269.74468994140625, + "loss": 0.4043, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.21781139075756073, + "rewards/margins": 1.6215749979019165, + "rewards/rejected": -1.8393863439559937, + "step": 155 + }, + { + "epoch": 0.05, + "learning_rate": 2.4390243902439026e-05, + "logits/chosen": -1.5152744054794312, + "logits/rejected": -1.3840240240097046, + "logps/chosen": -260.8744201660156, + "logps/rejected": -284.11175537109375, + "loss": 0.4132, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3614785969257355, + "rewards/margins": 1.4981237649917603, + "rewards/rejected": -1.8596023321151733, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 2.5152439024390246e-05, + "logits/chosen": -1.4719091653823853, + "logits/rejected": -1.3672441244125366, + "logps/chosen": -236.2136993408203, + "logps/rejected": -263.2491455078125, + "loss": 0.4243, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6757379770278931, + "rewards/margins": 1.6337473392486572, + "rewards/rejected": -2.3094851970672607, + "step": 165 + }, + { + "epoch": 0.05, + "learning_rate": 2.5914634146341466e-05, + "logits/chosen": -1.3289225101470947, + "logits/rejected": -1.2671074867248535, + "logps/chosen": -257.1202392578125, + "logps/rejected": -296.96026611328125, + "loss": 0.4002, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0305023193359375, + "rewards/margins": 1.9576082229614258, + "rewards/rejected": -2.988110303878784, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.6676829268292686e-05, + "logits/chosen": -1.4129236936569214, + "logits/rejected": -1.2899045944213867, + "logps/chosen": -241.03207397460938, + "logps/rejected": -279.87408447265625, + "loss": 0.4107, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7252627611160278, + "rewards/margins": 2.208683967590332, + "rewards/rejected": -2.9339470863342285, + "step": 175 + }, + { + "epoch": 0.05, + "learning_rate": 2.7439024390243906e-05, + "logits/chosen": -1.4490526914596558, + "logits/rejected": -1.3659610748291016, + "logps/chosen": -224.96658325195312, + "logps/rejected": -268.126953125, + "loss": 0.3825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39545050263404846, + "rewards/margins": 1.7132034301757812, + "rewards/rejected": -2.108654022216797, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 2.820121951219512e-05, + "logits/chosen": -1.4639991521835327, + "logits/rejected": -1.351359248161316, + "logps/chosen": -263.8490295410156, + "logps/rejected": -305.0282287597656, + "loss": 0.4509, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6279351115226746, + "rewards/margins": 1.842530608177185, + "rewards/rejected": -2.470465898513794, + "step": 185 + }, + { + "epoch": 0.06, + "learning_rate": 2.896341463414634e-05, + "logits/chosen": -1.437174677848816, + "logits/rejected": -1.3693865537643433, + "logps/chosen": -212.9979248046875, + "logps/rejected": -263.60546875, + "loss": 0.4206, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.675702691078186, + "rewards/margins": 1.771080732345581, + "rewards/rejected": -2.4467835426330566, + "step": 190 + }, + { + "epoch": 0.06, + "learning_rate": 2.972560975609756e-05, + "logits/chosen": -1.4669255018234253, + "logits/rejected": -1.3856886625289917, + "logps/chosen": -258.96734619140625, + "logps/rejected": -292.31195068359375, + "loss": 0.3976, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.35433077812194824, + "rewards/margins": 1.9425218105316162, + "rewards/rejected": -2.2968528270721436, + "step": 195 + }, + { + "epoch": 0.06, + "learning_rate": 3.048780487804878e-05, + "logits/chosen": -1.5489139556884766, + "logits/rejected": -1.4419410228729248, + "logps/chosen": -247.7865447998047, + "logps/rejected": -272.13140869140625, + "loss": 0.4203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44836997985839844, + "rewards/margins": 1.8278976678848267, + "rewards/rejected": -2.2762677669525146, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 3.125e-05, + "logits/chosen": -1.5211713314056396, + "logits/rejected": -1.5042860507965088, + "logps/chosen": -234.41732788085938, + "logps/rejected": -296.26666259765625, + "loss": 0.3832, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4585009217262268, + "rewards/margins": 1.8909355401992798, + "rewards/rejected": -2.3494365215301514, + "step": 205 + }, + { + "epoch": 0.06, + "learning_rate": 3.201219512195122e-05, + "logits/chosen": -1.4364079236984253, + "logits/rejected": -1.3636162281036377, + "logps/chosen": -252.7544403076172, + "logps/rejected": -289.1746826171875, + "loss": 0.3609, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3245674967765808, + "rewards/margins": 2.125056028366089, + "rewards/rejected": -2.4496235847473145, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 3.277439024390244e-05, + "logits/chosen": -1.4644180536270142, + "logits/rejected": -1.3949480056762695, + "logps/chosen": -234.929443359375, + "logps/rejected": -279.88995361328125, + "loss": 0.4598, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.40384259819984436, + "rewards/margins": 1.5408689975738525, + "rewards/rejected": -1.944711685180664, + "step": 215 + }, + { + "epoch": 0.07, + "learning_rate": 3.353658536585366e-05, + "logits/chosen": -1.4723870754241943, + "logits/rejected": -1.353324294090271, + "logps/chosen": -233.2500762939453, + "logps/rejected": -264.16302490234375, + "loss": 0.3909, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6612171530723572, + "rewards/margins": 2.2186388969421387, + "rewards/rejected": -2.8798558712005615, + "step": 220 + }, + { + "epoch": 0.07, + "learning_rate": 3.429878048780488e-05, + "logits/chosen": -1.5066630840301514, + "logits/rejected": -1.442996621131897, + "logps/chosen": -242.844970703125, + "logps/rejected": -285.40301513671875, + "loss": 0.4212, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8180769681930542, + "rewards/margins": 1.7058541774749756, + "rewards/rejected": -2.5239310264587402, + "step": 225 + }, + { + "epoch": 0.07, + "learning_rate": 3.5060975609756095e-05, + "logits/chosen": -1.397632122039795, + "logits/rejected": -1.309533715248108, + "logps/chosen": -271.36676025390625, + "logps/rejected": -294.4072265625, + "loss": 0.395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6913538575172424, + "rewards/margins": 1.8501994609832764, + "rewards/rejected": -2.5415537357330322, + "step": 230 + }, + { + "epoch": 0.07, + "learning_rate": 3.5823170731707315e-05, + "logits/chosen": -1.3661539554595947, + "logits/rejected": -1.2781219482421875, + "logps/chosen": -234.1933135986328, + "logps/rejected": -266.4363708496094, + "loss": 0.3969, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8205755949020386, + "rewards/margins": 2.130988597869873, + "rewards/rejected": -2.951564311981201, + "step": 235 + }, + { + "epoch": 0.07, + "learning_rate": 3.6585365853658535e-05, + "logits/chosen": -1.429700493812561, + "logits/rejected": -1.3226317167282104, + "logps/chosen": -260.207763671875, + "logps/rejected": -288.5872802734375, + "loss": 0.3711, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6312899589538574, + "rewards/margins": 2.1224799156188965, + "rewards/rejected": -2.7537693977355957, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.7347560975609755e-05, + "logits/chosen": -1.414186954498291, + "logits/rejected": -1.2793656587600708, + "logps/chosen": -269.228515625, + "logps/rejected": -310.0638732910156, + "loss": 0.3871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8959183692932129, + "rewards/margins": 2.4896857738494873, + "rewards/rejected": -3.3856041431427, + "step": 245 + }, + { + "epoch": 0.08, + "learning_rate": 3.8109756097560976e-05, + "logits/chosen": -1.3947970867156982, + "logits/rejected": -1.3410694599151611, + "logps/chosen": -225.885009765625, + "logps/rejected": -293.60003662109375, + "loss": 0.4103, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6115488409996033, + "rewards/margins": 2.3446762561798096, + "rewards/rejected": -2.9562251567840576, + "step": 250 + }, + { + "epoch": 0.08, + "learning_rate": 3.8871951219512196e-05, + "logits/chosen": -1.428045392036438, + "logits/rejected": -1.3080122470855713, + "logps/chosen": -253.37646484375, + "logps/rejected": -262.73150634765625, + "loss": 0.4013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43573275208473206, + "rewards/margins": 2.107551097869873, + "rewards/rejected": -2.5432839393615723, + "step": 255 + }, + { + "epoch": 0.08, + "learning_rate": 3.9634146341463416e-05, + "logits/chosen": -1.5183364152908325, + "logits/rejected": -1.4998283386230469, + "logps/chosen": -224.19851684570312, + "logps/rejected": -268.945556640625, + "loss": 0.4347, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.19424612820148468, + "rewards/margins": 1.7188571691513062, + "rewards/rejected": -1.913103461265564, + "step": 260 + }, + { + "epoch": 0.08, + "learning_rate": 4.0396341463414636e-05, + "logits/chosen": -1.447766900062561, + "logits/rejected": -1.3362300395965576, + "logps/chosen": -202.533935546875, + "logps/rejected": -243.1204071044922, + "loss": 0.3827, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.07845243811607361, + "rewards/margins": 1.7323980331420898, + "rewards/rejected": -1.8108505010604858, + "step": 265 + }, + { + "epoch": 0.08, + "learning_rate": 4.1158536585365856e-05, + "logits/chosen": -1.4217723608016968, + "logits/rejected": -1.3448355197906494, + "logps/chosen": -207.6378631591797, + "logps/rejected": -260.37933349609375, + "loss": 0.4343, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5095584988594055, + "rewards/margins": 1.8481013774871826, + "rewards/rejected": -2.3576598167419434, + "step": 270 + }, + { + "epoch": 0.08, + "learning_rate": 4.1920731707317077e-05, + "logits/chosen": -1.3837717771530151, + "logits/rejected": -1.3595670461654663, + "logps/chosen": -227.80239868164062, + "logps/rejected": -270.43463134765625, + "loss": 0.4198, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9875295758247375, + "rewards/margins": 2.1480934619903564, + "rewards/rejected": -3.135622978210449, + "step": 275 + }, + { + "epoch": 0.09, + "learning_rate": 4.26829268292683e-05, + "logits/chosen": -1.3282934427261353, + "logits/rejected": -1.2439639568328857, + "logps/chosen": -259.5870056152344, + "logps/rejected": -298.34307861328125, + "loss": 0.3466, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8122395277023315, + "rewards/margins": 2.514617443084717, + "rewards/rejected": -3.326857089996338, + "step": 280 + }, + { + "epoch": 0.09, + "learning_rate": 4.344512195121952e-05, + "logits/chosen": -1.3736869096755981, + "logits/rejected": -1.3249460458755493, + "logps/chosen": -213.5878143310547, + "logps/rejected": -260.0492248535156, + "loss": 0.4065, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6954141855239868, + "rewards/margins": 2.317218065261841, + "rewards/rejected": -3.012632369995117, + "step": 285 + }, + { + "epoch": 0.09, + "learning_rate": 4.420731707317074e-05, + "logits/chosen": -1.3154847621917725, + "logits/rejected": -1.2118194103240967, + "logps/chosen": -266.9904479980469, + "logps/rejected": -295.3359375, + "loss": 0.416, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.930508017539978, + "rewards/margins": 2.233632802963257, + "rewards/rejected": -3.1641409397125244, + "step": 290 + }, + { + "epoch": 0.09, + "learning_rate": 4.496951219512196e-05, + "logits/chosen": -1.4980214834213257, + "logits/rejected": -1.3795270919799805, + "logps/chosen": -224.4019317626953, + "logps/rejected": -240.95443725585938, + "loss": 0.419, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.718636691570282, + "rewards/margins": 1.8446115255355835, + "rewards/rejected": -2.5632483959198, + "step": 295 + }, + { + "epoch": 0.09, + "learning_rate": 4.573170731707318e-05, + "logits/chosen": -1.388089895248413, + "logits/rejected": -1.3248611688613892, + "logps/chosen": -223.11434936523438, + "logps/rejected": -272.0300598144531, + "loss": 0.4197, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6024666428565979, + "rewards/margins": 2.1911685466766357, + "rewards/rejected": -2.793635129928589, + "step": 300 + }, + { + "epoch": 0.09, + "learning_rate": 4.64939024390244e-05, + "logits/chosen": -1.4117351770401, + "logits/rejected": -1.3508259057998657, + "logps/chosen": -225.8909912109375, + "logps/rejected": -276.47314453125, + "loss": 0.44, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6590641140937805, + "rewards/margins": 1.815882921218872, + "rewards/rejected": -2.474947452545166, + "step": 305 + }, + { + "epoch": 0.09, + "learning_rate": 4.725609756097561e-05, + "logits/chosen": -1.4158817529678345, + "logits/rejected": -1.3631826639175415, + "logps/chosen": -241.3859405517578, + "logps/rejected": -298.5724182128906, + "loss": 0.4038, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1089242696762085, + "rewards/margins": 2.054959535598755, + "rewards/rejected": -3.163883924484253, + "step": 310 + }, + { + "epoch": 0.1, + "learning_rate": 4.801829268292683e-05, + "logits/chosen": -1.3394591808319092, + "logits/rejected": -1.2597063779830933, + "logps/chosen": -236.72244262695312, + "logps/rejected": -318.73687744140625, + "loss": 0.3443, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.473099708557129, + "rewards/margins": 2.826732873916626, + "rewards/rejected": -4.299832344055176, + "step": 315 + }, + { + "epoch": 0.1, + "learning_rate": 4.878048780487805e-05, + "logits/chosen": -1.2992823123931885, + "logits/rejected": -1.1666510105133057, + "logps/chosen": -272.9997863769531, + "logps/rejected": -309.48681640625, + "loss": 0.509, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7227243185043335, + "rewards/margins": 2.872715473175049, + "rewards/rejected": -4.595439910888672, + "step": 320 + }, + { + "epoch": 0.1, + "learning_rate": 4.954268292682927e-05, + "logits/chosen": -1.4183080196380615, + "logits/rejected": -1.3151204586029053, + "logps/chosen": -260.7613220214844, + "logps/rejected": -307.1131286621094, + "loss": 0.412, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0861679315567017, + "rewards/margins": 2.3004274368286133, + "rewards/rejected": -3.3865954875946045, + "step": 325 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999943371262496e-05, + "logits/chosen": -1.5749884843826294, + "logits/rejected": -1.4654959440231323, + "logps/chosen": -234.9315643310547, + "logps/rejected": -249.64013671875, + "loss": 0.3938, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.14985708892345428, + "rewards/margins": 1.9599332809448242, + "rewards/rejected": -2.109790325164795, + "step": 330 + }, + { + "epoch": 0.1, + "learning_rate": 4.9999306300911826e-05, + "logits/chosen": -1.4392597675323486, + "logits/rejected": -1.3621281385421753, + "logps/chosen": -214.2711181640625, + "logps/rejected": -261.72552490234375, + "loss": 0.4055, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.43718934059143066, + "rewards/margins": 1.943355917930603, + "rewards/rejected": -2.380545139312744, + "step": 335 + }, + { + "epoch": 0.1, + "learning_rate": 4.999796139238694e-05, + "logits/chosen": -1.518296480178833, + "logits/rejected": -1.4602665901184082, + "logps/chosen": -242.5746307373047, + "logps/rejected": -296.28204345703125, + "loss": 0.5931, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0693089962005615, + "rewards/margins": 2.0596704483032227, + "rewards/rejected": -3.128979444503784, + "step": 340 + }, + { + "epoch": 0.11, + "learning_rate": 4.9995908683767986e-05, + "logits/chosen": -1.4445441961288452, + "logits/rejected": -1.3583850860595703, + "logps/chosen": -238.28549194335938, + "logps/rejected": -276.1322021484375, + "loss": 0.418, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0093810558319092, + "rewards/margins": 2.0990090370178223, + "rewards/rejected": -3.1083903312683105, + "step": 345 + }, + { + "epoch": 0.11, + "learning_rate": 4.999314823317602e-05, + "logits/chosen": -1.4120949506759644, + "logits/rejected": -1.3273041248321533, + "logps/chosen": -234.6574249267578, + "logps/rejected": -289.5172424316406, + "loss": 0.4384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5543441772460938, + "rewards/margins": 2.0190649032592773, + "rewards/rejected": -2.573408842086792, + "step": 350 + }, + { + "epoch": 0.11, + "learning_rate": 4.9989680118771284e-05, + "logits/chosen": -1.4652230739593506, + "logits/rejected": -1.295188069343567, + "logps/chosen": -271.06207275390625, + "logps/rejected": -289.2547302246094, + "loss": 0.3962, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5600930452346802, + "rewards/margins": 2.303740978240967, + "rewards/rejected": -2.8638339042663574, + "step": 355 + }, + { + "epoch": 0.11, + "learning_rate": 4.9985504438751075e-05, + "logits/chosen": -1.5906970500946045, + "logits/rejected": -1.433538556098938, + "logps/chosen": -252.5865936279297, + "logps/rejected": -284.74139404296875, + "loss": 0.365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3220791220664978, + "rewards/margins": 1.9737313985824585, + "rewards/rejected": -2.2958106994628906, + "step": 360 + }, + { + "epoch": 0.11, + "learning_rate": 4.998062131134687e-05, + "logits/chosen": -1.4737342596054077, + "logits/rejected": -1.3808215856552124, + "logps/chosen": -225.4361114501953, + "logps/rejected": -269.8510437011719, + "loss": 0.4805, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5886635184288025, + "rewards/margins": 2.1054704189300537, + "rewards/rejected": -2.694133996963501, + "step": 365 + }, + { + "epoch": 0.11, + "learning_rate": 4.99750308748211e-05, + "logits/chosen": -1.3672006130218506, + "logits/rejected": -1.2418177127838135, + "logps/chosen": -252.7670135498047, + "logps/rejected": -298.70159912109375, + "loss": 0.429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6679830551147461, + "rewards/margins": 2.3387298583984375, + "rewards/rejected": -3.0067131519317627, + "step": 370 + }, + { + "epoch": 0.11, + "learning_rate": 4.996873328746311e-05, + "logits/chosen": -1.444612741470337, + "logits/rejected": -1.3134263753890991, + "logps/chosen": -258.3940734863281, + "logps/rejected": -270.0694885253906, + "loss": 0.4651, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7545816898345947, + "rewards/margins": 2.0772323608398438, + "rewards/rejected": -2.8318140506744385, + "step": 375 + }, + { + "epoch": 0.12, + "learning_rate": 4.9961728727584764e-05, + "logits/chosen": -1.4437062740325928, + "logits/rejected": -1.3258285522460938, + "logps/chosen": -248.73953247070312, + "logps/rejected": -284.25653076171875, + "loss": 0.385, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.1430978775024414, + "rewards/margins": 2.3336520195007324, + "rewards/rejected": -2.4767496585845947, + "step": 380 + }, + { + "epoch": 0.12, + "learning_rate": 4.995401739351536e-05, + "logits/chosen": -1.5048315525054932, + "logits/rejected": -1.4178255796432495, + "logps/chosen": -204.1002655029297, + "logps/rejected": -228.6428680419922, + "loss": 0.4414, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21167969703674316, + "rewards/margins": 1.6109260320663452, + "rewards/rejected": -1.822605848312378, + "step": 385 + }, + { + "epoch": 0.12, + "learning_rate": 4.994559950359603e-05, + "logits/chosen": -1.521078109741211, + "logits/rejected": -1.431056022644043, + "logps/chosen": -211.77392578125, + "logps/rejected": -271.99530029296875, + "loss": 0.4005, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.32148247957229614, + "rewards/margins": 2.0455245971679688, + "rewards/rejected": -2.36700701713562, + "step": 390 + }, + { + "epoch": 0.12, + "learning_rate": 4.9936475296173524e-05, + "logits/chosen": -1.4915629625320435, + "logits/rejected": -1.4468661546707153, + "logps/chosen": -220.66238403320312, + "logps/rejected": -274.36212158203125, + "loss": 0.398, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4649580121040344, + "rewards/margins": 2.0459415912628174, + "rewards/rejected": -2.510899543762207, + "step": 395 + }, + { + "epoch": 0.12, + "learning_rate": 4.992664502959351e-05, + "logits/chosen": -1.5382647514343262, + "logits/rejected": -1.4424117803573608, + "logps/chosen": -237.43264770507812, + "logps/rejected": -265.6887512207031, + "loss": 0.4285, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7469789981842041, + "rewards/margins": 2.0618503093719482, + "rewards/rejected": -2.8088290691375732, + "step": 400 + }, + { + "epoch": 0.12, + "learning_rate": 4.9916108982193246e-05, + "logits/chosen": -1.4920802116394043, + "logits/rejected": -1.3715362548828125, + "logps/chosen": -248.9379425048828, + "logps/rejected": -275.0431213378906, + "loss": 0.3805, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.266801118850708, + "rewards/margins": 1.8889585733413696, + "rewards/rejected": -2.155759811401367, + "step": 405 + }, + { + "epoch": 0.12, + "learning_rate": 4.990486745229364e-05, + "logits/chosen": -1.4824012517929077, + "logits/rejected": -1.41977858543396, + "logps/chosen": -220.0709991455078, + "logps/rejected": -247.99560546875, + "loss": 0.4792, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.18558254837989807, + "rewards/margins": 1.6866681575775146, + "rewards/rejected": -1.501085638999939, + "step": 410 + }, + { + "epoch": 0.13, + "learning_rate": 4.9892920758190907e-05, + "logits/chosen": -1.4227807521820068, + "logits/rejected": -1.3182239532470703, + "logps/chosen": -237.0270233154297, + "logps/rejected": -266.37872314453125, + "loss": 0.4349, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0361756794154644, + "rewards/margins": 1.9962981939315796, + "rewards/rejected": -2.0324740409851074, + "step": 415 + }, + { + "epoch": 0.13, + "learning_rate": 4.988026923814748e-05, + "logits/chosen": -1.5704119205474854, + "logits/rejected": -1.472022294998169, + "logps/chosen": -251.1329803466797, + "logps/rejected": -289.251953125, + "loss": 0.3614, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.0848822221159935, + "rewards/margins": 2.3444581031799316, + "rewards/rejected": -2.429340362548828, + "step": 420 + }, + { + "epoch": 0.13, + "learning_rate": 4.986691325038244e-05, + "logits/chosen": -1.5436654090881348, + "logits/rejected": -1.4181368350982666, + "logps/chosen": -235.8417205810547, + "logps/rejected": -269.031005859375, + "loss": 0.4396, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.306237131357193, + "rewards/margins": 2.005009889602661, + "rewards/rejected": -2.3112473487854004, + "step": 425 + }, + { + "epoch": 0.13, + "learning_rate": 4.985285317306141e-05, + "logits/chosen": -1.4109728336334229, + "logits/rejected": -1.3263506889343262, + "logps/chosen": -242.7462615966797, + "logps/rejected": -278.8143005371094, + "loss": 0.3797, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6337249279022217, + "rewards/margins": 2.4727721214294434, + "rewards/rejected": -3.106497049331665, + "step": 430 + }, + { + "epoch": 0.13, + "learning_rate": 4.9838089404285807e-05, + "logits/chosen": -1.4374382495880127, + "logits/rejected": -1.3346731662750244, + "logps/chosen": -229.8583984375, + "logps/rejected": -258.11395263671875, + "loss": 0.5004, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.9803594350814819, + "rewards/margins": 2.253356695175171, + "rewards/rejected": -3.2337162494659424, + "step": 435 + }, + { + "epoch": 0.13, + "learning_rate": 4.9822622362081594e-05, + "logits/chosen": -1.449986219406128, + "logits/rejected": -1.3739886283874512, + "logps/chosen": -240.986328125, + "logps/rejected": -296.00531005859375, + "loss": 0.4265, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.41646942496299744, + "rewards/margins": 2.698214292526245, + "rewards/rejected": -3.1146836280822754, + "step": 440 + }, + { + "epoch": 0.14, + "learning_rate": 4.980645248438745e-05, + "logits/chosen": -1.5218524932861328, + "logits/rejected": -1.4302794933319092, + "logps/chosen": -214.82852172851562, + "logps/rejected": -265.3507995605469, + "loss": 0.3939, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.3738623261451721, + "rewards/margins": 2.2887752056121826, + "rewards/rejected": -1.9149129390716553, + "step": 445 + }, + { + "epoch": 0.14, + "learning_rate": 4.978958022904235e-05, + "logits/chosen": -1.5862996578216553, + "logits/rejected": -1.4788892269134521, + "logps/chosen": -234.18478393554688, + "logps/rejected": -263.1192932128906, + "loss": 0.4216, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.3776335120201111, + "rewards/margins": 2.2993741035461426, + "rewards/rejected": -1.9217407703399658, + "step": 450 + }, + { + "epoch": 0.14, + "learning_rate": 4.977200607377259e-05, + "logits/chosen": -1.5885207653045654, + "logits/rejected": -1.5190343856811523, + "logps/chosen": -228.0282745361328, + "logps/rejected": -270.5809326171875, + "loss": 0.4147, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.047803785651922226, + "rewards/margins": 1.674283742904663, + "rewards/rejected": -1.7220878601074219, + "step": 455 + }, + { + "epoch": 0.14, + "learning_rate": 4.9753730516178313e-05, + "logits/chosen": -1.5095998048782349, + "logits/rejected": -1.4479546546936035, + "logps/chosen": -242.82656860351562, + "logps/rejected": -279.6665344238281, + "loss": 0.3911, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3711318373680115, + "rewards/margins": 2.2145276069641113, + "rewards/rejected": -2.5856597423553467, + "step": 460 + }, + { + "epoch": 0.14, + "learning_rate": 4.9734754073719355e-05, + "logits/chosen": -1.5498822927474976, + "logits/rejected": -1.4521539211273193, + "logps/chosen": -237.3514404296875, + "logps/rejected": -279.55035400390625, + "loss": 0.4096, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7321179509162903, + "rewards/margins": 2.069248676300049, + "rewards/rejected": -2.8013663291931152, + "step": 465 + }, + { + "epoch": 0.14, + "learning_rate": 4.971507728370066e-05, + "logits/chosen": -1.4880825281143188, + "logits/rejected": -1.414366602897644, + "logps/chosen": -244.0042266845703, + "logps/rejected": -303.25506591796875, + "loss": 0.3607, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.46353158354759216, + "rewards/margins": 2.542686939239502, + "rewards/rejected": -3.006218671798706, + "step": 470 + }, + { + "epoch": 0.14, + "learning_rate": 4.969470070325699e-05, + "logits/chosen": -1.546096682548523, + "logits/rejected": -1.4253944158554077, + "logps/chosen": -225.0137481689453, + "logps/rejected": -271.5694885253906, + "loss": 0.4059, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3330061137676239, + "rewards/margins": 2.4641575813293457, + "rewards/rejected": -2.797163724899292, + "step": 475 + }, + { + "epoch": 0.15, + "learning_rate": 4.967362490933723e-05, + "logits/chosen": -1.37833571434021, + "logits/rejected": -1.2442013025283813, + "logps/chosen": -227.9774627685547, + "logps/rejected": -260.406982421875, + "loss": 0.3492, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4235529899597168, + "rewards/margins": 2.632197618484497, + "rewards/rejected": -3.055750608444214, + "step": 480 + }, + { + "epoch": 0.15, + "learning_rate": 4.9651850498688e-05, + "logits/chosen": -1.5022382736206055, + "logits/rejected": -1.3960180282592773, + "logps/chosen": -244.43344116210938, + "logps/rejected": -302.40570068359375, + "loss": 0.4167, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.21162764728069305, + "rewards/margins": 2.4925358295440674, + "rewards/rejected": -2.7041635513305664, + "step": 485 + }, + { + "epoch": 0.15, + "learning_rate": 4.962937808783675e-05, + "logits/chosen": -1.4933425188064575, + "logits/rejected": -1.441125512123108, + "logps/chosen": -241.4817352294922, + "logps/rejected": -303.1959533691406, + "loss": 0.3826, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2536941468715668, + "rewards/margins": 2.397275447845459, + "rewards/rejected": -2.6509695053100586, + "step": 490 + }, + { + "epoch": 0.15, + "learning_rate": 4.960620831307436e-05, + "logits/chosen": -1.4081984758377075, + "logits/rejected": -1.3692537546157837, + "logps/chosen": -205.82504272460938, + "logps/rejected": -246.38015747070312, + "loss": 0.4207, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6011780500411987, + "rewards/margins": 1.9714361429214478, + "rewards/rejected": -2.5726141929626465, + "step": 495 + }, + { + "epoch": 0.15, + "learning_rate": 4.9582341830437085e-05, + "logits/chosen": -1.4795446395874023, + "logits/rejected": -1.3015785217285156, + "logps/chosen": -238.45947265625, + "logps/rejected": -233.8590850830078, + "loss": 0.436, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.386214941740036, + "rewards/margins": 2.216794490814209, + "rewards/rejected": -2.6030097007751465, + "step": 500 + }, + { + "epoch": 0.15, + "learning_rate": 4.955777931568797e-05, + "logits/chosen": -1.3686350584030151, + "logits/rejected": -1.3235373497009277, + "logps/chosen": -217.8348388671875, + "logps/rejected": -280.8824768066406, + "loss": 0.4044, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014941399917006493, + "rewards/margins": 2.3758702278137207, + "rewards/rejected": -2.360928773880005, + "step": 505 + }, + { + "epoch": 0.16, + "learning_rate": 4.953252146429772e-05, + "logits/chosen": -1.51080322265625, + "logits/rejected": -1.4593368768692017, + "logps/chosen": -200.56521606445312, + "logps/rejected": -246.88388061523438, + "loss": 0.4536, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.2722179889678955, + "rewards/margins": 1.7458912134170532, + "rewards/rejected": -1.4736731052398682, + "step": 510 + }, + { + "epoch": 0.16, + "learning_rate": 4.9506568991425065e-05, + "logits/chosen": -1.4120771884918213, + "logits/rejected": -1.3655294179916382, + "logps/chosen": -221.00479125976562, + "logps/rejected": -252.9807586669922, + "loss": 0.4058, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.4606713652610779, + "rewards/margins": 1.9229342937469482, + "rewards/rejected": -1.462262749671936, + "step": 515 + }, + { + "epoch": 0.16, + "learning_rate": 4.9479922631896405e-05, + "logits/chosen": -1.523662805557251, + "logits/rejected": -1.4615800380706787, + "logps/chosen": -216.89169311523438, + "logps/rejected": -274.2598571777344, + "loss": 0.3706, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.594303548336029, + "rewards/margins": 2.0781185626983643, + "rewards/rejected": -1.4838149547576904, + "step": 520 + }, + { + "epoch": 0.16, + "learning_rate": 4.945258314018511e-05, + "logits/chosen": -1.523301124572754, + "logits/rejected": -1.373157262802124, + "logps/chosen": -245.4419403076172, + "logps/rejected": -268.5808410644531, + "loss": 0.3833, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.17819438874721527, + "rewards/margins": 2.2116293907165527, + "rewards/rejected": -2.033435106277466, + "step": 525 + }, + { + "epoch": 0.16, + "learning_rate": 4.942455129039011e-05, + "logits/chosen": -1.455971360206604, + "logits/rejected": -1.3837201595306396, + "logps/chosen": -233.8610382080078, + "logps/rejected": -282.94891357421875, + "loss": 0.3602, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3101358711719513, + "rewards/margins": 2.8269975185394287, + "rewards/rejected": -3.1371333599090576, + "step": 530 + }, + { + "epoch": 0.16, + "learning_rate": 4.9395827876213936e-05, + "logits/chosen": -1.4602159261703491, + "logits/rejected": -1.357772707939148, + "logps/chosen": -238.6614227294922, + "logps/rejected": -281.5059814453125, + "loss": 0.4362, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.42020535469055176, + "rewards/margins": 2.5654282569885254, + "rewards/rejected": -2.9856338500976562, + "step": 535 + }, + { + "epoch": 0.16, + "learning_rate": 4.936641371094033e-05, + "logits/chosen": -1.5019209384918213, + "logits/rejected": -1.5190550088882446, + "logps/chosen": -197.41287231445312, + "logps/rejected": -252.25137329101562, + "loss": 0.4469, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11766906827688217, + "rewards/margins": 2.03245210647583, + "rewards/rejected": -2.1501212120056152, + "step": 540 + }, + { + "epoch": 0.17, + "learning_rate": 4.9336309627411163e-05, + "logits/chosen": -1.423473834991455, + "logits/rejected": -1.4443773031234741, + "logps/chosen": -217.05453491210938, + "logps/rejected": -282.62164306640625, + "loss": 0.3817, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.06701436638832092, + "rewards/margins": 2.2761709690093994, + "rewards/rejected": -2.3431851863861084, + "step": 545 + }, + { + "epoch": 0.17, + "learning_rate": 4.9305516478002865e-05, + "logits/chosen": -1.4173814058303833, + "logits/rejected": -1.3098132610321045, + "logps/chosen": -249.5691375732422, + "logps/rejected": -292.90435791015625, + "loss": 0.4061, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5416631102561951, + "rewards/margins": 2.6680707931518555, + "rewards/rejected": -3.2097339630126953, + "step": 550 + }, + { + "epoch": 0.17, + "learning_rate": 4.92740351346023e-05, + "logits/chosen": -1.3292713165283203, + "logits/rejected": -1.2327873706817627, + "logps/chosen": -229.86007690429688, + "logps/rejected": -264.585693359375, + "loss": 0.3875, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7183946371078491, + "rewards/margins": 2.537214517593384, + "rewards/rejected": -3.2556090354919434, + "step": 555 + }, + { + "epoch": 0.17, + "learning_rate": 4.924186648858207e-05, + "logits/chosen": -1.3974854946136475, + "logits/rejected": -1.2770755290985107, + "logps/chosen": -228.25625610351562, + "logps/rejected": -271.54052734375, + "loss": 0.398, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6100394129753113, + "rewards/margins": 2.393568992614746, + "rewards/rejected": -3.003608226776123, + "step": 560 + }, + { + "epoch": 0.17, + "learning_rate": 4.920901145077527e-05, + "logits/chosen": -1.5996006727218628, + "logits/rejected": -1.5182517766952515, + "logps/chosen": -213.2860565185547, + "logps/rejected": -258.7254943847656, + "loss": 0.4096, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1525319218635559, + "rewards/margins": 1.8957157135009766, + "rewards/rejected": -2.0482475757598877, + "step": 565 + }, + { + "epoch": 0.17, + "learning_rate": 4.917547095144971e-05, + "logits/chosen": -1.492539644241333, + "logits/rejected": -1.4047850370407104, + "logps/chosen": -239.7179412841797, + "logps/rejected": -278.13507080078125, + "loss": 0.3826, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.23858800530433655, + "rewards/margins": 2.1294798851013184, + "rewards/rejected": -2.368067979812622, + "step": 570 + }, + { + "epoch": 0.18, + "learning_rate": 4.914124594028157e-05, + "logits/chosen": -1.4673938751220703, + "logits/rejected": -1.3367671966552734, + "logps/chosen": -265.41009521484375, + "logps/rejected": -313.9954528808594, + "loss": 0.4158, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5570913553237915, + "rewards/margins": 2.5719971656799316, + "rewards/rejected": -3.1290886402130127, + "step": 575 + }, + { + "epoch": 0.18, + "learning_rate": 4.9106337386328524e-05, + "logits/chosen": -1.4329808950424194, + "logits/rejected": -1.3196234703063965, + "logps/chosen": -249.49081420898438, + "logps/rejected": -286.67352294921875, + "loss": 0.4113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6757558584213257, + "rewards/margins": 2.547755241394043, + "rewards/rejected": -3.2235107421875, + "step": 580 + }, + { + "epoch": 0.18, + "learning_rate": 4.907074627800229e-05, + "logits/chosen": -1.5212651491165161, + "logits/rejected": -1.376366376876831, + "logps/chosen": -263.5170593261719, + "logps/rejected": -291.48876953125, + "loss": 0.4057, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6802183389663696, + "rewards/margins": 2.613145112991333, + "rewards/rejected": -3.293363094329834, + "step": 585 + }, + { + "epoch": 0.18, + "learning_rate": 4.903447362304061e-05, + "logits/chosen": -1.5662963390350342, + "logits/rejected": -1.4853650331497192, + "logps/chosen": -226.19937133789062, + "logps/rejected": -273.17620849609375, + "loss": 0.3983, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4168701767921448, + "rewards/margins": 2.3373947143554688, + "rewards/rejected": -2.7542648315429688, + "step": 590 + }, + { + "epoch": 0.18, + "learning_rate": 4.899752044847881e-05, + "logits/chosen": -1.5506370067596436, + "logits/rejected": -1.4166381359100342, + "logps/chosen": -239.8184814453125, + "logps/rejected": -272.5811462402344, + "loss": 0.4064, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8294523358345032, + "rewards/margins": 2.456678867340088, + "rewards/rejected": -3.2861316204071045, + "step": 595 + }, + { + "epoch": 0.18, + "learning_rate": 4.895988780062059e-05, + "logits/chosen": -1.3921419382095337, + "logits/rejected": -1.3240123987197876, + "logps/chosen": -232.83157348632812, + "logps/rejected": -274.8085021972656, + "loss": 0.4231, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0233221054077148, + "rewards/margins": 2.417581081390381, + "rewards/rejected": -3.440903425216675, + "step": 600 + }, + { + "epoch": 0.18, + "learning_rate": 4.8921576745008544e-05, + "logits/chosen": -1.551561713218689, + "logits/rejected": -1.423801302909851, + "logps/chosen": -262.9552917480469, + "logps/rejected": -295.07269287109375, + "loss": 0.4166, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0641061067581177, + "rewards/margins": 2.3552744388580322, + "rewards/rejected": -3.4193801879882812, + "step": 605 + }, + { + "epoch": 0.19, + "learning_rate": 4.888258836639386e-05, + "logits/chosen": -1.3881410360336304, + "logits/rejected": -1.3344438076019287, + "logps/chosen": -239.0188751220703, + "logps/rejected": -303.75640869140625, + "loss": 0.4113, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2905619144439697, + "rewards/margins": 2.735004186630249, + "rewards/rejected": -4.025566577911377, + "step": 610 + }, + { + "epoch": 0.19, + "learning_rate": 4.884292376870567e-05, + "logits/chosen": -1.3135260343551636, + "logits/rejected": -1.2955373525619507, + "logps/chosen": -240.2941436767578, + "logps/rejected": -308.3316345214844, + "loss": 0.5103, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0934088230133057, + "rewards/margins": 2.280285358428955, + "rewards/rejected": -3.3736941814422607, + "step": 615 + }, + { + "epoch": 0.19, + "learning_rate": 4.880258407501982e-05, + "logits/chosen": -1.4538220167160034, + "logits/rejected": -1.3662500381469727, + "logps/chosen": -253.21591186523438, + "logps/rejected": -297.54193115234375, + "loss": 0.4718, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6241488456726074, + "rewards/margins": 2.2601757049560547, + "rewards/rejected": -2.884324550628662, + "step": 620 + }, + { + "epoch": 0.19, + "learning_rate": 4.8761570427526973e-05, + "logits/chosen": -1.5741875171661377, + "logits/rejected": -1.4919278621673584, + "logps/chosen": -232.79373168945312, + "logps/rejected": -269.60089111328125, + "loss": 0.4284, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.1897115260362625, + "rewards/margins": 1.9414294958114624, + "rewards/rejected": -1.7517179250717163, + "step": 625 + }, + { + "epoch": 0.19, + "learning_rate": 4.87198839875004e-05, + "logits/chosen": -1.5609136819839478, + "logits/rejected": -1.45805025100708, + "logps/chosen": -208.7440643310547, + "logps/rejected": -234.986083984375, + "loss": 0.4536, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.25724244117736816, + "rewards/margins": 1.7011772394180298, + "rewards/rejected": -1.443934679031372, + "step": 630 + }, + { + "epoch": 0.19, + "learning_rate": 4.867752593526297e-05, + "logits/chosen": -1.5343798398971558, + "logits/rejected": -1.4163181781768799, + "logps/chosen": -227.30960083007812, + "logps/rejected": -285.6361083984375, + "loss": 0.3958, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.07786116749048233, + "rewards/margins": 2.143432855606079, + "rewards/rejected": -2.0655717849731445, + "step": 635 + }, + { + "epoch": 0.2, + "learning_rate": 4.863449747015384e-05, + "logits/chosen": -1.4224778413772583, + "logits/rejected": -1.3595422506332397, + "logps/chosen": -240.3905029296875, + "logps/rejected": -296.36712646484375, + "loss": 0.4179, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4491243362426758, + "rewards/margins": 2.387755870819092, + "rewards/rejected": -2.8368804454803467, + "step": 640 + }, + { + "epoch": 0.2, + "learning_rate": 4.8590799810494405e-05, + "logits/chosen": -1.4686813354492188, + "logits/rejected": -1.373623013496399, + "logps/chosen": -196.8067626953125, + "logps/rejected": -231.6805877685547, + "loss": 0.3975, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.22348129749298096, + "rewards/margins": 2.002708911895752, + "rewards/rejected": -2.2261900901794434, + "step": 645 + }, + { + "epoch": 0.2, + "learning_rate": 4.854643419355387e-05, + "logits/chosen": -1.3826911449432373, + "logits/rejected": -1.2899580001831055, + "logps/chosen": -208.8357391357422, + "logps/rejected": -274.06304931640625, + "loss": 0.3487, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2312304526567459, + "rewards/margins": 2.2438297271728516, + "rewards/rejected": -2.475059986114502, + "step": 650 + }, + { + "epoch": 0.2, + "learning_rate": 4.850140187551417e-05, + "logits/chosen": -1.4895564317703247, + "logits/rejected": -1.4108827114105225, + "logps/chosen": -220.452392578125, + "logps/rejected": -257.15142822265625, + "loss": 0.3977, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.1533854752779007, + "rewards/margins": 2.6028971672058105, + "rewards/rejected": -2.756282329559326, + "step": 655 + }, + { + "epoch": 0.2, + "learning_rate": 4.8455704131434463e-05, + "logits/chosen": -1.402146339416504, + "logits/rejected": -1.3426183462142944, + "logps/chosen": -210.43310546875, + "logps/rejected": -261.40826416015625, + "loss": 0.4083, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7025829553604126, + "rewards/margins": 2.593606472015381, + "rewards/rejected": -3.296189069747925, + "step": 660 + }, + { + "epoch": 0.2, + "learning_rate": 4.840934225521495e-05, + "logits/chosen": -1.3444569110870361, + "logits/rejected": -1.2977235317230225, + "logps/chosen": -234.8567657470703, + "logps/rejected": -276.48529052734375, + "loss": 0.3982, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.500916600227356, + "rewards/margins": 2.0213747024536133, + "rewards/rejected": -2.522291421890259, + "step": 665 + }, + { + "epoch": 0.2, + "learning_rate": 4.8362317559560274e-05, + "logits/chosen": -1.4623371362686157, + "logits/rejected": -1.3292256593704224, + "logps/chosen": -227.4473876953125, + "logps/rejected": -257.6618957519531, + "loss": 0.3551, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2117508351802826, + "rewards/margins": 2.53009033203125, + "rewards/rejected": -2.7418415546417236, + "step": 670 + }, + { + "epoch": 0.21, + "learning_rate": 4.8314631375942385e-05, + "logits/chosen": -1.495482087135315, + "logits/rejected": -1.4167420864105225, + "logps/chosen": -230.7625274658203, + "logps/rejected": -274.5197448730469, + "loss": 0.3984, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09574685990810394, + "rewards/margins": 2.2370810508728027, + "rewards/rejected": -2.3328278064727783, + "step": 675 + }, + { + "epoch": 0.21, + "learning_rate": 4.8266285054562794e-05, + "logits/chosen": -1.5286659002304077, + "logits/rejected": -1.4208118915557861, + "logps/chosen": -238.05770874023438, + "logps/rejected": -278.955078125, + "loss": 0.3855, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17491035163402557, + "rewards/margins": 2.2506463527679443, + "rewards/rejected": -2.0757360458374023, + "step": 680 + }, + { + "epoch": 0.21, + "learning_rate": 4.821727996431435e-05, + "logits/chosen": -1.4394538402557373, + "logits/rejected": -1.3951141834259033, + "logps/chosen": -227.53298950195312, + "logps/rejected": -279.4712219238281, + "loss": 0.3826, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11384101212024689, + "rewards/margins": 2.3539175987243652, + "rewards/rejected": -2.2400765419006348, + "step": 685 + }, + { + "epoch": 0.21, + "learning_rate": 4.816761749274251e-05, + "logits/chosen": -1.4274007081985474, + "logits/rejected": -1.4132310152053833, + "logps/chosen": -217.4455108642578, + "logps/rejected": -278.5160827636719, + "loss": 0.3496, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09646536409854889, + "rewards/margins": 2.620297908782959, + "rewards/rejected": -2.7167630195617676, + "step": 690 + }, + { + "epoch": 0.21, + "learning_rate": 4.8117299046006e-05, + "logits/chosen": -1.5871320962905884, + "logits/rejected": -1.4668903350830078, + "logps/chosen": -237.6433868408203, + "logps/rejected": -277.51123046875, + "loss": 0.3578, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2201867550611496, + "rewards/margins": 2.6030189990997314, + "rewards/rejected": -2.8232059478759766, + "step": 695 + }, + { + "epoch": 0.21, + "learning_rate": 4.806632604883708e-05, + "logits/chosen": -1.492653489112854, + "logits/rejected": -1.3919525146484375, + "logps/chosen": -248.25741577148438, + "logps/rejected": -313.47393798828125, + "loss": 0.3474, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.24769659340381622, + "rewards/margins": 2.886350154876709, + "rewards/rejected": -3.134047031402588, + "step": 700 + }, + { + "epoch": 0.21, + "learning_rate": 4.801469994450111e-05, + "logits/chosen": -1.5104761123657227, + "logits/rejected": -1.3608448505401611, + "logps/chosen": -250.60995483398438, + "logps/rejected": -254.9477996826172, + "loss": 0.4383, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2116759717464447, + "rewards/margins": 2.409428596496582, + "rewards/rejected": -2.6211047172546387, + "step": 705 + }, + { + "epoch": 0.22, + "learning_rate": 4.796242219475575e-05, + "logits/chosen": -1.4535516500473022, + "logits/rejected": -1.391486644744873, + "logps/chosen": -223.2052764892578, + "logps/rejected": -280.236083984375, + "loss": 0.3887, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.05361563712358475, + "rewards/margins": 2.583587646484375, + "rewards/rejected": -2.6372032165527344, + "step": 710 + }, + { + "epoch": 0.22, + "learning_rate": 4.790949427980956e-05, + "logits/chosen": -1.4059816598892212, + "logits/rejected": -1.3338401317596436, + "logps/chosen": -249.96279907226562, + "logps/rejected": -290.481689453125, + "loss": 0.4033, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4107217788696289, + "rewards/margins": 2.9012093544006348, + "rewards/rejected": -3.3119311332702637, + "step": 715 + }, + { + "epoch": 0.22, + "learning_rate": 4.7855917698280054e-05, + "logits/chosen": -1.4610540866851807, + "logits/rejected": -1.317604660987854, + "logps/chosen": -248.26956176757812, + "logps/rejected": -262.8702697753906, + "loss": 0.4896, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.37552323937416077, + "rewards/margins": 2.372670888900757, + "rewards/rejected": -2.7481942176818848, + "step": 720 + }, + { + "epoch": 0.22, + "learning_rate": 4.780169396715133e-05, + "logits/chosen": -1.5573104619979858, + "logits/rejected": -1.4791498184204102, + "logps/chosen": -219.789794921875, + "logps/rejected": -264.08685302734375, + "loss": 0.4026, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.15070626139640808, + "rewards/margins": 2.185694932937622, + "rewards/rejected": -2.0349888801574707, + "step": 725 + }, + { + "epoch": 0.22, + "learning_rate": 4.774682462173105e-05, + "logits/chosen": -1.551232099533081, + "logits/rejected": -1.407405138015747, + "logps/chosen": -245.1878662109375, + "logps/rejected": -269.4394226074219, + "loss": 0.389, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.07436565309762955, + "rewards/margins": 2.136859178543091, + "rewards/rejected": -2.2112247943878174, + "step": 730 + }, + { + "epoch": 0.22, + "learning_rate": 4.769131121560701e-05, + "logits/chosen": -1.5128840208053589, + "logits/rejected": -1.4270654916763306, + "logps/chosen": -246.69100952148438, + "logps/rejected": -283.3271484375, + "loss": 0.4394, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.672562837600708, + "rewards/margins": 2.451646327972412, + "rewards/rejected": -3.1242096424102783, + "step": 735 + }, + { + "epoch": 0.23, + "learning_rate": 4.763515532060316e-05, + "logits/chosen": -1.4596669673919678, + "logits/rejected": -1.420090913772583, + "logps/chosen": -211.2493133544922, + "logps/rejected": -274.11322021484375, + "loss": 0.4276, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5362947583198547, + "rewards/margins": 2.3526790142059326, + "rewards/rejected": -2.8889739513397217, + "step": 740 + }, + { + "epoch": 0.23, + "learning_rate": 4.7578358526735065e-05, + "logits/chosen": -1.5720094442367554, + "logits/rejected": -1.4286072254180908, + "logps/chosen": -254.8545379638672, + "logps/rejected": -267.1537170410156, + "loss": 0.4106, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.07587162405252457, + "rewards/margins": 2.3346734046936035, + "rewards/rejected": -2.4105448722839355, + "step": 745 + }, + { + "epoch": 0.23, + "learning_rate": 4.7520922442164894e-05, + "logits/chosen": -1.5144745111465454, + "logits/rejected": -1.4029021263122559, + "logps/chosen": -212.46533203125, + "logps/rejected": -231.5972900390625, + "loss": 0.4307, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.12235695123672485, + "rewards/margins": 1.6954295635223389, + "rewards/rejected": -1.5730727910995483, + "step": 750 + }, + { + "epoch": 0.23, + "learning_rate": 4.74628486931559e-05, + "logits/chosen": -1.5446897745132446, + "logits/rejected": -1.4459664821624756, + "logps/chosen": -241.0093536376953, + "logps/rejected": -268.1150207519531, + "loss": 0.3903, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.28674596548080444, + "rewards/margins": 2.2632548809051514, + "rewards/rejected": -1.9765087366104126, + "step": 755 + }, + { + "epoch": 0.23, + "learning_rate": 4.740413892402639e-05, + "logits/chosen": -1.4135468006134033, + "logits/rejected": -1.3602290153503418, + "logps/chosen": -253.1114044189453, + "logps/rejected": -305.5167541503906, + "loss": 0.4179, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12774869799613953, + "rewards/margins": 2.793480634689331, + "rewards/rejected": -2.6657321453094482, + "step": 760 + }, + { + "epoch": 0.23, + "learning_rate": 4.734479479710311e-05, + "logits/chosen": -1.5195525884628296, + "logits/rejected": -1.446173906326294, + "logps/chosen": -244.97616577148438, + "logps/rejected": -279.899658203125, + "loss": 0.3906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19458039104938507, + "rewards/margins": 2.4705350399017334, + "rewards/rejected": -2.6651155948638916, + "step": 765 + }, + { + "epoch": 0.23, + "learning_rate": 4.728481799267421e-05, + "logits/chosen": -1.4518150091171265, + "logits/rejected": -1.3336080312728882, + "logps/chosen": -266.017822265625, + "logps/rejected": -296.8194274902344, + "loss": 0.3854, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.503818154335022, + "rewards/margins": 2.678640365600586, + "rewards/rejected": -3.1824586391448975, + "step": 770 + }, + { + "epoch": 0.24, + "learning_rate": 4.722421020894169e-05, + "logits/chosen": -1.4531335830688477, + "logits/rejected": -1.3481992483139038, + "logps/chosen": -247.59585571289062, + "logps/rejected": -294.21722412109375, + "loss": 0.3923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7091779708862305, + "rewards/margins": 3.397832155227661, + "rewards/rejected": -4.107010364532471, + "step": 775 + }, + { + "epoch": 0.24, + "learning_rate": 4.71629731619733e-05, + "logits/chosen": -1.4069632291793823, + "logits/rejected": -1.3233740329742432, + "logps/chosen": -251.7724151611328, + "logps/rejected": -308.6446838378906, + "loss": 0.387, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1772078275680542, + "rewards/margins": 2.8983986377716064, + "rewards/rejected": -4.075606346130371, + "step": 780 + }, + { + "epoch": 0.24, + "learning_rate": 4.7101108585653905e-05, + "logits/chosen": -1.4547842741012573, + "logits/rejected": -1.313291311264038, + "logps/chosen": -258.6347961425781, + "logps/rejected": -288.6201171875, + "loss": 0.3736, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9564958810806274, + "rewards/margins": 2.8389906883239746, + "rewards/rejected": -3.7954864501953125, + "step": 785 + }, + { + "epoch": 0.24, + "learning_rate": 4.703861823163649e-05, + "logits/chosen": -1.5221706628799438, + "logits/rejected": -1.4411219358444214, + "logps/chosen": -226.8802032470703, + "logps/rejected": -279.59564208984375, + "loss": 0.4248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6723430752754211, + "rewards/margins": 2.194626569747925, + "rewards/rejected": -2.866969585418701, + "step": 790 + }, + { + "epoch": 0.24, + "learning_rate": 4.697550386929246e-05, + "logits/chosen": -1.3913832902908325, + "logits/rejected": -1.274837613105774, + "logps/chosen": -244.55783081054688, + "logps/rejected": -282.39422607421875, + "loss": 0.3963, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.37080082297325134, + "rewards/margins": 2.579662561416626, + "rewards/rejected": -2.9504635334014893, + "step": 795 + }, + { + "epoch": 0.24, + "learning_rate": 4.691176728566159e-05, + "logits/chosen": -1.4640603065490723, + "logits/rejected": -1.405020833015442, + "logps/chosen": -220.28701782226562, + "logps/rejected": -271.06011962890625, + "loss": 0.3549, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5138343572616577, + "rewards/margins": 2.453996181488037, + "rewards/rejected": -2.9678304195404053, + "step": 800 + }, + { + "epoch": 0.25, + "learning_rate": 4.684741028540146e-05, + "logits/chosen": -1.3809168338775635, + "logits/rejected": -1.29449462890625, + "logps/chosen": -220.06051635742188, + "logps/rejected": -277.65325927734375, + "loss": 0.361, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5629986524581909, + "rewards/margins": 2.7898635864257812, + "rewards/rejected": -3.3528621196746826, + "step": 805 + }, + { + "epoch": 0.25, + "learning_rate": 4.6782434690736274e-05, + "logits/chosen": -1.455427885055542, + "logits/rejected": -1.315850019454956, + "logps/chosen": -265.70391845703125, + "logps/rejected": -298.28924560546875, + "loss": 0.4675, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.49216946959495544, + "rewards/margins": 3.0039374828338623, + "rewards/rejected": -3.4961071014404297, + "step": 810 + }, + { + "epoch": 0.25, + "learning_rate": 4.671684234140535e-05, + "logits/chosen": -1.4259642362594604, + "logits/rejected": -1.296662449836731, + "logps/chosen": -237.97695922851562, + "logps/rejected": -260.7139892578125, + "loss": 0.4043, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.25738030672073364, + "rewards/margins": 2.880896806716919, + "rewards/rejected": -3.1382765769958496, + "step": 815 + }, + { + "epoch": 0.25, + "learning_rate": 4.665063509461097e-05, + "logits/chosen": -1.4097397327423096, + "logits/rejected": -1.3623135089874268, + "logps/chosen": -224.6775665283203, + "logps/rejected": -264.73846435546875, + "loss": 0.4093, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.17778576910495758, + "rewards/margins": 2.099287986755371, + "rewards/rejected": -1.9215021133422852, + "step": 820 + }, + { + "epoch": 0.25, + "learning_rate": 4.6583814824965805e-05, + "logits/chosen": -1.5525894165039062, + "logits/rejected": -1.4566829204559326, + "logps/chosen": -222.94430541992188, + "logps/rejected": -271.26556396484375, + "loss": 0.3995, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.313865065574646, + "rewards/margins": 2.3322434425354004, + "rewards/rejected": -2.018378496170044, + "step": 825 + }, + { + "epoch": 0.25, + "learning_rate": 4.651638342443987e-05, + "logits/chosen": -1.5715656280517578, + "logits/rejected": -1.554890751838684, + "logps/chosen": -224.2897186279297, + "logps/rejected": -272.0341796875, + "loss": 0.4459, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.030287206172943115, + "rewards/margins": 1.9702775478363037, + "rewards/rejected": -1.9399904012680054, + "step": 830 + }, + { + "epoch": 0.25, + "learning_rate": 4.644834280230692e-05, + "logits/chosen": -1.5804816484451294, + "logits/rejected": -1.4860570430755615, + "logps/chosen": -200.5336456298828, + "logps/rejected": -246.2007598876953, + "loss": 0.4214, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.011333741247653961, + "rewards/margins": 2.026169776916504, + "rewards/rejected": -2.014835834503174, + "step": 835 + }, + { + "epoch": 0.26, + "learning_rate": 4.6379694885090405e-05, + "logits/chosen": -1.4862781763076782, + "logits/rejected": -1.4114696979522705, + "logps/chosen": -252.9210205078125, + "logps/rejected": -303.67584228515625, + "loss": 0.3977, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4330156445503235, + "rewards/margins": 2.5075771808624268, + "rewards/rejected": -2.9405925273895264, + "step": 840 + }, + { + "epoch": 0.26, + "learning_rate": 4.6310441616508914e-05, + "logits/chosen": -1.3494175672531128, + "logits/rejected": -1.2918002605438232, + "logps/chosen": -236.6602783203125, + "logps/rejected": -294.8357238769531, + "loss": 0.3647, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5964530110359192, + "rewards/margins": 2.777496814727783, + "rewards/rejected": -3.3739497661590576, + "step": 845 + }, + { + "epoch": 0.26, + "learning_rate": 4.624058495742114e-05, + "logits/chosen": -1.4458904266357422, + "logits/rejected": -1.3650354146957397, + "logps/chosen": -261.0570983886719, + "logps/rejected": -323.8190002441406, + "loss": 0.3724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.723595917224884, + "rewards/margins": 2.936446189880371, + "rewards/rejected": -3.6600422859191895, + "step": 850 + }, + { + "epoch": 0.26, + "learning_rate": 4.617012688577036e-05, + "logits/chosen": -1.4270175695419312, + "logits/rejected": -1.3395029306411743, + "logps/chosen": -234.1733856201172, + "logps/rejected": -280.4658203125, + "loss": 0.3788, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4625687599182129, + "rewards/margins": 2.767164707183838, + "rewards/rejected": -3.2297332286834717, + "step": 855 + }, + { + "epoch": 0.26, + "learning_rate": 4.609906939652846e-05, + "logits/chosen": -1.4226279258728027, + "logits/rejected": -1.3623218536376953, + "logps/chosen": -195.22999572753906, + "logps/rejected": -245.10183715820312, + "loss": 0.3799, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3807176947593689, + "rewards/margins": 2.3278114795684814, + "rewards/rejected": -2.708528995513916, + "step": 860 + }, + { + "epoch": 0.26, + "learning_rate": 4.60274145016394e-05, + "logits/chosen": -1.4433257579803467, + "logits/rejected": -1.3707187175750732, + "logps/chosen": -241.7848358154297, + "logps/rejected": -267.00213623046875, + "loss": 0.3711, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17411458492279053, + "rewards/margins": 2.5103354454040527, + "rewards/rejected": -2.6844499111175537, + "step": 865 + }, + { + "epoch": 0.27, + "learning_rate": 4.595516422996227e-05, + "logits/chosen": -1.4536000490188599, + "logits/rejected": -1.3923813104629517, + "logps/chosen": -204.7315216064453, + "logps/rejected": -269.3832092285156, + "loss": 0.3762, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.10245015472173691, + "rewards/margins": 3.06905198097229, + "rewards/rejected": -2.966601848602295, + "step": 870 + }, + { + "epoch": 0.27, + "learning_rate": 4.588232062721385e-05, + "logits/chosen": -1.506850004196167, + "logits/rejected": -1.417551875114441, + "logps/chosen": -226.9280242919922, + "logps/rejected": -282.6661071777344, + "loss": 0.3891, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.23108553886413574, + "rewards/margins": 2.9151499271392822, + "rewards/rejected": -3.1462349891662598, + "step": 875 + }, + { + "epoch": 0.27, + "learning_rate": 4.580888575591068e-05, + "logits/chosen": -1.432558298110962, + "logits/rejected": -1.3910208940505981, + "logps/chosen": -224.8462677001953, + "logps/rejected": -275.9529724121094, + "loss": 0.4098, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4546372890472412, + "rewards/margins": 2.7860777378082275, + "rewards/rejected": -3.2407150268554688, + "step": 880 + }, + { + "epoch": 0.27, + "learning_rate": 4.573486169531068e-05, + "logits/chosen": -1.3392664194107056, + "logits/rejected": -1.2887176275253296, + "logps/chosen": -228.43896484375, + "logps/rejected": -280.51556396484375, + "loss": 0.3702, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.29709941148757935, + "rewards/margins": 3.137378692626953, + "rewards/rejected": -3.434478282928467, + "step": 885 + }, + { + "epoch": 0.27, + "learning_rate": 4.5660250541354224e-05, + "logits/chosen": -1.484899878501892, + "logits/rejected": -1.381151556968689, + "logps/chosen": -244.162841796875, + "logps/rejected": -285.1504821777344, + "loss": 0.3823, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2389202117919922, + "rewards/margins": 3.0065085887908936, + "rewards/rejected": -3.245429277420044, + "step": 890 + }, + { + "epoch": 0.27, + "learning_rate": 4.5585054406604864e-05, + "logits/chosen": -1.5870790481567383, + "logits/rejected": -1.535390019416809, + "logps/chosen": -222.4707489013672, + "logps/rejected": -274.00555419921875, + "loss": 0.4039, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.4108337461948395, + "rewards/margins": 2.4919819831848145, + "rewards/rejected": -2.902815580368042, + "step": 895 + }, + { + "epoch": 0.27, + "learning_rate": 4.550927542018947e-05, + "logits/chosen": -1.3818638324737549, + "logits/rejected": -1.3315644264221191, + "logps/chosen": -225.2039337158203, + "logps/rejected": -255.9619140625, + "loss": 0.4276, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.7841989398002625, + "rewards/margins": 2.3433711528778076, + "rewards/rejected": -3.127570152282715, + "step": 900 + }, + { + "epoch": 0.28, + "learning_rate": 4.5432915727737936e-05, + "logits/chosen": -1.424290418624878, + "logits/rejected": -1.3178008794784546, + "logps/chosen": -248.19631958007812, + "logps/rejected": -290.7337341308594, + "loss": 0.4093, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.01188063621521, + "rewards/margins": 2.751819610595703, + "rewards/rejected": -3.763700008392334, + "step": 905 + }, + { + "epoch": 0.28, + "learning_rate": 4.5355977491322485e-05, + "logits/chosen": -1.4348728656768799, + "logits/rejected": -1.380027413368225, + "logps/chosen": -258.47039794921875, + "logps/rejected": -330.335693359375, + "loss": 0.3708, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1714378595352173, + "rewards/margins": 3.1479644775390625, + "rewards/rejected": -4.31940221786499, + "step": 910 + }, + { + "epoch": 0.28, + "learning_rate": 4.527846288939639e-05, + "logits/chosen": -1.5116336345672607, + "logits/rejected": -1.37888503074646, + "logps/chosen": -240.5450439453125, + "logps/rejected": -272.766357421875, + "loss": 0.4085, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6231032013893127, + "rewards/margins": 2.247542381286621, + "rewards/rejected": -2.870645761489868, + "step": 915 + }, + { + "epoch": 0.28, + "learning_rate": 4.5200374116732325e-05, + "logits/chosen": -1.4633252620697021, + "logits/rejected": -1.3567806482315063, + "logps/chosen": -251.53158569335938, + "logps/rejected": -293.6622009277344, + "loss": 0.3777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4417695105075836, + "rewards/margins": 2.9817652702331543, + "rewards/rejected": -3.423534870147705, + "step": 920 + }, + { + "epoch": 0.28, + "learning_rate": 4.5121713384360215e-05, + "logits/chosen": -1.4623820781707764, + "logits/rejected": -1.3391244411468506, + "logps/chosen": -227.37808227539062, + "logps/rejected": -264.84442138671875, + "loss": 0.3827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28840094804763794, + "rewards/margins": 2.5161187648773193, + "rewards/rejected": -2.8045194149017334, + "step": 925 + }, + { + "epoch": 0.28, + "learning_rate": 4.504248291950462e-05, + "logits/chosen": -1.540131688117981, + "logits/rejected": -1.4444448947906494, + "logps/chosen": -199.7274932861328, + "logps/rejected": -243.74893188476562, + "loss": 0.3956, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.33068814873695374, + "rewards/margins": 2.3045554161071777, + "rewards/rejected": -2.6352434158325195, + "step": 930 + }, + { + "epoch": 0.28, + "learning_rate": 4.4962684965521695e-05, + "logits/chosen": -1.4449470043182373, + "logits/rejected": -1.335399866104126, + "logps/chosen": -231.919677734375, + "logps/rejected": -284.73236083984375, + "loss": 0.3636, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4564400315284729, + "rewards/margins": 3.0215699672698975, + "rewards/rejected": -3.4780101776123047, + "step": 935 + }, + { + "epoch": 0.29, + "learning_rate": 4.488232178183567e-05, + "logits/chosen": -1.284208059310913, + "logits/rejected": -1.2242339849472046, + "logps/chosen": -247.7313232421875, + "logps/rejected": -297.8804931640625, + "loss": 0.3918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9177427291870117, + "rewards/margins": 3.033815383911133, + "rewards/rejected": -3.9515578746795654, + "step": 940 + }, + { + "epoch": 0.29, + "learning_rate": 4.480139564387482e-05, + "logits/chosen": -1.3877151012420654, + "logits/rejected": -1.3023748397827148, + "logps/chosen": -224.7671356201172, + "logps/rejected": -260.46551513671875, + "loss": 0.3879, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9650894403457642, + "rewards/margins": 2.5788512229919434, + "rewards/rejected": -3.543941020965576, + "step": 945 + }, + { + "epoch": 0.29, + "learning_rate": 4.471990884300715e-05, + "logits/chosen": -1.4054934978485107, + "logits/rejected": -1.3035714626312256, + "logps/chosen": -245.2154541015625, + "logps/rejected": -302.49969482421875, + "loss": 0.3624, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.588658332824707, + "rewards/margins": 3.1204638481140137, + "rewards/rejected": -3.7091221809387207, + "step": 950 + }, + { + "epoch": 0.29, + "learning_rate": 4.46378636864754e-05, + "logits/chosen": -1.3097150325775146, + "logits/rejected": -1.226994276046753, + "logps/chosen": -236.5945281982422, + "logps/rejected": -296.509765625, + "loss": 0.4001, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7584825158119202, + "rewards/margins": 3.3817715644836426, + "rewards/rejected": -4.140254020690918, + "step": 955 + }, + { + "epoch": 0.29, + "learning_rate": 4.455526249733178e-05, + "logits/chosen": -1.4240782260894775, + "logits/rejected": -1.3734047412872314, + "logps/chosen": -230.6123046875, + "logps/rejected": -291.6025390625, + "loss": 0.4944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2551751732826233, + "rewards/margins": 2.7103066444396973, + "rewards/rejected": -2.965481996536255, + "step": 960 + }, + { + "epoch": 0.29, + "learning_rate": 4.447210761437219e-05, + "logits/chosen": -1.5501660108566284, + "logits/rejected": -1.4900107383728027, + "logps/chosen": -229.24978637695312, + "logps/rejected": -278.9122314453125, + "loss": 0.4433, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.038763850927352905, + "rewards/margins": 2.4731106758117676, + "rewards/rejected": -2.5118744373321533, + "step": 965 + }, + { + "epoch": 0.3, + "learning_rate": 4.4388401392069975e-05, + "logits/chosen": -1.5285327434539795, + "logits/rejected": -1.370157241821289, + "logps/chosen": -245.3329315185547, + "logps/rejected": -271.7296447753906, + "loss": 0.3852, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16751372814178467, + "rewards/margins": 2.6887688636779785, + "rewards/rejected": -2.5212550163269043, + "step": 970 + }, + { + "epoch": 0.3, + "learning_rate": 4.430414620050929e-05, + "logits/chosen": -1.5238043069839478, + "logits/rejected": -1.4335300922393799, + "logps/chosen": -219.3772430419922, + "logps/rejected": -284.52667236328125, + "loss": 0.3898, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.2622632384300232, + "rewards/margins": 2.474923610687256, + "rewards/rejected": -2.212660551071167, + "step": 975 + }, + { + "epoch": 0.3, + "learning_rate": 4.421934442531796e-05, + "logits/chosen": -1.4353379011154175, + "logits/rejected": -1.4253833293914795, + "logps/chosen": -226.8037567138672, + "logps/rejected": -284.70269775390625, + "loss": 0.3631, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3952658772468567, + "rewards/margins": 2.801473617553711, + "rewards/rejected": -3.196739673614502, + "step": 980 + }, + { + "epoch": 0.3, + "learning_rate": 4.413399846759998e-05, + "logits/chosen": -1.4747555255889893, + "logits/rejected": -1.3409960269927979, + "logps/chosen": -261.8158264160156, + "logps/rejected": -304.100830078125, + "loss": 0.4125, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8937687873840332, + "rewards/margins": 3.569911241531372, + "rewards/rejected": -4.463679790496826, + "step": 985 + }, + { + "epoch": 0.3, + "learning_rate": 4.4048110743867455e-05, + "logits/chosen": -1.4302071332931519, + "logits/rejected": -1.298626184463501, + "logps/chosen": -241.70767211914062, + "logps/rejected": -302.16937255859375, + "loss": 0.3641, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8468769788742065, + "rewards/margins": 2.884181499481201, + "rewards/rejected": -3.7310585975646973, + "step": 990 + }, + { + "epoch": 0.3, + "learning_rate": 4.396168368597226e-05, + "logits/chosen": -1.410563588142395, + "logits/rejected": -1.296134352684021, + "logps/chosen": -247.36376953125, + "logps/rejected": -289.80047607421875, + "loss": 0.3609, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.35360056161880493, + "rewards/margins": 2.7350914478302, + "rewards/rejected": -3.0886917114257812, + "step": 995 + }, + { + "epoch": 0.3, + "learning_rate": 4.387471974103713e-05, + "logits/chosen": -1.5295279026031494, + "logits/rejected": -1.3939071893692017, + "logps/chosen": -225.24356079101562, + "logps/rejected": -259.98565673828125, + "loss": 0.4028, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06099366396665573, + "rewards/margins": 2.6358046531677246, + "rewards/rejected": -2.696798086166382, + "step": 1000 + }, + { + "epoch": 0.31, + "learning_rate": 4.3787221371386384e-05, + "logits/chosen": -1.5613595247268677, + "logits/rejected": -1.516898274421692, + "logps/chosen": -248.61074829101562, + "logps/rejected": -296.6068115234375, + "loss": 0.4068, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.038737304508686066, + "rewards/margins": 2.5149483680725098, + "rewards/rejected": -2.476210832595825, + "step": 1005 + }, + { + "epoch": 0.31, + "learning_rate": 4.369919105447622e-05, + "logits/chosen": -1.5018450021743774, + "logits/rejected": -1.4464499950408936, + "logps/chosen": -215.8905029296875, + "logps/rejected": -250.96121215820312, + "loss": 0.4027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006156214978545904, + "rewards/margins": 2.1807312965393066, + "rewards/rejected": -2.174575090408325, + "step": 1010 + }, + { + "epoch": 0.31, + "learning_rate": 4.3610631282824556e-05, + "logits/chosen": -1.487079381942749, + "logits/rejected": -1.3799813985824585, + "logps/chosen": -220.0702362060547, + "logps/rejected": -256.46270751953125, + "loss": 0.4356, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.10034330934286118, + "rewards/margins": 2.6790289878845215, + "rewards/rejected": -2.779372453689575, + "step": 1015 + }, + { + "epoch": 0.31, + "learning_rate": 4.352154456394045e-05, + "logits/chosen": -1.4811842441558838, + "logits/rejected": -1.3606802225112915, + "logps/chosen": -235.79672241210938, + "logps/rejected": -265.3165588378906, + "loss": 0.3825, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.42521244287490845, + "rewards/margins": 2.6016688346862793, + "rewards/rejected": -3.026881456375122, + "step": 1020 + }, + { + "epoch": 0.31, + "learning_rate": 4.34319334202531e-05, + "logits/chosen": -1.490431547164917, + "logits/rejected": -1.380516767501831, + "logps/chosen": -245.6602325439453, + "logps/rejected": -304.4634094238281, + "loss": 0.3552, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8394023776054382, + "rewards/margins": 2.91302490234375, + "rewards/rejected": -3.752427339553833, + "step": 1025 + }, + { + "epoch": 0.31, + "learning_rate": 4.334180038904046e-05, + "logits/chosen": -1.3724013566970825, + "logits/rejected": -1.2893320322036743, + "logps/chosen": -224.7459259033203, + "logps/rejected": -275.4010314941406, + "loss": 0.3907, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.885975182056427, + "rewards/margins": 2.789304733276367, + "rewards/rejected": -3.6752796173095703, + "step": 1030 + }, + { + "epoch": 0.32, + "learning_rate": 4.3251148022357355e-05, + "logits/chosen": -1.4460914134979248, + "logits/rejected": -1.3820542097091675, + "logps/chosen": -262.4001159667969, + "logps/rejected": -319.7551574707031, + "loss": 0.4385, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1028560400009155, + "rewards/margins": 2.362874746322632, + "rewards/rejected": -3.465731143951416, + "step": 1035 + }, + { + "epoch": 0.32, + "learning_rate": 4.3159978886963226e-05, + "logits/chosen": -1.4794714450836182, + "logits/rejected": -1.3599398136138916, + "logps/chosen": -277.8358459472656, + "logps/rejected": -306.3834533691406, + "loss": 0.4237, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0677882432937622, + "rewards/margins": 3.120405673980713, + "rewards/rejected": -4.188194274902344, + "step": 1040 + }, + { + "epoch": 0.32, + "learning_rate": 4.306829556424948e-05, + "logits/chosen": -1.4093915224075317, + "logits/rejected": -1.271278977394104, + "logps/chosen": -287.8832702636719, + "logps/rejected": -332.91485595703125, + "loss": 0.3677, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1396197080612183, + "rewards/margins": 3.1220901012420654, + "rewards/rejected": -4.261710166931152, + "step": 1045 + }, + { + "epoch": 0.32, + "learning_rate": 4.2976100650166387e-05, + "logits/chosen": -1.4229646921157837, + "logits/rejected": -1.39849853515625, + "logps/chosen": -228.4208984375, + "logps/rejected": -291.3751525878906, + "loss": 0.3707, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9911971092224121, + "rewards/margins": 2.7032597064971924, + "rewards/rejected": -3.6944565773010254, + "step": 1050 + }, + { + "epoch": 0.32, + "learning_rate": 4.288339675514954e-05, + "logits/chosen": -1.3454296588897705, + "logits/rejected": -1.302362322807312, + "logps/chosen": -255.00906372070312, + "logps/rejected": -313.9950256347656, + "loss": 0.4064, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0980554819107056, + "rewards/margins": 3.0446298122406006, + "rewards/rejected": -4.142685890197754, + "step": 1055 + }, + { + "epoch": 0.32, + "learning_rate": 4.279018650404604e-05, + "logits/chosen": -1.3721091747283936, + "logits/rejected": -1.36617112159729, + "logps/chosen": -248.34432983398438, + "logps/rejected": -322.83502197265625, + "loss": 0.3819, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2138582468032837, + "rewards/margins": 2.7490146160125732, + "rewards/rejected": -3.9628729820251465, + "step": 1060 + }, + { + "epoch": 0.32, + "learning_rate": 4.2696472536040054e-05, + "logits/chosen": -1.3144346475601196, + "logits/rejected": -1.199225664138794, + "logps/chosen": -266.83746337890625, + "logps/rejected": -312.2145080566406, + "loss": 0.3602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.659379005432129, + "rewards/margins": 3.416776180267334, + "rewards/rejected": -5.076155185699463, + "step": 1065 + }, + { + "epoch": 0.33, + "learning_rate": 4.260225750457818e-05, + "logits/chosen": -1.4334145784378052, + "logits/rejected": -1.3207252025604248, + "logps/chosen": -258.37725830078125, + "logps/rejected": -299.75164794921875, + "loss": 0.34, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7483505010604858, + "rewards/margins": 2.753929615020752, + "rewards/rejected": -4.502279758453369, + "step": 1070 + }, + { + "epoch": 0.33, + "learning_rate": 4.250754407729428e-05, + "logits/chosen": -1.327194333076477, + "logits/rejected": -1.2377169132232666, + "logps/chosen": -279.93280029296875, + "logps/rejected": -334.41094970703125, + "loss": 0.3752, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0792298316955566, + "rewards/margins": 3.3054840564727783, + "rewards/rejected": -5.384713649749756, + "step": 1075 + }, + { + "epoch": 0.33, + "learning_rate": 4.241233493593393e-05, + "logits/chosen": -1.2953674793243408, + "logits/rejected": -1.263270378112793, + "logps/chosen": -241.9004364013672, + "logps/rejected": -313.4391784667969, + "loss": 0.4028, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5718278884887695, + "rewards/margins": 3.0327353477478027, + "rewards/rejected": -4.6045637130737305, + "step": 1080 + }, + { + "epoch": 0.33, + "learning_rate": 4.2316632776278525e-05, + "logits/chosen": -1.3943222761154175, + "logits/rejected": -1.2810288667678833, + "logps/chosen": -232.25814819335938, + "logps/rejected": -280.0733947753906, + "loss": 0.3978, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7831779718399048, + "rewards/margins": 2.5116419792175293, + "rewards/rejected": -3.2948200702667236, + "step": 1085 + }, + { + "epoch": 0.33, + "learning_rate": 4.222044030806894e-05, + "logits/chosen": -1.3264403343200684, + "logits/rejected": -1.3240474462509155, + "logps/chosen": -215.8976287841797, + "logps/rejected": -287.00128173828125, + "loss": 0.3795, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7215701341629028, + "rewards/margins": 2.7273683547973633, + "rewards/rejected": -3.4489383697509766, + "step": 1090 + }, + { + "epoch": 0.33, + "learning_rate": 4.21237602549288e-05, + "logits/chosen": -1.434257984161377, + "logits/rejected": -1.4035327434539795, + "logps/chosen": -201.57369995117188, + "logps/rejected": -253.14108276367188, + "loss": 0.3946, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4502865672111511, + "rewards/margins": 2.529646396636963, + "rewards/rejected": -2.979933023452759, + "step": 1095 + }, + { + "epoch": 0.34, + "learning_rate": 4.2026595354287334e-05, + "logits/chosen": -1.3879592418670654, + "logits/rejected": -1.3350013494491577, + "logps/chosen": -241.10311889648438, + "logps/rejected": -302.80865478515625, + "loss": 0.3825, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7047330737113953, + "rewards/margins": 2.6905667781829834, + "rewards/rejected": -3.3953003883361816, + "step": 1100 + }, + { + "epoch": 0.34, + "learning_rate": 4.192894835730193e-05, + "logits/chosen": -1.3509743213653564, + "logits/rejected": -1.248357892036438, + "logps/chosen": -253.964111328125, + "logps/rejected": -293.91632080078125, + "loss": 0.3919, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8855097889900208, + "rewards/margins": 2.823207139968872, + "rewards/rejected": -3.7087173461914062, + "step": 1105 + }, + { + "epoch": 0.34, + "learning_rate": 4.1830822028780194e-05, + "logits/chosen": -1.447584867477417, + "logits/rejected": -1.3725135326385498, + "logps/chosen": -240.1465606689453, + "logps/rejected": -290.83099365234375, + "loss": 0.4142, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9225603342056274, + "rewards/margins": 2.983272075653076, + "rewards/rejected": -3.905831813812256, + "step": 1110 + }, + { + "epoch": 0.34, + "learning_rate": 4.173221914710165e-05, + "logits/chosen": -1.404601812362671, + "logits/rejected": -1.2846992015838623, + "logps/chosen": -221.6542510986328, + "logps/rejected": -249.8398895263672, + "loss": 0.4451, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8987483978271484, + "rewards/margins": 2.412379503250122, + "rewards/rejected": -3.3111279010772705, + "step": 1115 + }, + { + "epoch": 0.34, + "learning_rate": 4.163314250413913e-05, + "logits/chosen": -1.4802556037902832, + "logits/rejected": -1.3953096866607666, + "logps/chosen": -230.58865356445312, + "logps/rejected": -278.18951416015625, + "loss": 0.384, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9633780717849731, + "rewards/margins": 2.581531047821045, + "rewards/rejected": -3.5449092388153076, + "step": 1120 + }, + { + "epoch": 0.34, + "learning_rate": 4.153359490517969e-05, + "logits/chosen": -1.4657261371612549, + "logits/rejected": -1.377966284751892, + "logps/chosen": -228.0840606689453, + "logps/rejected": -270.4330749511719, + "loss": 0.378, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8458096385002136, + "rewards/margins": 2.5918195247650146, + "rewards/rejected": -3.437628984451294, + "step": 1125 + }, + { + "epoch": 0.34, + "learning_rate": 4.143357916884514e-05, + "logits/chosen": -1.4898326396942139, + "logits/rejected": -1.3859083652496338, + "logps/chosen": -256.3155212402344, + "logps/rejected": -299.3856506347656, + "loss": 0.4363, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0702875852584839, + "rewards/margins": 2.819854497909546, + "rewards/rejected": -3.8901419639587402, + "step": 1130 + }, + { + "epoch": 0.35, + "learning_rate": 4.1333098127012326e-05, + "logits/chosen": -1.491857886314392, + "logits/rejected": -1.4556844234466553, + "logps/chosen": -254.2758331298828, + "logps/rejected": -287.3238220214844, + "loss": 0.4276, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9167013168334961, + "rewards/margins": 2.7353272438049316, + "rewards/rejected": -3.6520285606384277, + "step": 1135 + }, + { + "epoch": 0.35, + "learning_rate": 4.123215462473287e-05, + "logits/chosen": -1.4471662044525146, + "logits/rejected": -1.3652303218841553, + "logps/chosen": -257.15521240234375, + "logps/rejected": -321.16632080078125, + "loss": 0.3993, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5140289068222046, + "rewards/margins": 3.106577157974243, + "rewards/rejected": -3.620605945587158, + "step": 1140 + }, + { + "epoch": 0.35, + "learning_rate": 4.113075152015267e-05, + "logits/chosen": -1.4940803050994873, + "logits/rejected": -1.4113094806671143, + "logps/chosen": -231.76272583007812, + "logps/rejected": -276.2952575683594, + "loss": 0.3857, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4637375473976135, + "rewards/margins": 2.7834160327911377, + "rewards/rejected": -3.2471535205841064, + "step": 1145 + }, + { + "epoch": 0.35, + "learning_rate": 4.102889168443091e-05, + "logits/chosen": -1.4232820272445679, + "logits/rejected": -1.3385612964630127, + "logps/chosen": -220.4342041015625, + "logps/rejected": -273.684326171875, + "loss": 0.3638, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5296335220336914, + "rewards/margins": 2.962373733520508, + "rewards/rejected": -3.49200701713562, + "step": 1150 + }, + { + "epoch": 0.35, + "learning_rate": 4.092657800165883e-05, + "logits/chosen": -1.3860952854156494, + "logits/rejected": -1.2572487592697144, + "logps/chosen": -238.58279418945312, + "logps/rejected": -299.2005310058594, + "loss": 0.3959, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.492671400308609, + "rewards/margins": 3.1469099521636963, + "rewards/rejected": -3.6395816802978516, + "step": 1155 + }, + { + "epoch": 0.35, + "learning_rate": 4.082381336877805e-05, + "logits/chosen": -1.4538966417312622, + "logits/rejected": -1.3704473972320557, + "logps/chosen": -232.91757202148438, + "logps/rejected": -289.62396240234375, + "loss": 0.4583, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3003008961677551, + "rewards/margins": 3.076662540435791, + "rewards/rejected": -3.3769633769989014, + "step": 1160 + }, + { + "epoch": 0.36, + "learning_rate": 4.0720600695498486e-05, + "logits/chosen": -1.5047948360443115, + "logits/rejected": -1.3999977111816406, + "logps/chosen": -216.4929962158203, + "logps/rejected": -261.029296875, + "loss": 0.3941, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.28260135650634766, + "rewards/margins": 2.698638677597046, + "rewards/rejected": -2.981240749359131, + "step": 1165 + }, + { + "epoch": 0.36, + "learning_rate": 4.061694290421604e-05, + "logits/chosen": -1.5519943237304688, + "logits/rejected": -1.4196147918701172, + "logps/chosen": -237.73605346679688, + "logps/rejected": -281.0528259277344, + "loss": 0.3755, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3319626450538635, + "rewards/margins": 2.4680209159851074, + "rewards/rejected": -2.799983501434326, + "step": 1170 + }, + { + "epoch": 0.36, + "learning_rate": 4.051284292992984e-05, + "logits/chosen": -1.3771086931228638, + "logits/rejected": -1.3074411153793335, + "logps/chosen": -251.0067596435547, + "logps/rejected": -294.7237243652344, + "loss": 0.3995, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2037571370601654, + "rewards/margins": 3.2541725635528564, + "rewards/rejected": -3.4579296112060547, + "step": 1175 + }, + { + "epoch": 0.36, + "learning_rate": 4.040830372015909e-05, + "logits/chosen": -1.475381851196289, + "logits/rejected": -1.3613998889923096, + "logps/chosen": -246.23281860351562, + "logps/rejected": -300.03497314453125, + "loss": 0.3781, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10555452108383179, + "rewards/margins": 3.167620897293091, + "rewards/rejected": -3.273175001144409, + "step": 1180 + }, + { + "epoch": 0.36, + "learning_rate": 4.0303328234859665e-05, + "logits/chosen": -1.4284617900848389, + "logits/rejected": -1.2940706014633179, + "logps/chosen": -268.23077392578125, + "logps/rejected": -295.5655517578125, + "loss": 0.3795, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.13254739344120026, + "rewards/margins": 3.2897098064422607, + "rewards/rejected": -3.4222571849823, + "step": 1185 + }, + { + "epoch": 0.36, + "learning_rate": 4.019791944634027e-05, + "logits/chosen": -1.4546287059783936, + "logits/rejected": -1.4235341548919678, + "logps/chosen": -225.8767547607422, + "logps/rejected": -300.689697265625, + "loss": 0.3781, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.15003997087478638, + "rewards/margins": 2.844740390777588, + "rewards/rejected": -2.9947803020477295, + "step": 1190 + }, + { + "epoch": 0.36, + "learning_rate": 4.00920803391783e-05, + "logits/chosen": -1.4976985454559326, + "logits/rejected": -1.436232089996338, + "logps/chosen": -216.7958526611328, + "logps/rejected": -249.0604705810547, + "loss": 0.38, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.038081564009189606, + "rewards/margins": 2.4354381561279297, + "rewards/rejected": -2.473519802093506, + "step": 1195 + }, + { + "epoch": 0.37, + "learning_rate": 3.9985813910135304e-05, + "logits/chosen": -1.488646149635315, + "logits/rejected": -1.4349124431610107, + "logps/chosen": -235.86972045898438, + "logps/rejected": -304.09039306640625, + "loss": 0.3782, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.26524874567985535, + "rewards/margins": 3.0398926734924316, + "rewards/rejected": -3.305140972137451, + "step": 1200 + }, + { + "epoch": 0.37, + "learning_rate": 3.9879123168072206e-05, + "logits/chosen": -1.4791837930679321, + "logits/rejected": -1.4347190856933594, + "logps/chosen": -243.2008819580078, + "logps/rejected": -321.9309997558594, + "loss": 0.4218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11531716585159302, + "rewards/margins": 2.624474048614502, + "rewards/rejected": -2.73979115486145, + "step": 1205 + }, + { + "epoch": 0.37, + "learning_rate": 3.977201113386402e-05, + "logits/chosen": -1.5107253789901733, + "logits/rejected": -1.3714876174926758, + "logps/chosen": -253.50497436523438, + "logps/rejected": -301.078125, + "loss": 0.3595, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07889306545257568, + "rewards/margins": 2.69875431060791, + "rewards/rejected": -2.7776474952697754, + "step": 1210 + }, + { + "epoch": 0.37, + "learning_rate": 3.966448084031437e-05, + "logits/chosen": -1.3860998153686523, + "logits/rejected": -1.3501628637313843, + "logps/chosen": -210.1219940185547, + "logps/rejected": -260.87872314453125, + "loss": 0.3633, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.19702157378196716, + "rewards/margins": 2.5177106857299805, + "rewards/rejected": -2.7147319316864014, + "step": 1215 + }, + { + "epoch": 0.37, + "learning_rate": 3.955653533206959e-05, + "logits/chosen": -1.4096229076385498, + "logits/rejected": -1.2972562313079834, + "logps/chosen": -219.9523162841797, + "logps/rejected": -269.67333984375, + "loss": 0.3453, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.26071828603744507, + "rewards/margins": 3.1086888313293457, + "rewards/rejected": -3.3694069385528564, + "step": 1220 + }, + { + "epoch": 0.37, + "learning_rate": 3.9448177665532574e-05, + "logits/chosen": -1.469242811203003, + "logits/rejected": -1.3070530891418457, + "logps/chosen": -230.3858184814453, + "logps/rejected": -264.9944763183594, + "loss": 0.3727, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5432360172271729, + "rewards/margins": 2.8007569313049316, + "rewards/rejected": -3.3439929485321045, + "step": 1225 + }, + { + "epoch": 0.37, + "learning_rate": 3.933941090877615e-05, + "logits/chosen": -1.4690440893173218, + "logits/rejected": -1.3958766460418701, + "logps/chosen": -232.1632843017578, + "logps/rejected": -284.0280456542969, + "loss": 0.382, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4447177052497864, + "rewards/margins": 2.9213266372680664, + "rewards/rejected": -3.3660449981689453, + "step": 1230 + }, + { + "epoch": 0.38, + "learning_rate": 3.923023814145629e-05, + "logits/chosen": -1.5301823616027832, + "logits/rejected": -1.376138687133789, + "logps/chosen": -245.1982879638672, + "logps/rejected": -270.8854675292969, + "loss": 0.4028, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6281201243400574, + "rewards/margins": 2.786634922027588, + "rewards/rejected": -3.414755344390869, + "step": 1235 + }, + { + "epoch": 0.38, + "learning_rate": 3.9120662454724836e-05, + "logits/chosen": -1.4721466302871704, + "logits/rejected": -1.4076852798461914, + "logps/chosen": -237.6669158935547, + "logps/rejected": -292.71478271484375, + "loss": 0.3709, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0528624057769775, + "rewards/margins": 3.1795761585235596, + "rewards/rejected": -4.232438087463379, + "step": 1240 + }, + { + "epoch": 0.38, + "learning_rate": 3.901068695114206e-05, + "logits/chosen": -1.4836117029190063, + "logits/rejected": -1.366645097732544, + "logps/chosen": -242.7480010986328, + "logps/rejected": -301.66607666015625, + "loss": 0.4359, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.254749059677124, + "rewards/margins": 3.5557892322540283, + "rewards/rejected": -4.810537815093994, + "step": 1245 + }, + { + "epoch": 0.38, + "learning_rate": 3.890031474458874e-05, + "logits/chosen": -1.4514219760894775, + "logits/rejected": -1.3249971866607666, + "logps/chosen": -275.0703430175781, + "logps/rejected": -320.9544372558594, + "loss": 0.4112, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.782173752784729, + "rewards/margins": 3.3930366039276123, + "rewards/rejected": -5.175210475921631, + "step": 1250 + }, + { + "epoch": 0.38, + "learning_rate": 3.878954896017804e-05, + "logits/chosen": -1.3831149339675903, + "logits/rejected": -1.261541724205017, + "logps/chosen": -270.8629455566406, + "logps/rejected": -328.1438903808594, + "loss": 0.3622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6734645366668701, + "rewards/margins": 3.397109270095825, + "rewards/rejected": -5.070573806762695, + "step": 1255 + }, + { + "epoch": 0.38, + "learning_rate": 3.867839273416701e-05, + "logits/chosen": -1.375957727432251, + "logits/rejected": -1.244816541671753, + "logps/chosen": -237.8169403076172, + "logps/rejected": -267.6765441894531, + "loss": 0.3726, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1279919147491455, + "rewards/margins": 3.1958394050598145, + "rewards/rejected": -4.323831081390381, + "step": 1260 + }, + { + "epoch": 0.39, + "learning_rate": 3.8566849213867795e-05, + "logits/chosen": -1.3469189405441284, + "logits/rejected": -1.3233760595321655, + "logps/chosen": -215.3714141845703, + "logps/rejected": -297.8151550292969, + "loss": 0.3699, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6543620824813843, + "rewards/margins": 2.7842297554016113, + "rewards/rejected": -4.438591957092285, + "step": 1265 + }, + { + "epoch": 0.39, + "learning_rate": 3.8454921557558476e-05, + "logits/chosen": -1.4500279426574707, + "logits/rejected": -1.343481183052063, + "logps/chosen": -246.06689453125, + "logps/rejected": -302.48260498046875, + "loss": 0.3783, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4052373170852661, + "rewards/margins": 3.2777016162872314, + "rewards/rejected": -4.6829400062561035, + "step": 1270 + }, + { + "epoch": 0.39, + "learning_rate": 3.834261293439374e-05, + "logits/chosen": -1.2790629863739014, + "logits/rejected": -1.1767776012420654, + "logps/chosen": -233.415771484375, + "logps/rejected": -286.8774719238281, + "loss": 0.4056, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6317148208618164, + "rewards/margins": 2.9100582599639893, + "rewards/rejected": -4.541773319244385, + "step": 1275 + }, + { + "epoch": 0.39, + "learning_rate": 3.8229926524315016e-05, + "logits/chosen": -1.4536702632904053, + "logits/rejected": -1.3482431173324585, + "logps/chosen": -238.40676879882812, + "logps/rejected": -278.3370666503906, + "loss": 0.3956, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.226927638053894, + "rewards/margins": 3.155203342437744, + "rewards/rejected": -4.3821306228637695, + "step": 1280 + }, + { + "epoch": 0.39, + "learning_rate": 3.8116865517960585e-05, + "logits/chosen": -1.4348114728927612, + "logits/rejected": -1.318174123764038, + "logps/chosen": -228.39523315429688, + "logps/rejected": -275.8654479980469, + "loss": 0.4036, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2049471139907837, + "rewards/margins": 3.1574530601501465, + "rewards/rejected": -4.362399578094482, + "step": 1285 + }, + { + "epoch": 0.39, + "learning_rate": 3.800343311657509e-05, + "logits/chosen": -1.4712860584259033, + "logits/rejected": -1.3852875232696533, + "logps/chosen": -236.8831024169922, + "logps/rejected": -282.43572998046875, + "loss": 0.4184, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1031112670898438, + "rewards/margins": 3.026742458343506, + "rewards/rejected": -4.12985372543335, + "step": 1290 + }, + { + "epoch": 0.39, + "learning_rate": 3.788963253191905e-05, + "logits/chosen": -1.4327385425567627, + "logits/rejected": -1.3884919881820679, + "logps/chosen": -256.63848876953125, + "logps/rejected": -309.84759521484375, + "loss": 0.4386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7931209206581116, + "rewards/margins": 2.4091200828552246, + "rewards/rejected": -3.2022411823272705, + "step": 1295 + }, + { + "epoch": 0.4, + "learning_rate": 3.777546698617776e-05, + "logits/chosen": -1.4254684448242188, + "logits/rejected": -1.3687108755111694, + "logps/chosen": -224.0829315185547, + "logps/rejected": -275.72247314453125, + "loss": 0.3798, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6869279146194458, + "rewards/margins": 2.486288547515869, + "rewards/rejected": -3.1732163429260254, + "step": 1300 + }, + { + "epoch": 0.4, + "learning_rate": 3.766093971187019e-05, + "logits/chosen": -1.452755331993103, + "logits/rejected": -1.3947112560272217, + "logps/chosen": -239.74575805664062, + "logps/rejected": -277.12017822265625, + "loss": 0.3977, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5879044532775879, + "rewards/margins": 2.501594066619873, + "rewards/rejected": -3.089498519897461, + "step": 1305 + }, + { + "epoch": 0.4, + "learning_rate": 3.75460539517574e-05, + "logits/chosen": -1.4273698329925537, + "logits/rejected": -1.3977384567260742, + "logps/chosen": -231.1275634765625, + "logps/rejected": -284.3058776855469, + "loss": 0.3925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20015409588813782, + "rewards/margins": 2.5172126293182373, + "rewards/rejected": -2.7173666954040527, + "step": 1310 + }, + { + "epoch": 0.4, + "learning_rate": 3.743081295875069e-05, + "logits/chosen": -1.5619311332702637, + "logits/rejected": -1.4386647939682007, + "logps/chosen": -235.97891235351562, + "logps/rejected": -289.92694091796875, + "loss": 0.3512, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.166568323969841, + "rewards/margins": 2.67271089553833, + "rewards/rejected": -2.8392791748046875, + "step": 1315 + }, + { + "epoch": 0.4, + "learning_rate": 3.7315219995819594e-05, + "logits/chosen": -1.5064969062805176, + "logits/rejected": -1.3810298442840576, + "logps/chosen": -266.1444091796875, + "logps/rejected": -311.23199462890625, + "loss": 0.3587, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7214463353157043, + "rewards/margins": 3.029715061187744, + "rewards/rejected": -3.7511610984802246, + "step": 1320 + }, + { + "epoch": 0.4, + "learning_rate": 3.719927833589939e-05, + "logits/chosen": -1.358946442604065, + "logits/rejected": -1.2690280675888062, + "logps/chosen": -235.3449249267578, + "logps/rejected": -287.78924560546875, + "loss": 0.3899, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8808122873306274, + "rewards/margins": 2.6421303749084473, + "rewards/rejected": -3.5229427814483643, + "step": 1325 + }, + { + "epoch": 0.41, + "learning_rate": 3.708299126179847e-05, + "logits/chosen": -1.4998195171356201, + "logits/rejected": -1.3532516956329346, + "logps/chosen": -251.736572265625, + "logps/rejected": -285.5415954589844, + "loss": 0.3396, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8132502436637878, + "rewards/margins": 3.2861030101776123, + "rewards/rejected": -4.099352836608887, + "step": 1330 + }, + { + "epoch": 0.41, + "learning_rate": 3.6966362066105435e-05, + "logits/chosen": -1.4474642276763916, + "logits/rejected": -1.3161416053771973, + "logps/chosen": -244.8996124267578, + "logps/rejected": -285.0702209472656, + "loss": 0.3514, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6026067733764648, + "rewards/margins": 3.359797954559326, + "rewards/rejected": -3.96240496635437, + "step": 1335 + }, + { + "epoch": 0.41, + "learning_rate": 3.684939405109577e-05, + "logits/chosen": -1.4846141338348389, + "logits/rejected": -1.3937715291976929, + "logps/chosen": -234.49697875976562, + "logps/rejected": -277.59503173828125, + "loss": 0.3958, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3563997745513916, + "rewards/margins": 2.4923202991485596, + "rewards/rejected": -2.848719835281372, + "step": 1340 + }, + { + "epoch": 0.41, + "learning_rate": 3.673209052863843e-05, + "logits/chosen": -1.51144540309906, + "logits/rejected": -1.347617268562317, + "logps/chosen": -239.48681640625, + "logps/rejected": -275.97760009765625, + "loss": 0.3446, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.024043012410402298, + "rewards/margins": 2.835794687271118, + "rewards/rejected": -2.8117516040802, + "step": 1345 + }, + { + "epoch": 0.41, + "learning_rate": 3.6614454820102017e-05, + "logits/chosen": -1.47091543674469, + "logits/rejected": -1.385925531387329, + "logps/chosen": -257.21575927734375, + "logps/rejected": -294.6009521484375, + "loss": 0.3907, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.17883895337581635, + "rewards/margins": 2.582345485687256, + "rewards/rejected": -2.7611842155456543, + "step": 1350 + }, + { + "epoch": 0.41, + "learning_rate": 3.6496490256260777e-05, + "logits/chosen": -1.5150299072265625, + "logits/rejected": -1.4086599349975586, + "logps/chosen": -237.181640625, + "logps/rejected": -292.00518798828125, + "loss": 0.3889, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.24234585464000702, + "rewards/margins": 2.8969500064849854, + "rewards/rejected": -3.1392955780029297, + "step": 1355 + }, + { + "epoch": 0.41, + "learning_rate": 3.6378200177200224e-05, + "logits/chosen": -1.4277657270431519, + "logits/rejected": -1.350029706954956, + "logps/chosen": -230.1826629638672, + "logps/rejected": -300.61090087890625, + "loss": 0.3423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.406125545501709, + "rewards/margins": 3.0230190753936768, + "rewards/rejected": -3.4291443824768066, + "step": 1360 + }, + { + "epoch": 0.42, + "learning_rate": 3.625958793222265e-05, + "logits/chosen": -1.4115116596221924, + "logits/rejected": -1.2951580286026, + "logps/chosen": -203.2592315673828, + "logps/rejected": -257.6768798828125, + "loss": 0.4226, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.45247992873191833, + "rewards/margins": 2.8937525749206543, + "rewards/rejected": -3.3462326526641846, + "step": 1365 + }, + { + "epoch": 0.42, + "learning_rate": 3.614065687975225e-05, + "logits/chosen": -1.3729918003082275, + "logits/rejected": -1.274886131286621, + "logps/chosen": -236.5909423828125, + "logps/rejected": -300.1371154785156, + "loss": 0.3832, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6886480450630188, + "rewards/margins": 3.3041484355926514, + "rewards/rejected": -3.9927964210510254, + "step": 1370 + }, + { + "epoch": 0.42, + "learning_rate": 3.602141038724001e-05, + "logits/chosen": -1.445521593093872, + "logits/rejected": -1.3185946941375732, + "logps/chosen": -251.41104125976562, + "logps/rejected": -301.2436218261719, + "loss": 0.3619, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5627816319465637, + "rewards/margins": 3.646247386932373, + "rewards/rejected": -4.209029197692871, + "step": 1375 + }, + { + "epoch": 0.42, + "learning_rate": 3.590185183106842e-05, + "logits/chosen": -1.4172786474227905, + "logits/rejected": -1.3537501096725464, + "logps/chosen": -225.7064666748047, + "logps/rejected": -291.16998291015625, + "loss": 0.3889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.118973508477211, + "rewards/margins": 2.9702212810516357, + "rewards/rejected": -3.0891947746276855, + "step": 1380 + }, + { + "epoch": 0.42, + "learning_rate": 3.578198459645579e-05, + "logits/chosen": -1.4852367639541626, + "logits/rejected": -1.3799657821655273, + "logps/chosen": -253.7847442626953, + "logps/rejected": -293.2589111328125, + "loss": 0.3236, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0412837415933609, + "rewards/margins": 3.353567123413086, + "rewards/rejected": -3.394850969314575, + "step": 1385 + }, + { + "epoch": 0.42, + "learning_rate": 3.56618120773605e-05, + "logits/chosen": -1.4363138675689697, + "logits/rejected": -1.2875852584838867, + "logps/chosen": -235.5326385498047, + "logps/rejected": -260.58050537109375, + "loss": 0.4108, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2815939784049988, + "rewards/margins": 2.755204677581787, + "rewards/rejected": -3.0367987155914307, + "step": 1390 + }, + { + "epoch": 0.43, + "learning_rate": 3.55413376763848e-05, + "logits/chosen": -1.434983491897583, + "logits/rejected": -1.3754985332489014, + "logps/chosen": -251.4075164794922, + "logps/rejected": -303.62640380859375, + "loss": 0.3982, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3561423420906067, + "rewards/margins": 2.8273160457611084, + "rewards/rejected": -3.1834583282470703, + "step": 1395 + }, + { + "epoch": 0.43, + "learning_rate": 3.542056480467858e-05, + "logits/chosen": -1.3716362714767456, + "logits/rejected": -1.308511734008789, + "logps/chosen": -207.52523803710938, + "logps/rejected": -277.71246337890625, + "loss": 0.3433, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.49458226561546326, + "rewards/margins": 3.2204792499542236, + "rewards/rejected": -3.7150611877441406, + "step": 1400 + }, + { + "epoch": 0.43, + "learning_rate": 3.529949688184265e-05, + "logits/chosen": -1.357021450996399, + "logits/rejected": -1.2644864320755005, + "logps/chosen": -252.494384765625, + "logps/rejected": -286.4941101074219, + "loss": 0.3908, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7067984342575073, + "rewards/margins": 2.705132484436035, + "rewards/rejected": -3.411930799484253, + "step": 1405 + }, + { + "epoch": 0.43, + "learning_rate": 3.5178137335832045e-05, + "logits/chosen": -1.4006474018096924, + "logits/rejected": -1.3148066997528076, + "logps/chosen": -220.7507781982422, + "logps/rejected": -300.52197265625, + "loss": 0.4377, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8004969358444214, + "rewards/margins": 2.8967666625976562, + "rewards/rejected": -3.697263240814209, + "step": 1410 + }, + { + "epoch": 0.43, + "learning_rate": 3.50564896028589e-05, + "logits/chosen": -1.4328795671463013, + "logits/rejected": -1.2834830284118652, + "logps/chosen": -248.32144165039062, + "logps/rejected": -283.7814025878906, + "loss": 0.3956, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0089048147201538, + "rewards/margins": 3.0403881072998047, + "rewards/rejected": -4.049293041229248, + "step": 1415 + }, + { + "epoch": 0.43, + "learning_rate": 3.493455712729514e-05, + "logits/chosen": -1.4717390537261963, + "logits/rejected": -1.4287965297698975, + "logps/chosen": -233.8995819091797, + "logps/rejected": -281.0123596191406, + "loss": 0.403, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6191913485527039, + "rewards/margins": 2.783186435699463, + "rewards/rejected": -3.4023776054382324, + "step": 1420 + }, + { + "epoch": 0.43, + "learning_rate": 3.4812343361575e-05, + "logits/chosen": -1.4601266384124756, + "logits/rejected": -1.4116663932800293, + "logps/chosen": -239.9668731689453, + "logps/rejected": -297.328125, + "loss": 0.3886, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.38425880670547485, + "rewards/margins": 2.980128049850464, + "rewards/rejected": -3.364386796951294, + "step": 1425 + }, + { + "epoch": 0.44, + "learning_rate": 3.468985176609726e-05, + "logits/chosen": -1.425545334815979, + "logits/rejected": -1.3163349628448486, + "logps/chosen": -251.3730010986328, + "logps/rejected": -292.0272216796875, + "loss": 0.321, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.007286679930984974, + "rewards/margins": 2.743786573410034, + "rewards/rejected": -2.7510733604431152, + "step": 1430 + }, + { + "epoch": 0.44, + "learning_rate": 3.456708580912725e-05, + "logits/chosen": -1.448166012763977, + "logits/rejected": -1.3342589139938354, + "logps/chosen": -245.2729034423828, + "logps/rejected": -287.38189697265625, + "loss": 0.361, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09365560114383698, + "rewards/margins": 2.8862528800964355, + "rewards/rejected": -2.7925972938537598, + "step": 1435 + }, + { + "epoch": 0.44, + "learning_rate": 3.444404896669865e-05, + "logits/chosen": -1.4818215370178223, + "logits/rejected": -1.3631136417388916, + "logps/chosen": -257.03533935546875, + "logps/rejected": -275.7957763671875, + "loss": 0.3723, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19895866513252258, + "rewards/margins": 2.7059268951416016, + "rewards/rejected": -2.9048852920532227, + "step": 1440 + }, + { + "epoch": 0.44, + "learning_rate": 3.432074472251508e-05, + "logits/chosen": -1.3858647346496582, + "logits/rejected": -1.305906057357788, + "logps/chosen": -243.6377410888672, + "logps/rejected": -286.6610107421875, + "loss": 0.3676, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.13256962597370148, + "rewards/margins": 3.50431752204895, + "rewards/rejected": -3.6368870735168457, + "step": 1445 + }, + { + "epoch": 0.44, + "learning_rate": 3.419717656785146e-05, + "logits/chosen": -1.3872106075286865, + "logits/rejected": -1.2487151622772217, + "logps/chosen": -209.8594512939453, + "logps/rejected": -236.38931274414062, + "loss": 0.3655, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3276270925998688, + "rewards/margins": 2.7453348636627197, + "rewards/rejected": -3.0729620456695557, + "step": 1450 + }, + { + "epoch": 0.44, + "learning_rate": 3.4073348001455164e-05, + "logits/chosen": -1.4358813762664795, + "logits/rejected": -1.3491919040679932, + "logps/chosen": -253.41952514648438, + "logps/rejected": -297.0237731933594, + "loss": 0.3833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3080201745033264, + "rewards/margins": 3.321065902709961, + "rewards/rejected": -3.6290860176086426, + "step": 1455 + }, + { + "epoch": 0.45, + "learning_rate": 3.3949262529446915e-05, + "logits/chosen": -1.394351601600647, + "logits/rejected": -1.3554754257202148, + "logps/chosen": -228.14852905273438, + "logps/rejected": -290.87591552734375, + "loss": 0.3869, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8205677270889282, + "rewards/margins": 2.6521365642547607, + "rewards/rejected": -3.4727044105529785, + "step": 1460 + }, + { + "epoch": 0.45, + "learning_rate": 3.382492366522158e-05, + "logits/chosen": -1.4379384517669678, + "logits/rejected": -1.292317509651184, + "logps/chosen": -235.61386108398438, + "logps/rejected": -263.2154541015625, + "loss": 0.3953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6492370367050171, + "rewards/margins": 2.9269096851348877, + "rewards/rejected": -3.5761466026306152, + "step": 1465 + }, + { + "epoch": 0.45, + "learning_rate": 3.370033492934862e-05, + "logits/chosen": -1.366807222366333, + "logits/rejected": -1.2599581480026245, + "logps/chosen": -271.9466857910156, + "logps/rejected": -321.1268005371094, + "loss": 0.332, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7314456105232239, + "rewards/margins": 3.1656384468078613, + "rewards/rejected": -3.8970837593078613, + "step": 1470 + }, + { + "epoch": 0.45, + "learning_rate": 3.357549984947246e-05, + "logits/chosen": -1.392762303352356, + "logits/rejected": -1.2771762609481812, + "logps/chosen": -245.3613739013672, + "logps/rejected": -282.22467041015625, + "loss": 0.3886, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8663581013679504, + "rewards/margins": 2.6135199069976807, + "rewards/rejected": -3.4798779487609863, + "step": 1475 + }, + { + "epoch": 0.45, + "learning_rate": 3.3450421960212566e-05, + "logits/chosen": -1.4894797801971436, + "logits/rejected": -1.3859410285949707, + "logps/chosen": -243.93490600585938, + "logps/rejected": -270.7559814453125, + "loss": 0.3777, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4715906083583832, + "rewards/margins": 2.5874104499816895, + "rewards/rejected": -3.0590012073516846, + "step": 1480 + }, + { + "epoch": 0.45, + "learning_rate": 3.332510480306342e-05, + "logits/chosen": -1.4027369022369385, + "logits/rejected": -1.281185269355774, + "logps/chosen": -239.8651580810547, + "logps/rejected": -269.84600830078125, + "loss": 0.4071, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3220774233341217, + "rewards/margins": 2.7993996143341064, + "rewards/rejected": -3.121476650238037, + "step": 1485 + }, + { + "epoch": 0.45, + "learning_rate": 3.319955192629417e-05, + "logits/chosen": -1.4315681457519531, + "logits/rejected": -1.3057044744491577, + "logps/chosen": -248.1969757080078, + "logps/rejected": -286.1350402832031, + "loss": 0.3744, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.40552300214767456, + "rewards/margins": 2.5983104705810547, + "rewards/rejected": -3.003833293914795, + "step": 1490 + }, + { + "epoch": 0.46, + "learning_rate": 3.3073766884848234e-05, + "logits/chosen": -1.3912522792816162, + "logits/rejected": -1.3030383586883545, + "logps/chosen": -223.78085327148438, + "logps/rejected": -273.02410888671875, + "loss": 0.3343, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3259705901145935, + "rewards/margins": 2.883284091949463, + "rewards/rejected": -3.2092552185058594, + "step": 1495 + }, + { + "epoch": 0.46, + "learning_rate": 3.294775324024259e-05, + "logits/chosen": -1.4088590145111084, + "logits/rejected": -1.3673789501190186, + "logps/chosen": -222.629150390625, + "logps/rejected": -287.8419189453125, + "loss": 0.3654, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3555578589439392, + "rewards/margins": 3.0522375106811523, + "rewards/rejected": -3.4077954292297363, + "step": 1500 + }, + { + "epoch": 0.46, + "learning_rate": 3.2821514560466965e-05, + "logits/chosen": -1.3416802883148193, + "logits/rejected": -1.2942748069763184, + "logps/chosen": -256.21875, + "logps/rejected": -308.81402587890625, + "loss": 0.4083, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5183025002479553, + "rewards/margins": 3.0418663024902344, + "rewards/rejected": -3.560168743133545, + "step": 1505 + }, + { + "epoch": 0.46, + "learning_rate": 3.269505441988281e-05, + "logits/chosen": -1.408935308456421, + "logits/rejected": -1.2729170322418213, + "logps/chosen": -265.832275390625, + "logps/rejected": -287.1328430175781, + "loss": 0.354, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.49197083711624146, + "rewards/margins": 3.098872661590576, + "rewards/rejected": -3.5908432006835938, + "step": 1510 + }, + { + "epoch": 0.46, + "learning_rate": 3.256837639912208e-05, + "logits/chosen": -1.4301097393035889, + "logits/rejected": -1.3893522024154663, + "logps/chosen": -226.5152587890625, + "logps/rejected": -269.6661682128906, + "loss": 0.3423, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.36981138586997986, + "rewards/margins": 2.82721209526062, + "rewards/rejected": -3.197023391723633, + "step": 1515 + }, + { + "epoch": 0.46, + "learning_rate": 3.2441484084985865e-05, + "logits/chosen": -1.408756971359253, + "logits/rejected": -1.3597663640975952, + "logps/chosen": -246.8629913330078, + "logps/rejected": -291.6884765625, + "loss": 0.4077, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5306534767150879, + "rewards/margins": 2.8845298290252686, + "rewards/rejected": -3.4151833057403564, + "step": 1520 + }, + { + "epoch": 0.46, + "learning_rate": 3.231438107034281e-05, + "logits/chosen": -1.457080602645874, + "logits/rejected": -1.37287437915802, + "logps/chosen": -251.12301635742188, + "logps/rejected": -291.7841796875, + "loss": 0.4085, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5804392099380493, + "rewards/margins": 2.6093997955322266, + "rewards/rejected": -3.1898388862609863, + "step": 1525 + }, + { + "epoch": 0.47, + "learning_rate": 3.218707095402741e-05, + "logits/chosen": -1.4033076763153076, + "logits/rejected": -1.305397868156433, + "logps/chosen": -225.1595916748047, + "logps/rejected": -280.4457702636719, + "loss": 0.3649, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.493985116481781, + "rewards/margins": 2.9174463748931885, + "rewards/rejected": -3.4114317893981934, + "step": 1530 + }, + { + "epoch": 0.47, + "learning_rate": 3.20595573407381e-05, + "logits/chosen": -1.4854360818862915, + "logits/rejected": -1.3654184341430664, + "logps/chosen": -251.3857879638672, + "logps/rejected": -296.96356201171875, + "loss": 0.3963, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7684920430183411, + "rewards/margins": 2.8625669479370117, + "rewards/rejected": -3.631059169769287, + "step": 1535 + }, + { + "epoch": 0.47, + "learning_rate": 3.19318438409352e-05, + "logits/chosen": -1.4596775770187378, + "logits/rejected": -1.3616211414337158, + "logps/chosen": -219.89013671875, + "logps/rejected": -271.64605712890625, + "loss": 0.4047, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1834189891815186, + "rewards/margins": 2.471865653991699, + "rewards/rejected": -3.655284881591797, + "step": 1540 + }, + { + "epoch": 0.47, + "learning_rate": 3.180393407073866e-05, + "logits/chosen": -1.3478964567184448, + "logits/rejected": -1.266242265701294, + "logps/chosen": -259.29766845703125, + "logps/rejected": -302.2203369140625, + "loss": 0.4261, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5068633556365967, + "rewards/margins": 2.906827449798584, + "rewards/rejected": -4.413690567016602, + "step": 1545 + }, + { + "epoch": 0.47, + "learning_rate": 3.1675831651825704e-05, + "logits/chosen": -1.2607046365737915, + "logits/rejected": -1.12994384765625, + "logps/chosen": -280.3153381347656, + "logps/rejected": -326.66900634765625, + "loss": 0.3582, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.910593032836914, + "rewards/margins": 3.522291898727417, + "rewards/rejected": -5.43288516998291, + "step": 1550 + }, + { + "epoch": 0.47, + "learning_rate": 3.154754021132827e-05, + "logits/chosen": -1.4051783084869385, + "logits/rejected": -1.296360969543457, + "logps/chosen": -274.32733154296875, + "logps/rejected": -321.9216003417969, + "loss": 0.403, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.8257957696914673, + "rewards/margins": 3.0792431831359863, + "rewards/rejected": -4.905039310455322, + "step": 1555 + }, + { + "epoch": 0.48, + "learning_rate": 3.1419063381730317e-05, + "logits/chosen": -1.3628051280975342, + "logits/rejected": -1.2559598684310913, + "logps/chosen": -234.3201141357422, + "logps/rejected": -262.8561706542969, + "loss": 0.4669, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6719917058944702, + "rewards/margins": 2.5399348735809326, + "rewards/rejected": -4.211926460266113, + "step": 1560 + }, + { + "epoch": 0.48, + "learning_rate": 3.129040480076496e-05, + "logits/chosen": -1.427811861038208, + "logits/rejected": -1.2909691333770752, + "logps/chosen": -259.02105712890625, + "logps/rejected": -307.50006103515625, + "loss": 0.3602, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5834705829620361, + "rewards/margins": 3.394528865814209, + "rewards/rejected": -4.977999687194824, + "step": 1565 + }, + { + "epoch": 0.48, + "learning_rate": 3.116156811131148e-05, + "logits/chosen": -1.404813528060913, + "logits/rejected": -1.2708321809768677, + "logps/chosen": -235.78720092773438, + "logps/rejected": -253.49172973632812, + "loss": 0.3823, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3320645093917847, + "rewards/margins": 2.766334056854248, + "rewards/rejected": -4.098398685455322, + "step": 1570 + }, + { + "epoch": 0.48, + "learning_rate": 3.1032556961292194e-05, + "logits/chosen": -1.457380771636963, + "logits/rejected": -1.3725563287734985, + "logps/chosen": -235.90377807617188, + "logps/rejected": -280.8343200683594, + "loss": 0.3649, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9432939291000366, + "rewards/margins": 2.720813512802124, + "rewards/rejected": -3.664107084274292, + "step": 1575 + }, + { + "epoch": 0.48, + "learning_rate": 3.0903375003569124e-05, + "logits/chosen": -1.3895059823989868, + "logits/rejected": -1.3300002813339233, + "logps/chosen": -250.08218383789062, + "logps/rejected": -312.3929748535156, + "loss": 0.3683, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0983926057815552, + "rewards/margins": 3.0330488681793213, + "rewards/rejected": -4.131441116333008, + "step": 1580 + }, + { + "epoch": 0.48, + "learning_rate": 3.077402589584061e-05, + "logits/chosen": -1.4692285060882568, + "logits/rejected": -1.3498972654342651, + "logps/chosen": -269.5802917480469, + "logps/rejected": -317.4629211425781, + "loss": 0.3919, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9154409170150757, + "rewards/margins": 3.027033805847168, + "rewards/rejected": -3.942474842071533, + "step": 1585 + }, + { + "epoch": 0.48, + "learning_rate": 3.064451330053773e-05, + "logits/chosen": -1.358955979347229, + "logits/rejected": -1.2798420190811157, + "logps/chosen": -221.613525390625, + "logps/rejected": -278.844970703125, + "loss": 0.354, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8618852496147156, + "rewards/margins": 3.097717761993408, + "rewards/rejected": -3.9596030712127686, + "step": 1590 + }, + { + "epoch": 0.49, + "learning_rate": 3.0514840884720598e-05, + "logits/chosen": -1.5032036304473877, + "logits/rejected": -1.3345158100128174, + "logps/chosen": -291.01507568359375, + "logps/rejected": -325.32574462890625, + "loss": 0.409, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3084170818328857, + "rewards/margins": 2.8849997520446777, + "rewards/rejected": -4.193417549133301, + "step": 1595 + }, + { + "epoch": 0.49, + "learning_rate": 3.0385012319974537e-05, + "logits/chosen": -1.4529359340667725, + "logits/rejected": -1.3749693632125854, + "logps/chosen": -238.7993927001953, + "logps/rejected": -302.42303466796875, + "loss": 0.3772, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1509284973144531, + "rewards/margins": 3.0289180278778076, + "rewards/rejected": -4.17984676361084, + "step": 1600 + }, + { + "epoch": 0.49, + "learning_rate": 3.0255031282306106e-05, + "logits/chosen": -1.3734673261642456, + "logits/rejected": -1.2939527034759521, + "logps/chosen": -236.4968719482422, + "logps/rejected": -294.1763000488281, + "loss": 0.3329, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4785823822021484, + "rewards/margins": 3.626276731491089, + "rewards/rejected": -5.104858875274658, + "step": 1605 + }, + { + "epoch": 0.49, + "learning_rate": 3.012490145203906e-05, + "logits/chosen": -1.396791696548462, + "logits/rejected": -1.3902806043624878, + "logps/chosen": -227.6318817138672, + "logps/rejected": -299.10784912109375, + "loss": 0.3666, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4431589841842651, + "rewards/margins": 3.2994017601013184, + "rewards/rejected": -4.742560863494873, + "step": 1610 + }, + { + "epoch": 0.49, + "learning_rate": 2.9994626513710084e-05, + "logits/chosen": -1.3043615818023682, + "logits/rejected": -1.1662665605545044, + "logps/chosen": -263.1172180175781, + "logps/rejected": -311.5401916503906, + "loss": 0.3786, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4172508716583252, + "rewards/margins": 4.293739318847656, + "rewards/rejected": -5.710989952087402, + "step": 1615 + }, + { + "epoch": 0.49, + "learning_rate": 2.9864210155964507e-05, + "logits/chosen": -1.3513799905776978, + "logits/rejected": -1.226161241531372, + "logps/chosen": -235.1254119873047, + "logps/rejected": -298.6861267089844, + "loss": 0.3383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1513350009918213, + "rewards/margins": 3.736593723297119, + "rewards/rejected": -4.887928485870361, + "step": 1620 + }, + { + "epoch": 0.5, + "learning_rate": 2.9733656071451867e-05, + "logits/chosen": -1.3315715789794922, + "logits/rejected": -1.280723214149475, + "logps/chosen": -248.2561492919922, + "logps/rejected": -322.32025146484375, + "loss": 0.3294, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2765705585479736, + "rewards/margins": 3.39424204826355, + "rewards/rejected": -4.670812606811523, + "step": 1625 + }, + { + "epoch": 0.5, + "learning_rate": 2.9602967956721316e-05, + "logits/chosen": -1.3882957696914673, + "logits/rejected": -1.284002661705017, + "logps/chosen": -276.1745910644531, + "logps/rejected": -322.44061279296875, + "loss": 0.4215, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0892524719238281, + "rewards/margins": 3.2803215980529785, + "rewards/rejected": -4.369574069976807, + "step": 1630 + }, + { + "epoch": 0.5, + "learning_rate": 2.947214951211701e-05, + "logits/chosen": -1.4538739919662476, + "logits/rejected": -1.3563892841339111, + "logps/chosen": -234.4571533203125, + "logps/rejected": -294.8569030761719, + "loss": 0.3709, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6589905619621277, + "rewards/margins": 3.601499080657959, + "rewards/rejected": -4.260489463806152, + "step": 1635 + }, + { + "epoch": 0.5, + "learning_rate": 2.9341204441673266e-05, + "logits/chosen": -1.4792182445526123, + "logits/rejected": -1.377165675163269, + "logps/chosen": -239.96060180664062, + "logps/rejected": -283.60491943359375, + "loss": 0.3906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.825871467590332, + "rewards/margins": 3.0817301273345947, + "rewards/rejected": -3.9076011180877686, + "step": 1640 + }, + { + "epoch": 0.5, + "learning_rate": 2.921013645300975e-05, + "logits/chosen": -1.4524424076080322, + "logits/rejected": -1.401745080947876, + "logps/chosen": -234.1028289794922, + "logps/rejected": -295.9302673339844, + "loss": 0.4504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8630698919296265, + "rewards/margins": 2.638474702835083, + "rewards/rejected": -3.50154447555542, + "step": 1645 + }, + { + "epoch": 0.5, + "learning_rate": 2.907894925722648e-05, + "logits/chosen": -1.3984206914901733, + "logits/rejected": -1.3711296319961548, + "logps/chosen": -251.3262481689453, + "logps/rejected": -302.9779968261719, + "loss": 0.422, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8784204721450806, + "rewards/margins": 2.59228515625, + "rewards/rejected": -3.470705509185791, + "step": 1650 + }, + { + "epoch": 0.5, + "learning_rate": 2.894764656879873e-05, + "logits/chosen": -1.5556986331939697, + "logits/rejected": -1.48770010471344, + "logps/chosen": -241.54751586914062, + "logps/rejected": -301.2930603027344, + "loss": 0.374, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6194090247154236, + "rewards/margins": 2.9210126399993896, + "rewards/rejected": -3.540421962738037, + "step": 1655 + }, + { + "epoch": 0.51, + "learning_rate": 2.8816232105471863e-05, + "logits/chosen": -1.5585861206054688, + "logits/rejected": -1.4013144969940186, + "logps/chosen": -277.42864990234375, + "logps/rejected": -293.63787841796875, + "loss": 0.3893, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6244452595710754, + "rewards/margins": 2.798342227935791, + "rewards/rejected": -3.4227874279022217, + "step": 1660 + }, + { + "epoch": 0.51, + "learning_rate": 2.8684709588156085e-05, + "logits/chosen": -1.4837238788604736, + "logits/rejected": -1.3633246421813965, + "logps/chosen": -267.8808898925781, + "logps/rejected": -308.5547180175781, + "loss": 0.3612, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.40743985772132874, + "rewards/margins": 3.1635591983795166, + "rewards/rejected": -3.5709991455078125, + "step": 1665 + }, + { + "epoch": 0.51, + "learning_rate": 2.8553082740821057e-05, + "logits/chosen": -1.5174936056137085, + "logits/rejected": -1.3714519739151, + "logps/chosen": -262.1058349609375, + "logps/rejected": -278.0890197753906, + "loss": 0.4465, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.555295467376709, + "rewards/margins": 2.108398675918579, + "rewards/rejected": -2.663693904876709, + "step": 1670 + }, + { + "epoch": 0.51, + "learning_rate": 2.8421355290390506e-05, + "logits/chosen": -1.4402861595153809, + "logits/rejected": -1.3555018901824951, + "logps/chosen": -259.9664001464844, + "logps/rejected": -304.7403259277344, + "loss": 0.3857, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.537945568561554, + "rewards/margins": 2.5592586994171143, + "rewards/rejected": -3.0972039699554443, + "step": 1675 + }, + { + "epoch": 0.51, + "learning_rate": 2.8289530966636625e-05, + "logits/chosen": -1.4750789403915405, + "logits/rejected": -1.4176933765411377, + "logps/chosen": -247.0699920654297, + "logps/rejected": -291.6591796875, + "loss": 0.4013, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.47404319047927856, + "rewards/margins": 2.721088171005249, + "rewards/rejected": -3.195131301879883, + "step": 1680 + }, + { + "epoch": 0.51, + "learning_rate": 2.8157613502074543e-05, + "logits/chosen": -1.3425180912017822, + "logits/rejected": -1.2664659023284912, + "logps/chosen": -229.84848022460938, + "logps/rejected": -275.348388671875, + "loss": 0.386, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.45674949884414673, + "rewards/margins": 2.6647157669067383, + "rewards/rejected": -3.1214652061462402, + "step": 1685 + }, + { + "epoch": 0.52, + "learning_rate": 2.8025606631856578e-05, + "logits/chosen": -1.4296760559082031, + "logits/rejected": -1.3889728784561157, + "logps/chosen": -240.1858673095703, + "logps/rejected": -287.5098571777344, + "loss": 0.3997, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3761466145515442, + "rewards/margins": 2.935863971710205, + "rewards/rejected": -3.3120105266571045, + "step": 1690 + }, + { + "epoch": 0.52, + "learning_rate": 2.7893514093666538e-05, + "logits/chosen": -1.5364658832550049, + "logits/rejected": -1.4733527898788452, + "logps/chosen": -221.28564453125, + "logps/rejected": -300.366943359375, + "loss": 0.3334, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.08093585073947906, + "rewards/margins": 3.043505907058716, + "rewards/rejected": -2.9625699520111084, + "step": 1695 + }, + { + "epoch": 0.52, + "learning_rate": 2.7761339627613848e-05, + "logits/chosen": -1.5357977151870728, + "logits/rejected": -1.3645578622817993, + "logps/chosen": -251.7772979736328, + "logps/rejected": -281.92376708984375, + "loss": 0.364, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3491308391094208, + "rewards/margins": 2.4752745628356934, + "rewards/rejected": -2.8244051933288574, + "step": 1700 + }, + { + "epoch": 0.52, + "learning_rate": 2.762908697612765e-05, + "logits/chosen": -1.5145528316497803, + "logits/rejected": -1.4855618476867676, + "logps/chosen": -197.80935668945312, + "logps/rejected": -249.3860321044922, + "loss": 0.3845, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19136330485343933, + "rewards/margins": 2.256312608718872, + "rewards/rejected": -2.447675943374634, + "step": 1705 + }, + { + "epoch": 0.52, + "learning_rate": 2.749675988385087e-05, + "logits/chosen": -1.5334855318069458, + "logits/rejected": -1.4892711639404297, + "logps/chosen": -212.96484375, + "logps/rejected": -259.4864196777344, + "loss": 0.3969, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04548867046833038, + "rewards/margins": 2.449012279510498, + "rewards/rejected": -2.4945008754730225, + "step": 1710 + }, + { + "epoch": 0.52, + "learning_rate": 2.7364362097534165e-05, + "logits/chosen": -1.3821698427200317, + "logits/rejected": -1.2841250896453857, + "logps/chosen": -239.5942840576172, + "logps/rejected": -286.0035095214844, + "loss": 0.3596, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.20363537967205048, + "rewards/margins": 3.127345561981201, + "rewards/rejected": -3.3309807777404785, + "step": 1715 + }, + { + "epoch": 0.52, + "learning_rate": 2.723189736592986e-05, + "logits/chosen": -1.4247050285339355, + "logits/rejected": -1.3530040979385376, + "logps/chosen": -229.03988647460938, + "logps/rejected": -294.17706298828125, + "loss": 0.3672, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4444514811038971, + "rewards/margins": 2.9286928176879883, + "rewards/rejected": -3.3731446266174316, + "step": 1720 + }, + { + "epoch": 0.53, + "learning_rate": 2.709936943968577e-05, + "logits/chosen": -1.4718440771102905, + "logits/rejected": -1.3845016956329346, + "logps/chosen": -238.27865600585938, + "logps/rejected": -283.81939697265625, + "loss": 0.411, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5444830656051636, + "rewards/margins": 2.8488547801971436, + "rewards/rejected": -3.3933379650115967, + "step": 1725 + }, + { + "epoch": 0.53, + "learning_rate": 2.6966782071239027e-05, + "logits/chosen": -1.4738774299621582, + "logits/rejected": -1.3779428005218506, + "logps/chosen": -231.0215606689453, + "logps/rejected": -272.0970764160156, + "loss": 0.3721, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.35090774297714233, + "rewards/margins": 2.7793362140655518, + "rewards/rejected": -3.1302435398101807, + "step": 1730 + }, + { + "epoch": 0.53, + "learning_rate": 2.6834139014709843e-05, + "logits/chosen": -1.37410569190979, + "logits/rejected": -1.3108699321746826, + "logps/chosen": -239.4388885498047, + "logps/rejected": -282.64971923828125, + "loss": 0.3667, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3109089732170105, + "rewards/margins": 3.148824691772461, + "rewards/rejected": -3.459733486175537, + "step": 1735 + }, + { + "epoch": 0.53, + "learning_rate": 2.670144402579518e-05, + "logits/chosen": -1.4222691059112549, + "logits/rejected": -1.3661749362945557, + "logps/chosen": -251.8944091796875, + "logps/rejected": -308.817138671875, + "loss": 0.3653, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.46417436003685, + "rewards/margins": 2.9680099487304688, + "rewards/rejected": -3.4321842193603516, + "step": 1740 + }, + { + "epoch": 0.53, + "learning_rate": 2.6568700861662445e-05, + "logits/chosen": -1.4371557235717773, + "logits/rejected": -1.395452857017517, + "logps/chosen": -233.2667236328125, + "logps/rejected": -296.88897705078125, + "loss": 0.3752, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42657342553138733, + "rewards/margins": 2.4918549060821533, + "rewards/rejected": -2.9184281826019287, + "step": 1745 + }, + { + "epoch": 0.53, + "learning_rate": 2.643591328084309e-05, + "logits/chosen": -1.3927974700927734, + "logits/rejected": -1.3720002174377441, + "logps/chosen": -216.7613983154297, + "logps/rejected": -285.96746826171875, + "loss": 0.3812, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2734035849571228, + "rewards/margins": 2.81980562210083, + "rewards/rejected": -3.093209743499756, + "step": 1750 + }, + { + "epoch": 0.53, + "learning_rate": 2.6303085043126176e-05, + "logits/chosen": -1.4500024318695068, + "logits/rejected": -1.35175359249115, + "logps/chosen": -269.2599182128906, + "logps/rejected": -313.74639892578125, + "loss": 0.3278, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4534524977207184, + "rewards/margins": 3.014010190963745, + "rewards/rejected": -3.467463254928589, + "step": 1755 + }, + { + "epoch": 0.54, + "learning_rate": 2.617021990945197e-05, + "logits/chosen": -1.627383828163147, + "logits/rejected": -1.5457924604415894, + "logps/chosen": -228.83285522460938, + "logps/rejected": -268.08148193359375, + "loss": 0.4364, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3291838467121124, + "rewards/margins": 2.480336904525757, + "rewards/rejected": -2.809520721435547, + "step": 1760 + }, + { + "epoch": 0.54, + "learning_rate": 2.603732164180539e-05, + "logits/chosen": -1.3406190872192383, + "logits/rejected": -1.321590542793274, + "logps/chosen": -204.06773376464844, + "logps/rejected": -272.1907653808594, + "loss": 0.3289, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5110599398612976, + "rewards/margins": 2.9445903301239014, + "rewards/rejected": -3.4556503295898438, + "step": 1765 + }, + { + "epoch": 0.54, + "learning_rate": 2.5904394003109555e-05, + "logits/chosen": -1.407454252243042, + "logits/rejected": -1.3605538606643677, + "logps/chosen": -241.2417755126953, + "logps/rejected": -289.2109680175781, + "loss": 0.3726, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7598351836204529, + "rewards/margins": 2.538835287094116, + "rewards/rejected": -3.2986702919006348, + "step": 1770 + }, + { + "epoch": 0.54, + "learning_rate": 2.5771440757119165e-05, + "logits/chosen": -1.3968393802642822, + "logits/rejected": -1.3065111637115479, + "logps/chosen": -258.99139404296875, + "logps/rejected": -323.99920654296875, + "loss": 0.3912, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9822736978530884, + "rewards/margins": 3.3598105907440186, + "rewards/rejected": -4.3420844078063965, + "step": 1775 + }, + { + "epoch": 0.54, + "learning_rate": 2.5638465668314006e-05, + "logits/chosen": -1.40274178981781, + "logits/rejected": -1.3181835412979126, + "logps/chosen": -232.64627075195312, + "logps/rejected": -276.2817077636719, + "loss": 0.3353, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7937625646591187, + "rewards/margins": 2.890428066253662, + "rewards/rejected": -3.684190273284912, + "step": 1780 + }, + { + "epoch": 0.54, + "learning_rate": 2.5505472501792298e-05, + "logits/chosen": -1.41157066822052, + "logits/rejected": -1.3127758502960205, + "logps/chosen": -240.01473999023438, + "logps/rejected": -264.02313232421875, + "loss": 0.4051, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.821036159992218, + "rewards/margins": 2.6046383380889893, + "rewards/rejected": -3.4256744384765625, + "step": 1785 + }, + { + "epoch": 0.55, + "learning_rate": 2.5372465023164148e-05, + "logits/chosen": -1.5105726718902588, + "logits/rejected": -1.3530454635620117, + "logps/chosen": -232.69873046875, + "logps/rejected": -260.64453125, + "loss": 0.4211, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6090893745422363, + "rewards/margins": 2.594888210296631, + "rewards/rejected": -3.2039780616760254, + "step": 1790 + }, + { + "epoch": 0.55, + "learning_rate": 2.5239446998444898e-05, + "logits/chosen": -1.4365472793579102, + "logits/rejected": -1.3337024450302124, + "logps/chosen": -217.64816284179688, + "logps/rejected": -258.85650634765625, + "loss": 0.3962, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6081379652023315, + "rewards/margins": 2.89850115776062, + "rewards/rejected": -3.5066394805908203, + "step": 1795 + }, + { + "epoch": 0.55, + "learning_rate": 2.510642219394847e-05, + "logits/chosen": -1.3360542058944702, + "logits/rejected": -1.2949392795562744, + "logps/chosen": -221.05899047851562, + "logps/rejected": -275.3384704589844, + "loss": 0.4155, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.44868984818458557, + "rewards/margins": 3.002562999725342, + "rewards/rejected": -3.4512531757354736, + "step": 1800 + }, + { + "epoch": 0.55, + "learning_rate": 2.4973394376180773e-05, + "logits/chosen": -1.4750487804412842, + "logits/rejected": -1.392292857170105, + "logps/chosen": -215.1530303955078, + "logps/rejected": -286.1130065917969, + "loss": 0.3386, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04450890049338341, + "rewards/margins": 3.0319771766662598, + "rewards/rejected": -3.076486110687256, + "step": 1805 + }, + { + "epoch": 0.55, + "learning_rate": 2.4840367311733024e-05, + "logits/chosen": -1.4934360980987549, + "logits/rejected": -1.4029854536056519, + "logps/chosen": -238.88021850585938, + "logps/rejected": -284.1180114746094, + "loss": 0.3991, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19598393142223358, + "rewards/margins": 2.753159999847412, + "rewards/rejected": -2.949143886566162, + "step": 1810 + }, + { + "epoch": 0.55, + "learning_rate": 2.4707344767175118e-05, + "logits/chosen": -1.4552268981933594, + "logits/rejected": -1.3600109815597534, + "logps/chosen": -244.9551239013672, + "logps/rejected": -293.6857604980469, + "loss": 0.3665, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.16339322924613953, + "rewards/margins": 2.704094409942627, + "rewards/rejected": -2.867487668991089, + "step": 1815 + }, + { + "epoch": 0.55, + "learning_rate": 2.457433050894896e-05, + "logits/chosen": -1.4229071140289307, + "logits/rejected": -1.3520857095718384, + "logps/chosen": -234.4111785888672, + "logps/rejected": -274.68487548828125, + "loss": 0.4018, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.19696488976478577, + "rewards/margins": 3.0029873847961426, + "rewards/rejected": -3.1999526023864746, + "step": 1820 + }, + { + "epoch": 0.56, + "learning_rate": 2.4441328303261867e-05, + "logits/chosen": -1.365638017654419, + "logits/rejected": -1.2584137916564941, + "logps/chosen": -267.7275085449219, + "logps/rejected": -299.4195251464844, + "loss": 0.3461, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6673690676689148, + "rewards/margins": 2.8428685665130615, + "rewards/rejected": -3.5102379322052, + "step": 1825 + }, + { + "epoch": 0.56, + "learning_rate": 2.4308341915979838e-05, + "logits/chosen": -1.501835823059082, + "logits/rejected": -1.4059853553771973, + "logps/chosen": -221.58859252929688, + "logps/rejected": -267.6972961425781, + "loss": 0.3689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6144279837608337, + "rewards/margins": 2.702439785003662, + "rewards/rejected": -3.3168678283691406, + "step": 1830 + }, + { + "epoch": 0.56, + "learning_rate": 2.417537511252105e-05, + "logits/chosen": -1.4447147846221924, + "logits/rejected": -1.3833913803100586, + "logps/chosen": -236.3400115966797, + "logps/rejected": -290.0586853027344, + "loss": 0.3494, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6023090481758118, + "rewards/margins": 3.214966297149658, + "rewards/rejected": -3.817275285720825, + "step": 1835 + }, + { + "epoch": 0.56, + "learning_rate": 2.4042431657749117e-05, + "logits/chosen": -1.4315189123153687, + "logits/rejected": -1.3628207445144653, + "logps/chosen": -207.71719360351562, + "logps/rejected": -248.83102416992188, + "loss": 0.3966, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6046200394630432, + "rewards/margins": 2.3360109329223633, + "rewards/rejected": -2.9406309127807617, + "step": 1840 + }, + { + "epoch": 0.56, + "learning_rate": 2.3909515315866605e-05, + "logits/chosen": -1.333467960357666, + "logits/rejected": -1.236061692237854, + "logps/chosen": -257.92413330078125, + "logps/rejected": -294.17901611328125, + "loss": 0.3582, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8103699684143066, + "rewards/margins": 3.270451784133911, + "rewards/rejected": -4.0808210372924805, + "step": 1845 + }, + { + "epoch": 0.56, + "learning_rate": 2.3776629850308354e-05, + "logits/chosen": -1.3392812013626099, + "logits/rejected": -1.299647331237793, + "logps/chosen": -239.8131103515625, + "logps/rejected": -303.4067077636719, + "loss": 0.3613, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6897146105766296, + "rewards/margins": 3.181128978729248, + "rewards/rejected": -3.8708438873291016, + "step": 1850 + }, + { + "epoch": 0.57, + "learning_rate": 2.364377902363499e-05, + "logits/chosen": -1.4630589485168457, + "logits/rejected": -1.3817940950393677, + "logps/chosen": -224.0995635986328, + "logps/rejected": -267.346435546875, + "loss": 0.4036, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8380252122879028, + "rewards/margins": 2.4955790042877197, + "rewards/rejected": -3.333604097366333, + "step": 1855 + }, + { + "epoch": 0.57, + "learning_rate": 2.3510966597426354e-05, + "logits/chosen": -1.4201809167861938, + "logits/rejected": -1.3353426456451416, + "logps/chosen": -267.3838806152344, + "logps/rejected": -314.8360290527344, + "loss": 0.4379, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7916916608810425, + "rewards/margins": 3.1964616775512695, + "rewards/rejected": -3.9881534576416016, + "step": 1860 + }, + { + "epoch": 0.57, + "learning_rate": 2.3378196332174993e-05, + "logits/chosen": -1.4213106632232666, + "logits/rejected": -1.3859220743179321, + "logps/chosen": -195.9226837158203, + "logps/rejected": -256.7573547363281, + "loss": 0.3855, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37132301926612854, + "rewards/margins": 2.312356472015381, + "rewards/rejected": -2.6836793422698975, + "step": 1865 + }, + { + "epoch": 0.57, + "learning_rate": 2.324547198717972e-05, + "logits/chosen": -1.4690866470336914, + "logits/rejected": -1.4080109596252441, + "logps/chosen": -241.17703247070312, + "logps/rejected": -302.3997802734375, + "loss": 0.363, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6016757488250732, + "rewards/margins": 2.820457696914673, + "rewards/rejected": -3.422133207321167, + "step": 1870 + }, + { + "epoch": 0.57, + "learning_rate": 2.311279732043912e-05, + "logits/chosen": -1.4260601997375488, + "logits/rejected": -1.3536970615386963, + "logps/chosen": -200.6278533935547, + "logps/rejected": -253.57763671875, + "loss": 0.4365, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.11818097531795502, + "rewards/margins": 2.7506301403045654, + "rewards/rejected": -2.8688108921051025, + "step": 1875 + }, + { + "epoch": 0.57, + "learning_rate": 2.2980176088545197e-05, + "logits/chosen": -1.4411252737045288, + "logits/rejected": -1.3734889030456543, + "logps/chosen": -226.0155029296875, + "logps/rejected": -277.5158996582031, + "loss": 0.3744, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.14310994744300842, + "rewards/margins": 3.070277452468872, + "rewards/rejected": -3.2133877277374268, + "step": 1880 + }, + { + "epoch": 0.57, + "learning_rate": 2.284761204657696e-05, + "logits/chosen": -1.4405572414398193, + "logits/rejected": -1.3739019632339478, + "logps/chosen": -213.6031494140625, + "logps/rejected": -278.64849853515625, + "loss": 0.3599, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1836298406124115, + "rewards/margins": 3.2234256267547607, + "rewards/rejected": -3.407055377960205, + "step": 1885 + }, + { + "epoch": 0.58, + "learning_rate": 2.2715108947994152e-05, + "logits/chosen": -1.4480597972869873, + "logits/rejected": -1.3346760272979736, + "logps/chosen": -242.326904296875, + "logps/rejected": -276.35223388671875, + "loss": 0.3833, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.040028151124715805, + "rewards/margins": 2.910196542739868, + "rewards/rejected": -2.9502246379852295, + "step": 1890 + }, + { + "epoch": 0.58, + "learning_rate": 2.258267054453091e-05, + "logits/chosen": -1.4914884567260742, + "logits/rejected": -1.3812105655670166, + "logps/chosen": -255.26089477539062, + "logps/rejected": -296.7649230957031, + "loss": 0.3651, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.10911808162927628, + "rewards/margins": 2.631997585296631, + "rewards/rejected": -2.7411160469055176, + "step": 1895 + }, + { + "epoch": 0.58, + "learning_rate": 2.2450300586089622e-05, + "logits/chosen": -1.5325438976287842, + "logits/rejected": -1.4326366186141968, + "logps/chosen": -233.3433837890625, + "logps/rejected": -272.3608703613281, + "loss": 0.378, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.07660797983407974, + "rewards/margins": 3.0263311862945557, + "rewards/rejected": -2.9497230052948, + "step": 1900 + }, + { + "epoch": 0.58, + "learning_rate": 2.2318002820634648e-05, + "logits/chosen": -1.5223352909088135, + "logits/rejected": -1.4486531019210815, + "logps/chosen": -251.2206268310547, + "logps/rejected": -311.48101806640625, + "loss": 0.3898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.14458785951137543, + "rewards/margins": 2.646322250366211, + "rewards/rejected": -2.790910005569458, + "step": 1905 + }, + { + "epoch": 0.58, + "learning_rate": 2.218578099408631e-05, + "logits/chosen": -1.4956655502319336, + "logits/rejected": -1.4093170166015625, + "logps/chosen": -220.2262420654297, + "logps/rejected": -277.7985534667969, + "loss": 0.3647, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09166286140680313, + "rewards/margins": 2.5863606929779053, + "rewards/rejected": -2.67802357673645, + "step": 1910 + }, + { + "epoch": 0.58, + "learning_rate": 2.2053638850214704e-05, + "logits/chosen": -1.4817699193954468, + "logits/rejected": -1.3774340152740479, + "logps/chosen": -248.29226684570312, + "logps/rejected": -301.8695068359375, + "loss": 0.3206, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.18610629439353943, + "rewards/margins": 2.9152894020080566, + "rewards/rejected": -3.101395606994629, + "step": 1915 + }, + { + "epoch": 0.59, + "learning_rate": 2.1921580130533827e-05, + "logits/chosen": -1.4281337261199951, + "logits/rejected": -1.3430382013320923, + "logps/chosen": -248.2852020263672, + "logps/rejected": -295.68255615234375, + "loss": 0.417, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4364057183265686, + "rewards/margins": 2.8607912063598633, + "rewards/rejected": -3.297196865081787, + "step": 1920 + }, + { + "epoch": 0.59, + "learning_rate": 2.178960857419556e-05, + "logits/chosen": -1.4959535598754883, + "logits/rejected": -1.3279250860214233, + "logps/chosen": -235.4945831298828, + "logps/rejected": -272.96612548828125, + "loss": 0.3668, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3301815986633301, + "rewards/margins": 3.2808384895324707, + "rewards/rejected": -3.6110198497772217, + "step": 1925 + }, + { + "epoch": 0.59, + "learning_rate": 2.165772791788379e-05, + "logits/chosen": -1.3634750843048096, + "logits/rejected": -1.356945276260376, + "logps/chosen": -209.04238891601562, + "logps/rejected": -281.57305908203125, + "loss": 0.3819, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.26547056436538696, + "rewards/margins": 2.8412766456604004, + "rewards/rejected": -3.1067471504211426, + "step": 1930 + }, + { + "epoch": 0.59, + "learning_rate": 2.1525941895708663e-05, + "logits/chosen": -1.3758561611175537, + "logits/rejected": -1.3101266622543335, + "logps/chosen": -225.871826171875, + "logps/rejected": -287.9697265625, + "loss": 0.3345, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.41031938791275024, + "rewards/margins": 3.1809744834899902, + "rewards/rejected": -3.591294050216675, + "step": 1935 + }, + { + "epoch": 0.59, + "learning_rate": 2.1394254239100803e-05, + "logits/chosen": -1.4200907945632935, + "logits/rejected": -1.3337624073028564, + "logps/chosen": -238.2420196533203, + "logps/rejected": -286.61334228515625, + "loss": 0.3804, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5707160234451294, + "rewards/margins": 2.8512752056121826, + "rewards/rejected": -3.4219913482666016, + "step": 1940 + }, + { + "epoch": 0.59, + "learning_rate": 2.1262668676705695e-05, + "logits/chosen": -1.4637157917022705, + "logits/rejected": -1.3421003818511963, + "logps/chosen": -253.0985565185547, + "logps/rejected": -297.3470153808594, + "loss": 0.3698, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4105305075645447, + "rewards/margins": 3.30517578125, + "rewards/rejected": -3.7157065868377686, + "step": 1945 + }, + { + "epoch": 0.59, + "learning_rate": 2.113118893427809e-05, + "logits/chosen": -1.4735605716705322, + "logits/rejected": -1.3585256338119507, + "logps/chosen": -256.94464111328125, + "logps/rejected": -304.25311279296875, + "loss": 0.3457, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.34792160987854004, + "rewards/margins": 3.4550399780273438, + "rewards/rejected": -3.8029613494873047, + "step": 1950 + }, + { + "epoch": 0.6, + "learning_rate": 2.0999818734576517e-05, + "logits/chosen": -1.5473016500473022, + "logits/rejected": -1.4243013858795166, + "logps/chosen": -231.41488647460938, + "logps/rejected": -275.14227294921875, + "loss": 0.3553, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.2229432314634323, + "rewards/margins": 3.035780191421509, + "rewards/rejected": -3.258723497390747, + "step": 1955 + }, + { + "epoch": 0.6, + "learning_rate": 2.0868561797257878e-05, + "logits/chosen": -1.3970296382904053, + "logits/rejected": -1.3482105731964111, + "logps/chosen": -229.5034942626953, + "logps/rejected": -291.2673645019531, + "loss": 0.351, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.32970067858695984, + "rewards/margins": 2.6836564540863037, + "rewards/rejected": -3.013357162475586, + "step": 1960 + }, + { + "epoch": 0.6, + "learning_rate": 2.0737421838772146e-05, + "logits/chosen": -1.3854676485061646, + "logits/rejected": -1.286738395690918, + "logps/chosen": -234.7396240234375, + "logps/rejected": -284.1507568359375, + "loss": 0.3544, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.48179665207862854, + "rewards/margins": 2.779966354370117, + "rewards/rejected": -3.261763095855713, + "step": 1965 + }, + { + "epoch": 0.6, + "learning_rate": 2.0606402572257084e-05, + "logits/chosen": -1.4137897491455078, + "logits/rejected": -1.313356637954712, + "logps/chosen": -248.8478546142578, + "logps/rejected": -297.3784484863281, + "loss": 0.402, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7029843330383301, + "rewards/margins": 2.836820602416992, + "rewards/rejected": -3.5398049354553223, + "step": 1970 + }, + { + "epoch": 0.6, + "learning_rate": 2.047550770743318e-05, + "logits/chosen": -1.4211134910583496, + "logits/rejected": -1.3550993204116821, + "logps/chosen": -244.33279418945312, + "logps/rejected": -308.0687561035156, + "loss": 0.3867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.746593713760376, + "rewards/margins": 3.4636435508728027, + "rewards/rejected": -4.2102370262146, + "step": 1975 + }, + { + "epoch": 0.6, + "learning_rate": 2.034474095049855e-05, + "logits/chosen": -1.4394162893295288, + "logits/rejected": -1.337714433670044, + "logps/chosen": -249.66024780273438, + "logps/rejected": -289.6098937988281, + "loss": 0.3449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6490699648857117, + "rewards/margins": 3.0966174602508545, + "rewards/rejected": -3.745687961578369, + "step": 1980 + }, + { + "epoch": 0.61, + "learning_rate": 2.021410600402404e-05, + "logits/chosen": -1.3536403179168701, + "logits/rejected": -1.2263944149017334, + "logps/chosen": -229.769775390625, + "logps/rejected": -286.17205810546875, + "loss": 0.3394, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8614643812179565, + "rewards/margins": 3.6408305168151855, + "rewards/rejected": -4.50229549407959, + "step": 1985 + }, + { + "epoch": 0.61, + "learning_rate": 2.008360656684837e-05, + "logits/chosen": -1.3974249362945557, + "logits/rejected": -1.3158893585205078, + "logps/chosen": -243.1596221923828, + "logps/rejected": -301.7734375, + "loss": 0.3899, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6392949819564819, + "rewards/margins": 3.155815362930298, + "rewards/rejected": -3.7951102256774902, + "step": 1990 + }, + { + "epoch": 0.61, + "learning_rate": 1.995324633397338e-05, + "logits/chosen": -1.5212452411651611, + "logits/rejected": -1.4078960418701172, + "logps/chosen": -231.8392333984375, + "logps/rejected": -280.28375244140625, + "loss": 0.4023, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.47973960638046265, + "rewards/margins": 3.4426467418670654, + "rewards/rejected": -3.922386884689331, + "step": 1995 + }, + { + "epoch": 0.61, + "learning_rate": 1.9823028996459486e-05, + "logits/chosen": -1.3417774438858032, + "logits/rejected": -1.255110502243042, + "logps/chosen": -240.9922332763672, + "logps/rejected": -289.61871337890625, + "loss": 0.3945, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3862563371658325, + "rewards/margins": 3.139626979827881, + "rewards/rejected": -3.525883436203003, + "step": 2000 + }, + { + "epoch": 0.61, + "learning_rate": 1.969295824132107e-05, + "logits/chosen": -1.3991576433181763, + "logits/rejected": -1.2910772562026978, + "logps/chosen": -210.6538848876953, + "logps/rejected": -255.54403686523438, + "loss": 0.3442, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.38305678963661194, + "rewards/margins": 2.7472267150878906, + "rewards/rejected": -3.1302833557128906, + "step": 2005 + }, + { + "epoch": 0.61, + "learning_rate": 1.956303775142217e-05, + "logits/chosen": -1.458589792251587, + "logits/rejected": -1.3729736804962158, + "logps/chosen": -236.9148406982422, + "logps/rejected": -294.9259338378906, + "loss": 0.3655, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.33419889211654663, + "rewards/margins": 3.009550094604492, + "rewards/rejected": -3.3437488079071045, + "step": 2010 + }, + { + "epoch": 0.61, + "learning_rate": 1.943327120537215e-05, + "logits/chosen": -1.4556093215942383, + "logits/rejected": -1.4194831848144531, + "logps/chosen": -230.87850952148438, + "logps/rejected": -290.84881591796875, + "loss": 0.3859, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5414482951164246, + "rewards/margins": 2.6852874755859375, + "rewards/rejected": -3.226736068725586, + "step": 2015 + }, + { + "epoch": 0.62, + "learning_rate": 1.9303662277421568e-05, + "logits/chosen": -1.408242106437683, + "logits/rejected": -1.3377116918563843, + "logps/chosen": -214.5132293701172, + "logps/rejected": -254.10140991210938, + "loss": 0.3929, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.29236260056495667, + "rewards/margins": 2.5391948223114014, + "rewards/rejected": -2.831557512283325, + "step": 2020 + }, + { + "epoch": 0.62, + "learning_rate": 1.9174214637358122e-05, + "logits/chosen": -1.4432224035263062, + "logits/rejected": -1.4354238510131836, + "logps/chosen": -205.9859619140625, + "logps/rejected": -270.5035705566406, + "loss": 0.4391, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5980446338653564, + "rewards/margins": 2.451856851577759, + "rewards/rejected": -3.0499014854431152, + "step": 2025 + }, + { + "epoch": 0.62, + "learning_rate": 1.9044931950402774e-05, + "logits/chosen": -1.4746092557907104, + "logits/rejected": -1.400431513786316, + "logps/chosen": -238.9920196533203, + "logps/rejected": -290.6305236816406, + "loss": 0.3919, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5437467098236084, + "rewards/margins": 2.831157684326172, + "rewards/rejected": -3.374904155731201, + "step": 2030 + }, + { + "epoch": 0.62, + "learning_rate": 1.8915817877105926e-05, + "logits/chosen": -1.523667335510254, + "logits/rejected": -1.418398380279541, + "logps/chosen": -237.66549682617188, + "logps/rejected": -272.54052734375, + "loss": 0.3978, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42985543608665466, + "rewards/margins": 2.5595247745513916, + "rewards/rejected": -2.989380121231079, + "step": 2035 + }, + { + "epoch": 0.62, + "learning_rate": 1.878687607324382e-05, + "logits/chosen": -1.4313665628433228, + "logits/rejected": -1.3328959941864014, + "logps/chosen": -254.4265594482422, + "logps/rejected": -286.2435607910156, + "loss": 0.3886, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.38601452112197876, + "rewards/margins": 2.6175618171691895, + "rewards/rejected": -3.0035765171051025, + "step": 2040 + }, + { + "epoch": 0.62, + "learning_rate": 1.865811018971502e-05, + "logits/chosen": -1.5035260915756226, + "logits/rejected": -1.4406765699386597, + "logps/chosen": -235.6429443359375, + "logps/rejected": -284.11651611328125, + "loss": 0.3842, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5510352849960327, + "rewards/margins": 2.69769024848938, + "rewards/rejected": -3.248725175857544, + "step": 2045 + }, + { + "epoch": 0.62, + "learning_rate": 1.852952387243698e-05, + "logits/chosen": -1.4961767196655273, + "logits/rejected": -1.431730031967163, + "logps/chosen": -216.1791229248047, + "logps/rejected": -258.23095703125, + "loss": 0.3778, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3444279730319977, + "rewards/margins": 2.5474395751953125, + "rewards/rejected": -2.8918673992156982, + "step": 2050 + }, + { + "epoch": 0.63, + "learning_rate": 1.840112076224291e-05, + "logits/chosen": -1.382947564125061, + "logits/rejected": -1.3435488939285278, + "logps/chosen": -227.3728485107422, + "logps/rejected": -301.82159423828125, + "loss": 0.3834, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5828854441642761, + "rewards/margins": 3.2627499103546143, + "rewards/rejected": -3.845635175704956, + "step": 2055 + }, + { + "epoch": 0.63, + "learning_rate": 1.8272904494778602e-05, + "logits/chosen": -1.416355013847351, + "logits/rejected": -1.408719539642334, + "logps/chosen": -234.11172485351562, + "logps/rejected": -281.99505615234375, + "loss": 0.4241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5792425870895386, + "rewards/margins": 2.3325040340423584, + "rewards/rejected": -2.9117465019226074, + "step": 2060 + }, + { + "epoch": 0.63, + "learning_rate": 1.814487870039955e-05, + "logits/chosen": -1.419982671737671, + "logits/rejected": -1.3562277555465698, + "logps/chosen": -230.45144653320312, + "logps/rejected": -270.77203369140625, + "loss": 0.3769, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5390575528144836, + "rewards/margins": 2.4156157970428467, + "rewards/rejected": -2.9546732902526855, + "step": 2065 + }, + { + "epoch": 0.63, + "learning_rate": 1.8017047004068105e-05, + "logits/chosen": -1.4043166637420654, + "logits/rejected": -1.3527730703353882, + "logps/chosen": -235.4857177734375, + "logps/rejected": -319.1287841796875, + "loss": 0.3844, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5465725660324097, + "rewards/margins": 3.312981367111206, + "rewards/rejected": -3.859553575515747, + "step": 2070 + }, + { + "epoch": 0.63, + "learning_rate": 1.7889413025250897e-05, + "logits/chosen": -1.4501091241836548, + "logits/rejected": -1.3656995296478271, + "logps/chosen": -248.06228637695312, + "logps/rejected": -298.1331787109375, + "loss": 0.3948, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6459044218063354, + "rewards/margins": 2.820582151412964, + "rewards/rejected": -3.466486692428589, + "step": 2075 + }, + { + "epoch": 0.63, + "learning_rate": 1.7761980377816287e-05, + "logits/chosen": -1.446747064590454, + "logits/rejected": -1.331923007965088, + "logps/chosen": -254.3787841796875, + "logps/rejected": -298.6913146972656, + "loss": 0.3788, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.58036208152771, + "rewards/margins": 3.1082186698913574, + "rewards/rejected": -3.688580274581909, + "step": 2080 + }, + { + "epoch": 0.64, + "learning_rate": 1.7634752669932115e-05, + "logits/chosen": -1.425175428390503, + "logits/rejected": -1.3348580598831177, + "logps/chosen": -231.65908813476562, + "logps/rejected": -274.4278869628906, + "loss": 0.4255, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.701018214225769, + "rewards/margins": 2.68603515625, + "rewards/rejected": -3.3870530128479004, + "step": 2085 + }, + { + "epoch": 0.64, + "learning_rate": 1.7507733503963457e-05, + "logits/chosen": -1.4499051570892334, + "logits/rejected": -1.3007951974868774, + "logps/chosen": -242.14804077148438, + "logps/rejected": -265.29937744140625, + "loss": 0.3743, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5920419096946716, + "rewards/margins": 2.760340690612793, + "rewards/rejected": -3.3523826599121094, + "step": 2090 + }, + { + "epoch": 0.64, + "learning_rate": 1.7380926476370702e-05, + "logits/chosen": -1.4862868785858154, + "logits/rejected": -1.3940832614898682, + "logps/chosen": -213.6471405029297, + "logps/rejected": -256.01214599609375, + "loss": 0.3692, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5724049806594849, + "rewards/margins": 2.784367561340332, + "rewards/rejected": -3.3567726612091064, + "step": 2095 + }, + { + "epoch": 0.64, + "learning_rate": 1.725433517760768e-05, + "logits/chosen": -1.4317893981933594, + "logits/rejected": -1.2953369617462158, + "logps/chosen": -274.6922607421875, + "logps/rejected": -308.44354248046875, + "loss": 0.3591, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6146590113639832, + "rewards/margins": 3.500483751296997, + "rewards/rejected": -4.115141868591309, + "step": 2100 + }, + { + "epoch": 0.64, + "learning_rate": 1.7127963192019975e-05, + "logits/chosen": -1.3676774501800537, + "logits/rejected": -1.3125852346420288, + "logps/chosen": -214.765625, + "logps/rejected": -290.7088928222656, + "loss": 0.3849, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5840885043144226, + "rewards/margins": 3.0269782543182373, + "rewards/rejected": -3.6110668182373047, + "step": 2105 + }, + { + "epoch": 0.64, + "learning_rate": 1.7001814097743528e-05, + "logits/chosen": -1.4912971258163452, + "logits/rejected": -1.3937398195266724, + "logps/chosen": -243.7738037109375, + "logps/rejected": -289.00592041015625, + "loss": 0.3838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7238431572914124, + "rewards/margins": 2.812650203704834, + "rewards/rejected": -3.5364933013916016, + "step": 2110 + }, + { + "epoch": 0.64, + "learning_rate": 1.6875891466603204e-05, + "logits/chosen": -1.4120972156524658, + "logits/rejected": -1.3030786514282227, + "logps/chosen": -245.34042358398438, + "logps/rejected": -291.8741149902344, + "loss": 0.3802, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5131228566169739, + "rewards/margins": 3.2208220958709717, + "rewards/rejected": -3.73394513130188, + "step": 2115 + }, + { + "epoch": 0.65, + "learning_rate": 1.675019886401177e-05, + "logits/chosen": -1.441261649131775, + "logits/rejected": -1.3036904335021973, + "logps/chosen": -252.41976928710938, + "logps/rejected": -283.91204833984375, + "loss": 0.3985, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.39861178398132324, + "rewards/margins": 2.945107936859131, + "rewards/rejected": -3.343719959259033, + "step": 2120 + }, + { + "epoch": 0.65, + "learning_rate": 1.6624739848868854e-05, + "logits/chosen": -1.4387789964675903, + "logits/rejected": -1.3071900606155396, + "logps/chosen": -241.0995635986328, + "logps/rejected": -269.37548828125, + "loss": 0.366, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.20480592548847198, + "rewards/margins": 3.1126081943511963, + "rewards/rejected": -3.3174140453338623, + "step": 2125 + }, + { + "epoch": 0.65, + "learning_rate": 1.6499517973460245e-05, + "logits/chosen": -1.3849633932113647, + "logits/rejected": -1.394295334815979, + "logps/chosen": -244.7119903564453, + "logps/rejected": -331.77960205078125, + "loss": 0.3654, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4295539855957031, + "rewards/margins": 3.0043957233428955, + "rewards/rejected": -3.4339497089385986, + "step": 2130 + }, + { + "epoch": 0.65, + "learning_rate": 1.6374536783357268e-05, + "logits/chosen": -1.3435999155044556, + "logits/rejected": -1.3118559122085571, + "logps/chosen": -228.97640991210938, + "logps/rejected": -318.62689208984375, + "loss": 0.3505, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2641984820365906, + "rewards/margins": 3.0577046871185303, + "rewards/rejected": -3.3219032287597656, + "step": 2135 + }, + { + "epoch": 0.65, + "learning_rate": 1.6249799817316415e-05, + "logits/chosen": -1.3465222120285034, + "logits/rejected": -1.2002493143081665, + "logps/chosen": -260.20477294921875, + "logps/rejected": -297.1739501953125, + "loss": 0.3054, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.2534390985965729, + "rewards/margins": 3.1337084770202637, + "rewards/rejected": -3.3871474266052246, + "step": 2140 + }, + { + "epoch": 0.65, + "learning_rate": 1.6125310607179133e-05, + "logits/chosen": -1.4271605014801025, + "logits/rejected": -1.4029309749603271, + "logps/chosen": -217.9933319091797, + "logps/rejected": -285.71282958984375, + "loss": 0.3688, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2108803689479828, + "rewards/margins": 2.7702507972717285, + "rewards/rejected": -2.9811313152313232, + "step": 2145 + }, + { + "epoch": 0.66, + "learning_rate": 1.6001072677771843e-05, + "logits/chosen": -1.434211015701294, + "logits/rejected": -1.3352999687194824, + "logps/chosen": -256.20953369140625, + "logps/rejected": -302.0452575683594, + "loss": 0.3685, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3069499433040619, + "rewards/margins": 3.477191925048828, + "rewards/rejected": -3.784142017364502, + "step": 2150 + }, + { + "epoch": 0.66, + "learning_rate": 1.5877089546806125e-05, + "logits/chosen": -1.3840197324752808, + "logits/rejected": -1.2724934816360474, + "logps/chosen": -230.0450439453125, + "logps/rejected": -274.7979431152344, + "loss": 0.3782, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3171940743923187, + "rewards/margins": 3.197904109954834, + "rewards/rejected": -3.5150985717773438, + "step": 2155 + }, + { + "epoch": 0.66, + "learning_rate": 1.5753364724779092e-05, + "logits/chosen": -1.4506622552871704, + "logits/rejected": -1.3692567348480225, + "logps/chosen": -231.21963500976562, + "logps/rejected": -269.173095703125, + "loss": 0.3128, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.2219165563583374, + "rewards/margins": 3.0431320667266846, + "rewards/rejected": -3.2650482654571533, + "step": 2160 + }, + { + "epoch": 0.66, + "learning_rate": 1.5629901714874056e-05, + "logits/chosen": -1.4132072925567627, + "logits/rejected": -1.301841378211975, + "logps/chosen": -215.6963348388672, + "logps/rejected": -276.54949951171875, + "loss": 0.4004, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4771305024623871, + "rewards/margins": 2.786186933517456, + "rewards/rejected": -3.263317584991455, + "step": 2165 + }, + { + "epoch": 0.66, + "learning_rate": 1.5506704012861256e-05, + "logits/chosen": -1.4663952589035034, + "logits/rejected": -1.4031254053115845, + "logps/chosen": -215.5591278076172, + "logps/rejected": -278.62933349609375, + "loss": 0.4011, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3410421907901764, + "rewards/margins": 2.8404176235198975, + "rewards/rejected": -3.181459903717041, + "step": 2170 + }, + { + "epoch": 0.66, + "learning_rate": 1.5383775106998976e-05, + "logits/chosen": -1.3863542079925537, + "logits/rejected": -1.353366494178772, + "logps/chosen": -219.19192504882812, + "logps/rejected": -289.38018798828125, + "loss": 0.4094, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5598834156990051, + "rewards/margins": 2.8867409229278564, + "rewards/rejected": -3.4466240406036377, + "step": 2175 + }, + { + "epoch": 0.66, + "learning_rate": 1.5261118477934645e-05, + "logits/chosen": -1.3812984228134155, + "logits/rejected": -1.2817041873931885, + "logps/chosen": -208.2887725830078, + "logps/rejected": -252.01492309570312, + "loss": 0.3853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.492426335811615, + "rewards/margins": 2.9170594215393066, + "rewards/rejected": -3.4094855785369873, + "step": 2180 + }, + { + "epoch": 0.67, + "learning_rate": 1.5138737598606448e-05, + "logits/chosen": -1.4833437204360962, + "logits/rejected": -1.3331998586654663, + "logps/chosen": -275.10308837890625, + "logps/rejected": -283.90399169921875, + "loss": 0.4228, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.29744550585746765, + "rewards/margins": 2.7555899620056152, + "rewards/rejected": -3.0530357360839844, + "step": 2185 + }, + { + "epoch": 0.67, + "learning_rate": 1.5016635934144824e-05, + "logits/chosen": -1.4359524250030518, + "logits/rejected": -1.3323842287063599, + "logps/chosen": -227.65158081054688, + "logps/rejected": -264.79144287109375, + "loss": 0.3748, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.48231664299964905, + "rewards/margins": 2.5819931030273438, + "rewards/rejected": -3.06430983543396, + "step": 2190 + }, + { + "epoch": 0.67, + "learning_rate": 1.4894816941774508e-05, + "logits/chosen": -1.4607924222946167, + "logits/rejected": -1.3200831413269043, + "logps/chosen": -224.31387329101562, + "logps/rejected": -267.02325439453125, + "loss": 0.4454, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3082335591316223, + "rewards/margins": 2.6731209754943848, + "rewards/rejected": -2.9813544750213623, + "step": 2195 + }, + { + "epoch": 0.67, + "learning_rate": 1.4773284070716503e-05, + "logits/chosen": -1.5401244163513184, + "logits/rejected": -1.4368436336517334, + "logps/chosen": -232.74081420898438, + "logps/rejected": -272.65423583984375, + "loss": 0.4142, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3758106231689453, + "rewards/margins": 2.910980701446533, + "rewards/rejected": -3.2867913246154785, + "step": 2200 + }, + { + "epoch": 0.67, + "learning_rate": 1.4652040762090541e-05, + "logits/chosen": -1.4225276708602905, + "logits/rejected": -1.3527642488479614, + "logps/chosen": -230.27249145507812, + "logps/rejected": -273.2910461425781, + "loss": 0.3982, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.14205607771873474, + "rewards/margins": 2.690882682800293, + "rewards/rejected": -2.8329386711120605, + "step": 2205 + }, + { + "epoch": 0.67, + "learning_rate": 1.4531090448817558e-05, + "logits/chosen": -1.47641921043396, + "logits/rejected": -1.4135901927947998, + "logps/chosen": -233.48486328125, + "logps/rejected": -287.6616516113281, + "loss": 0.3689, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.20316210389137268, + "rewards/margins": 3.0185022354125977, + "rewards/rejected": -3.2216639518737793, + "step": 2210 + }, + { + "epoch": 0.68, + "learning_rate": 1.4410436555522522e-05, + "logits/chosen": -1.4257131814956665, + "logits/rejected": -1.275315761566162, + "logps/chosen": -252.2233123779297, + "logps/rejected": -284.6685791015625, + "loss": 0.2998, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012933698482811451, + "rewards/margins": 3.136654853820801, + "rewards/rejected": -3.1495883464813232, + "step": 2215 + }, + { + "epoch": 0.68, + "learning_rate": 1.4290082498437515e-05, + "logits/chosen": -1.466830849647522, + "logits/rejected": -1.326336145401001, + "logps/chosen": -245.9455108642578, + "logps/rejected": -305.823486328125, + "loss": 0.3603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14640632271766663, + "rewards/margins": 3.019836902618408, + "rewards/rejected": -3.166242837905884, + "step": 2220 + }, + { + "epoch": 0.68, + "learning_rate": 1.4170031685304913e-05, + "logits/chosen": -1.408921241760254, + "logits/rejected": -1.3455395698547363, + "logps/chosen": -232.0191192626953, + "logps/rejected": -277.630859375, + "loss": 0.3774, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4036007821559906, + "rewards/margins": 2.6838150024414062, + "rewards/rejected": -3.0874156951904297, + "step": 2225 + }, + { + "epoch": 0.68, + "learning_rate": 1.405028751528099e-05, + "logits/chosen": -1.4571171998977661, + "logits/rejected": -1.3236163854599, + "logps/chosen": -253.90609741210938, + "logps/rejected": -288.3222961425781, + "loss": 0.3519, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.03257422894239426, + "rewards/margins": 3.569223403930664, + "rewards/rejected": -3.601797580718994, + "step": 2230 + }, + { + "epoch": 0.68, + "learning_rate": 1.3930853378839603e-05, + "logits/chosen": -1.4818923473358154, + "logits/rejected": -1.4186928272247314, + "logps/chosen": -258.0881652832031, + "logps/rejected": -309.25738525390625, + "loss": 0.3378, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2946815490722656, + "rewards/margins": 3.1481051445007324, + "rewards/rejected": -3.4427871704101562, + "step": 2235 + }, + { + "epoch": 0.68, + "learning_rate": 1.381173265767623e-05, + "logits/chosen": -1.3591539859771729, + "logits/rejected": -1.2734744548797607, + "logps/chosen": -231.13619995117188, + "logps/rejected": -265.5144958496094, + "loss": 0.4036, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3812045454978943, + "rewards/margins": 3.0088858604431152, + "rewards/rejected": -3.3900904655456543, + "step": 2240 + }, + { + "epoch": 0.68, + "learning_rate": 1.3692928724612203e-05, + "logits/chosen": -1.4619818925857544, + "logits/rejected": -1.4295190572738647, + "logps/chosen": -249.37570190429688, + "logps/rejected": -324.696533203125, + "loss": 0.3229, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.17562466859817505, + "rewards/margins": 3.0519189834594727, + "rewards/rejected": -3.227544069290161, + "step": 2245 + }, + { + "epoch": 0.69, + "learning_rate": 1.357444494349924e-05, + "logits/chosen": -1.4158533811569214, + "logits/rejected": -1.3473224639892578, + "logps/chosen": -236.3145751953125, + "logps/rejected": -283.5877380371094, + "loss": 0.3509, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.2872604727745056, + "rewards/margins": 2.891709089279175, + "rewards/rejected": -3.178969383239746, + "step": 2250 + }, + { + "epoch": 0.69, + "learning_rate": 1.3456284669124158e-05, + "logits/chosen": -1.439143419265747, + "logits/rejected": -1.357162356376648, + "logps/chosen": -251.6182098388672, + "logps/rejected": -325.0864562988281, + "loss": 0.388, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.33754006028175354, + "rewards/margins": 3.1689419746398926, + "rewards/rejected": -3.5064823627471924, + "step": 2255 + }, + { + "epoch": 0.69, + "learning_rate": 1.3338451247113897e-05, + "logits/chosen": -1.3981112241744995, + "logits/rejected": -1.311858892440796, + "logps/chosen": -210.9602508544922, + "logps/rejected": -260.03875732421875, + "loss": 0.3967, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4606190621852875, + "rewards/margins": 2.981788396835327, + "rewards/rejected": -3.4424071311950684, + "step": 2260 + }, + { + "epoch": 0.69, + "learning_rate": 1.3220948013840808e-05, + "logits/chosen": -1.3882707357406616, + "logits/rejected": -1.3152581453323364, + "logps/chosen": -204.07455444335938, + "logps/rejected": -260.4723205566406, + "loss": 0.3922, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5220751762390137, + "rewards/margins": 2.884295701980591, + "rewards/rejected": -3.4063706398010254, + "step": 2265 + }, + { + "epoch": 0.69, + "learning_rate": 1.310377829632818e-05, + "logits/chosen": -1.4722161293029785, + "logits/rejected": -1.382716178894043, + "logps/chosen": -255.38790893554688, + "logps/rejected": -315.30548095703125, + "loss": 0.3716, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.427694708108902, + "rewards/margins": 3.0956382751464844, + "rewards/rejected": -3.5233330726623535, + "step": 2270 + }, + { + "epoch": 0.69, + "learning_rate": 1.2986945412156038e-05, + "logits/chosen": -1.4732109308242798, + "logits/rejected": -1.3891632556915283, + "logps/chosen": -267.89892578125, + "logps/rejected": -326.42291259765625, + "loss": 0.3559, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5320366621017456, + "rewards/margins": 3.3837790489196777, + "rewards/rejected": -3.915815830230713, + "step": 2275 + }, + { + "epoch": 0.69, + "learning_rate": 1.2870452669367155e-05, + "logits/chosen": -1.3830268383026123, + "logits/rejected": -1.2808607816696167, + "logps/chosen": -247.59646606445312, + "logps/rejected": -309.2234802246094, + "loss": 0.3803, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5355652570724487, + "rewards/margins": 2.929332733154297, + "rewards/rejected": -3.4648983478546143, + "step": 2280 + }, + { + "epoch": 0.7, + "learning_rate": 1.2754303366373504e-05, + "logits/chosen": -1.5191096067428589, + "logits/rejected": -1.3283154964447021, + "logps/chosen": -246.34066772460938, + "logps/rejected": -265.673828125, + "loss": 0.3481, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.40870028734207153, + "rewards/margins": 3.1531982421875, + "rewards/rejected": -3.5618984699249268, + "step": 2285 + }, + { + "epoch": 0.7, + "learning_rate": 1.263850079186274e-05, + "logits/chosen": -1.4764816761016846, + "logits/rejected": -1.430755853652954, + "logps/chosen": -212.9180145263672, + "logps/rejected": -274.1111145019531, + "loss": 0.3783, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.45547690987586975, + "rewards/margins": 2.7248294353485107, + "rewards/rejected": -3.1803061962127686, + "step": 2290 + }, + { + "epoch": 0.7, + "learning_rate": 1.2523048224705186e-05, + "logits/chosen": -1.4630491733551025, + "logits/rejected": -1.3331501483917236, + "logps/chosen": -270.1141052246094, + "logps/rejected": -308.35821533203125, + "loss": 0.3684, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3026786148548126, + "rewards/margins": 3.3436999320983887, + "rewards/rejected": -3.646378755569458, + "step": 2295 + }, + { + "epoch": 0.7, + "learning_rate": 1.2407948933860892e-05, + "logits/chosen": -1.496614694595337, + "logits/rejected": -1.4215686321258545, + "logps/chosen": -235.4622039794922, + "logps/rejected": -290.6049499511719, + "loss": 0.4294, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6504599452018738, + "rewards/margins": 2.79489803314209, + "rewards/rejected": -3.4453582763671875, + "step": 2300 + }, + { + "epoch": 0.7, + "learning_rate": 1.2293206178287184e-05, + "logits/chosen": -1.3421580791473389, + "logits/rejected": -1.217355489730835, + "logps/chosen": -213.7441864013672, + "logps/rejected": -239.513671875, + "loss": 0.3678, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2297484427690506, + "rewards/margins": 2.904219150543213, + "rewards/rejected": -3.133967399597168, + "step": 2305 + }, + { + "epoch": 0.7, + "learning_rate": 1.2178823206846302e-05, + "logits/chosen": -1.4291033744812012, + "logits/rejected": -1.324205994606018, + "logps/chosen": -240.22793579101562, + "logps/rejected": -290.51068115234375, + "loss": 0.4002, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3026602566242218, + "rewards/margins": 3.2725844383239746, + "rewards/rejected": -3.575244903564453, + "step": 2310 + }, + { + "epoch": 0.71, + "learning_rate": 1.2064803258213476e-05, + "logits/chosen": -1.4654659032821655, + "logits/rejected": -1.4113140106201172, + "logps/chosen": -231.51815795898438, + "logps/rejected": -281.7957458496094, + "loss": 0.4204, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.29831621050834656, + "rewards/margins": 2.721684694290161, + "rewards/rejected": -3.02000093460083, + "step": 2315 + }, + { + "epoch": 0.71, + "learning_rate": 1.1951149560785167e-05, + "logits/chosen": -1.4634226560592651, + "logits/rejected": -1.3689024448394775, + "logps/chosen": -229.8677215576172, + "logps/rejected": -284.0724792480469, + "loss": 0.3761, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2706076502799988, + "rewards/margins": 3.476905345916748, + "rewards/rejected": -3.7475128173828125, + "step": 2320 + }, + { + "epoch": 0.71, + "learning_rate": 1.18378653325877e-05, + "logits/chosen": -1.4571824073791504, + "logits/rejected": -1.3364005088806152, + "logps/chosen": -261.83563232421875, + "logps/rejected": -289.01141357421875, + "loss": 0.3223, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2980685830116272, + "rewards/margins": 3.3560166358947754, + "rewards/rejected": -3.654085159301758, + "step": 2325 + }, + { + "epoch": 0.71, + "learning_rate": 1.1724953781186116e-05, + "logits/chosen": -1.459835410118103, + "logits/rejected": -1.3399070501327515, + "logps/chosen": -265.9317626953125, + "logps/rejected": -301.6705017089844, + "loss": 0.3793, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3601319193840027, + "rewards/margins": 2.9270853996276855, + "rewards/rejected": -3.287217378616333, + "step": 2330 + }, + { + "epoch": 0.71, + "learning_rate": 1.16124181035934e-05, + "logits/chosen": -1.4324496984481812, + "logits/rejected": -1.3354243040084839, + "logps/chosen": -213.58486938476562, + "logps/rejected": -264.10089111328125, + "loss": 0.3614, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3313239514827728, + "rewards/margins": 3.0802438259124756, + "rewards/rejected": -3.4115676879882812, + "step": 2335 + }, + { + "epoch": 0.71, + "learning_rate": 1.15002614861799e-05, + "logits/chosen": -1.4642277956008911, + "logits/rejected": -1.3924884796142578, + "logps/chosen": -223.7937774658203, + "logps/rejected": -284.98785400390625, + "loss": 0.3665, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4326489567756653, + "rewards/margins": 3.1457138061523438, + "rewards/rejected": -3.578362226486206, + "step": 2340 + }, + { + "epoch": 0.71, + "learning_rate": 1.138848710458314e-05, + "logits/chosen": -1.5061982870101929, + "logits/rejected": -1.4258203506469727, + "logps/chosen": -210.1097869873047, + "logps/rejected": -278.6955261230469, + "loss": 0.3572, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2931816875934601, + "rewards/margins": 3.354383945465088, + "rewards/rejected": -3.6475658416748047, + "step": 2345 + }, + { + "epoch": 0.72, + "learning_rate": 1.1277098123617922e-05, + "logits/chosen": -1.4186168909072876, + "logits/rejected": -1.2714909315109253, + "logps/chosen": -264.96453857421875, + "logps/rejected": -298.23052978515625, + "loss": 0.382, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5977450013160706, + "rewards/margins": 2.8557708263397217, + "rewards/rejected": -3.4535155296325684, + "step": 2350 + }, + { + "epoch": 0.72, + "learning_rate": 1.1166097697186654e-05, + "logits/chosen": -1.3840543031692505, + "logits/rejected": -1.3661469221115112, + "logps/chosen": -230.8970947265625, + "logps/rejected": -278.3932800292969, + "loss": 0.3896, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.44915327429771423, + "rewards/margins": 2.8046319484710693, + "rewards/rejected": -3.2537853717803955, + "step": 2355 + }, + { + "epoch": 0.72, + "learning_rate": 1.1055488968190145e-05, + "logits/chosen": -1.436781644821167, + "logits/rejected": -1.30691397190094, + "logps/chosen": -255.84805297851562, + "logps/rejected": -299.69842529296875, + "loss": 0.3509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.25631824135780334, + "rewards/margins": 3.2734789848327637, + "rewards/rejected": -3.529797315597534, + "step": 2360 + }, + { + "epoch": 0.72, + "learning_rate": 1.094527506843849e-05, + "logits/chosen": -1.5180524587631226, + "logits/rejected": -1.3637323379516602, + "logps/chosen": -235.41012573242188, + "logps/rejected": -272.5983581542969, + "loss": 0.3703, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5200671553611755, + "rewards/margins": 2.5616183280944824, + "rewards/rejected": -3.0816853046417236, + "step": 2365 + }, + { + "epoch": 0.72, + "learning_rate": 1.083545911856253e-05, + "logits/chosen": -1.410651683807373, + "logits/rejected": -1.3152965307235718, + "logps/chosen": -250.3278045654297, + "logps/rejected": -304.0352783203125, + "loss": 0.3922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4448067545890808, + "rewards/margins": 2.8111159801483154, + "rewards/rejected": -3.255922794342041, + "step": 2370 + }, + { + "epoch": 0.72, + "learning_rate": 1.0726044227925381e-05, + "logits/chosen": -1.3654406070709229, + "logits/rejected": -1.3672550916671753, + "logps/chosen": -222.97714233398438, + "logps/rejected": -298.3196105957031, + "loss": 0.3789, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3055039942264557, + "rewards/margins": 3.0886244773864746, + "rewards/rejected": -3.3941283226013184, + "step": 2375 + }, + { + "epoch": 0.73, + "learning_rate": 1.0617033494534486e-05, + "logits/chosen": -1.4127416610717773, + "logits/rejected": -1.365252137184143, + "logps/chosen": -248.2038116455078, + "logps/rejected": -304.67987060546875, + "loss": 0.399, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.549986720085144, + "rewards/margins": 2.9103312492370605, + "rewards/rejected": -3.460318088531494, + "step": 2380 + }, + { + "epoch": 0.73, + "learning_rate": 1.0508430004953821e-05, + "logits/chosen": -1.4619853496551514, + "logits/rejected": -1.3127577304840088, + "logps/chosen": -258.4478454589844, + "logps/rejected": -284.666748046875, + "loss": 0.3749, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4515061378479004, + "rewards/margins": 3.1675283908843994, + "rewards/rejected": -3.6190345287323, + "step": 2385 + }, + { + "epoch": 0.73, + "learning_rate": 1.0400236834216528e-05, + "logits/chosen": -1.4262062311172485, + "logits/rejected": -1.3755922317504883, + "logps/chosen": -217.9772186279297, + "logps/rejected": -275.2790222167969, + "loss": 0.3543, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.38236337900161743, + "rewards/margins": 2.9082367420196533, + "rewards/rejected": -3.290599822998047, + "step": 2390 + }, + { + "epoch": 0.73, + "learning_rate": 1.0292457045737895e-05, + "logits/chosen": -1.4641517400741577, + "logits/rejected": -1.3777649402618408, + "logps/chosen": -244.06765747070312, + "logps/rejected": -297.1828308105469, + "loss": 0.3964, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5129659175872803, + "rewards/margins": 2.698361873626709, + "rewards/rejected": -3.2113280296325684, + "step": 2395 + }, + { + "epoch": 0.73, + "learning_rate": 1.0185093691228534e-05, + "logits/chosen": -1.4218064546585083, + "logits/rejected": -1.3444823026657104, + "logps/chosen": -255.1289520263672, + "logps/rejected": -323.5635986328125, + "loss": 0.3172, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.28663378953933716, + "rewards/margins": 3.552114963531494, + "rewards/rejected": -3.8387484550476074, + "step": 2400 + }, + { + "epoch": 0.73, + "learning_rate": 1.0078149810608028e-05, + "logits/chosen": -1.3803118467330933, + "logits/rejected": -1.3066353797912598, + "logps/chosen": -256.8506164550781, + "logps/rejected": -305.9026794433594, + "loss": 0.4005, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5916983485221863, + "rewards/margins": 3.0665206909179688, + "rewards/rejected": -3.6582188606262207, + "step": 2405 + }, + { + "epoch": 0.73, + "learning_rate": 9.971628431918845e-06, + "logits/chosen": -1.4089895486831665, + "logits/rejected": -1.327468752861023, + "logps/chosen": -245.14462280273438, + "logps/rejected": -303.9306945800781, + "loss": 0.3771, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32407158613204956, + "rewards/margins": 3.180250883102417, + "rewards/rejected": -3.5043225288391113, + "step": 2410 + }, + { + "epoch": 0.74, + "learning_rate": 9.865532571240615e-06, + "logits/chosen": -1.3610029220581055, + "logits/rejected": -1.289146900177002, + "logps/chosen": -259.354248046875, + "logps/rejected": -309.7987060546875, + "loss": 0.326, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3622656464576721, + "rewards/margins": 3.3705601692199707, + "rewards/rejected": -3.732825756072998, + "step": 2415 + }, + { + "epoch": 0.74, + "learning_rate": 9.759865232604692e-06, + "logits/chosen": -1.433051347732544, + "logits/rejected": -1.3162428140640259, + "logps/chosen": -236.21365356445312, + "logps/rejected": -268.08502197265625, + "loss": 0.389, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32003462314605713, + "rewards/margins": 3.065013885498047, + "rewards/rejected": -3.3850486278533936, + "step": 2420 + }, + { + "epoch": 0.74, + "learning_rate": 9.654629407909163e-06, + "logits/chosen": -1.4370791912078857, + "logits/rejected": -1.3018367290496826, + "logps/chosen": -244.87490844726562, + "logps/rejected": -300.98974609375, + "loss": 0.3731, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.700079083442688, + "rewards/margins": 2.8450570106506348, + "rewards/rejected": -3.5451362133026123, + "step": 2425 + }, + { + "epoch": 0.74, + "learning_rate": 9.549828076834033e-06, + "logits/chosen": -1.373586893081665, + "logits/rejected": -1.263668179512024, + "logps/chosen": -251.5384063720703, + "logps/rejected": -301.0669250488281, + "loss": 0.3579, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4735511839389801, + "rewards/margins": 3.3861241340637207, + "rewards/rejected": -3.859675168991089, + "step": 2430 + }, + { + "epoch": 0.74, + "learning_rate": 9.44546420675698e-06, + "logits/chosen": -1.3819353580474854, + "logits/rejected": -1.2966265678405762, + "logps/chosen": -228.5697021484375, + "logps/rejected": -270.13189697265625, + "loss": 0.3729, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4449302554130554, + "rewards/margins": 2.756574869155884, + "rewards/rejected": -3.201505184173584, + "step": 2435 + }, + { + "epoch": 0.74, + "learning_rate": 9.341540752669235e-06, + "logits/chosen": -1.4071505069732666, + "logits/rejected": -1.3509398698806763, + "logps/chosen": -209.3730926513672, + "logps/rejected": -246.5635223388672, + "loss": 0.3756, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5815329551696777, + "rewards/margins": 2.4732906818389893, + "rewards/rejected": -3.054823398590088, + "step": 2440 + }, + { + "epoch": 0.75, + "learning_rate": 9.238060657091988e-06, + "logits/chosen": -1.337214708328247, + "logits/rejected": -1.3006173372268677, + "logps/chosen": -230.82327270507812, + "logps/rejected": -289.1628112792969, + "loss": 0.3277, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.547378420829773, + "rewards/margins": 3.1115758419036865, + "rewards/rejected": -3.658954620361328, + "step": 2445 + }, + { + "epoch": 0.75, + "learning_rate": 9.135026849992984e-06, + "logits/chosen": -1.4462355375289917, + "logits/rejected": -1.3543756008148193, + "logps/chosen": -229.1552276611328, + "logps/rejected": -260.8125915527344, + "loss": 0.3562, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.49785342812538147, + "rewards/margins": 2.897026538848877, + "rewards/rejected": -3.3948798179626465, + "step": 2450 + }, + { + "epoch": 0.75, + "learning_rate": 9.032442248703666e-06, + "logits/chosen": -1.508615493774414, + "logits/rejected": -1.3623030185699463, + "logps/chosen": -238.5448760986328, + "logps/rejected": -265.12078857421875, + "loss": 0.3402, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.407143771648407, + "rewards/margins": 3.078805446624756, + "rewards/rejected": -3.485949754714966, + "step": 2455 + }, + { + "epoch": 0.75, + "learning_rate": 8.930309757836517e-06, + "logits/chosen": -1.4162170886993408, + "logits/rejected": -1.3703190088272095, + "logps/chosen": -212.07846069335938, + "logps/rejected": -300.2138671875, + "loss": 0.3221, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.48511773347854614, + "rewards/margins": 3.757521867752075, + "rewards/rejected": -4.242639064788818, + "step": 2460 + }, + { + "epoch": 0.75, + "learning_rate": 8.828632269202803e-06, + "logits/chosen": -1.5403014421463013, + "logits/rejected": -1.3916822671890259, + "logps/chosen": -260.9053649902344, + "logps/rejected": -289.03033447265625, + "loss": 0.3271, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4829959273338318, + "rewards/margins": 2.945845365524292, + "rewards/rejected": -3.4288413524627686, + "step": 2465 + }, + { + "epoch": 0.75, + "learning_rate": 8.727412661730724e-06, + "logits/chosen": -1.4161503314971924, + "logits/rejected": -1.3081133365631104, + "logps/chosen": -225.17373657226562, + "logps/rejected": -309.2181091308594, + "loss": 0.3697, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.393091082572937, + "rewards/margins": 3.462285280227661, + "rewards/rejected": -3.8553764820098877, + "step": 2470 + }, + { + "epoch": 0.75, + "learning_rate": 8.626653801383885e-06, + "logits/chosen": -1.3759651184082031, + "logits/rejected": -1.3239902257919312, + "logps/chosen": -208.36666870117188, + "logps/rejected": -270.0406799316406, + "loss": 0.3714, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4686359763145447, + "rewards/margins": 3.1771905422210693, + "rewards/rejected": -3.645826816558838, + "step": 2475 + }, + { + "epoch": 0.76, + "learning_rate": 8.526358541080173e-06, + "logits/chosen": -1.3998143672943115, + "logits/rejected": -1.286664366722107, + "logps/chosen": -270.0042419433594, + "logps/rejected": -293.59649658203125, + "loss": 0.376, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6333631873130798, + "rewards/margins": 2.6090807914733887, + "rewards/rejected": -3.242443799972534, + "step": 2480 + }, + { + "epoch": 0.76, + "learning_rate": 8.426529720610934e-06, + "logits/chosen": -1.4069288969039917, + "logits/rejected": -1.3146326541900635, + "logps/chosen": -269.93084716796875, + "logps/rejected": -314.04022216796875, + "loss": 0.375, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5928846597671509, + "rewards/margins": 3.069899797439575, + "rewards/rejected": -3.6627845764160156, + "step": 2485 + }, + { + "epoch": 0.76, + "learning_rate": 8.327170166560605e-06, + "logits/chosen": -1.5114113092422485, + "logits/rejected": -1.3770763874053955, + "logps/chosen": -245.75100708007812, + "logps/rejected": -278.27679443359375, + "loss": 0.4151, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5059057474136353, + "rewards/margins": 2.6279234886169434, + "rewards/rejected": -3.1338295936584473, + "step": 2490 + }, + { + "epoch": 0.76, + "learning_rate": 8.228282692226652e-06, + "logits/chosen": -1.4327267408370972, + "logits/rejected": -1.385161280632019, + "logps/chosen": -255.29782104492188, + "logps/rejected": -335.4405212402344, + "loss": 0.3429, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.5459723472595215, + "rewards/margins": 3.3763973712921143, + "rewards/rejected": -3.9223697185516357, + "step": 2495 + }, + { + "epoch": 0.76, + "learning_rate": 8.129870097539951e-06, + "logits/chosen": -1.361555814743042, + "logits/rejected": -1.3569921255111694, + "logps/chosen": -235.18258666992188, + "logps/rejected": -307.37359619140625, + "loss": 0.3515, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5130782127380371, + "rewards/margins": 3.2528796195983887, + "rewards/rejected": -3.7659575939178467, + "step": 2500 + }, + { + "epoch": 0.76, + "learning_rate": 8.03193516898547e-06, + "logits/chosen": -1.401824712753296, + "logits/rejected": -1.3137303590774536, + "logps/chosen": -265.5319519042969, + "logps/rejected": -333.13201904296875, + "loss": 0.3844, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23906204104423523, + "rewards/margins": 3.3118507862091064, + "rewards/rejected": -3.550912380218506, + "step": 2505 + }, + { + "epoch": 0.77, + "learning_rate": 7.934480679523395e-06, + "logits/chosen": -1.4119746685028076, + "logits/rejected": -1.2916350364685059, + "logps/chosen": -229.8092041015625, + "logps/rejected": -273.4209899902344, + "loss": 0.3693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36957868933677673, + "rewards/margins": 3.204549789428711, + "rewards/rejected": -3.5741286277770996, + "step": 2510 + }, + { + "epoch": 0.77, + "learning_rate": 7.837509388510611e-06, + "logits/chosen": -1.4083707332611084, + "logits/rejected": -1.2871843576431274, + "logps/chosen": -256.2283935546875, + "logps/rejected": -289.46746826171875, + "loss": 0.3276, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.42285409569740295, + "rewards/margins": 3.3278889656066895, + "rewards/rejected": -3.7507431507110596, + "step": 2515 + }, + { + "epoch": 0.77, + "learning_rate": 7.741024041622557e-06, + "logits/chosen": -1.4926209449768066, + "logits/rejected": -1.3721725940704346, + "logps/chosen": -227.1091766357422, + "logps/rejected": -244.8163299560547, + "loss": 0.3926, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2877279818058014, + "rewards/margins": 2.691335439682007, + "rewards/rejected": -2.9790635108947754, + "step": 2520 + }, + { + "epoch": 0.77, + "learning_rate": 7.645027370775526e-06, + "logits/chosen": -1.4789535999298096, + "logits/rejected": -1.3919769525527954, + "logps/chosen": -239.02194213867188, + "logps/rejected": -276.45062255859375, + "loss": 0.4188, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7315130233764648, + "rewards/margins": 2.4713168144226074, + "rewards/rejected": -3.2028298377990723, + "step": 2525 + }, + { + "epoch": 0.77, + "learning_rate": 7.54952209404926e-06, + "logits/chosen": -1.4852955341339111, + "logits/rejected": -1.3242584466934204, + "logps/chosen": -252.9635467529297, + "logps/rejected": -278.1438293457031, + "loss": 0.3786, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.439079612493515, + "rewards/margins": 3.1143195629119873, + "rewards/rejected": -3.5533993244171143, + "step": 2530 + }, + { + "epoch": 0.77, + "learning_rate": 7.454510915610019e-06, + "logits/chosen": -1.455822229385376, + "logits/rejected": -1.3178565502166748, + "logps/chosen": -249.8934326171875, + "logps/rejected": -284.191650390625, + "loss": 0.3985, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.43874090909957886, + "rewards/margins": 3.0221173763275146, + "rewards/rejected": -3.460858106613159, + "step": 2535 + }, + { + "epoch": 0.77, + "learning_rate": 7.359996525634011e-06, + "logits/chosen": -1.4768798351287842, + "logits/rejected": -1.3438224792480469, + "logps/chosen": -249.227294921875, + "logps/rejected": -284.577392578125, + "loss": 0.4065, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.46533918380737305, + "rewards/margins": 2.8477797508239746, + "rewards/rejected": -3.3131186962127686, + "step": 2540 + }, + { + "epoch": 0.78, + "learning_rate": 7.265981600231234e-06, + "logits/chosen": -1.5325887203216553, + "logits/rejected": -1.3816581964492798, + "logps/chosen": -246.61160278320312, + "logps/rejected": -293.80322265625, + "loss": 0.3348, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4386494755744934, + "rewards/margins": 3.20097279548645, + "rewards/rejected": -3.639622211456299, + "step": 2545 + }, + { + "epoch": 0.78, + "learning_rate": 7.172468801369669e-06, + "logits/chosen": -1.5481603145599365, + "logits/rejected": -1.4134643077850342, + "logps/chosen": -220.052490234375, + "logps/rejected": -261.2530822753906, + "loss": 0.3884, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.42949026823043823, + "rewards/margins": 2.8239688873291016, + "rewards/rejected": -3.2534592151641846, + "step": 2550 + }, + { + "epoch": 0.78, + "learning_rate": 7.07946077679994e-06, + "logits/chosen": -1.5553944110870361, + "logits/rejected": -1.4415475130081177, + "logps/chosen": -230.6360626220703, + "logps/rejected": -284.0323181152344, + "loss": 0.355, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.41859936714172363, + "rewards/margins": 2.953322410583496, + "rewards/rejected": -3.3719215393066406, + "step": 2555 + }, + { + "epoch": 0.78, + "learning_rate": 6.986960159980327e-06, + "logits/chosen": -1.4129726886749268, + "logits/rejected": -1.3629024028778076, + "logps/chosen": -221.9413604736328, + "logps/rejected": -286.68896484375, + "loss": 0.4049, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4513567388057709, + "rewards/margins": 3.1536812782287598, + "rewards/rejected": -3.6050381660461426, + "step": 2560 + }, + { + "epoch": 0.78, + "learning_rate": 6.894969570002225e-06, + "logits/chosen": -1.4404191970825195, + "logits/rejected": -1.3535890579223633, + "logps/chosen": -240.3525848388672, + "logps/rejected": -292.9701843261719, + "loss": 0.4055, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.42974838614463806, + "rewards/margins": 2.7643489837646484, + "rewards/rejected": -3.1940975189208984, + "step": 2565 + }, + { + "epoch": 0.78, + "learning_rate": 6.80349161151595e-06, + "logits/chosen": -1.4715224504470825, + "logits/rejected": -1.40964674949646, + "logps/chosen": -262.740234375, + "logps/rejected": -326.66021728515625, + "loss": 0.337, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.17057926952838898, + "rewards/margins": 3.535477876663208, + "rewards/rejected": -3.706057071685791, + "step": 2570 + }, + { + "epoch": 0.78, + "learning_rate": 6.712528874657012e-06, + "logits/chosen": -1.4969851970672607, + "logits/rejected": -1.3666260242462158, + "logps/chosen": -256.3636169433594, + "logps/rejected": -294.6187744140625, + "loss": 0.4133, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5324915647506714, + "rewards/margins": 2.963696002960205, + "rewards/rejected": -3.496187686920166, + "step": 2575 + }, + { + "epoch": 0.79, + "learning_rate": 6.6220839349727945e-06, + "logits/chosen": -1.3161433935165405, + "logits/rejected": -1.2051467895507812, + "logps/chosen": -269.2625732421875, + "logps/rejected": -301.7975769042969, + "loss": 0.3369, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5251234769821167, + "rewards/margins": 3.5199074745178223, + "rewards/rejected": -4.0450310707092285, + "step": 2580 + }, + { + "epoch": 0.79, + "learning_rate": 6.532159353349582e-06, + "logits/chosen": -1.4186230897903442, + "logits/rejected": -1.3696180582046509, + "logps/chosen": -225.25747680664062, + "logps/rejected": -318.4682922363281, + "loss": 0.3668, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3813226819038391, + "rewards/margins": 2.9579646587371826, + "rewards/rejected": -3.339287519454956, + "step": 2585 + }, + { + "epoch": 0.79, + "learning_rate": 6.442757675940109e-06, + "logits/chosen": -1.3839209079742432, + "logits/rejected": -1.3220348358154297, + "logps/chosen": -251.20068359375, + "logps/rejected": -307.3045349121094, + "loss": 0.3602, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.4376640319824219, + "rewards/margins": 3.113976240158081, + "rewards/rejected": -3.551640272140503, + "step": 2590 + }, + { + "epoch": 0.79, + "learning_rate": 6.353881434091405e-06, + "logits/chosen": -1.382564902305603, + "logits/rejected": -1.3075412511825562, + "logps/chosen": -229.39492797851562, + "logps/rejected": -264.2771911621094, + "loss": 0.3838, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4839390814304352, + "rewards/margins": 2.7121593952178955, + "rewards/rejected": -3.196098804473877, + "step": 2595 + }, + { + "epoch": 0.79, + "learning_rate": 6.265533144273175e-06, + "logits/chosen": -1.4169657230377197, + "logits/rejected": -1.3276934623718262, + "logps/chosen": -230.9569854736328, + "logps/rejected": -289.4246520996094, + "loss": 0.3691, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4061087965965271, + "rewards/margins": 3.205444812774658, + "rewards/rejected": -3.611553192138672, + "step": 2600 + }, + { + "epoch": 0.79, + "learning_rate": 6.177715308006505e-06, + "logits/chosen": -1.3769886493682861, + "logits/rejected": -1.3074685335159302, + "logps/chosen": -232.6650848388672, + "logps/rejected": -278.31683349609375, + "loss": 0.3099, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.3842293322086334, + "rewards/margins": 3.2725253105163574, + "rewards/rejected": -3.656754732131958, + "step": 2605 + }, + { + "epoch": 0.8, + "learning_rate": 6.0904304117930825e-06, + "logits/chosen": -1.4004212617874146, + "logits/rejected": -1.2935984134674072, + "logps/chosen": -230.5190887451172, + "logps/rejected": -285.92791748046875, + "loss": 0.3489, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3556326627731323, + "rewards/margins": 3.4840025901794434, + "rewards/rejected": -3.839635133743286, + "step": 2610 + }, + { + "epoch": 0.8, + "learning_rate": 6.003680927044738e-06, + "logits/chosen": -1.403623342514038, + "logits/rejected": -1.350563645362854, + "logps/chosen": -250.65713500976562, + "logps/rejected": -304.454833984375, + "loss": 0.3803, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.49298906326293945, + "rewards/margins": 2.6846089363098145, + "rewards/rejected": -3.177597761154175, + "step": 2615 + }, + { + "epoch": 0.8, + "learning_rate": 5.91746931001351e-06, + "logits/chosen": -1.3959264755249023, + "logits/rejected": -1.354936957359314, + "logps/chosen": -221.90097045898438, + "logps/rejected": -291.7824401855469, + "loss": 0.3779, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4702683389186859, + "rewards/margins": 3.11871600151062, + "rewards/rejected": -3.588984251022339, + "step": 2620 + }, + { + "epoch": 0.8, + "learning_rate": 5.831798001722058e-06, + "logits/chosen": -1.4298603534698486, + "logits/rejected": -1.303333044052124, + "logps/chosen": -237.63467407226562, + "logps/rejected": -282.2469177246094, + "loss": 0.3906, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.33407384157180786, + "rewards/margins": 3.3796210289001465, + "rewards/rejected": -3.7136950492858887, + "step": 2625 + }, + { + "epoch": 0.8, + "learning_rate": 5.7466694278946046e-06, + "logits/chosen": -1.4614206552505493, + "logits/rejected": -1.3570278882980347, + "logps/chosen": -243.57266235351562, + "logps/rejected": -296.99383544921875, + "loss": 0.3934, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.53616863489151, + "rewards/margins": 2.9343109130859375, + "rewards/rejected": -3.4704794883728027, + "step": 2630 + }, + { + "epoch": 0.8, + "learning_rate": 5.662085998888214e-06, + "logits/chosen": -1.428770661354065, + "logits/rejected": -1.3504103422164917, + "logps/chosen": -203.597900390625, + "logps/rejected": -243.631591796875, + "loss": 0.3839, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4517253041267395, + "rewards/margins": 2.85153865814209, + "rewards/rejected": -3.3032639026641846, + "step": 2635 + }, + { + "epoch": 0.8, + "learning_rate": 5.578050109624511e-06, + "logits/chosen": -1.4417340755462646, + "logits/rejected": -1.3601127862930298, + "logps/chosen": -213.7624969482422, + "logps/rejected": -274.27764892578125, + "loss": 0.3752, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5715819597244263, + "rewards/margins": 2.8983328342437744, + "rewards/rejected": -3.469914674758911, + "step": 2640 + }, + { + "epoch": 0.81, + "learning_rate": 5.494564139521957e-06, + "logits/chosen": -1.4128557443618774, + "logits/rejected": -1.3376586437225342, + "logps/chosen": -198.64208984375, + "logps/rejected": -243.85733032226562, + "loss": 0.388, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4724113345146179, + "rewards/margins": 2.727562427520752, + "rewards/rejected": -3.1999735832214355, + "step": 2645 + }, + { + "epoch": 0.81, + "learning_rate": 5.411630452428395e-06, + "logits/chosen": -1.4205583333969116, + "logits/rejected": -1.3981122970581055, + "logps/chosen": -249.38552856445312, + "logps/rejected": -318.7808837890625, + "loss": 0.398, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6618324518203735, + "rewards/margins": 3.0907139778137207, + "rewards/rejected": -3.7525463104248047, + "step": 2650 + }, + { + "epoch": 0.81, + "learning_rate": 5.329251396554186e-06, + "logits/chosen": -1.445494532585144, + "logits/rejected": -1.3906385898590088, + "logps/chosen": -210.5917510986328, + "logps/rejected": -269.33074951171875, + "loss": 0.4151, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5111071467399597, + "rewards/margins": 2.8808178901672363, + "rewards/rejected": -3.391925096511841, + "step": 2655 + }, + { + "epoch": 0.81, + "learning_rate": 5.247429304405663e-06, + "logits/chosen": -1.371177315711975, + "logits/rejected": -1.2566999197006226, + "logps/chosen": -267.86383056640625, + "logps/rejected": -303.1114807128906, + "loss": 0.327, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.42511287331581116, + "rewards/margins": 3.470594882965088, + "rewards/rejected": -3.895707607269287, + "step": 2660 + }, + { + "epoch": 0.81, + "learning_rate": 5.166166492719124e-06, + "logits/chosen": -1.4190130233764648, + "logits/rejected": -1.3397761583328247, + "logps/chosen": -231.8660125732422, + "logps/rejected": -248.5600128173828, + "loss": 0.4795, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5424979329109192, + "rewards/margins": 2.146846294403076, + "rewards/rejected": -2.6893444061279297, + "step": 2665 + }, + { + "epoch": 0.81, + "learning_rate": 5.08546526239522e-06, + "logits/chosen": -1.3313415050506592, + "logits/rejected": -1.2856453657150269, + "logps/chosen": -240.83554077148438, + "logps/rejected": -311.7351379394531, + "loss": 0.3312, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3891278803348541, + "rewards/margins": 3.937739133834839, + "rewards/rejected": -4.326866149902344, + "step": 2670 + }, + { + "epoch": 0.82, + "learning_rate": 5.005327898433826e-06, + "logits/chosen": -1.4672349691390991, + "logits/rejected": -1.4020793437957764, + "logps/chosen": -232.9001007080078, + "logps/rejected": -289.7752990722656, + "loss": 0.3673, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4757001996040344, + "rewards/margins": 2.6827566623687744, + "rewards/rejected": -3.158457040786743, + "step": 2675 + }, + { + "epoch": 0.82, + "learning_rate": 4.925756669869314e-06, + "logits/chosen": -1.439378023147583, + "logits/rejected": -1.3500677347183228, + "logps/chosen": -211.7573699951172, + "logps/rejected": -264.590087890625, + "loss": 0.403, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5147947072982788, + "rewards/margins": 2.793654203414917, + "rewards/rejected": -3.3084492683410645, + "step": 2680 + }, + { + "epoch": 0.82, + "learning_rate": 4.846753829706321e-06, + "logits/chosen": -1.4199397563934326, + "logits/rejected": -1.3846681118011475, + "logps/chosen": -228.0806427001953, + "logps/rejected": -308.934814453125, + "loss": 0.368, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5116747617721558, + "rewards/margins": 3.1513829231262207, + "rewards/rejected": -3.663057804107666, + "step": 2685 + }, + { + "epoch": 0.82, + "learning_rate": 4.768321614855972e-06, + "logits/chosen": -1.3238885402679443, + "logits/rejected": -1.2452843189239502, + "logps/chosen": -251.7180938720703, + "logps/rejected": -303.04449462890625, + "loss": 0.3569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6033166646957397, + "rewards/margins": 3.017152786254883, + "rewards/rejected": -3.620469331741333, + "step": 2690 + }, + { + "epoch": 0.82, + "learning_rate": 4.690462246072516e-06, + "logits/chosen": -1.3670374155044556, + "logits/rejected": -1.2038557529449463, + "logps/chosen": -240.1848602294922, + "logps/rejected": -265.8099060058594, + "loss": 0.3334, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3051196336746216, + "rewards/margins": 3.5024948120117188, + "rewards/rejected": -3.8076140880584717, + "step": 2695 + }, + { + "epoch": 0.82, + "learning_rate": 4.6131779278904606e-06, + "logits/chosen": -1.3956595659255981, + "logits/rejected": -1.2981932163238525, + "logps/chosen": -211.70315551757812, + "logps/rejected": -269.17413330078125, + "loss": 0.35, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5337416529655457, + "rewards/margins": 3.1707746982574463, + "rewards/rejected": -3.704516649246216, + "step": 2700 + }, + { + "epoch": 0.82, + "learning_rate": 4.536470848562143e-06, + "logits/chosen": -1.3783342838287354, + "logits/rejected": -1.293217420578003, + "logps/chosen": -238.942138671875, + "logps/rejected": -291.1051330566406, + "loss": 0.374, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6116763949394226, + "rewards/margins": 2.8823437690734863, + "rewards/rejected": -3.4940199851989746, + "step": 2705 + }, + { + "epoch": 0.83, + "learning_rate": 4.460343179995807e-06, + "logits/chosen": -1.5143723487854004, + "logits/rejected": -1.4304159879684448, + "logps/chosen": -204.8470916748047, + "logps/rejected": -232.28128051757812, + "loss": 0.3996, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5758059024810791, + "rewards/margins": 2.4611544609069824, + "rewards/rejected": -3.0369603633880615, + "step": 2710 + }, + { + "epoch": 0.83, + "learning_rate": 4.384797077694042e-06, + "logits/chosen": -1.4032760858535767, + "logits/rejected": -1.2930408716201782, + "logps/chosen": -235.9485626220703, + "logps/rejected": -260.80706787109375, + "loss": 0.3845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4756258428096771, + "rewards/margins": 3.060300827026367, + "rewards/rejected": -3.5359268188476562, + "step": 2715 + }, + { + "epoch": 0.83, + "learning_rate": 4.309834680692832e-06, + "logits/chosen": -1.4975941181182861, + "logits/rejected": -1.3984637260437012, + "logps/chosen": -238.25595092773438, + "logps/rejected": -294.8500061035156, + "loss": 0.3362, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5681991577148438, + "rewards/margins": 3.3456757068634033, + "rewards/rejected": -3.9138755798339844, + "step": 2720 + }, + { + "epoch": 0.83, + "learning_rate": 4.235458111500889e-06, + "logits/chosen": -1.347448706626892, + "logits/rejected": -1.2785007953643799, + "logps/chosen": -220.80105590820312, + "logps/rejected": -285.6271667480469, + "loss": 0.3648, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.49286168813705444, + "rewards/margins": 3.1875343322753906, + "rewards/rejected": -3.680396556854248, + "step": 2725 + }, + { + "epoch": 0.83, + "learning_rate": 4.16166947603967e-06, + "logits/chosen": -1.5035779476165771, + "logits/rejected": -1.4337027072906494, + "logps/chosen": -248.08816528320312, + "logps/rejected": -293.638427734375, + "loss": 0.3728, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5657131671905518, + "rewards/margins": 3.0048439502716064, + "rewards/rejected": -3.570557117462158, + "step": 2730 + }, + { + "epoch": 0.83, + "learning_rate": 4.088470863583655e-06, + "logits/chosen": -1.4668022394180298, + "logits/rejected": -1.3386309146881104, + "logps/chosen": -230.1422882080078, + "logps/rejected": -268.1689147949219, + "loss": 0.3495, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4433426260948181, + "rewards/margins": 2.858656883239746, + "rewards/rejected": -3.30199933052063, + "step": 2735 + }, + { + "epoch": 0.84, + "learning_rate": 4.015864346701251e-06, + "logits/chosen": -1.4615294933319092, + "logits/rejected": -1.3883612155914307, + "logps/chosen": -255.8359375, + "logps/rejected": -298.8800048828125, + "loss": 0.364, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.34084415435791016, + "rewards/margins": 3.069605588912964, + "rewards/rejected": -3.410449504852295, + "step": 2740 + }, + { + "epoch": 0.84, + "learning_rate": 3.943851981196073e-06, + "logits/chosen": -1.4294850826263428, + "logits/rejected": -1.3411905765533447, + "logps/chosen": -243.9447784423828, + "logps/rejected": -310.84173583984375, + "loss": 0.343, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6324904561042786, + "rewards/margins": 3.342142105102539, + "rewards/rejected": -3.9746322631835938, + "step": 2745 + }, + { + "epoch": 0.84, + "learning_rate": 3.872435806048743e-06, + "logits/chosen": -1.4718683958053589, + "logits/rejected": -1.4558926820755005, + "logps/chosen": -223.1630859375, + "logps/rejected": -303.9229736328125, + "loss": 0.3797, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7409528493881226, + "rewards/margins": 2.7985827922821045, + "rewards/rejected": -3.5395359992980957, + "step": 2750 + }, + { + "epoch": 0.84, + "learning_rate": 3.801617843359187e-06, + "logits/chosen": -1.3796002864837646, + "logits/rejected": -1.282780647277832, + "logps/chosen": -236.2412109375, + "logps/rejected": -280.6457214355469, + "loss": 0.3928, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7115376591682434, + "rewards/margins": 2.9424045085906982, + "rewards/rejected": -3.653942108154297, + "step": 2755 + }, + { + "epoch": 0.84, + "learning_rate": 3.731400098289331e-06, + "logits/chosen": -1.4562902450561523, + "logits/rejected": -1.3559983968734741, + "logps/chosen": -235.83462524414062, + "logps/rejected": -280.1587829589844, + "loss": 0.3715, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6619512438774109, + "rewards/margins": 2.6791372299194336, + "rewards/rejected": -3.3410885334014893, + "step": 2760 + }, + { + "epoch": 0.84, + "learning_rate": 3.661784559006362e-06, + "logits/chosen": -1.3674700260162354, + "logits/rejected": -1.295462727546692, + "logps/chosen": -223.27389526367188, + "logps/rejected": -271.55645751953125, + "loss": 0.4534, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6604770421981812, + "rewards/margins": 2.8196301460266113, + "rewards/rejected": -3.480107069015503, + "step": 2765 + }, + { + "epoch": 0.84, + "learning_rate": 3.592773196626417e-06, + "logits/chosen": -1.3982799053192139, + "logits/rejected": -1.3020669221878052, + "logps/chosen": -238.637939453125, + "logps/rejected": -287.42523193359375, + "loss": 0.382, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5564891695976257, + "rewards/margins": 3.2080490589141846, + "rewards/rejected": -3.764538526535034, + "step": 2770 + }, + { + "epoch": 0.85, + "learning_rate": 3.524367965158798e-06, + "logits/chosen": -1.359069585800171, + "logits/rejected": -1.2711738348007202, + "logps/chosen": -229.8795166015625, + "logps/rejected": -286.6170654296875, + "loss": 0.3735, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3981967866420746, + "rewards/margins": 3.4926648139953613, + "rewards/rejected": -3.8908615112304688, + "step": 2775 + }, + { + "epoch": 0.85, + "learning_rate": 3.4565708014506066e-06, + "logits/chosen": -1.3851430416107178, + "logits/rejected": -1.2968069314956665, + "logps/chosen": -230.38229370117188, + "logps/rejected": -283.18670654296875, + "loss": 0.3322, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.47342461347579956, + "rewards/margins": 3.4595096111297607, + "rewards/rejected": -3.932934522628784, + "step": 2780 + }, + { + "epoch": 0.85, + "learning_rate": 3.3893836251319422e-06, + "logits/chosen": -1.4177032709121704, + "logits/rejected": -1.3064008951187134, + "logps/chosen": -258.6094665527344, + "logps/rejected": -312.0318908691406, + "loss": 0.326, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5791738629341125, + "rewards/margins": 2.927536725997925, + "rewards/rejected": -3.5067107677459717, + "step": 2785 + }, + { + "epoch": 0.85, + "learning_rate": 3.3228083385615004e-06, + "logits/chosen": -1.3896998167037964, + "logits/rejected": -1.2878631353378296, + "logps/chosen": -228.61428833007812, + "logps/rejected": -269.18634033203125, + "loss": 0.4131, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3808293342590332, + "rewards/margins": 3.2980189323425293, + "rewards/rejected": -3.6788482666015625, + "step": 2790 + }, + { + "epoch": 0.85, + "learning_rate": 3.2568468267727775e-06, + "logits/chosen": -1.3433361053466797, + "logits/rejected": -1.2808220386505127, + "logps/chosen": -236.6678924560547, + "logps/rejected": -310.6268310546875, + "loss": 0.3305, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4466523230075836, + "rewards/margins": 3.5550129413604736, + "rewards/rejected": -4.0016655921936035, + "step": 2795 + }, + { + "epoch": 0.85, + "learning_rate": 3.1915009574206262e-06, + "logits/chosen": -1.4498833417892456, + "logits/rejected": -1.3102750778198242, + "logps/chosen": -275.7056579589844, + "logps/rejected": -311.9723815917969, + "loss": 0.4167, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5615810751914978, + "rewards/margins": 2.9476184844970703, + "rewards/rejected": -3.509199619293213, + "step": 2800 + }, + { + "epoch": 0.85, + "learning_rate": 3.126772580728432e-06, + "logits/chosen": -1.3818706274032593, + "logits/rejected": -1.3033558130264282, + "logps/chosen": -221.0478057861328, + "logps/rejected": -258.5665588378906, + "loss": 0.4022, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3416077196598053, + "rewards/margins": 3.1456706523895264, + "rewards/rejected": -3.487278461456299, + "step": 2805 + }, + { + "epoch": 0.86, + "learning_rate": 3.062663529435686e-06, + "logits/chosen": -1.4404270648956299, + "logits/rejected": -1.342151403427124, + "logps/chosen": -265.16644287109375, + "logps/rejected": -311.3492736816406, + "loss": 0.3744, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4786798059940338, + "rewards/margins": 3.2908883094787598, + "rewards/rejected": -3.7695682048797607, + "step": 2810 + }, + { + "epoch": 0.86, + "learning_rate": 2.9991756187461e-06, + "logits/chosen": -1.493826150894165, + "logits/rejected": -1.424478530883789, + "logps/chosen": -224.368408203125, + "logps/rejected": -282.0445556640625, + "loss": 0.3955, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5058669447898865, + "rewards/margins": 3.138854503631592, + "rewards/rejected": -3.644721508026123, + "step": 2815 + }, + { + "epoch": 0.86, + "learning_rate": 2.9363106462762386e-06, + "logits/chosen": -1.5206154584884644, + "logits/rejected": -1.3797047138214111, + "logps/chosen": -255.96420288085938, + "logps/rejected": -288.0357666015625, + "loss": 0.4149, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5714098215103149, + "rewards/margins": 3.050893783569336, + "rewards/rejected": -3.6223034858703613, + "step": 2820 + }, + { + "epoch": 0.86, + "learning_rate": 2.87407039200458e-06, + "logits/chosen": -1.4571731090545654, + "logits/rejected": -1.4377485513687134, + "logps/chosen": -203.18972778320312, + "logps/rejected": -278.02984619140625, + "loss": 0.4169, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.32160016894340515, + "rewards/margins": 2.9917640686035156, + "rewards/rejected": -3.313364028930664, + "step": 2825 + }, + { + "epoch": 0.86, + "learning_rate": 2.812456618221143e-06, + "logits/chosen": -1.4474319219589233, + "logits/rejected": -1.3620309829711914, + "logps/chosen": -237.1829376220703, + "logps/rejected": -291.8993225097656, + "loss": 0.3575, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.646533727645874, + "rewards/margins": 2.8670644760131836, + "rewards/rejected": -3.5135979652404785, + "step": 2830 + }, + { + "epoch": 0.86, + "learning_rate": 2.7514710694775735e-06, + "logits/chosen": -1.3768714666366577, + "logits/rejected": -1.283097743988037, + "logps/chosen": -229.52603149414062, + "logps/rejected": -275.2613220214844, + "loss": 0.3919, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4789879322052002, + "rewards/margins": 2.905561923980713, + "rewards/rejected": -3.384549617767334, + "step": 2835 + }, + { + "epoch": 0.87, + "learning_rate": 2.691115472537778e-06, + "logits/chosen": -1.3752249479293823, + "logits/rejected": -1.2862586975097656, + "logps/chosen": -259.10321044921875, + "logps/rejected": -305.44573974609375, + "loss": 0.3621, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5128841996192932, + "rewards/margins": 3.505547285079956, + "rewards/rejected": -4.018431186676025, + "step": 2840 + }, + { + "epoch": 0.87, + "learning_rate": 2.631391536328992e-06, + "logits/chosen": -1.507230281829834, + "logits/rejected": -1.3867247104644775, + "logps/chosen": -262.347900390625, + "logps/rejected": -312.17578125, + "loss": 0.3645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6277137994766235, + "rewards/margins": 3.4604015350341797, + "rewards/rejected": -4.088115692138672, + "step": 2845 + }, + { + "epoch": 0.87, + "learning_rate": 2.5723009518934136e-06, + "logits/chosen": -1.4776110649108887, + "logits/rejected": -1.3628318309783936, + "logps/chosen": -243.9078369140625, + "logps/rejected": -274.2095031738281, + "loss": 0.4023, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5240500569343567, + "rewards/margins": 2.7778477668762207, + "rewards/rejected": -3.3018977642059326, + "step": 2850 + }, + { + "epoch": 0.87, + "learning_rate": 2.513845392340322e-06, + "logits/chosen": -1.328366756439209, + "logits/rejected": -1.2408344745635986, + "logps/chosen": -244.29013061523438, + "logps/rejected": -292.7612609863281, + "loss": 0.3595, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.4736739993095398, + "rewards/margins": 3.4074509143829346, + "rewards/rejected": -3.8811252117156982, + "step": 2855 + }, + { + "epoch": 0.87, + "learning_rate": 2.4560265127987147e-06, + "logits/chosen": -1.4597951173782349, + "logits/rejected": -1.3627371788024902, + "logps/chosen": -258.7566833496094, + "logps/rejected": -299.85479736328125, + "loss": 0.3925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.48914337158203125, + "rewards/margins": 2.6332907676696777, + "rewards/rejected": -3.122434377670288, + "step": 2860 + }, + { + "epoch": 0.87, + "learning_rate": 2.3988459503704154e-06, + "logits/chosen": -1.3771252632141113, + "logits/rejected": -1.3485709428787231, + "logps/chosen": -247.46517944335938, + "logps/rejected": -313.8783264160156, + "loss": 0.3822, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.637241542339325, + "rewards/margins": 2.9835617542266846, + "rewards/rejected": -3.620803117752075, + "step": 2865 + }, + { + "epoch": 0.87, + "learning_rate": 2.3423053240837515e-06, + "logits/chosen": -1.4605720043182373, + "logits/rejected": -1.3656359910964966, + "logps/chosen": -230.97634887695312, + "logps/rejected": -291.53668212890625, + "loss": 0.3689, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.45648589730262756, + "rewards/margins": 3.1279473304748535, + "rewards/rejected": -3.584432601928711, + "step": 2870 + }, + { + "epoch": 0.88, + "learning_rate": 2.2864062348476905e-06, + "logits/chosen": -1.4365028142929077, + "logits/rejected": -1.3548933267593384, + "logps/chosen": -243.0567169189453, + "logps/rejected": -294.0565490722656, + "loss": 0.3536, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3898884356021881, + "rewards/margins": 3.077261209487915, + "rewards/rejected": -3.4671497344970703, + "step": 2875 + }, + { + "epoch": 0.88, + "learning_rate": 2.231150265406512e-06, + "logits/chosen": -1.4013705253601074, + "logits/rejected": -1.2795777320861816, + "logps/chosen": -262.1343078613281, + "logps/rejected": -303.52899169921875, + "loss": 0.3715, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3743430972099304, + "rewards/margins": 3.6337814331054688, + "rewards/rejected": -4.008124351501465, + "step": 2880 + }, + { + "epoch": 0.88, + "learning_rate": 2.176538980295023e-06, + "logits/chosen": -1.3362782001495361, + "logits/rejected": -1.2446343898773193, + "logps/chosen": -208.1090087890625, + "logps/rejected": -275.521240234375, + "loss": 0.3464, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.22351208329200745, + "rewards/margins": 3.3608238697052, + "rewards/rejected": -3.5843353271484375, + "step": 2885 + }, + { + "epoch": 0.88, + "learning_rate": 2.122573925794219e-06, + "logits/chosen": -1.3743422031402588, + "logits/rejected": -1.3110918998718262, + "logps/chosen": -226.4436798095703, + "logps/rejected": -284.766845703125, + "loss": 0.4132, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.42287105321884155, + "rewards/margins": 3.1263039112091064, + "rewards/rejected": -3.5491747856140137, + "step": 2890 + }, + { + "epoch": 0.88, + "learning_rate": 2.0692566298875198e-06, + "logits/chosen": -1.3773993253707886, + "logits/rejected": -1.255491852760315, + "logps/chosen": -242.8449249267578, + "logps/rejected": -293.3464660644531, + "loss": 0.323, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6184684038162231, + "rewards/margins": 3.38958740234375, + "rewards/rejected": -4.008055686950684, + "step": 2895 + }, + { + "epoch": 0.88, + "learning_rate": 2.016588602217512e-06, + "logits/chosen": -1.4417493343353271, + "logits/rejected": -1.3363769054412842, + "logps/chosen": -220.3513641357422, + "logps/rejected": -261.87921142578125, + "loss": 0.3987, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4997718930244446, + "rewards/margins": 2.572354316711426, + "rewards/rejected": -3.0721261501312256, + "step": 2900 + }, + { + "epoch": 0.89, + "learning_rate": 1.9645713340431997e-06, + "logits/chosen": -1.3745633363723755, + "logits/rejected": -1.3230297565460205, + "logps/chosen": -232.82363891601562, + "logps/rejected": -307.91510009765625, + "loss": 0.3933, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6310940980911255, + "rewards/margins": 3.0805411338806152, + "rewards/rejected": -3.711635112762451, + "step": 2905 + }, + { + "epoch": 0.89, + "learning_rate": 1.9132062981977783e-06, + "logits/chosen": -1.3700783252716064, + "logits/rejected": -1.2972946166992188, + "logps/chosen": -250.9999237060547, + "logps/rejected": -316.5565185546875, + "loss": 0.3469, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.36076945066452026, + "rewards/margins": 3.416379451751709, + "rewards/rejected": -3.777149200439453, + "step": 2910 + }, + { + "epoch": 0.89, + "learning_rate": 1.8624949490469252e-06, + "logits/chosen": -1.3321553468704224, + "logits/rejected": -1.3144387006759644, + "logps/chosen": -226.41629028320312, + "logps/rejected": -283.4264221191406, + "loss": 0.3523, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5463991165161133, + "rewards/margins": 2.817983388900757, + "rewards/rejected": -3.36438250541687, + "step": 2915 + }, + { + "epoch": 0.89, + "learning_rate": 1.8124387224476347e-06, + "logits/chosen": -1.3554986715316772, + "logits/rejected": -1.248975396156311, + "logps/chosen": -236.5032196044922, + "logps/rejected": -301.41937255859375, + "loss": 0.3758, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5690854787826538, + "rewards/margins": 3.2041728496551514, + "rewards/rejected": -3.773258686065674, + "step": 2920 + }, + { + "epoch": 0.89, + "learning_rate": 1.763039035707556e-06, + "logits/chosen": -1.445534586906433, + "logits/rejected": -1.3765289783477783, + "logps/chosen": -218.98098754882812, + "logps/rejected": -282.2618408203125, + "loss": 0.3595, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.42225924134254456, + "rewards/margins": 3.1719846725463867, + "rewards/rejected": -3.5942440032958984, + "step": 2925 + }, + { + "epoch": 0.89, + "learning_rate": 1.714297287544872e-06, + "logits/chosen": -1.4084084033966064, + "logits/rejected": -1.3347828388214111, + "logps/chosen": -228.32858276367188, + "logps/rejected": -264.45947265625, + "loss": 0.3807, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.733644425868988, + "rewards/margins": 2.8378963470458984, + "rewards/rejected": -3.5715413093566895, + "step": 2930 + }, + { + "epoch": 0.89, + "learning_rate": 1.6662148580486702e-06, + "logits/chosen": -1.4024819135665894, + "logits/rejected": -1.2530525922775269, + "logps/chosen": -274.0736999511719, + "logps/rejected": -300.11798095703125, + "loss": 0.4, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44952473044395447, + "rewards/margins": 3.0304102897644043, + "rewards/rejected": -3.4799346923828125, + "step": 2935 + }, + { + "epoch": 0.9, + "learning_rate": 1.6187931086398932e-06, + "logits/chosen": -1.4648611545562744, + "logits/rejected": -1.3795498609542847, + "logps/chosen": -244.27194213867188, + "logps/rejected": -287.3001403808594, + "loss": 0.3857, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5816971659660339, + "rewards/margins": 2.8270397186279297, + "rewards/rejected": -3.4087371826171875, + "step": 2940 + }, + { + "epoch": 0.9, + "learning_rate": 1.5720333820327782e-06, + "logits/chosen": -1.437745451927185, + "logits/rejected": -1.368643879890442, + "logps/chosen": -230.3911590576172, + "logps/rejected": -294.9569091796875, + "loss": 0.4097, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6143894195556641, + "rewards/margins": 2.9358458518981934, + "rewards/rejected": -3.5502357482910156, + "step": 2945 + }, + { + "epoch": 0.9, + "learning_rate": 1.525937002196845e-06, + "logits/chosen": -1.3672488927841187, + "logits/rejected": -1.3020068407058716, + "logps/chosen": -227.96572875976562, + "logps/rejected": -288.87579345703125, + "loss": 0.3754, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.48889145255088806, + "rewards/margins": 3.2010180950164795, + "rewards/rejected": -3.6899094581604004, + "step": 2950 + }, + { + "epoch": 0.9, + "learning_rate": 1.4805052743194048e-06, + "logits/chosen": -1.4185682535171509, + "logits/rejected": -1.3656527996063232, + "logps/chosen": -221.6512451171875, + "logps/rejected": -275.33544921875, + "loss": 0.4033, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4141221046447754, + "rewards/margins": 2.6198203563690186, + "rewards/rejected": -3.033942461013794, + "step": 2955 + }, + { + "epoch": 0.9, + "learning_rate": 1.435739484768603e-06, + "logits/chosen": -1.3836723566055298, + "logits/rejected": -1.3007224798202515, + "logps/chosen": -233.81982421875, + "logps/rejected": -294.37762451171875, + "loss": 0.3667, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6608399152755737, + "rewards/margins": 3.0005805492401123, + "rewards/rejected": -3.6614208221435547, + "step": 2960 + }, + { + "epoch": 0.9, + "learning_rate": 1.3916409010569926e-06, + "logits/chosen": -1.4528993368148804, + "logits/rejected": -1.3190717697143555, + "logps/chosen": -251.28640747070312, + "logps/rejected": -297.99755859375, + "loss": 0.4178, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6956207156181335, + "rewards/margins": 3.153359889984131, + "rewards/rejected": -3.848980665206909, + "step": 2965 + }, + { + "epoch": 0.91, + "learning_rate": 1.348210771805672e-06, + "logits/chosen": -1.4740675687789917, + "logits/rejected": -1.3801523447036743, + "logps/chosen": -266.302490234375, + "logps/rejected": -314.86102294921875, + "loss": 0.3896, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6047788858413696, + "rewards/margins": 3.4077250957489014, + "rewards/rejected": -4.012503623962402, + "step": 2970 + }, + { + "epoch": 0.91, + "learning_rate": 1.305450326708893e-06, + "logits/chosen": -1.5181069374084473, + "logits/rejected": -1.4141440391540527, + "logps/chosen": -232.6529998779297, + "logps/rejected": -269.83807373046875, + "loss": 0.3822, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.34738487005233765, + "rewards/margins": 2.8622207641601562, + "rewards/rejected": -3.2096054553985596, + "step": 2975 + }, + { + "epoch": 0.91, + "learning_rate": 1.2633607764992671e-06, + "logits/chosen": -1.4072999954223633, + "logits/rejected": -1.311030387878418, + "logps/chosen": -240.6744384765625, + "logps/rejected": -303.5523986816406, + "loss": 0.3021, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.32643240690231323, + "rewards/margins": 3.427372694015503, + "rewards/rejected": -3.753805160522461, + "step": 2980 + }, + { + "epoch": 0.91, + "learning_rate": 1.2219433129134733e-06, + "logits/chosen": -1.4545339345932007, + "logits/rejected": -1.372537612915039, + "logps/chosen": -249.2332000732422, + "logps/rejected": -285.46868896484375, + "loss": 0.3641, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5586282014846802, + "rewards/margins": 2.956866502761841, + "rewards/rejected": -3.5154948234558105, + "step": 2985 + }, + { + "epoch": 0.91, + "learning_rate": 1.1811991086585261e-06, + "logits/chosen": -1.421443223953247, + "logits/rejected": -1.3431203365325928, + "logps/chosen": -231.0830078125, + "logps/rejected": -286.212646484375, + "loss": 0.3906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3323056101799011, + "rewards/margins": 3.385119915008545, + "rewards/rejected": -3.717425584793091, + "step": 2990 + }, + { + "epoch": 0.91, + "learning_rate": 1.1411293173785726e-06, + "logits/chosen": -1.5069319009780884, + "logits/rejected": -1.420163869857788, + "logps/chosen": -239.2240753173828, + "logps/rejected": -298.16607666015625, + "loss": 0.402, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.31917136907577515, + "rewards/margins": 3.190476894378662, + "rewards/rejected": -3.509648084640503, + "step": 2995 + }, + { + "epoch": 0.91, + "learning_rate": 1.1017350736221925e-06, + "logits/chosen": -1.418235421180725, + "logits/rejected": -1.3685309886932373, + "logps/chosen": -191.44699096679688, + "logps/rejected": -228.60757446289062, + "loss": 0.4359, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.48290008306503296, + "rewards/margins": 2.216280460357666, + "rewards/rejected": -2.6991806030273438, + "step": 3000 + }, + { + "epoch": 0.92, + "learning_rate": 1.0630174928103337e-06, + "logits/chosen": -1.4737344980239868, + "logits/rejected": -1.3487292528152466, + "logps/chosen": -258.76336669921875, + "logps/rejected": -320.27337646484375, + "loss": 0.4227, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5768004059791565, + "rewards/margins": 3.205977201461792, + "rewards/rejected": -3.7827765941619873, + "step": 3005 + }, + { + "epoch": 0.92, + "learning_rate": 1.0249776712046744e-06, + "logits/chosen": -1.4237945079803467, + "logits/rejected": -1.2867224216461182, + "logps/chosen": -246.64108276367188, + "logps/rejected": -275.6163024902344, + "loss": 0.3447, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.47526636719703674, + "rewards/margins": 3.0651516914367676, + "rewards/rejected": -3.5404179096221924, + "step": 3010 + }, + { + "epoch": 0.92, + "learning_rate": 9.876166858766244e-07, + "logits/chosen": -1.48415207862854, + "logits/rejected": -1.344327688217163, + "logps/chosen": -262.2154235839844, + "logps/rejected": -287.91632080078125, + "loss": 0.3705, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5292236804962158, + "rewards/margins": 2.700580596923828, + "rewards/rejected": -3.2298038005828857, + "step": 3015 + }, + { + "epoch": 0.92, + "learning_rate": 9.509355946767995e-07, + "logits/chosen": -1.4063690900802612, + "logits/rejected": -1.3571805953979492, + "logps/chosen": -265.2889404296875, + "logps/rejected": -301.1048889160156, + "loss": 0.4011, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5893287658691406, + "rewards/margins": 3.2209103107452393, + "rewards/rejected": -3.810239315032959, + "step": 3020 + }, + { + "epoch": 0.92, + "learning_rate": 9.149354362050805e-07, + "logits/chosen": -1.3574830293655396, + "logits/rejected": -1.2446348667144775, + "logps/chosen": -249.58804321289062, + "logps/rejected": -300.3077697753906, + "loss": 0.3128, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.2982921898365021, + "rewards/margins": 3.3077235221862793, + "rewards/rejected": -3.606015682220459, + "step": 3025 + }, + { + "epoch": 0.92, + "learning_rate": 8.7961722978121e-07, + "logits/chosen": -1.4134116172790527, + "logits/rejected": -1.3037294149398804, + "logps/chosen": -263.79638671875, + "logps/rejected": -328.1442565917969, + "loss": 0.3121, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4436197280883789, + "rewards/margins": 3.141730546951294, + "rewards/rejected": -3.5853500366210938, + "step": 3030 + }, + { + "epoch": 0.93, + "learning_rate": 8.449819754159316e-07, + "logits/chosen": -1.401808738708496, + "logits/rejected": -1.296014666557312, + "logps/chosen": -275.23028564453125, + "logps/rejected": -319.0748291015625, + "loss": 0.3704, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7753769159317017, + "rewards/margins": 2.9746193885803223, + "rewards/rejected": -3.749995708465576, + "step": 3035 + }, + { + "epoch": 0.93, + "learning_rate": 8.110306537826601e-07, + "logits/chosen": -1.4505449533462524, + "logits/rejected": -1.3536399602890015, + "logps/chosen": -254.4628448486328, + "logps/rejected": -301.4260559082031, + "loss": 0.3979, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5519896745681763, + "rewards/margins": 2.8951056003570557, + "rewards/rejected": -3.4470953941345215, + "step": 3040 + }, + { + "epoch": 0.93, + "learning_rate": 7.777642261897311e-07, + "logits/chosen": -1.3979469537734985, + "logits/rejected": -1.2560176849365234, + "logps/chosen": -241.8521270751953, + "logps/rejected": -274.6918640136719, + "loss": 0.3321, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40183010697364807, + "rewards/margins": 3.511676788330078, + "rewards/rejected": -3.913506269454956, + "step": 3045 + }, + { + "epoch": 0.93, + "learning_rate": 7.451836345531787e-07, + "logits/chosen": -1.4664833545684814, + "logits/rejected": -1.3185244798660278, + "logps/chosen": -237.0376739501953, + "logps/rejected": -264.0262145996094, + "loss": 0.358, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3076043426990509, + "rewards/margins": 3.1689975261688232, + "rewards/rejected": -3.476602077484131, + "step": 3050 + }, + { + "epoch": 0.93, + "learning_rate": 7.13289801370054e-07, + "logits/chosen": -1.3925743103027344, + "logits/rejected": -1.3529897928237915, + "logps/chosen": -203.59584045410156, + "logps/rejected": -267.44140625, + "loss": 0.363, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5152291059494019, + "rewards/margins": 3.0308661460876465, + "rewards/rejected": -3.546095371246338, + "step": 3055 + }, + { + "epoch": 0.93, + "learning_rate": 6.820836296923316e-07, + "logits/chosen": -1.4813798666000366, + "logits/rejected": -1.4033396244049072, + "logps/chosen": -240.37704467773438, + "logps/rejected": -286.06890869140625, + "loss": 0.3794, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5136979222297668, + "rewards/margins": 2.7911887168884277, + "rewards/rejected": -3.30488657951355, + "step": 3060 + }, + { + "epoch": 0.93, + "learning_rate": 6.515660031013004e-07, + "logits/chosen": -1.5199018716812134, + "logits/rejected": -1.45121169090271, + "logps/chosen": -250.4469757080078, + "logps/rejected": -301.24481201171875, + "loss": 0.3259, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4252336025238037, + "rewards/margins": 3.3114101886749268, + "rewards/rejected": -3.7366433143615723, + "step": 3065 + }, + { + "epoch": 0.94, + "learning_rate": 6.217377856825885e-07, + "logits/chosen": -1.3836722373962402, + "logits/rejected": -1.2654017210006714, + "logps/chosen": -234.94223022460938, + "logps/rejected": -289.21533203125, + "loss": 0.3508, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.46201711893081665, + "rewards/margins": 3.144443988800049, + "rewards/rejected": -3.6064610481262207, + "step": 3070 + }, + { + "epoch": 0.94, + "learning_rate": 5.925998220016659e-07, + "logits/chosen": -1.3546682596206665, + "logits/rejected": -1.281021237373352, + "logps/chosen": -223.7041473388672, + "logps/rejected": -256.8625183105469, + "loss": 0.4022, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.43491753935813904, + "rewards/margins": 2.5875766277313232, + "rewards/rejected": -3.0224945545196533, + "step": 3075 + }, + { + "epoch": 0.94, + "learning_rate": 5.64152937079948e-07, + "logits/chosen": -1.498255968093872, + "logits/rejected": -1.425018310546875, + "logps/chosen": -248.11636352539062, + "logps/rejected": -293.47760009765625, + "loss": 0.4036, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7960094213485718, + "rewards/margins": 2.6311516761779785, + "rewards/rejected": -3.4271609783172607, + "step": 3080 + }, + { + "epoch": 0.94, + "learning_rate": 5.363979363714245e-07, + "logits/chosen": -1.3672258853912354, + "logits/rejected": -1.3166520595550537, + "logps/chosen": -257.9898681640625, + "logps/rejected": -312.44708251953125, + "loss": 0.3768, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6454734206199646, + "rewards/margins": 3.113892078399658, + "rewards/rejected": -3.7593655586242676, + "step": 3085 + }, + { + "epoch": 0.94, + "learning_rate": 5.093356057398663e-07, + "logits/chosen": -1.509857416152954, + "logits/rejected": -1.3743062019348145, + "logps/chosen": -264.7552185058594, + "logps/rejected": -293.62603759765625, + "loss": 0.4094, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4879273474216461, + "rewards/margins": 3.007258892059326, + "rewards/rejected": -3.4951863288879395, + "step": 3090 + }, + { + "epoch": 0.94, + "learning_rate": 4.82966711436561e-07, + "logits/chosen": -1.3606380224227905, + "logits/rejected": -1.2969977855682373, + "logps/chosen": -246.0964813232422, + "logps/rejected": -316.4306945800781, + "loss": 0.3085, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5718385577201843, + "rewards/margins": 3.2876274585723877, + "rewards/rejected": -3.8594658374786377, + "step": 3095 + }, + { + "epoch": 0.94, + "learning_rate": 4.5729200007862683e-07, + "logits/chosen": -1.3616141080856323, + "logits/rejected": -1.3222945928573608, + "logps/chosen": -234.003662109375, + "logps/rejected": -307.3812561035156, + "loss": 0.3466, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5820930004119873, + "rewards/margins": 3.154688835144043, + "rewards/rejected": -3.7367820739746094, + "step": 3100 + }, + { + "epoch": 0.95, + "learning_rate": 4.323121986278683e-07, + "logits/chosen": -1.446487545967102, + "logits/rejected": -1.3170316219329834, + "logps/chosen": -249.99685668945312, + "logps/rejected": -275.61956787109375, + "loss": 0.3646, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.3564623296260834, + "rewards/margins": 3.4546267986297607, + "rewards/rejected": -3.811089038848877, + "step": 3105 + }, + { + "epoch": 0.95, + "learning_rate": 4.0802801437019033e-07, + "logits/chosen": -1.475476861000061, + "logits/rejected": -1.3680169582366943, + "logps/chosen": -246.2252197265625, + "logps/rejected": -293.28692626953125, + "loss": 0.3239, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6484922766685486, + "rewards/margins": 2.8538057804107666, + "rewards/rejected": -3.5022976398468018, + "step": 3110 + }, + { + "epoch": 0.95, + "learning_rate": 3.8444013489558337e-07, + "logits/chosen": -1.3692257404327393, + "logits/rejected": -1.2947582006454468, + "logps/chosen": -247.55606079101562, + "logps/rejected": -299.1629943847656, + "loss": 0.3442, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.43855223059654236, + "rewards/margins": 3.4032680988311768, + "rewards/rejected": -3.841820478439331, + "step": 3115 + }, + { + "epoch": 0.95, + "learning_rate": 3.6154922807863643e-07, + "logits/chosen": -1.4646461009979248, + "logits/rejected": -1.2760677337646484, + "logps/chosen": -279.49127197265625, + "logps/rejected": -303.4764709472656, + "loss": 0.3364, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5851499438285828, + "rewards/margins": 3.501290798187256, + "rewards/rejected": -4.086440563201904, + "step": 3120 + }, + { + "epoch": 0.95, + "learning_rate": 3.393559420596437e-07, + "logits/chosen": -1.4126708507537842, + "logits/rejected": -1.3040239810943604, + "logps/chosen": -244.0888214111328, + "logps/rejected": -290.2459411621094, + "loss": 0.3495, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5026829838752747, + "rewards/margins": 3.261399507522583, + "rewards/rejected": -3.764082431793213, + "step": 3125 + }, + { + "epoch": 0.95, + "learning_rate": 3.1786090522624156e-07, + "logits/chosen": -1.4109928607940674, + "logits/rejected": -1.358865737915039, + "logps/chosen": -219.6774444580078, + "logps/rejected": -284.9616394042969, + "loss": 0.3981, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.25217631459236145, + "rewards/margins": 2.7937846183776855, + "rewards/rejected": -3.0459611415863037, + "step": 3130 + }, + { + "epoch": 0.96, + "learning_rate": 2.970647261956255e-07, + "logits/chosen": -1.4728444814682007, + "logits/rejected": -1.3215397596359253, + "logps/chosen": -228.3172149658203, + "logps/rejected": -245.55490112304688, + "loss": 0.3836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.45830535888671875, + "rewards/margins": 2.752584218978882, + "rewards/rejected": -3.2108893394470215, + "step": 3135 + }, + { + "epoch": 0.96, + "learning_rate": 2.769679937973085e-07, + "logits/chosen": -1.4454705715179443, + "logits/rejected": -1.3466382026672363, + "logps/chosen": -237.8345489501953, + "logps/rejected": -294.45111083984375, + "loss": 0.3784, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5216543078422546, + "rewards/margins": 2.708383321762085, + "rewards/rejected": -3.230037212371826, + "step": 3140 + }, + { + "epoch": 0.96, + "learning_rate": 2.575712770564592e-07, + "logits/chosen": -1.518226981163025, + "logits/rejected": -1.4551050662994385, + "logps/chosen": -221.72201538085938, + "logps/rejected": -278.15484619140625, + "loss": 0.3569, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3757239282131195, + "rewards/margins": 3.098101854324341, + "rewards/rejected": -3.473825454711914, + "step": 3145 + }, + { + "epoch": 0.96, + "learning_rate": 2.3887512517777324e-07, + "logits/chosen": -1.3695826530456543, + "logits/rejected": -1.2446963787078857, + "logps/chosen": -251.01058959960938, + "logps/rejected": -304.1330871582031, + "loss": 0.3612, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4472174644470215, + "rewards/margins": 3.318666934967041, + "rewards/rejected": -3.7658848762512207, + "step": 3150 + }, + { + "epoch": 0.96, + "learning_rate": 2.2088006752994384e-07, + "logits/chosen": -1.4280850887298584, + "logits/rejected": -1.3461120128631592, + "logps/chosen": -250.29443359375, + "logps/rejected": -296.2013244628906, + "loss": 0.3697, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6222056150436401, + "rewards/margins": 2.967963695526123, + "rewards/rejected": -3.5901694297790527, + "step": 3155 + }, + { + "epoch": 0.96, + "learning_rate": 2.0358661363065746e-07, + "logits/chosen": -1.5051259994506836, + "logits/rejected": -1.4378012418746948, + "logps/chosen": -215.5379180908203, + "logps/rejected": -270.934814453125, + "loss": 0.4377, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.33570045232772827, + "rewards/margins": 2.7214572429656982, + "rewards/rejected": -3.0571579933166504, + "step": 3160 + }, + { + "epoch": 0.96, + "learning_rate": 1.8699525313217447e-07, + "logits/chosen": -1.451647162437439, + "logits/rejected": -1.3913623094558716, + "logps/chosen": -228.0519561767578, + "logps/rejected": -294.03021240234375, + "loss": 0.3211, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4577842652797699, + "rewards/margins": 3.234616756439209, + "rewards/rejected": -3.6924006938934326, + "step": 3165 + }, + { + "epoch": 0.97, + "learning_rate": 1.7110645580746264e-07, + "logits/chosen": -1.564744234085083, + "logits/rejected": -1.4929113388061523, + "logps/chosen": -236.1385498046875, + "logps/rejected": -284.2821044921875, + "loss": 0.3587, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3766639530658722, + "rewards/margins": 3.0007431507110596, + "rewards/rejected": -3.3774070739746094, + "step": 3170 + }, + { + "epoch": 0.97, + "learning_rate": 1.559206715368966e-07, + "logits/chosen": -1.4131274223327637, + "logits/rejected": -1.3240829706192017, + "logps/chosen": -235.37606811523438, + "logps/rejected": -287.25518798828125, + "loss": 0.3694, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.46932369470596313, + "rewards/margins": 3.396801710128784, + "rewards/rejected": -3.8661255836486816, + "step": 3175 + }, + { + "epoch": 0.97, + "learning_rate": 1.4143833029552355e-07, + "logits/chosen": -1.378590703010559, + "logits/rejected": -1.3106247186660767, + "logps/chosen": -237.6977996826172, + "logps/rejected": -297.2106018066406, + "loss": 0.3736, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6389200091362, + "rewards/margins": 2.9030025005340576, + "rewards/rejected": -3.5419223308563232, + "step": 3180 + }, + { + "epoch": 0.97, + "learning_rate": 1.276598421408759e-07, + "logits/chosen": -1.4562031030654907, + "logits/rejected": -1.3900493383407593, + "logps/chosen": -219.63406372070312, + "logps/rejected": -253.91976928710938, + "loss": 0.392, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.5725888013839722, + "rewards/margins": 2.4471755027770996, + "rewards/rejected": -3.0197644233703613, + "step": 3185 + }, + { + "epoch": 0.97, + "learning_rate": 1.1458559720137762e-07, + "logits/chosen": -1.4543843269348145, + "logits/rejected": -1.3984777927398682, + "logps/chosen": -240.25765991210938, + "logps/rejected": -306.05010986328125, + "loss": 0.4252, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5501489639282227, + "rewards/margins": 3.2002170085906982, + "rewards/rejected": -3.7503662109375, + "step": 3190 + }, + { + "epoch": 0.97, + "learning_rate": 1.0221596566528657e-07, + "logits/chosen": -1.426992654800415, + "logits/rejected": -1.35605788230896, + "logps/chosen": -263.15740966796875, + "logps/rejected": -306.79266357421875, + "loss": 0.3263, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5317724347114563, + "rewards/margins": 3.079737901687622, + "rewards/rejected": -3.6115100383758545, + "step": 3195 + }, + { + "epoch": 0.98, + "learning_rate": 9.055129777021665e-08, + "logits/chosen": -1.473975658416748, + "logits/rejected": -1.3327196836471558, + "logps/chosen": -261.496337890625, + "logps/rejected": -293.1902160644531, + "loss": 0.3396, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5715179443359375, + "rewards/margins": 3.3157131671905518, + "rewards/rejected": -3.8872311115264893, + "step": 3200 + }, + { + "epoch": 0.98, + "learning_rate": 7.959192379322077e-08, + "logits/chosen": -1.4569588899612427, + "logits/rejected": -1.3854528665542603, + "logps/chosen": -247.93746948242188, + "logps/rejected": -321.257080078125, + "loss": 0.3443, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5199737548828125, + "rewards/margins": 3.264164686203003, + "rewards/rejected": -3.7841384410858154, + "step": 3205 + }, + { + "epoch": 0.98, + "learning_rate": 6.933815404144561e-08, + "logits/chosen": -1.5174143314361572, + "logits/rejected": -1.4582128524780273, + "logps/chosen": -221.869384765625, + "logps/rejected": -289.20001220703125, + "loss": 0.3538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4151820242404938, + "rewards/margins": 2.987104892730713, + "rewards/rejected": -3.402287006378174, + "step": 3210 + }, + { + "epoch": 0.98, + "learning_rate": 5.979027884332744e-08, + "logits/chosen": -1.4034459590911865, + "logits/rejected": -1.2570650577545166, + "logps/chosen": -261.0184631347656, + "logps/rejected": -288.6986999511719, + "loss": 0.3289, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.40868058800697327, + "rewards/margins": 3.1276040077209473, + "rewards/rejected": -3.5362846851348877, + "step": 3215 + }, + { + "epoch": 0.98, + "learning_rate": 5.094856854039043e-08, + "logits/chosen": -1.525322675704956, + "logits/rejected": -1.3608064651489258, + "logps/chosen": -253.0553436279297, + "logps/rejected": -286.7688903808594, + "loss": 0.3748, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5386615991592407, + "rewards/margins": 3.1457836627960205, + "rewards/rejected": -3.6844451427459717, + "step": 3220 + }, + { + "epoch": 0.98, + "learning_rate": 4.281327347958608e-08, + "logits/chosen": -1.3911330699920654, + "logits/rejected": -1.3263561725616455, + "logps/chosen": -251.658447265625, + "logps/rejected": -292.4714660644531, + "loss": 0.3871, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.48612624406814575, + "rewards/margins": 3.019296884536743, + "rewards/rejected": -3.505422592163086, + "step": 3225 + }, + { + "epoch": 0.98, + "learning_rate": 3.5384624006201686e-08, + "logits/chosen": -1.4651706218719482, + "logits/rejected": -1.3821120262145996, + "logps/chosen": -231.82302856445312, + "logps/rejected": -290.69891357421875, + "loss": 0.365, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6108437776565552, + "rewards/margins": 3.042966365814209, + "rewards/rejected": -3.6538097858428955, + "step": 3230 + }, + { + "epoch": 0.99, + "learning_rate": 2.866283045734053e-08, + "logits/chosen": -1.4114316701889038, + "logits/rejected": -1.3266972303390503, + "logps/chosen": -236.2187957763672, + "logps/rejected": -279.8185119628906, + "loss": 0.4135, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6854463815689087, + "rewards/margins": 2.7222111225128174, + "rewards/rejected": -3.4076576232910156, + "step": 3235 + }, + { + "epoch": 0.99, + "learning_rate": 2.264808315596556e-08, + "logits/chosen": -1.443969964981079, + "logits/rejected": -1.3723801374435425, + "logps/chosen": -222.7816925048828, + "logps/rejected": -301.02911376953125, + "loss": 0.3736, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5945747494697571, + "rewards/margins": 3.3723983764648438, + "rewards/rejected": -3.966973066329956, + "step": 3240 + }, + { + "epoch": 0.99, + "learning_rate": 1.73405524055148e-08, + "logits/chosen": -1.4318532943725586, + "logits/rejected": -1.289945125579834, + "logps/chosen": -234.6119384765625, + "logps/rejected": -275.46038818359375, + "loss": 0.3873, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6027265191078186, + "rewards/margins": 2.830317258834839, + "rewards/rejected": -3.4330437183380127, + "step": 3245 + }, + { + "epoch": 0.99, + "learning_rate": 1.2740388485071863e-08, + "logits/chosen": -1.328491449356079, + "logits/rejected": -1.2337000370025635, + "logps/chosen": -250.76171875, + "logps/rejected": -305.5037841796875, + "loss": 0.3748, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.47596779465675354, + "rewards/margins": 3.1194262504577637, + "rewards/rejected": -3.595393657684326, + "step": 3250 + }, + { + "epoch": 0.99, + "learning_rate": 8.847721645116603e-09, + "logits/chosen": -1.3881988525390625, + "logits/rejected": -1.2779829502105713, + "logps/chosen": -247.0287322998047, + "logps/rejected": -300.8529052734375, + "loss": 0.3475, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5738905668258667, + "rewards/margins": 3.4033915996551514, + "rewards/rejected": -3.9772822856903076, + "step": 3255 + }, + { + "epoch": 0.99, + "learning_rate": 5.662662103833594e-09, + "logits/chosen": -1.4306684732437134, + "logits/rejected": -1.3507243394851685, + "logps/chosen": -231.3989715576172, + "logps/rejected": -270.3944396972656, + "loss": 0.3358, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.49925675988197327, + "rewards/margins": 3.0653929710388184, + "rewards/rejected": -3.5646495819091797, + "step": 3260 + }, + { + "epoch": 1.0, + "learning_rate": 3.1853000439951987e-09, + "logits/chosen": -1.4770541191101074, + "logits/rejected": -1.361383318901062, + "logps/chosen": -235.0412139892578, + "logps/rejected": -268.224609375, + "loss": 0.3657, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5067777633666992, + "rewards/margins": 2.797917127609253, + "rewards/rejected": -3.304694652557373, + "step": 3265 + }, + { + "epoch": 1.0, + "learning_rate": 1.4157056104052713e-09, + "logits/chosen": -1.4269344806671143, + "logits/rejected": -1.369800329208374, + "logps/chosen": -228.0611114501953, + "logps/rejected": -286.09051513671875, + "loss": 0.3807, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6630217432975769, + "rewards/margins": 3.0376479625701904, + "rewards/rejected": -3.700669765472412, + "step": 3270 + }, + { + "epoch": 1.0, + "learning_rate": 3.5392890791463574e-10, + "logits/chosen": -1.4762697219848633, + "logits/rejected": -1.370078444480896, + "logps/chosen": -224.59619140625, + "logps/rejected": -275.50189208984375, + "loss": 0.3731, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5560105443000793, + "rewards/margins": 3.0750603675842285, + "rewards/rejected": -3.631071090698242, + "step": 3275 + }, + { + "epoch": 1.0, + "learning_rate": 0.0, + "logits/chosen": -1.5287964344024658, + "logits/rejected": -1.4150562286376953, + "logps/chosen": -244.9498291015625, + "logps/rejected": -282.9684753417969, + "loss": 0.3593, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5622932314872742, + "rewards/margins": 2.9687438011169434, + "rewards/rejected": -3.531036853790283, + "step": 3280 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -0.7773212790489197, + "eval_logits/rejected": -0.7749085426330566, + "eval_logps/chosen": -275.3572082519531, + "eval_logps/rejected": -324.0383605957031, + "eval_loss": 0.12900209426879883, + "eval_rewards/accuracies": 0.8596742749214172, + "eval_rewards/chosen": -0.17991267144680023, + "eval_rewards/margins": 5.889674186706543, + "eval_rewards/rejected": -6.069586277008057, + "eval_runtime": 76185.8679, + "eval_samples_per_second": 2.598, + "eval_steps_per_second": 1.299, + "step": 3280 + }, + { + "epoch": 1.0, + "step": 3280, + "total_flos": 0.0, + "train_loss": 0.3931771684165408, + "train_runtime": 248473.0607, + "train_samples_per_second": 0.845, + "train_steps_per_second": 0.013 + } + ], + "logging_steps": 5, + "max_steps": 3280, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}