{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 2902, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006891798759476223, "grad_norm": 1.1716080904006958, "learning_rate": 1.718213058419244e-10, "logits/chosen": -3.184086799621582, "logits/rejected": -3.1319174766540527, "logps/chosen": -49.95408630371094, "logps/rejected": -44.33523178100586, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.006891798759476223, "grad_norm": 1.0663460493087769, "learning_rate": 1.718213058419244e-09, "logits/chosen": -3.080113172531128, "logits/rejected": -3.0596792697906494, "logps/chosen": -54.03813171386719, "logps/rejected": -53.65137481689453, "loss": 0.6932, "rewards/accuracies": 0.4565972089767456, "rewards/chosen": 8.68273782543838e-05, "rewards/margins": -1.9125265680486336e-05, "rewards/rejected": 0.00010595263302093372, "step": 10 }, { "epoch": 0.013783597518952447, "grad_norm": 1.1690140962600708, "learning_rate": 3.436426116838488e-09, "logits/chosen": -3.1165332794189453, "logits/rejected": -3.0916168689727783, "logps/chosen": -55.888938903808594, "logps/rejected": -53.246864318847656, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 3.240557634853758e-05, "rewards/margins": -3.6290578009356977e-06, "rewards/rejected": 3.603463846957311e-05, "step": 20 }, { "epoch": 0.02067539627842867, "grad_norm": 1.2955037355422974, "learning_rate": 5.154639175257731e-09, "logits/chosen": -3.0878665447235107, "logits/rejected": -3.058804988861084, "logps/chosen": -54.54620361328125, "logps/rejected": -52.591636657714844, "loss": 0.6932, "rewards/accuracies": 0.4921875, "rewards/chosen": -2.4173205019906163e-05, "rewards/margins": -4.490778155741282e-05, "rewards/rejected": 2.0734580175485462e-05, "step": 30 }, { "epoch": 0.027567195037904894, "grad_norm": 1.1852333545684814, "learning_rate": 6.872852233676976e-09, "logits/chosen": -3.0849013328552246, "logits/rejected": -3.0671732425689697, "logps/chosen": -53.879005432128906, "logps/rejected": -53.66566848754883, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -5.587830673903227e-05, "rewards/margins": 4.417077434482053e-05, "rewards/rejected": -0.00010004905925597996, "step": 40 }, { "epoch": 0.03445899379738112, "grad_norm": 1.2431070804595947, "learning_rate": 8.59106529209622e-09, "logits/chosen": -3.0804286003112793, "logits/rejected": -3.0561296939849854, "logps/chosen": -56.24019241333008, "logps/rejected": -53.092872619628906, "loss": 0.6931, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 8.934068318922073e-05, "rewards/margins": 0.00011236695718253031, "rewards/rejected": -2.302624488947913e-05, "step": 50 }, { "epoch": 0.04135079255685734, "grad_norm": 1.1313049793243408, "learning_rate": 1.0309278350515463e-08, "logits/chosen": -3.0351052284240723, "logits/rejected": -3.0099387168884277, "logps/chosen": -52.579429626464844, "logps/rejected": -52.6761589050293, "loss": 0.6931, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": 2.3904693080112338e-05, "rewards/margins": 2.025809772021603e-05, "rewards/rejected": 3.6465789889916778e-06, "step": 60 }, { "epoch": 0.048242591316333565, "grad_norm": 1.2357141971588135, "learning_rate": 1.2027491408934707e-08, "logits/chosen": -3.092390537261963, "logits/rejected": -3.0711493492126465, "logps/chosen": -54.469940185546875, "logps/rejected": -53.86017990112305, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 5.560473709920188e-06, "rewards/margins": 2.101451354974415e-05, "rewards/rejected": -1.5454041204066016e-05, "step": 70 }, { "epoch": 0.05513439007580979, "grad_norm": 1.1142845153808594, "learning_rate": 1.3745704467353952e-08, "logits/chosen": -3.0345962047576904, "logits/rejected": -3.0208940505981445, "logps/chosen": -54.06622314453125, "logps/rejected": -52.69053268432617, "loss": 0.6932, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": -4.671530405175872e-05, "rewards/margins": -4.488803824642673e-05, "rewards/rejected": -1.827271603360714e-06, "step": 80 }, { "epoch": 0.06202618883528601, "grad_norm": 1.1985735893249512, "learning_rate": 1.5463917525773195e-08, "logits/chosen": -3.048698663711548, "logits/rejected": -3.0217783451080322, "logps/chosen": -54.59540939331055, "logps/rejected": -52.060035705566406, "loss": 0.6931, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -6.244768155738711e-05, "rewards/margins": -3.21494007948786e-06, "rewards/rejected": -5.923274511587806e-05, "step": 90 }, { "epoch": 0.06891798759476224, "grad_norm": 1.3350454568862915, "learning_rate": 1.718213058419244e-08, "logits/chosen": -3.119621753692627, "logits/rejected": -3.095787763595581, "logps/chosen": -53.65461349487305, "logps/rejected": -52.88787841796875, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": -6.203976226970553e-05, "rewards/margins": 5.055965812061913e-05, "rewards/rejected": -0.00011259941675234586, "step": 100 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.163339376449585, "eval_logits/rejected": -3.157687187194824, "eval_logps/chosen": -58.7006721496582, "eval_logps/rejected": -63.17026138305664, "eval_loss": 0.693140983581543, "eval_rewards/accuracies": 0.5023234486579895, "eval_rewards/chosen": 0.00011220378655707464, "eval_rewards/margins": 1.3582017345470376e-05, "eval_rewards/rejected": 9.862175647867844e-05, "eval_runtime": 383.3503, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 100 }, { "epoch": 0.07580978635423846, "grad_norm": 1.2324384450912476, "learning_rate": 1.8900343642611684e-08, "logits/chosen": -3.0891432762145996, "logits/rejected": -3.0738348960876465, "logps/chosen": -53.08173751831055, "logps/rejected": -54.20978546142578, "loss": 0.6932, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -4.9080466851592064e-05, "rewards/margins": -7.88484321674332e-05, "rewards/rejected": 2.976796167786233e-05, "step": 110 }, { "epoch": 0.08270158511371468, "grad_norm": 1.2855055332183838, "learning_rate": 2.0618556701030925e-08, "logits/chosen": -3.043365478515625, "logits/rejected": -3.0211169719696045, "logps/chosen": -54.957427978515625, "logps/rejected": -54.4825439453125, "loss": 0.6932, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -7.188355084508657e-05, "rewards/margins": -0.00011629929940681905, "rewards/rejected": 4.441575947566889e-05, "step": 120 }, { "epoch": 0.08959338387319091, "grad_norm": 1.1282892227172852, "learning_rate": 2.2336769759450173e-08, "logits/chosen": -3.0101354122161865, "logits/rejected": -2.9788012504577637, "logps/chosen": -57.5596923828125, "logps/rejected": -51.651153564453125, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 2.1165338694117963e-05, "rewards/margins": 0.00015237969637382776, "rewards/rejected": -0.00013121434312779456, "step": 130 }, { "epoch": 0.09648518263266713, "grad_norm": 1.1657721996307373, "learning_rate": 2.4054982817869415e-08, "logits/chosen": -3.067199468612671, "logits/rejected": -3.046125888824463, "logps/chosen": -53.55717849731445, "logps/rejected": -52.773223876953125, "loss": 0.6931, "rewards/accuracies": 0.5234375, "rewards/chosen": 1.3996473171573598e-05, "rewards/margins": 0.00015452780644409359, "rewards/rejected": -0.00014053132326807827, "step": 140 }, { "epoch": 0.10337698139214335, "grad_norm": 1.2658566236495972, "learning_rate": 2.5773195876288656e-08, "logits/chosen": -3.04317569732666, "logits/rejected": -3.0280072689056396, "logps/chosen": -52.809234619140625, "logps/rejected": -54.64301300048828, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -1.0054915037471801e-05, "rewards/margins": 0.0001238631666637957, "rewards/rejected": -0.00013391808897722512, "step": 150 }, { "epoch": 0.11026878015161957, "grad_norm": 1.2126415967941284, "learning_rate": 2.7491408934707904e-08, "logits/chosen": -3.09346342086792, "logits/rejected": -3.07668399810791, "logps/chosen": -53.59107208251953, "logps/rejected": -52.9258918762207, "loss": 0.6932, "rewards/accuracies": 0.4703125059604645, "rewards/chosen": -9.462583875574637e-06, "rewards/margins": -2.1159441530471668e-05, "rewards/rejected": 1.1696849469444714e-05, "step": 160 }, { "epoch": 0.1171605789110958, "grad_norm": 1.1890392303466797, "learning_rate": 2.9209621993127148e-08, "logits/chosen": -3.0306668281555176, "logits/rejected": -3.0220158100128174, "logps/chosen": -53.26588821411133, "logps/rejected": -53.87241744995117, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -5.447790681500919e-05, "rewards/margins": -7.718646884313785e-06, "rewards/rejected": -4.675926174968481e-05, "step": 170 }, { "epoch": 0.12405237767057202, "grad_norm": 1.15412175655365, "learning_rate": 3.092783505154639e-08, "logits/chosen": -3.0752334594726562, "logits/rejected": -3.0524303913116455, "logps/chosen": -55.69530487060547, "logps/rejected": -53.15666961669922, "loss": 0.6931, "rewards/accuracies": 0.504687488079071, "rewards/chosen": -1.1674828783725388e-05, "rewards/margins": 9.358397619507741e-06, "rewards/rejected": -2.1033218217780814e-05, "step": 180 }, { "epoch": 0.13094417643004824, "grad_norm": 1.1720036268234253, "learning_rate": 3.264604810996564e-08, "logits/chosen": -3.1030337810516357, "logits/rejected": -3.0736050605773926, "logps/chosen": -55.423614501953125, "logps/rejected": -52.4505500793457, "loss": 0.6931, "rewards/accuracies": 0.5234375, "rewards/chosen": 9.101578143599909e-06, "rewards/margins": 0.0001561685057822615, "rewards/rejected": -0.00014706689398735762, "step": 190 }, { "epoch": 0.13783597518952448, "grad_norm": 1.2227604389190674, "learning_rate": 3.436426116838488e-08, "logits/chosen": -3.0704421997070312, "logits/rejected": -3.041954278945923, "logps/chosen": -53.747833251953125, "logps/rejected": -52.85246658325195, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": -6.959711026865989e-05, "rewards/margins": 9.592306014383212e-05, "rewards/rejected": -0.0001655201631365344, "step": 200 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.1631689071655273, "eval_logits/rejected": -3.157501220703125, "eval_logps/chosen": -58.700950622558594, "eval_logps/rejected": -63.162139892578125, "eval_loss": 0.6931830048561096, "eval_rewards/accuracies": 0.48745352029800415, "eval_rewards/chosen": 0.00010945786925731227, "eval_rewards/margins": -7.042505603749305e-05, "eval_rewards/rejected": 0.00017988293257076293, "eval_runtime": 383.3981, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 200 }, { "epoch": 0.1447277739490007, "grad_norm": 1.1424545049667358, "learning_rate": 3.608247422680412e-08, "logits/chosen": -3.08945631980896, "logits/rejected": -3.0655088424682617, "logps/chosen": -54.22871780395508, "logps/rejected": -52.478431701660156, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00013771439262200147, "rewards/margins": 0.00012528176011983305, "rewards/rejected": -0.00026299612363800406, "step": 210 }, { "epoch": 0.15161957270847692, "grad_norm": 1.1047999858856201, "learning_rate": 3.780068728522337e-08, "logits/chosen": -3.0537705421447754, "logits/rejected": -3.039431571960449, "logps/chosen": -51.688323974609375, "logps/rejected": -53.095741271972656, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.00010369622759753838, "rewards/margins": 0.00015849454212002456, "rewards/rejected": -0.00026219076244160533, "step": 220 }, { "epoch": 0.15851137146795313, "grad_norm": 1.2490479946136475, "learning_rate": 3.951890034364261e-08, "logits/chosen": -3.071945905685425, "logits/rejected": -3.0471181869506836, "logps/chosen": -54.49678421020508, "logps/rejected": -52.037872314453125, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.5270270018372685e-05, "rewards/margins": 0.0002025824796874076, "rewards/rejected": -0.0002378527569817379, "step": 230 }, { "epoch": 0.16540317022742937, "grad_norm": 1.141684889793396, "learning_rate": 4.123711340206185e-08, "logits/chosen": -3.028677225112915, "logits/rejected": -3.0117344856262207, "logps/chosen": -54.690513610839844, "logps/rejected": -55.188621520996094, "loss": 0.693, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": -0.00015048097702674568, "rewards/margins": 0.00021787775040138513, "rewards/rejected": -0.00036835874198004603, "step": 240 }, { "epoch": 0.17229496898690558, "grad_norm": 1.171937108039856, "learning_rate": 4.295532646048109e-08, "logits/chosen": -3.06539249420166, "logits/rejected": -3.0387420654296875, "logps/chosen": -57.0573616027832, "logps/rejected": -52.94896697998047, "loss": 0.6931, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": -0.0001992958423215896, "rewards/margins": 0.00017509344615973532, "rewards/rejected": -0.0003743892884813249, "step": 250 }, { "epoch": 0.17918676774638181, "grad_norm": 1.1496978998184204, "learning_rate": 4.4673539518900346e-08, "logits/chosen": -3.0649943351745605, "logits/rejected": -3.0493435859680176, "logps/chosen": -54.52451705932617, "logps/rejected": -54.94301223754883, "loss": 0.693, "rewards/accuracies": 0.535937488079071, "rewards/chosen": -0.00030117519781924784, "rewards/margins": 0.00021073469542898238, "rewards/rejected": -0.0005119099514558911, "step": 260 }, { "epoch": 0.18607856650585802, "grad_norm": 1.1325643062591553, "learning_rate": 4.639175257731959e-08, "logits/chosen": -3.066349506378174, "logits/rejected": -3.0383307933807373, "logps/chosen": -56.371307373046875, "logps/rejected": -52.432106018066406, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00030968443024903536, "rewards/margins": 0.0001904324017232284, "rewards/rejected": -0.000500116846524179, "step": 270 }, { "epoch": 0.19297036526533426, "grad_norm": 1.2462892532348633, "learning_rate": 4.810996563573883e-08, "logits/chosen": -3.0566208362579346, "logits/rejected": -3.051412582397461, "logps/chosen": -53.14699172973633, "logps/rejected": -54.41425323486328, "loss": 0.6931, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": -0.00045495276572182775, "rewards/margins": 9.360066178487614e-05, "rewards/rejected": -0.0005485534202307463, "step": 280 }, { "epoch": 0.19986216402481047, "grad_norm": 1.1743725538253784, "learning_rate": 4.982817869415808e-08, "logits/chosen": -3.0853469371795654, "logits/rejected": -3.063814640045166, "logps/chosen": -54.09833908081055, "logps/rejected": -54.12751007080078, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0002439660020172596, "rewards/margins": 0.000504250347148627, "rewards/rejected": -0.0007482162909582257, "step": 290 }, { "epoch": 0.2067539627842867, "grad_norm": 1.1194610595703125, "learning_rate": 4.999853419300577e-08, "logits/chosen": -3.012183666229248, "logits/rejected": -2.9885506629943848, "logps/chosen": -54.22556686401367, "logps/rejected": -51.91581344604492, "loss": 0.6929, "rewards/accuracies": 0.5546875, "rewards/chosen": -0.000393096124753356, "rewards/margins": 0.0003973825369030237, "rewards/rejected": -0.0007904786616563797, "step": 300 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1624693870544434, "eval_logits/rejected": -3.156888484954834, "eval_logps/chosen": -58.67123794555664, "eval_logps/rejected": -63.15048599243164, "eval_loss": 0.6930928826332092, "eval_rewards/accuracies": 0.5148698687553406, "eval_rewards/chosen": 0.00040659555816091597, "eval_rewards/margins": 0.00011023049592040479, "eval_rewards/rejected": 0.0002963650331366807, "eval_runtime": 383.575, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 300 }, { "epoch": 0.2136457615437629, "grad_norm": 1.1925629377365112, "learning_rate": 4.9993467426542045e-08, "logits/chosen": -3.086402416229248, "logits/rejected": -3.0562937259674072, "logps/chosen": -53.876312255859375, "logps/rejected": -52.675437927246094, "loss": 0.6929, "rewards/accuracies": 0.5796874761581421, "rewards/chosen": -0.0003693565959110856, "rewards/margins": 0.0005247757071629167, "rewards/rejected": -0.0008941322448663414, "step": 310 }, { "epoch": 0.22053756030323915, "grad_norm": 1.154595136642456, "learning_rate": 4.998478233757101e-08, "logits/chosen": -3.0752129554748535, "logits/rejected": -3.0584304332733154, "logps/chosen": -52.4905891418457, "logps/rejected": -54.12751388549805, "loss": 0.6929, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.0005890514003112912, "rewards/margins": 0.0004612796474248171, "rewards/rejected": -0.0010503310477361083, "step": 320 }, { "epoch": 0.22742935906271536, "grad_norm": 1.143236517906189, "learning_rate": 4.9972480183439325e-08, "logits/chosen": -3.075157642364502, "logits/rejected": -3.0487570762634277, "logps/chosen": -53.44994354248047, "logps/rejected": -51.2059326171875, "loss": 0.6928, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0005590206128545105, "rewards/margins": 0.0007544254185631871, "rewards/rejected": -0.0013134460896253586, "step": 330 }, { "epoch": 0.2343211578221916, "grad_norm": 1.3542113304138184, "learning_rate": 4.995656274513881e-08, "logits/chosen": -3.0580501556396484, "logits/rejected": -3.035737991333008, "logps/chosen": -54.966087341308594, "logps/rejected": -53.1796760559082, "loss": 0.6928, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": -0.0005089120240882039, "rewards/margins": 0.0006240031216293573, "rewards/rejected": -0.001132915262132883, "step": 340 }, { "epoch": 0.2412129565816678, "grad_norm": 1.2170838117599487, "learning_rate": 4.993703232704862e-08, "logits/chosen": -3.0822110176086426, "logits/rejected": -3.059418201446533, "logps/chosen": -54.97810745239258, "logps/rejected": -52.979820251464844, "loss": 0.6927, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.0005054243374615908, "rewards/margins": 0.0008770185522735119, "rewards/rejected": -0.0013824428897351027, "step": 350 }, { "epoch": 0.24810475534114404, "grad_norm": 1.107391595840454, "learning_rate": 4.991389175660163e-08, "logits/chosen": -3.0396039485931396, "logits/rejected": -3.0273656845092773, "logps/chosen": -52.375274658203125, "logps/rejected": -53.336265563964844, "loss": 0.6928, "rewards/accuracies": 0.573437511920929, "rewards/chosen": -0.0008153729140758514, "rewards/margins": 0.0007024986553005874, "rewards/rejected": -0.001517871511168778, "step": 360 }, { "epoch": 0.2549965541006203, "grad_norm": 1.1218314170837402, "learning_rate": 4.98871443838751e-08, "logits/chosen": -3.114689350128174, "logits/rejected": -3.0790865421295166, "logps/chosen": -53.952476501464844, "logps/rejected": -52.38344192504883, "loss": 0.6926, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.0007519819191657007, "rewards/margins": 0.001077468739822507, "rewards/rejected": -0.0018294507171958685, "step": 370 }, { "epoch": 0.2618883528600965, "grad_norm": 1.1392273902893066, "learning_rate": 4.985679408110568e-08, "logits/chosen": -3.0398175716400146, "logits/rejected": -3.0220084190368652, "logps/chosen": -54.647239685058594, "logps/rejected": -52.84843826293945, "loss": 0.6927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0008999688434414566, "rewards/margins": 0.0009324215352535248, "rewards/rejected": -0.001832390553317964, "step": 380 }, { "epoch": 0.2687801516195727, "grad_norm": 1.1045254468917847, "learning_rate": 4.9822845242128844e-08, "logits/chosen": -3.0233044624328613, "logits/rejected": -3.001706600189209, "logps/chosen": -53.072547912597656, "logps/rejected": -50.9716796875, "loss": 0.6926, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.0011974747758358717, "rewards/margins": 0.001123163616284728, "rewards/rejected": -0.0023206386249512434, "step": 390 }, { "epoch": 0.27567195037904896, "grad_norm": 1.1129488945007324, "learning_rate": 4.9785302781742763e-08, "logits/chosen": -3.050330400466919, "logits/rejected": -3.035008192062378, "logps/chosen": -52.901397705078125, "logps/rejected": -54.134605407714844, "loss": 0.6927, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.0011613852111622691, "rewards/margins": 0.0009022338199429214, "rewards/rejected": -0.0020636192057281733, "step": 400 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.1611053943634033, "eval_logits/rejected": -3.1554572582244873, "eval_logps/chosen": -58.63969039916992, "eval_logps/rejected": -63.135032653808594, "eval_loss": 0.6930131316184998, "eval_rewards/accuracies": 0.5257899761199951, "eval_rewards/chosen": 0.0007220551487989724, "eval_rewards/margins": 0.0002711908018682152, "eval_rewards/rejected": 0.00045086428872309625, "eval_runtime": 383.1949, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 400 }, { "epoch": 0.28256374913852517, "grad_norm": 1.2506204843521118, "learning_rate": 4.974417213499681e-08, "logits/chosen": -3.0777323246002197, "logits/rejected": -3.049983501434326, "logps/chosen": -55.058868408203125, "logps/rejected": -53.96419143676758, "loss": 0.6924, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0010929839918389916, "rewards/margins": 0.0014660651795566082, "rewards/rejected": -0.0025590492878109217, "step": 410 }, { "epoch": 0.2894555478980014, "grad_norm": 1.237091302871704, "learning_rate": 4.9699459256404706e-08, "logits/chosen": -3.105699300765991, "logits/rejected": -3.0748677253723145, "logps/chosen": -55.66558837890625, "logps/rejected": -53.8339729309082, "loss": 0.6923, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.000828454561997205, "rewards/margins": 0.001723860390484333, "rewards/rejected": -0.002552315127104521, "step": 420 }, { "epoch": 0.2963473466574776, "grad_norm": 1.1707303524017334, "learning_rate": 4.965117061908251e-08, "logits/chosen": -3.056098461151123, "logits/rejected": -3.035871982574463, "logps/chosen": -55.13801193237305, "logps/rejected": -53.53112030029297, "loss": 0.6927, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0015201037749648094, "rewards/margins": 0.0009773834608495235, "rewards/rejected": -0.002497487235814333, "step": 430 }, { "epoch": 0.30323914541695385, "grad_norm": 1.1965198516845703, "learning_rate": 4.959931321381145e-08, "logits/chosen": -3.082432508468628, "logits/rejected": -3.063544750213623, "logps/chosen": -54.456016540527344, "logps/rejected": -54.16331100463867, "loss": 0.6925, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.0014482419937849045, "rewards/margins": 0.0013292920775711536, "rewards/rejected": -0.002777534071356058, "step": 440 }, { "epoch": 0.31013094417643006, "grad_norm": 1.2304091453552246, "learning_rate": 4.954389454802591e-08, "logits/chosen": -3.1104228496551514, "logits/rejected": -3.090036153793335, "logps/chosen": -53.494163513183594, "logps/rejected": -53.315879821777344, "loss": 0.6923, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.0016971270088106394, "rewards/margins": 0.0016467798268422484, "rewards/rejected": -0.003343907417729497, "step": 450 }, { "epoch": 0.31702274293590627, "grad_norm": 1.1292587518692017, "learning_rate": 4.948492264472656e-08, "logits/chosen": -3.1166298389434814, "logits/rejected": -3.094527006149292, "logps/chosen": -55.6964111328125, "logps/rejected": -53.82384490966797, "loss": 0.6925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0016075067687779665, "rewards/margins": 0.001251583336852491, "rewards/rejected": -0.0028590902220457792, "step": 460 }, { "epoch": 0.3239145416953825, "grad_norm": 1.197009563446045, "learning_rate": 4.9422406041318844e-08, "logits/chosen": -3.0635745525360107, "logits/rejected": -3.038623094558716, "logps/chosen": -54.91028594970703, "logps/rejected": -53.81779861450195, "loss": 0.6918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0011848278809338808, "rewards/margins": 0.002738000126555562, "rewards/rejected": -0.0039228275418281555, "step": 470 }, { "epoch": 0.33080634045485874, "grad_norm": 1.2151196002960205, "learning_rate": 4.9356353788377026e-08, "logits/chosen": -3.055495023727417, "logits/rejected": -3.0299649238586426, "logps/chosen": -55.23993682861328, "logps/rejected": -53.810813903808594, "loss": 0.692, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.0015645608073100448, "rewards/margins": 0.0022706836462020874, "rewards/rejected": -0.003835244569927454, "step": 480 }, { "epoch": 0.33769813921433495, "grad_norm": 1.1114208698272705, "learning_rate": 4.9286775448333944e-08, "logits/chosen": -3.0453591346740723, "logits/rejected": -3.0262703895568848, "logps/chosen": -53.14439010620117, "logps/rejected": -53.70630645751953, "loss": 0.6922, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.0021999510936439037, "rewards/margins": 0.0019915387965738773, "rewards/rejected": -0.0041914889588952065, "step": 490 }, { "epoch": 0.34458993797381116, "grad_norm": 1.2440327405929565, "learning_rate": 4.921368109409663e-08, "logits/chosen": -3.0790770053863525, "logits/rejected": -3.0631680488586426, "logps/chosen": -53.35895538330078, "logps/rejected": -53.36548614501953, "loss": 0.692, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.002265265677124262, "rewards/margins": 0.002222201321274042, "rewards/rejected": -0.004487467464059591, "step": 500 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.1591975688934326, "eval_logits/rejected": -3.153568983078003, "eval_logps/chosen": -58.59514236450195, "eval_logps/rejected": -63.1102180480957, "eval_loss": 0.6929171681404114, "eval_rewards/accuracies": 0.5246282815933228, "eval_rewards/chosen": 0.0011674691922962666, "eval_rewards/margins": 0.0004684112500399351, "eval_rewards/rejected": 0.0006990578840486705, "eval_runtime": 382.8893, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 500 }, { "epoch": 0.35148173673328736, "grad_norm": 1.1804462671279907, "learning_rate": 4.913708130758806e-08, "logits/chosen": -3.0682575702667236, "logits/rejected": -3.046999454498291, "logps/chosen": -54.03418731689453, "logps/rejected": -54.376319885253906, "loss": 0.692, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.002343302359804511, "rewards/margins": 0.002379921730607748, "rewards/rejected": -0.0047232238575816154, "step": 510 }, { "epoch": 0.35837353549276363, "grad_norm": 1.1343954801559448, "learning_rate": 4.9056987178215176e-08, "logits/chosen": -3.1094601154327393, "logits/rejected": -3.0802154541015625, "logps/chosen": -53.637245178222656, "logps/rejected": -53.262474060058594, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0020912564359605312, "rewards/margins": 0.002018420724198222, "rewards/rejected": -0.004109677392989397, "step": 520 }, { "epoch": 0.36526533425223984, "grad_norm": 1.1996898651123047, "learning_rate": 4.8973410301263516e-08, "logits/chosen": -3.051212787628174, "logits/rejected": -3.0387063026428223, "logps/chosen": -53.287681579589844, "logps/rejected": -53.440711975097656, "loss": 0.6922, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0023356422316282988, "rewards/margins": 0.0020052504260092974, "rewards/rejected": -0.004340892191976309, "step": 530 }, { "epoch": 0.37215713301171605, "grad_norm": 1.164119839668274, "learning_rate": 4.8886362776218506e-08, "logits/chosen": -3.0033349990844727, "logits/rejected": -2.9812140464782715, "logps/chosen": -53.450355529785156, "logps/rejected": -51.471229553222656, "loss": 0.6919, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0030757987406104803, "rewards/margins": 0.0025067501701414585, "rewards/rejected": -0.005582548677921295, "step": 540 }, { "epoch": 0.37904893177119225, "grad_norm": 1.28213632106781, "learning_rate": 4.879585720501382e-08, "logits/chosen": -3.148085355758667, "logits/rejected": -3.127159595489502, "logps/chosen": -54.660545349121094, "logps/rejected": -53.745887756347656, "loss": 0.6919, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0028407000936567783, "rewards/margins": 0.0024546708445996046, "rewards/rejected": -0.005295370705425739, "step": 550 }, { "epoch": 0.3859407305306685, "grad_norm": 1.3044832944869995, "learning_rate": 4.870190669020703e-08, "logits/chosen": -3.0593714714050293, "logits/rejected": -3.036311388015747, "logps/chosen": -55.014060974121094, "logps/rejected": -53.53757858276367, "loss": 0.6915, "rewards/accuracies": 0.609375, "rewards/chosen": -0.002411695895716548, "rewards/margins": 0.0032335221767425537, "rewards/rejected": -0.005645217839628458, "step": 560 }, { "epoch": 0.3928325292901447, "grad_norm": 1.1550047397613525, "learning_rate": 4.860452483308266e-08, "logits/chosen": -2.9982199668884277, "logits/rejected": -2.972108840942383, "logps/chosen": -56.20374298095703, "logps/rejected": -55.09558868408203, "loss": 0.6916, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.0028813418466597795, "rewards/margins": 0.0031816777773201466, "rewards/rejected": -0.00606301985681057, "step": 570 }, { "epoch": 0.39972432804962094, "grad_norm": 1.230724573135376, "learning_rate": 4.8503725731683204e-08, "logits/chosen": -3.0479977130889893, "logits/rejected": -3.0179476737976074, "logps/chosen": -54.623687744140625, "logps/rejected": -53.172157287597656, "loss": 0.6912, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.002868245355784893, "rewards/margins": 0.003858409356325865, "rewards/rejected": -0.006726655177772045, "step": 580 }, { "epoch": 0.4066161268090972, "grad_norm": 1.1609071493148804, "learning_rate": 4.839952397876808e-08, "logits/chosen": -3.0574018955230713, "logits/rejected": -3.039822816848755, "logps/chosen": -54.512779235839844, "logps/rejected": -54.206886291503906, "loss": 0.6914, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.0032920341473072767, "rewards/margins": 0.0035798237659037113, "rewards/rejected": -0.006871857680380344, "step": 590 }, { "epoch": 0.4135079255685734, "grad_norm": 1.0820258855819702, "learning_rate": 4.829193465970105e-08, "logits/chosen": -3.089672327041626, "logits/rejected": -3.069746494293213, "logps/chosen": -54.53960418701172, "logps/rejected": -53.9844970703125, "loss": 0.6915, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.003799352329224348, "rewards/margins": 0.0033282779622823, "rewards/rejected": -0.007127630058676004, "step": 600 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.1564178466796875, "eval_logits/rejected": -3.150780200958252, "eval_logps/chosen": -58.54813766479492, "eval_logps/rejected": -63.1104850769043, "eval_loss": 0.6926856637001038, "eval_rewards/accuracies": 0.5504181981086731, "eval_rewards/chosen": 0.001637543668039143, "eval_rewards/margins": 0.0009411590872332454, "eval_rewards/rejected": 0.0006963845225982368, "eval_runtime": 383.4087, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 600 }, { "epoch": 0.4203997243280496, "grad_norm": 1.1835054159164429, "learning_rate": 4.818097335026631e-08, "logits/chosen": -3.101921319961548, "logits/rejected": -3.0772037506103516, "logps/chosen": -55.26588821411133, "logps/rejected": -53.28364181518555, "loss": 0.6912, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.002828064141795039, "rewards/margins": 0.00399785814806819, "rewards/rejected": -0.006825921591371298, "step": 610 }, { "epoch": 0.4272915230875258, "grad_norm": 1.203052043914795, "learning_rate": 4.806665611441354e-08, "logits/chosen": -3.077770233154297, "logits/rejected": -3.0505123138427734, "logps/chosen": -55.078880310058594, "logps/rejected": -52.72577667236328, "loss": 0.6916, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.004233444109559059, "rewards/margins": 0.0032121867407113314, "rewards/rejected": -0.007445631083101034, "step": 620 }, { "epoch": 0.4341833218470021, "grad_norm": 1.1228797435760498, "learning_rate": 4.794899950193235e-08, "logits/chosen": -3.0409035682678223, "logits/rejected": -3.0232279300689697, "logps/chosen": -53.423980712890625, "logps/rejected": -52.98765182495117, "loss": 0.6921, "rewards/accuracies": 0.5796874761581421, "rewards/chosen": -0.004781276918947697, "rewards/margins": 0.0022252718918025494, "rewards/rejected": -0.007006548345088959, "step": 630 }, { "epoch": 0.4410751206064783, "grad_norm": 1.262542486190796, "learning_rate": 4.782802054605635e-08, "logits/chosen": -3.0899507999420166, "logits/rejected": -3.0717437267303467, "logps/chosen": -55.078704833984375, "logps/rejected": -54.794776916503906, "loss": 0.6913, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0041534146293997765, "rewards/margins": 0.0038144378922879696, "rewards/rejected": -0.007967852056026459, "step": 640 }, { "epoch": 0.4479669193659545, "grad_norm": 1.2199469804763794, "learning_rate": 4.77037367609972e-08, "logits/chosen": -3.0735621452331543, "logits/rejected": -3.0427281856536865, "logps/chosen": -56.89426803588867, "logps/rejected": -53.209136962890625, "loss": 0.6911, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.004497360438108444, "rewards/margins": 0.004101374186575413, "rewards/rejected": -0.008598734624683857, "step": 650 }, { "epoch": 0.4548587181254307, "grad_norm": 1.1544371843338013, "learning_rate": 4.7576166139409105e-08, "logits/chosen": -3.042221784591675, "logits/rejected": -3.0110714435577393, "logps/chosen": -54.11481475830078, "logps/rejected": -52.04207229614258, "loss": 0.6907, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.004763273987919092, "rewards/margins": 0.004999758210033178, "rewards/rejected": -0.00976303219795227, "step": 660 }, { "epoch": 0.461750516884907, "grad_norm": 1.281175136566162, "learning_rate": 4.744532714978399e-08, "logits/chosen": -3.0140280723571777, "logits/rejected": -2.9848811626434326, "logps/chosen": -56.1414680480957, "logps/rejected": -54.0085334777832, "loss": 0.6908, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.0041292086243629456, "rewards/margins": 0.004748177714645863, "rewards/rejected": -0.008877387270331383, "step": 670 }, { "epoch": 0.4686423156443832, "grad_norm": 1.1695414781570435, "learning_rate": 4.7311238733777815e-08, "logits/chosen": -3.046804428100586, "logits/rejected": -3.0304887294769287, "logps/chosen": -54.355079650878906, "logps/rejected": -54.04961395263672, "loss": 0.6911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005136381834745407, "rewards/margins": 0.00416863476857543, "rewards/rejected": -0.009305017068982124, "step": 680 }, { "epoch": 0.4755341144038594, "grad_norm": 1.1991028785705566, "learning_rate": 4.717392030346835e-08, "logits/chosen": -3.028083562850952, "logits/rejected": -3.011951446533203, "logps/chosen": -54.25959396362305, "logps/rejected": -54.1555061340332, "loss": 0.6911, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00518420897424221, "rewards/margins": 0.004269171506166458, "rewards/rejected": -0.009453380480408669, "step": 690 }, { "epoch": 0.4824259131633356, "grad_norm": 1.2611873149871826, "learning_rate": 4.70333917385449e-08, "logits/chosen": -3.079685926437378, "logits/rejected": -3.049795627593994, "logps/chosen": -55.45751190185547, "logps/rejected": -53.548301696777344, "loss": 0.6912, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.004976336378604174, "rewards/margins": 0.003996217157691717, "rewards/rejected": -0.00897255353629589, "step": 700 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.1538004875183105, "eval_logits/rejected": -3.1481423377990723, "eval_logps/chosen": -58.522918701171875, "eval_logps/rejected": -63.142425537109375, "eval_loss": 0.6924082636833191, "eval_rewards/accuracies": 0.5671468377113342, "eval_rewards/chosen": 0.0018897424452006817, "eval_rewards/margins": 0.0015127337537705898, "eval_rewards/rejected": 0.0003770088078454137, "eval_runtime": 383.1967, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 700 }, { "epoch": 0.48931771192281187, "grad_norm": 1.1392662525177002, "learning_rate": 4.688967338343029e-08, "logits/chosen": -3.0261685848236084, "logits/rejected": -3.0102686882019043, "logps/chosen": -54.990821838378906, "logps/rejected": -54.767127990722656, "loss": 0.691, "rewards/accuracies": 0.625, "rewards/chosen": -0.00540867168456316, "rewards/margins": 0.004442816134542227, "rewards/rejected": -0.0098514873534441, "step": 710 }, { "epoch": 0.4962095106822881, "grad_norm": 1.1779069900512695, "learning_rate": 4.6742786044335625e-08, "logits/chosen": -3.0809476375579834, "logits/rejected": -3.057307481765747, "logps/chosen": -55.18914794921875, "logps/rejected": -53.8927116394043, "loss": 0.6898, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.0052046263590455055, "rewards/margins": 0.006748650223016739, "rewards/rejected": -0.011953277513384819, "step": 720 }, { "epoch": 0.5031013094417643, "grad_norm": 1.2199147939682007, "learning_rate": 4.6592750986248085e-08, "logits/chosen": -3.107689380645752, "logits/rejected": -3.1000123023986816, "logps/chosen": -54.34379959106445, "logps/rejected": -54.853431701660156, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": -0.005630369298160076, "rewards/margins": 0.004252653103321791, "rewards/rejected": -0.009883022867143154, "step": 730 }, { "epoch": 0.5099931082012406, "grad_norm": 1.307981014251709, "learning_rate": 4.6439589929852476e-08, "logits/chosen": -3.0687716007232666, "logits/rejected": -3.0409016609191895, "logps/chosen": -53.86914825439453, "logps/rejected": -53.336158752441406, "loss": 0.6905, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005739855580031872, "rewards/margins": 0.0054161581210792065, "rewards/rejected": -0.011156014166772366, "step": 740 }, { "epoch": 0.5168849069607168, "grad_norm": 1.1373140811920166, "learning_rate": 4.6283325048386624e-08, "logits/chosen": -3.0201470851898193, "logits/rejected": -2.998100519180298, "logps/chosen": -55.00568389892578, "logps/rejected": -54.43558883666992, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0055595808662474155, "rewards/margins": 0.005786740221083164, "rewards/rejected": -0.011346321552991867, "step": 750 }, { "epoch": 0.523776705720193, "grad_norm": 1.1454448699951172, "learning_rate": 4.612397896443138e-08, "logits/chosen": -3.118800163269043, "logits/rejected": -3.0978825092315674, "logps/chosen": -54.798065185546875, "logps/rejected": -54.3465461730957, "loss": 0.6909, "rewards/accuracies": 0.5796874761581421, "rewards/chosen": -0.007258473429828882, "rewards/margins": 0.004668924491852522, "rewards/rejected": -0.011927397921681404, "step": 760 }, { "epoch": 0.5306685044796692, "grad_norm": 1.1706945896148682, "learning_rate": 4.5961574746635536e-08, "logits/chosen": -3.012247323989868, "logits/rejected": -2.993521213531494, "logps/chosen": -55.298187255859375, "logps/rejected": -55.779624938964844, "loss": 0.6913, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.00789455696940422, "rewards/margins": 0.0037407889030873775, "rewards/rejected": -0.011635346338152885, "step": 770 }, { "epoch": 0.5375603032391454, "grad_norm": 1.2820113897323608, "learning_rate": 4.5796135906376144e-08, "logits/chosen": -3.0310168266296387, "logits/rejected": -3.015160083770752, "logps/chosen": -54.29914474487305, "logps/rejected": -55.2180290222168, "loss": 0.691, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.007602076046168804, "rewards/margins": 0.004507972858846188, "rewards/rejected": -0.012110048905014992, "step": 780 }, { "epoch": 0.5444521019986216, "grad_norm": 1.1508716344833374, "learning_rate": 4.5627686394354766e-08, "logits/chosen": -3.0379862785339355, "logits/rejected": -3.017380475997925, "logps/chosen": -53.72552490234375, "logps/rejected": -54.60520553588867, "loss": 0.6901, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0065218256786465645, "rewards/margins": 0.006237885914742947, "rewards/rejected": -0.012759710662066936, "step": 790 }, { "epoch": 0.5513439007580979, "grad_norm": 1.1988805532455444, "learning_rate": 4.545625059713011e-08, "logits/chosen": -3.0689666271209717, "logits/rejected": -3.046346664428711, "logps/chosen": -54.87028121948242, "logps/rejected": -53.7490119934082, "loss": 0.69, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.006772381253540516, "rewards/margins": 0.006438801996409893, "rewards/rejected": -0.013211183249950409, "step": 800 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.1500000953674316, "eval_logits/rejected": -3.1443684101104736, "eval_logps/chosen": -58.52486038208008, "eval_logps/rejected": -63.183868408203125, "eval_loss": 0.69222092628479, "eval_rewards/accuracies": 0.5759758353233337, "eval_rewards/chosen": 0.001870311563834548, "eval_rewards/margins": 0.0019077310571447015, "eval_rewards/rejected": -3.741981345228851e-05, "eval_runtime": 383.2221, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 800 }, { "epoch": 0.5582356995175741, "grad_norm": 1.181986927986145, "learning_rate": 4.528185333358756e-08, "logits/chosen": -3.026899814605713, "logits/rejected": -3.0093157291412354, "logps/chosen": -54.46189498901367, "logps/rejected": -54.8513298034668, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00789581798017025, "rewards/margins": 0.004861229099333286, "rewards/rejected": -0.01275704801082611, "step": 810 }, { "epoch": 0.5651274982770503, "grad_norm": 1.1672871112823486, "learning_rate": 4.510451985134616e-08, "logits/chosen": -3.0875649452209473, "logits/rejected": -3.0743203163146973, "logps/chosen": -53.040733337402344, "logps/rejected": -55.541954040527344, "loss": 0.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00876162201166153, "rewards/margins": 0.00582465436309576, "rewards/rejected": -0.014586275443434715, "step": 820 }, { "epoch": 0.5720192970365265, "grad_norm": 1.2362406253814697, "learning_rate": 4.492427582310346e-08, "logits/chosen": -3.0630054473876953, "logits/rejected": -3.0335052013397217, "logps/chosen": -54.42986297607422, "logps/rejected": -53.1016845703125, "loss": 0.6894, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.007075751665979624, "rewards/margins": 0.007580357138067484, "rewards/rejected": -0.014656109735369682, "step": 830 }, { "epoch": 0.5789110957960028, "grad_norm": 1.1612728834152222, "learning_rate": 4.4741147342918894e-08, "logits/chosen": -3.076169013977051, "logits/rejected": -3.0513038635253906, "logps/chosen": -55.8946533203125, "logps/rejected": -55.85911178588867, "loss": 0.689, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0071454280987381935, "rewards/margins": 0.008417905308306217, "rewards/rejected": -0.01556333340704441, "step": 840 }, { "epoch": 0.585802894555479, "grad_norm": 1.1926907300949097, "learning_rate": 4.4555160922436074e-08, "logits/chosen": -3.079662322998047, "logits/rejected": -3.0524630546569824, "logps/chosen": -54.04046630859375, "logps/rejected": -53.262847900390625, "loss": 0.6896, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.007903190329670906, "rewards/margins": 0.007286435458809137, "rewards/rejected": -0.015189625322818756, "step": 850 }, { "epoch": 0.5926946933149552, "grad_norm": 1.2318311929702759, "learning_rate": 4.4366343487044754e-08, "logits/chosen": -3.031019926071167, "logits/rejected": -3.0084445476531982, "logps/chosen": -52.4871711730957, "logps/rejected": -53.59075927734375, "loss": 0.6897, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.009235886856913567, "rewards/margins": 0.007172322832047939, "rewards/rejected": -0.01640820875763893, "step": 860 }, { "epoch": 0.5995864920744314, "grad_norm": 1.2643660306930542, "learning_rate": 4.417472237198275e-08, "logits/chosen": -3.122987985610962, "logits/rejected": -3.097611665725708, "logps/chosen": -56.34685134887695, "logps/rejected": -55.146095275878906, "loss": 0.6896, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.0072593227960169315, "rewards/margins": 0.007286491803824902, "rewards/rejected": -0.01454581506550312, "step": 870 }, { "epoch": 0.6064782908339077, "grad_norm": 1.2349611520767212, "learning_rate": 4.398032531837865e-08, "logits/chosen": -3.000382423400879, "logits/rejected": -2.979700803756714, "logps/chosen": -54.820579528808594, "logps/rejected": -54.77504348754883, "loss": 0.69, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.010271805338561535, "rewards/margins": 0.006492338143289089, "rewards/rejected": -0.016764143481850624, "step": 880 }, { "epoch": 0.6133700895933839, "grad_norm": 1.2228236198425293, "learning_rate": 4.378318046923567e-08, "logits/chosen": -3.046607494354248, "logits/rejected": -3.0200607776641846, "logps/chosen": -55.26753616333008, "logps/rejected": -54.108428955078125, "loss": 0.6894, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.00934204924851656, "rewards/margins": 0.007791099604219198, "rewards/rejected": -0.01713315024971962, "step": 890 }, { "epoch": 0.6202618883528601, "grad_norm": 1.186522126197815, "learning_rate": 4.3583316365357413e-08, "logits/chosen": -3.081699848175049, "logits/rejected": -3.0569376945495605, "logps/chosen": -56.97715377807617, "logps/rejected": -55.800636291503906, "loss": 0.6893, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.009422613307833672, "rewards/margins": 0.007982470095157623, "rewards/rejected": -0.017405081540346146, "step": 900 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -3.145947217941284, "eval_logits/rejected": -3.140315532684326, "eval_logps/chosen": -58.54254913330078, "eval_logps/rejected": -63.26302719116211, "eval_loss": 0.6919277906417847, "eval_rewards/accuracies": 0.5708643198013306, "eval_rewards/chosen": 0.0016934837913140655, "eval_rewards/margins": 0.00252249906770885, "eval_rewards/rejected": -0.0008290152181871235, "eval_runtime": 383.2519, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 900 }, { "epoch": 0.6271536871123363, "grad_norm": 1.234681248664856, "learning_rate": 4.3380761941215947e-08, "logits/chosen": -3.046011447906494, "logits/rejected": -3.0302977561950684, "logps/chosen": -54.25246047973633, "logps/rejected": -55.46947479248047, "loss": 0.6893, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.009562310762703419, "rewards/margins": 0.007873213849961758, "rewards/rejected": -0.017435524612665176, "step": 910 }, { "epoch": 0.6340454858718125, "grad_norm": 1.141934871673584, "learning_rate": 4.317554652076299e-08, "logits/chosen": -3.054769992828369, "logits/rejected": -3.0311903953552246, "logps/chosen": -54.04453659057617, "logps/rejected": -54.37770462036133, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009277190081775188, "rewards/margins": 0.0076979040168225765, "rewards/rejected": -0.016975093632936478, "step": 920 }, { "epoch": 0.6409372846312887, "grad_norm": 1.236680269241333, "learning_rate": 4.2967699813184615e-08, "logits/chosen": -3.0500195026397705, "logits/rejected": -3.0328176021575928, "logps/chosen": -54.70762252807617, "logps/rejected": -57.55879592895508, "loss": 0.6884, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.008434431627392769, "rewards/margins": 0.009904151782393456, "rewards/rejected": -0.018338583409786224, "step": 930 }, { "epoch": 0.647829083390765, "grad_norm": 1.2360023260116577, "learning_rate": 4.275725190860027e-08, "logits/chosen": -3.073611259460449, "logits/rejected": -3.0537660121917725, "logps/chosen": -55.351104736328125, "logps/rejected": -55.8747673034668, "loss": 0.6896, "rewards/accuracies": 0.609375, "rewards/chosen": -0.010648580268025398, "rewards/margins": 0.007464288733899593, "rewards/rejected": -0.018112869933247566, "step": 940 }, { "epoch": 0.6547208821502413, "grad_norm": 1.2623155117034912, "learning_rate": 4.2544233273706585e-08, "logits/chosen": -3.0598671436309814, "logits/rejected": -3.0294106006622314, "logps/chosen": -55.8059196472168, "logps/rejected": -53.73136520385742, "loss": 0.6893, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.009804973378777504, "rewards/margins": 0.007985373958945274, "rewards/rejected": -0.01779034733772278, "step": 950 }, { "epoch": 0.6616126809097175, "grad_norm": 1.2945950031280518, "learning_rate": 4.232867474736669e-08, "logits/chosen": -3.0672502517700195, "logits/rejected": -3.0369277000427246, "logps/chosen": -56.809417724609375, "logps/rejected": -55.6953239440918, "loss": 0.6879, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.007269621826708317, "rewards/margins": 0.010824671015143394, "rewards/rejected": -0.018094293773174286, "step": 960 }, { "epoch": 0.6685044796691937, "grad_norm": 1.1434519290924072, "learning_rate": 4.211060753614565e-08, "logits/chosen": -3.1128265857696533, "logits/rejected": -3.0972368717193604, "logps/chosen": -56.41877365112305, "logps/rejected": -55.6785774230957, "loss": 0.6903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.011154340580105782, "rewards/margins": 0.005970745347440243, "rewards/rejected": -0.0171250868588686, "step": 970 }, { "epoch": 0.6753962784286699, "grad_norm": 1.1750149726867676, "learning_rate": 4.1890063209792674e-08, "logits/chosen": -3.1158690452575684, "logits/rejected": -3.079075336456299, "logps/chosen": -57.21317672729492, "logps/rejected": -53.88423538208008, "loss": 0.6876, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.008817395195364952, "rewards/margins": 0.01136676874011755, "rewards/rejected": -0.020184166729450226, "step": 980 }, { "epoch": 0.6822880771881461, "grad_norm": 1.3042854070663452, "learning_rate": 4.166707369667073e-08, "logits/chosen": -3.045738458633423, "logits/rejected": -3.0281968116760254, "logps/chosen": -54.19719696044922, "logps/rejected": -56.0754280090332, "loss": 0.6888, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.009636206552386284, "rewards/margins": 0.009063459932804108, "rewards/rejected": -0.01869966648519039, "step": 990 }, { "epoch": 0.6891798759476223, "grad_norm": 1.22942054271698, "learning_rate": 4.144167127913426e-08, "logits/chosen": -3.075810194015503, "logits/rejected": -3.052361249923706, "logps/chosen": -55.58427810668945, "logps/rejected": -55.51273727416992, "loss": 0.6892, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.011259499937295914, "rewards/margins": 0.008291425183415413, "rewards/rejected": -0.019550926983356476, "step": 1000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -3.1417765617370605, "eval_logits/rejected": -3.136131525039673, "eval_logps/chosen": -58.606346130371094, "eval_logps/rejected": -63.375797271728516, "eval_loss": 0.6916959881782532, "eval_rewards/accuracies": 0.5724906921386719, "eval_rewards/chosen": 0.001055453554727137, "eval_rewards/margins": 0.0030122159514576197, "eval_rewards/rejected": -0.0019567625131458044, "eval_runtime": 382.8427, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 1000 }, { "epoch": 0.6960716747070985, "grad_norm": 1.228550910949707, "learning_rate": 4.1213888588855636e-08, "logits/chosen": -3.0645551681518555, "logits/rejected": -3.0503764152526855, "logps/chosen": -54.256507873535156, "logps/rejected": -55.95310592651367, "loss": 0.6892, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.011437224224209785, "rewards/margins": 0.008133414201438427, "rewards/rejected": -0.019570637494325638, "step": 1010 }, { "epoch": 0.7029634734665747, "grad_norm": 1.290880560874939, "learning_rate": 4.098375860210107e-08, "logits/chosen": -3.0364532470703125, "logits/rejected": -3.0164756774902344, "logps/chosen": -54.49522018432617, "logps/rejected": -55.22959518432617, "loss": 0.6887, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.013310156762599945, "rewards/margins": 0.0093264514580369, "rewards/rejected": -0.02263660728931427, "step": 1020 }, { "epoch": 0.709855272226051, "grad_norm": 1.234087586402893, "learning_rate": 4.075131463495657e-08, "logits/chosen": -3.0410397052764893, "logits/rejected": -3.023860454559326, "logps/chosen": -54.86391067504883, "logps/rejected": -54.73369598388672, "loss": 0.6886, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.012769045308232307, "rewards/margins": 0.009414998814463615, "rewards/rejected": -0.022184044122695923, "step": 1030 }, { "epoch": 0.7167470709855273, "grad_norm": 1.1497515439987183, "learning_rate": 4.051659033850477e-08, "logits/chosen": -3.0711050033569336, "logits/rejected": -3.0434327125549316, "logps/chosen": -55.960113525390625, "logps/rejected": -53.39757537841797, "loss": 0.6876, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.01206748653203249, "rewards/margins": 0.011478706263005733, "rewards/rejected": -0.023546192795038223, "step": 1040 }, { "epoch": 0.7236388697450035, "grad_norm": 1.241176724433899, "learning_rate": 4.0279619693953283e-08, "logits/chosen": -3.0579118728637695, "logits/rejected": -3.044525623321533, "logps/chosen": -54.537757873535156, "logps/rejected": -55.7606201171875, "loss": 0.6891, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.01158697810024023, "rewards/margins": 0.008560305461287498, "rewards/rejected": -0.020147282630205154, "step": 1050 }, { "epoch": 0.7305306685044797, "grad_norm": 1.287839651107788, "learning_rate": 4.0040437007715124e-08, "logits/chosen": -3.0260822772979736, "logits/rejected": -3.0041518211364746, "logps/chosen": -55.73114013671875, "logps/rejected": -56.4024772644043, "loss": 0.6875, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.013397350907325745, "rewards/margins": 0.01179309468716383, "rewards/rejected": -0.02519044652581215, "step": 1060 }, { "epoch": 0.7374224672639559, "grad_norm": 1.1840453147888184, "learning_rate": 3.979907690644222e-08, "logits/chosen": -3.005467653274536, "logits/rejected": -2.9843525886535645, "logps/chosen": -54.47725296020508, "logps/rejected": -54.86272048950195, "loss": 0.688, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.012515179812908173, "rewards/margins": 0.010800262913107872, "rewards/rejected": -0.023315440863370895, "step": 1070 }, { "epoch": 0.7443142660234321, "grad_norm": 1.2041012048721313, "learning_rate": 3.9555574332012454e-08, "logits/chosen": -3.0442147254943848, "logits/rejected": -3.0234692096710205, "logps/chosen": -56.4234619140625, "logps/rejected": -55.07111740112305, "loss": 0.6882, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.013009254820644855, "rewards/margins": 0.01034192182123661, "rewards/rejected": -0.02335117571055889, "step": 1080 }, { "epoch": 0.7512060647829083, "grad_norm": 1.192734956741333, "learning_rate": 3.930996453647113e-08, "logits/chosen": -3.008514881134033, "logits/rejected": -2.986760139465332, "logps/chosen": -53.92486572265625, "logps/rejected": -53.8699951171875, "loss": 0.6886, "rewards/accuracies": 0.609375, "rewards/chosen": -0.016327153891324997, "rewards/margins": 0.009530487470328808, "rewards/rejected": -0.02585764229297638, "step": 1090 }, { "epoch": 0.7580978635423845, "grad_norm": 1.1945998668670654, "learning_rate": 3.906228307692747e-08, "logits/chosen": -3.050058126449585, "logits/rejected": -3.0325589179992676, "logps/chosen": -56.26338577270508, "logps/rejected": -56.20615768432617, "loss": 0.6892, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.014123663306236267, "rewards/margins": 0.0081967543810606, "rewards/rejected": -0.022320415824651718, "step": 1100 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -3.136918067932129, "eval_logits/rejected": -3.131256103515625, "eval_logps/chosen": -58.693904876708984, "eval_logps/rejected": -63.52504348754883, "eval_loss": 0.6914047598838806, "eval_rewards/accuracies": 0.5808550119400024, "eval_rewards/chosen": 0.00017988457693718374, "eval_rewards/margins": 0.003629034385085106, "eval_rewards/rejected": -0.0034491494297981262, "eval_runtime": 382.7678, "eval_samples_per_second": 11.244, "eval_steps_per_second": 1.406, "step": 1100 }, { "epoch": 0.7649896623018608, "grad_norm": 1.2611422538757324, "learning_rate": 3.8812565810407006e-08, "logits/chosen": -3.0509583950042725, "logits/rejected": -3.019794225692749, "logps/chosen": -57.166297912597656, "logps/rejected": -55.56831741333008, "loss": 0.6876, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.014432080090045929, "rewards/margins": 0.01164001040160656, "rewards/rejected": -0.02607208862900734, "step": 1110 }, { "epoch": 0.771881461061337, "grad_norm": 1.1777701377868652, "learning_rate": 3.856084888866052e-08, "logits/chosen": -3.0596282482147217, "logits/rejected": -3.045269250869751, "logps/chosen": -55.52899932861328, "logps/rejected": -54.93024444580078, "loss": 0.6888, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.01683993637561798, "rewards/margins": 0.009132475592195988, "rewards/rejected": -0.025972411036491394, "step": 1120 }, { "epoch": 0.7787732598208132, "grad_norm": 1.2296311855316162, "learning_rate": 3.830716875293038e-08, "logits/chosen": -3.0673999786376953, "logits/rejected": -3.0444142818450928, "logps/chosen": -54.970741271972656, "logps/rejected": -54.68275833129883, "loss": 0.6885, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.015940912067890167, "rewards/margins": 0.009639047086238861, "rewards/rejected": -0.02557995915412903, "step": 1130 }, { "epoch": 0.7856650585802895, "grad_norm": 1.1905580759048462, "learning_rate": 3.805156212867483e-08, "logits/chosen": -3.029092788696289, "logits/rejected": -3.0086588859558105, "logps/chosen": -56.229042053222656, "logps/rejected": -55.34952926635742, "loss": 0.6875, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.01458609290421009, "rewards/margins": 0.011665640398859978, "rewards/rejected": -0.02625173330307007, "step": 1140 }, { "epoch": 0.7925568573397657, "grad_norm": 1.1570724248886108, "learning_rate": 3.779406602025128e-08, "logits/chosen": -3.007833957672119, "logits/rejected": -2.9827017784118652, "logps/chosen": -55.04015350341797, "logps/rejected": -55.21650314331055, "loss": 0.6881, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.01640317775309086, "rewards/margins": 0.010581018403172493, "rewards/rejected": -0.02698419615626335, "step": 1150 }, { "epoch": 0.7994486560992419, "grad_norm": 1.211165189743042, "learning_rate": 3.7534717705559146e-08, "logits/chosen": -3.036921977996826, "logits/rejected": -3.0160536766052246, "logps/chosen": -56.46533203125, "logps/rejected": -57.42781448364258, "loss": 0.6882, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.017415925860404968, "rewards/margins": 0.01034791674464941, "rewards/rejected": -0.027763843536376953, "step": 1160 }, { "epoch": 0.8063404548587181, "grad_norm": 1.1748243570327759, "learning_rate": 3.727355473064308e-08, "logits/chosen": -3.05203914642334, "logits/rejected": -3.024839162826538, "logps/chosen": -54.88653564453125, "logps/rejected": -54.187705993652344, "loss": 0.6875, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.015425342135131359, "rewards/margins": 0.011839036829769611, "rewards/rejected": -0.02726438082754612, "step": 1170 }, { "epoch": 0.8132322536181944, "grad_norm": 1.2590429782867432, "learning_rate": 3.701061490425745e-08, "logits/chosen": -3.053898334503174, "logits/rejected": -3.0290002822875977, "logps/chosen": -57.20033645629883, "logps/rejected": -56.57124710083008, "loss": 0.6873, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.016385816037654877, "rewards/margins": 0.012309985235333443, "rewards/rejected": -0.02869580127298832, "step": 1180 }, { "epoch": 0.8201240523776706, "grad_norm": 1.2485055923461914, "learning_rate": 3.6745936292392666e-08, "logits/chosen": -3.021477460861206, "logits/rejected": -3.0019021034240723, "logps/chosen": -55.60076141357422, "logps/rejected": -55.449058532714844, "loss": 0.6881, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.01587783917784691, "rewards/margins": 0.010499360039830208, "rewards/rejected": -0.026377201080322266, "step": 1190 }, { "epoch": 0.8270158511371468, "grad_norm": 1.2800626754760742, "learning_rate": 3.6479557212764414e-08, "logits/chosen": -3.028402090072632, "logits/rejected": -3.008002519607544, "logps/chosen": -56.96452713012695, "logps/rejected": -55.789756774902344, "loss": 0.6885, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.01752752624452114, "rewards/margins": 0.009723445400595665, "rewards/rejected": -0.027250971645116806, "step": 1200 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -3.133814811706543, "eval_logits/rejected": -3.128159284591675, "eval_logps/chosen": -58.78531265258789, "eval_logps/rejected": -63.680179595947266, "eval_loss": 0.6911039352416992, "eval_rewards/accuracies": 0.5755111575126648, "eval_rewards/chosen": -0.0007341906311921775, "eval_rewards/margins": 0.004266415257006884, "eval_rewards/rejected": -0.005000605713576078, "eval_runtime": 383.3119, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.404, "step": 1200 }, { "epoch": 0.833907649896623, "grad_norm": 1.3170323371887207, "learning_rate": 3.621151622926631e-08, "logits/chosen": -3.022981643676758, "logits/rejected": -2.9983408451080322, "logps/chosen": -56.321983337402344, "logps/rejected": -55.284454345703125, "loss": 0.6875, "rewards/accuracies": 0.609375, "rewards/chosen": -0.017917579039931297, "rewards/margins": 0.011845814064145088, "rewards/rejected": -0.029763391241431236, "step": 1210 }, { "epoch": 0.8407994486560992, "grad_norm": 1.170351505279541, "learning_rate": 3.594185214638704e-08, "logits/chosen": -3.066943407058716, "logits/rejected": -3.0385823249816895, "logps/chosen": -57.5960807800293, "logps/rejected": -54.60730743408203, "loss": 0.6872, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0178567823022604, "rewards/margins": 0.012462841346859932, "rewards/rejected": -0.03031962178647518, "step": 1220 }, { "epoch": 0.8476912474155754, "grad_norm": 1.224771499633789, "learning_rate": 3.567060400359253e-08, "logits/chosen": -3.0506491661071777, "logits/rejected": -3.0242903232574463, "logps/chosen": -56.49556350708008, "logps/rejected": -55.71235275268555, "loss": 0.686, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.017950473353266716, "rewards/margins": 0.014979615807533264, "rewards/rejected": -0.03293009102344513, "step": 1230 }, { "epoch": 0.8545830461750517, "grad_norm": 1.2280082702636719, "learning_rate": 3.5397811069674256e-08, "logits/chosen": -3.037538528442383, "logits/rejected": -3.023832082748413, "logps/chosen": -56.15264129638672, "logps/rejected": -58.523162841796875, "loss": 0.6886, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.02116158790886402, "rewards/margins": 0.0096513070166111, "rewards/rejected": -0.03081289492547512, "step": 1240 }, { "epoch": 0.8614748449345279, "grad_norm": 1.3131028413772583, "learning_rate": 3.512351283706419e-08, "logits/chosen": -3.0145888328552246, "logits/rejected": -3.0035436153411865, "logps/chosen": -55.00154495239258, "logps/rejected": -56.4818229675293, "loss": 0.6892, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.02046709507703781, "rewards/margins": 0.008517416194081306, "rewards/rejected": -0.02898450754582882, "step": 1250 }, { "epoch": 0.8683666436940042, "grad_norm": 1.2719794511795044, "learning_rate": 3.484774901611753e-08, "logits/chosen": -3.037191390991211, "logits/rejected": -3.011030673980713, "logps/chosen": -56.1077880859375, "logps/rejected": -55.119110107421875, "loss": 0.6874, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020265808328986168, "rewards/margins": 0.012135522440075874, "rewards/rejected": -0.03240133076906204, "step": 1260 }, { "epoch": 0.8752584424534804, "grad_norm": 1.223455786705017, "learning_rate": 3.4570559529363756e-08, "logits/chosen": -3.0510623455047607, "logits/rejected": -3.0273430347442627, "logps/chosen": -56.024391174316406, "logps/rejected": -54.66645431518555, "loss": 0.687, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01942160725593567, "rewards/margins": 0.012839846312999725, "rewards/rejected": -0.032261453568935394, "step": 1270 }, { "epoch": 0.8821502412129566, "grad_norm": 1.224640965461731, "learning_rate": 3.429198450572702e-08, "logits/chosen": -3.045257568359375, "logits/rejected": -3.0113143920898438, "logps/chosen": -57.24846267700195, "logps/rejected": -55.42502975463867, "loss": 0.6862, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.019725024700164795, "rewards/margins": 0.014653083868324757, "rewards/rejected": -0.03437810391187668, "step": 1280 }, { "epoch": 0.8890420399724328, "grad_norm": 1.1708803176879883, "learning_rate": 3.401206427471665e-08, "logits/chosen": -3.083014965057373, "logits/rejected": -3.053872585296631, "logps/chosen": -55.94194412231445, "logps/rejected": -54.92655563354492, "loss": 0.6859, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.019591879099607468, "rewards/margins": 0.015089405700564384, "rewards/rejected": -0.034681286662817, "step": 1290 }, { "epoch": 0.895933838731909, "grad_norm": 1.2637726068496704, "learning_rate": 3.3730839360588633e-08, "logits/chosen": -3.0728800296783447, "logits/rejected": -3.0561113357543945, "logps/chosen": -55.9746208190918, "logps/rejected": -56.79145431518555, "loss": 0.6877, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.020700206980109215, "rewards/margins": 0.011441068723797798, "rewards/rejected": -0.03214127570390701, "step": 1300 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -3.12795090675354, "eval_logits/rejected": -3.1222612857818604, "eval_logps/chosen": -58.95671081542969, "eval_logps/rejected": -63.90719223022461, "eval_loss": 0.690849244594574, "eval_rewards/accuracies": 0.578066885471344, "eval_rewards/chosen": -0.002448198851197958, "eval_rewards/margins": 0.004822447896003723, "eval_rewards/rejected": -0.007270646747201681, "eval_runtime": 383.2075, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 1300 }, { "epoch": 0.9028256374913852, "grad_norm": 1.2189267873764038, "learning_rate": 3.344835047647892e-08, "logits/chosen": -3.038391351699829, "logits/rejected": -3.0231399536132812, "logps/chosen": -55.052391052246094, "logps/rejected": -57.159202575683594, "loss": 0.6869, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.01980864442884922, "rewards/margins": 0.013179932720959187, "rewards/rejected": -0.03298857808113098, "step": 1310 }, { "epoch": 0.9097174362508614, "grad_norm": 1.2375820875167847, "learning_rate": 3.316463851850925e-08, "logits/chosen": -3.0543713569641113, "logits/rejected": -3.0287716388702393, "logps/chosen": -55.84870147705078, "logps/rejected": -54.72203826904297, "loss": 0.6873, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.024270061403512955, "rewards/margins": 0.012434338219463825, "rewards/rejected": -0.036704398691654205, "step": 1320 }, { "epoch": 0.9166092350103378, "grad_norm": 1.2331100702285767, "learning_rate": 3.287974455986671e-08, "logits/chosen": -3.0482242107391357, "logits/rejected": -3.0168027877807617, "logps/chosen": -58.51416015625, "logps/rejected": -55.834266662597656, "loss": 0.6859, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -0.018537839874625206, "rewards/margins": 0.015199096873402596, "rewards/rejected": -0.0337369367480278, "step": 1330 }, { "epoch": 0.923501033769814, "grad_norm": 1.2201625108718872, "learning_rate": 3.259370984485746e-08, "logits/chosen": -3.0217771530151367, "logits/rejected": -2.998465061187744, "logps/chosen": -55.553428649902344, "logps/rejected": -56.95924758911133, "loss": 0.6867, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022853773087263107, "rewards/margins": 0.013605187647044659, "rewards/rejected": -0.03645896166563034, "step": 1340 }, { "epoch": 0.9303928325292902, "grad_norm": 1.2265971899032593, "learning_rate": 3.2306575782935806e-08, "logits/chosen": -3.043489456176758, "logits/rejected": -3.013939380645752, "logps/chosen": -56.77729415893555, "logps/rejected": -56.281822204589844, "loss": 0.6854, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.020281706005334854, "rewards/margins": 0.016187874600291252, "rewards/rejected": -0.036469582468271255, "step": 1350 }, { "epoch": 0.9372846312887664, "grad_norm": 1.245123267173767, "learning_rate": 3.201838394270931e-08, "logits/chosen": -3.064115524291992, "logits/rejected": -3.0484519004821777, "logps/chosen": -57.521820068359375, "logps/rejected": -57.416893005371094, "loss": 0.6874, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.024825390428304672, "rewards/margins": 0.012256421148777008, "rewards/rejected": -0.03708181157708168, "step": 1360 }, { "epoch": 0.9441764300482426, "grad_norm": 1.2887479066848755, "learning_rate": 3.172917604592084e-08, "logits/chosen": -3.0228118896484375, "logits/rejected": -3.0045721530914307, "logps/chosen": -55.98120880126953, "logps/rejected": -55.73798751831055, "loss": 0.6871, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.025501202791929245, "rewards/margins": 0.012837904505431652, "rewards/rejected": -0.03833910822868347, "step": 1370 }, { "epoch": 0.9510682288077188, "grad_norm": 1.2927711009979248, "learning_rate": 3.143899396140856e-08, "logits/chosen": -3.03184175491333, "logits/rejected": -3.010230779647827, "logps/chosen": -56.847740173339844, "logps/rejected": -56.31840133666992, "loss": 0.6871, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.022407762706279755, "rewards/margins": 0.01289152167737484, "rewards/rejected": -0.035299282521009445, "step": 1380 }, { "epoch": 0.957960027567195, "grad_norm": 1.2551859617233276, "learning_rate": 3.114787969904446e-08, "logits/chosen": -3.001943826675415, "logits/rejected": -2.9838249683380127, "logps/chosen": -56.46649169921875, "logps/rejected": -57.01629638671875, "loss": 0.6865, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.022768724709749222, "rewards/margins": 0.014040583744645119, "rewards/rejected": -0.03680930659174919, "step": 1390 }, { "epoch": 0.9648518263266712, "grad_norm": 1.1776050329208374, "learning_rate": 3.085587540365262e-08, "logits/chosen": -3.054063081741333, "logits/rejected": -3.0365357398986816, "logps/chosen": -55.7647819519043, "logps/rejected": -59.496559143066406, "loss": 0.6874, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.024700012058019638, "rewards/margins": 0.01230792049318552, "rewards/rejected": -0.03700793534517288, "step": 1400 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -3.1261656284332275, "eval_logits/rejected": -3.1204779148101807, "eval_logps/chosen": -59.10846710205078, "eval_logps/rejected": -64.10256958007812, "eval_loss": 0.6906503438949585, "eval_rewards/accuracies": 0.5771375298500061, "eval_rewards/chosen": -0.003965714480727911, "eval_rewards/margins": 0.0052587250247597694, "eval_rewards/rejected": -0.009224439039826393, "eval_runtime": 383.1495, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 1400 }, { "epoch": 0.9717436250861475, "grad_norm": 1.347545862197876, "learning_rate": 3.056302334890786e-08, "logits/chosen": -3.0551466941833496, "logits/rejected": -3.0341668128967285, "logps/chosen": -56.55133056640625, "logps/rejected": -57.29961395263672, "loss": 0.6861, "rewards/accuracies": 0.609375, "rewards/chosen": -0.02232871949672699, "rewards/margins": 0.014838054776191711, "rewards/rejected": -0.037166766822338104, "step": 1410 }, { "epoch": 0.9786354238456237, "grad_norm": 1.2241698503494263, "learning_rate": 3.02693659312157e-08, "logits/chosen": -2.9941155910491943, "logits/rejected": -2.9760937690734863, "logps/chosen": -55.75859451293945, "logps/rejected": -55.846839904785156, "loss": 0.6873, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.02684764564037323, "rewards/margins": 0.012402022257447243, "rewards/rejected": -0.039249666035175323, "step": 1420 }, { "epoch": 0.9855272226051, "grad_norm": 1.3626172542572021, "learning_rate": 2.9974945663574684e-08, "logits/chosen": -3.026280403137207, "logits/rejected": -3.0005555152893066, "logps/chosen": -57.986793518066406, "logps/rejected": -56.02061080932617, "loss": 0.684, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.02259395457804203, "rewards/margins": 0.019132796674966812, "rewards/rejected": -0.041726745665073395, "step": 1430 }, { "epoch": 0.9924190213645762, "grad_norm": 1.289384126663208, "learning_rate": 2.967980516942168e-08, "logits/chosen": -3.0657455921173096, "logits/rejected": -3.040670394897461, "logps/chosen": -58.47277069091797, "logps/rejected": -57.19081497192383, "loss": 0.6851, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.024709826335310936, "rewards/margins": 0.016781199723482132, "rewards/rejected": -0.04149102419614792, "step": 1440 }, { "epoch": 0.9993108201240524, "grad_norm": 1.288656234741211, "learning_rate": 2.9383987176461268e-08, "logits/chosen": -2.991293430328369, "logits/rejected": -2.9657962322235107, "logps/chosen": -54.06468963623047, "logps/rejected": -57.25890350341797, "loss": 0.6871, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024288879707455635, "rewards/margins": 0.012961057014763355, "rewards/rejected": -0.037249937653541565, "step": 1450 }, { "epoch": 1.0062026188835287, "grad_norm": 1.3280855417251587, "learning_rate": 2.9087534510480032e-08, "logits/chosen": -3.045292377471924, "logits/rejected": -3.0192904472351074, "logps/chosen": -55.768096923828125, "logps/rejected": -56.869842529296875, "loss": 0.6848, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.02522682026028633, "rewards/margins": 0.017563799396157265, "rewards/rejected": -0.04279061779379845, "step": 1460 }, { "epoch": 1.0130944176430048, "grad_norm": 1.2552244663238525, "learning_rate": 2.879049008914664e-08, "logits/chosen": -3.015263080596924, "logits/rejected": -2.999004602432251, "logps/chosen": -55.86402130126953, "logps/rejected": -58.1766471862793, "loss": 0.6849, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.024612870067358017, "rewards/margins": 0.01742735505104065, "rewards/rejected": -0.04204022139310837, "step": 1470 }, { "epoch": 1.019986216402481, "grad_norm": 1.2557737827301025, "learning_rate": 2.8492896915798605e-08, "logits/chosen": -3.021721601486206, "logits/rejected": -3.0094776153564453, "logps/chosen": -56.1937141418457, "logps/rejected": -59.952537536621094, "loss": 0.6864, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.024868253618478775, "rewards/margins": 0.014400708489120007, "rewards/rejected": -0.039268963038921356, "step": 1480 }, { "epoch": 1.0268780151619572, "grad_norm": 1.3632835149765015, "learning_rate": 2.8194798073216665e-08, "logits/chosen": -2.9897549152374268, "logits/rejected": -2.9639222621917725, "logps/chosen": -56.546180725097656, "logps/rejected": -56.685096740722656, "loss": 0.6847, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.024227097630500793, "rewards/margins": 0.017609497532248497, "rewards/rejected": -0.04183660075068474, "step": 1490 }, { "epoch": 1.0337698139214335, "grad_norm": 1.359270691871643, "learning_rate": 2.7896236717387662e-08, "logits/chosen": -2.9973204135894775, "logits/rejected": -2.979785680770874, "logps/chosen": -55.88494110107422, "logps/rejected": -56.941490173339844, "loss": 0.6871, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.029738834127783775, "rewards/margins": 0.012930555269122124, "rewards/rejected": -0.0426693893969059, "step": 1500 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -3.121001720428467, "eval_logits/rejected": -3.1152734756469727, "eval_logps/chosen": -59.26029968261719, "eval_logps/rejected": -64.31062316894531, "eval_loss": 0.6903930306434631, "eval_rewards/accuracies": 0.5824813842773438, "eval_rewards/chosen": -0.005484058987349272, "eval_rewards/margins": 0.005820916499942541, "eval_rewards/rejected": -0.011304975487291813, "eval_runtime": 383.1818, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 1500 }, { "epoch": 1.0406616126809096, "grad_norm": 1.1823673248291016, "learning_rate": 2.7597256071256836e-08, "logits/chosen": -3.0232627391815186, "logits/rejected": -3.002992630004883, "logps/chosen": -55.69109344482422, "logps/rejected": -55.20794677734375, "loss": 0.6864, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.029489045962691307, "rewards/margins": 0.014345052652060986, "rewards/rejected": -0.04383409768342972, "step": 1510 }, { "epoch": 1.047553411440386, "grad_norm": 1.2642569541931152, "learning_rate": 2.7297899418470372e-08, "logits/chosen": -2.9904372692108154, "logits/rejected": -2.9669933319091797, "logps/chosen": -59.17595672607422, "logps/rejected": -57.17033767700195, "loss": 0.685, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.027122503146529198, "rewards/margins": 0.017171606421470642, "rewards/rejected": -0.04429411143064499, "step": 1520 }, { "epoch": 1.0544452101998623, "grad_norm": 1.3126106262207031, "learning_rate": 2.6998210097109213e-08, "logits/chosen": -3.062737226486206, "logits/rejected": -3.041637659072876, "logps/chosen": -55.9976806640625, "logps/rejected": -57.3626823425293, "loss": 0.6843, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -0.0244468804448843, "rewards/margins": 0.018709514290094376, "rewards/rejected": -0.043156400322914124, "step": 1530 }, { "epoch": 1.0613370089593384, "grad_norm": 1.1926969289779663, "learning_rate": 2.669823149341498e-08, "logits/chosen": -3.0017895698547363, "logits/rejected": -2.9859423637390137, "logps/chosen": -55.688560485839844, "logps/rejected": -56.54026412963867, "loss": 0.6862, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.02824980393052101, "rewards/margins": 0.01474563218653202, "rewards/rejected": -0.04299543425440788, "step": 1540 }, { "epoch": 1.0682288077188147, "grad_norm": 1.2355592250823975, "learning_rate": 2.6398007035508906e-08, "logits/chosen": -3.020545482635498, "logits/rejected": -2.9991073608398438, "logps/chosen": -60.19884490966797, "logps/rejected": -58.834068298339844, "loss": 0.6847, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02622285485267639, "rewards/margins": 0.01788989268243313, "rewards/rejected": -0.04411274939775467, "step": 1550 }, { "epoch": 1.0751206064782908, "grad_norm": 1.2842044830322266, "learning_rate": 2.609758018710473e-08, "logits/chosen": -3.0513670444488525, "logits/rejected": -3.0258631706237793, "logps/chosen": -58.162193298339844, "logps/rejected": -58.37177276611328, "loss": 0.6845, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026162385940551758, "rewards/margins": 0.01831069216132164, "rewards/rejected": -0.0444730743765831, "step": 1560 }, { "epoch": 1.082012405237767, "grad_norm": 1.2734873294830322, "learning_rate": 2.5796994441216392e-08, "logits/chosen": -3.025871753692627, "logits/rejected": -3.008380174636841, "logps/chosen": -57.0385856628418, "logps/rejected": -57.39351272583008, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.027973037213087082, "rewards/margins": 0.017069904133677483, "rewards/rejected": -0.045042943209409714, "step": 1570 }, { "epoch": 1.0889042039972432, "grad_norm": 1.2615596055984497, "learning_rate": 2.5496293313861533e-08, "logits/chosen": -3.0057692527770996, "logits/rejected": -2.9775753021240234, "logps/chosen": -55.155418395996094, "logps/rejected": -56.9516716003418, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02907036617398262, "rewards/margins": 0.01720438338816166, "rewards/rejected": -0.04627475142478943, "step": 1580 }, { "epoch": 1.0957960027567195, "grad_norm": 1.260473370552063, "learning_rate": 2.519552033776168e-08, "logits/chosen": -2.992969512939453, "logits/rejected": -2.9799506664276123, "logps/chosen": -57.432411193847656, "logps/rejected": -58.8470458984375, "loss": 0.6859, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.029672112315893173, "rewards/margins": 0.015414416790008545, "rewards/rejected": -0.04508653283119202, "step": 1590 }, { "epoch": 1.1026878015161956, "grad_norm": 1.3105090856552124, "learning_rate": 2.4894719056039933e-08, "logits/chosen": -3.059690237045288, "logits/rejected": -3.0416653156280518, "logps/chosen": -57.756431579589844, "logps/rejected": -58.930335998535156, "loss": 0.6863, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.02842426858842373, "rewards/margins": 0.014618475921452045, "rewards/rejected": -0.04304274171590805, "step": 1600 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -3.117943286895752, "eval_logits/rejected": -3.112224817276001, "eval_logps/chosen": -59.45923614501953, "eval_logps/rejected": -64.5576171875, "eval_loss": 0.6901748180389404, "eval_rewards/accuracies": 0.5887546539306641, "eval_rewards/chosen": -0.007473426405340433, "eval_rewards/margins": 0.00630148034542799, "eval_rewards/rejected": -0.013774906285107136, "eval_runtime": 383.3589, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 1600 }, { "epoch": 1.109579600275672, "grad_norm": 1.2811577320098877, "learning_rate": 2.459393301591723e-08, "logits/chosen": -3.0472471714019775, "logits/rejected": -3.0280234813690186, "logps/chosen": -56.1804313659668, "logps/rejected": -56.194740295410156, "loss": 0.6864, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.029790574684739113, "rewards/margins": 0.014425704255700111, "rewards/rejected": -0.044216278940439224, "step": 1610 }, { "epoch": 1.1164713990351482, "grad_norm": 1.2719690799713135, "learning_rate": 2.429320576240796e-08, "logits/chosen": -2.983424186706543, "logits/rejected": -2.960758686065674, "logps/chosen": -57.0593147277832, "logps/rejected": -57.68733596801758, "loss": 0.684, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -0.0267618540674448, "rewards/margins": 0.01918993890285492, "rewards/rejected": -0.04595179110765457, "step": 1620 }, { "epoch": 1.1233631977946243, "grad_norm": 1.274936556816101, "learning_rate": 2.3992580832015937e-08, "logits/chosen": -3.0748069286346436, "logits/rejected": -3.0516257286071777, "logps/chosen": -57.92633056640625, "logps/rejected": -57.392669677734375, "loss": 0.6851, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.028706436976790428, "rewards/margins": 0.016950782388448715, "rewards/rejected": -0.04565722122788429, "step": 1630 }, { "epoch": 1.1302549965541007, "grad_norm": 1.3350669145584106, "learning_rate": 2.3692101746431582e-08, "logits/chosen": -2.999372720718384, "logits/rejected": -2.9686479568481445, "logps/chosen": -57.523155212402344, "logps/rejected": -57.58971405029297, "loss": 0.6842, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -0.0288604237139225, "rewards/margins": 0.018762212246656418, "rewards/rejected": -0.04762263223528862, "step": 1640 }, { "epoch": 1.1371467953135768, "grad_norm": 1.1888097524642944, "learning_rate": 2.3391812006231252e-08, "logits/chosen": -3.051567792892456, "logits/rejected": -3.024486780166626, "logps/chosen": -57.35553741455078, "logps/rejected": -57.0074462890625, "loss": 0.6863, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.030893787741661072, "rewards/margins": 0.01476077176630497, "rewards/rejected": -0.04565456137061119, "step": 1650 }, { "epoch": 1.144038594073053, "grad_norm": 1.1799283027648926, "learning_rate": 2.3091755084579655e-08, "logits/chosen": -3.032055377960205, "logits/rejected": -3.004883289337158, "logps/chosen": -56.0220832824707, "logps/rejected": -55.85259246826172, "loss": 0.6844, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.033284805715084076, "rewards/margins": 0.018482720479369164, "rewards/rejected": -0.05176752805709839, "step": 1660 }, { "epoch": 1.1509303928325294, "grad_norm": 1.3657642602920532, "learning_rate": 2.2791974420936168e-08, "logits/chosen": -3.0568909645080566, "logits/rejected": -3.0396854877471924, "logps/chosen": -55.1065673828125, "logps/rejected": -58.755226135253906, "loss": 0.6845, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.03204982727766037, "rewards/margins": 0.018315287306904793, "rewards/rejected": -0.05036511272192001, "step": 1670 }, { "epoch": 1.1578221915920055, "grad_norm": 1.238609790802002, "learning_rate": 2.2492513414766092e-08, "logits/chosen": -3.015735626220703, "logits/rejected": -2.9980287551879883, "logps/chosen": -58.169593811035156, "logps/rejected": -59.08977127075195, "loss": 0.6831, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.027626004070043564, "rewards/margins": 0.0211968831717968, "rewards/rejected": -0.04882288724184036, "step": 1680 }, { "epoch": 1.1647139903514818, "grad_norm": 1.2068655490875244, "learning_rate": 2.2193415419257697e-08, "logits/chosen": -3.023995876312256, "logits/rejected": -3.0071871280670166, "logps/chosen": -57.2905158996582, "logps/rejected": -58.23944854736328, "loss": 0.6866, "rewards/accuracies": 0.5953124761581421, "rewards/chosen": -0.03379104658961296, "rewards/margins": 0.014122662134468555, "rewards/rejected": -0.04791371151804924, "step": 1690 }, { "epoch": 1.171605789110958, "grad_norm": 1.2340092658996582, "learning_rate": 2.1894723735045923e-08, "logits/chosen": -3.015665054321289, "logits/rejected": -2.995542526245117, "logps/chosen": -56.78801727294922, "logps/rejected": -58.15932083129883, "loss": 0.6854, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.034682370722293854, "rewards/margins": 0.01648074761033058, "rewards/rejected": -0.051163118332624435, "step": 1700 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -3.11429500579834, "eval_logits/rejected": -3.1086244583129883, "eval_logps/chosen": -59.66813278198242, "eval_logps/rejected": -64.80902099609375, "eval_loss": 0.6899911165237427, "eval_rewards/accuracies": 0.586663544178009, "eval_rewards/chosen": -0.009562356397509575, "eval_rewards/margins": 0.006726610474288464, "eval_rewards/rejected": -0.016288965940475464, "eval_runtime": 383.0366, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 1700 }, { "epoch": 1.1784975878704342, "grad_norm": 1.361463189125061, "learning_rate": 2.159648160394373e-08, "logits/chosen": -3.077711582183838, "logits/rejected": -3.057525396347046, "logps/chosen": -58.2719612121582, "logps/rejected": -58.065284729003906, "loss": 0.6842, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.028816580772399902, "rewards/margins": 0.018897056579589844, "rewards/rejected": -0.047713637351989746, "step": 1710 }, { "epoch": 1.1853893866299103, "grad_norm": 1.315765619277954, "learning_rate": 2.1298732202681956e-08, "logits/chosen": -2.9981892108917236, "logits/rejected": -2.9703431129455566, "logps/chosen": -57.9752311706543, "logps/rejected": -57.5850944519043, "loss": 0.6835, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.03318381309509277, "rewards/margins": 0.02043316699564457, "rewards/rejected": -0.053616978228092194, "step": 1720 }, { "epoch": 1.1922811853893867, "grad_norm": 1.2448103427886963, "learning_rate": 2.1001518636658567e-08, "logits/chosen": -3.0659806728363037, "logits/rejected": -3.0397450923919678, "logps/chosen": -57.8054313659668, "logps/rejected": -58.7362174987793, "loss": 0.684, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.030917003750801086, "rewards/margins": 0.019438743591308594, "rewards/rejected": -0.05035575106739998, "step": 1730 }, { "epoch": 1.1991729841488628, "grad_norm": 1.3165340423583984, "learning_rate": 2.0704883933698286e-08, "logits/chosen": -3.0220000743865967, "logits/rejected": -2.988614559173584, "logps/chosen": -59.221092224121094, "logps/rejected": -56.499412536621094, "loss": 0.6833, "rewards/accuracies": 0.640625, "rewards/chosen": -0.03028092160820961, "rewards/margins": 0.020808402448892593, "rewards/rejected": -0.0510893277823925, "step": 1740 }, { "epoch": 1.206064782908339, "grad_norm": 1.2597706317901611, "learning_rate": 2.0408871037823392e-08, "logits/chosen": -3.050657033920288, "logits/rejected": -3.0249600410461426, "logps/chosen": -58.73793411254883, "logps/rejected": -58.411109924316406, "loss": 0.6832, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.032266996800899506, "rewards/margins": 0.020968910306692123, "rewards/rejected": -0.05323590710759163, "step": 1750 }, { "epoch": 1.2129565816678154, "grad_norm": 1.3488494157791138, "learning_rate": 2.0113522803036697e-08, "logits/chosen": -3.0145840644836426, "logits/rejected": -2.9919447898864746, "logps/chosen": -57.56928253173828, "logps/rejected": -59.618064880371094, "loss": 0.6833, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.033350322395563126, "rewards/margins": 0.020749535411596298, "rewards/rejected": -0.054099857807159424, "step": 1760 }, { "epoch": 1.2198483804272915, "grad_norm": 1.2959116697311401, "learning_rate": 1.981888198711757e-08, "logits/chosen": -3.0438103675842285, "logits/rejected": -3.0201594829559326, "logps/chosen": -56.78424072265625, "logps/rejected": -58.80998992919922, "loss": 0.6845, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03599938377737999, "rewards/margins": 0.018340986222028732, "rewards/rejected": -0.05434036999940872, "step": 1770 }, { "epoch": 1.2267401791867678, "grad_norm": 1.3248341083526611, "learning_rate": 1.952499124543181e-08, "logits/chosen": -3.050183057785034, "logits/rejected": -3.0215609073638916, "logps/chosen": -59.94126510620117, "logps/rejected": -58.557289123535156, "loss": 0.6838, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.03428710997104645, "rewards/margins": 0.019783692434430122, "rewards/rejected": -0.05407080054283142, "step": 1780 }, { "epoch": 1.233631977946244, "grad_norm": 1.2419155836105347, "learning_rate": 1.923189312475642e-08, "logits/chosen": -3.0126285552978516, "logits/rejected": -2.989089250564575, "logps/chosen": -57.96059036254883, "logps/rejected": -58.968994140625, "loss": 0.6844, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.035335466265678406, "rewards/margins": 0.01862800493836403, "rewards/rejected": -0.05396346375346184, "step": 1790 }, { "epoch": 1.2405237767057202, "grad_norm": 1.3300213813781738, "learning_rate": 1.8939630057120098e-08, "logits/chosen": -3.000619888305664, "logits/rejected": -2.9799740314483643, "logps/chosen": -58.02915573120117, "logps/rejected": -60.03089141845703, "loss": 0.6855, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.03870057314634323, "rewards/margins": 0.01641600951552391, "rewards/rejected": -0.05511658638715744, "step": 1800 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -3.11264705657959, "eval_logits/rejected": -3.1069631576538086, "eval_logps/chosen": -59.91139221191406, "eval_logps/rejected": -65.09736633300781, "eval_loss": 0.6897966265678406, "eval_rewards/accuracies": 0.582713782787323, "eval_rewards/chosen": -0.01199500635266304, "eval_rewards/margins": 0.007177378050982952, "eval_rewards/rejected": -0.019172383472323418, "eval_runtime": 383.0708, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 1800 }, { "epoch": 1.2474155754651963, "grad_norm": 1.2643280029296875, "learning_rate": 1.8648244353660288e-08, "logits/chosen": -3.0149238109588623, "logits/rejected": -2.9926235675811768, "logps/chosen": -59.53852081298828, "logps/rejected": -58.4305419921875, "loss": 0.6858, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.03731811046600342, "rewards/margins": 0.015873271971940994, "rewards/rejected": -0.05319138243794441, "step": 1810 }, { "epoch": 1.2543073742246726, "grad_norm": 1.353582501411438, "learning_rate": 1.835777819849779e-08, "logits/chosen": -3.084817886352539, "logits/rejected": -3.052018880844116, "logps/chosen": -58.27227783203125, "logps/rejected": -58.49510955810547, "loss": 0.6821, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -0.034875739365816116, "rewards/margins": 0.023144185543060303, "rewards/rejected": -0.05801992490887642, "step": 1820 }, { "epoch": 1.2611991729841487, "grad_norm": 1.3098019361495972, "learning_rate": 1.806827364262974e-08, "logits/chosen": -2.970393657684326, "logits/rejected": -2.9504239559173584, "logps/chosen": -57.546607971191406, "logps/rejected": -58.0165901184082, "loss": 0.687, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0393044538795948, "rewards/margins": 0.013580908067524433, "rewards/rejected": -0.05288536101579666, "step": 1830 }, { "epoch": 1.268090971743625, "grad_norm": 1.2913509607315063, "learning_rate": 1.7779772597841818e-08, "logits/chosen": -3.0347402095794678, "logits/rejected": -3.008613109588623, "logps/chosen": -58.36812210083008, "logps/rejected": -58.02600860595703, "loss": 0.6827, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03452888876199722, "rewards/margins": 0.022128187119960785, "rewards/rejected": -0.05665707588195801, "step": 1840 }, { "epoch": 1.2749827705031014, "grad_norm": 1.252109169960022, "learning_rate": 1.749231683064069e-08, "logits/chosen": -2.9613466262817383, "logits/rejected": -2.9358131885528564, "logps/chosen": -57.309776306152344, "logps/rejected": -57.78460693359375, "loss": 0.6849, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.03900003433227539, "rewards/margins": 0.017551228404045105, "rewards/rejected": -0.056551266461610794, "step": 1850 }, { "epoch": 1.2818745692625775, "grad_norm": 1.3581938743591309, "learning_rate": 1.7205947956207416e-08, "logits/chosen": -2.9560749530792236, "logits/rejected": -2.9271953105926514, "logps/chosen": -58.55373001098633, "logps/rejected": -58.74352264404297, "loss": 0.6828, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.03720385953783989, "rewards/margins": 0.021904241293668747, "rewards/rejected": -0.059108100831508636, "step": 1860 }, { "epoch": 1.2887663680220538, "grad_norm": 1.377907395362854, "learning_rate": 1.69207074323728e-08, "logits/chosen": -3.007751703262329, "logits/rejected": -2.990427255630493, "logps/chosen": -57.58440017700195, "logps/rejected": -57.02080154418945, "loss": 0.685, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.0374065637588501, "rewards/margins": 0.01753416657447815, "rewards/rejected": -0.05494073033332825, "step": 1870 }, { "epoch": 1.29565816678153, "grad_norm": 1.3684296607971191, "learning_rate": 1.6636636553615502e-08, "logits/chosen": -2.969104290008545, "logits/rejected": -2.9459190368652344, "logps/chosen": -57.26690673828125, "logps/rejected": -58.23255157470703, "loss": 0.6839, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.03711014613509178, "rewards/margins": 0.019704418256878853, "rewards/rejected": -0.056814562529325485, "step": 1880 }, { "epoch": 1.3025499655410062, "grad_norm": 1.2850284576416016, "learning_rate": 1.6353776445083815e-08, "logits/chosen": -3.0240912437438965, "logits/rejected": -3.0120110511779785, "logps/chosen": -55.608795166015625, "logps/rejected": -58.87140655517578, "loss": 0.6839, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.03577902913093567, "rewards/margins": 0.019716601818799973, "rewards/rejected": -0.05549562722444534, "step": 1890 }, { "epoch": 1.3094417643004825, "grad_norm": 1.2016737461090088, "learning_rate": 1.6072168056641944e-08, "logits/chosen": -3.0512993335723877, "logits/rejected": -3.0232746601104736, "logps/chosen": -59.6502571105957, "logps/rejected": -57.75080490112305, "loss": 0.6824, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.03732553869485855, "rewards/margins": 0.02288132533431053, "rewards/rejected": -0.06020686775445938, "step": 1900 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -3.109053134918213, "eval_logits/rejected": -3.1033873558044434, "eval_logps/chosen": -60.10012435913086, "eval_logps/rejected": -65.30886840820312, "eval_loss": 0.6897038221359253, "eval_rewards/accuracies": 0.5824813842773438, "eval_rewards/chosen": -0.013882317580282688, "eval_rewards/margins": 0.007405092474073172, "eval_rewards/rejected": -0.02128741145133972, "eval_runtime": 383.365, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 1900 }, { "epoch": 1.3163335630599586, "grad_norm": 1.3704478740692139, "learning_rate": 1.5791852156941672e-08, "logits/chosen": -2.9737439155578613, "logits/rejected": -2.9562785625457764, "logps/chosen": -58.3253173828125, "logps/rejected": -58.146751403808594, "loss": 0.685, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.03781484067440033, "rewards/margins": 0.017499810084700584, "rewards/rejected": -0.05531464144587517, "step": 1910 }, { "epoch": 1.323225361819435, "grad_norm": 1.304793119430542, "learning_rate": 1.5512869327520234e-08, "logits/chosen": -3.0345849990844727, "logits/rejected": -3.006624937057495, "logps/chosen": -59.780555725097656, "logps/rejected": -59.3910026550293, "loss": 0.6828, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.038787275552749634, "rewards/margins": 0.022125843912363052, "rewards/rejected": -0.060913123190402985, "step": 1920 }, { "epoch": 1.330117160578911, "grad_norm": 1.281746506690979, "learning_rate": 1.52352599569253e-08, "logits/chosen": -3.0547759532928467, "logits/rejected": -3.0221850872039795, "logps/chosen": -57.975791931152344, "logps/rejected": -56.89446258544922, "loss": 0.6843, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.03873300552368164, "rewards/margins": 0.018973171710968018, "rewards/rejected": -0.05770616978406906, "step": 1930 }, { "epoch": 1.3370089593383874, "grad_norm": 1.3232277631759644, "learning_rate": 1.4959064234867925e-08, "logits/chosen": -2.9585065841674805, "logits/rejected": -2.936213970184326, "logps/chosen": -56.48392868041992, "logps/rejected": -58.73712158203125, "loss": 0.6824, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.03964737430214882, "rewards/margins": 0.022885087877511978, "rewards/rejected": -0.0625324696302414, "step": 1940 }, { "epoch": 1.3439007580978635, "grad_norm": 1.266557216644287, "learning_rate": 1.4684322146404215e-08, "logits/chosen": -3.035268783569336, "logits/rejected": -3.01952862739563, "logps/chosen": -56.704620361328125, "logps/rejected": -59.3856086730957, "loss": 0.6851, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04403103142976761, "rewards/margins": 0.017256928607821465, "rewards/rejected": -0.061287958174943924, "step": 1950 }, { "epoch": 1.3507925568573398, "grad_norm": 1.2548446655273438, "learning_rate": 1.4411073466146698e-08, "logits/chosen": -3.0059549808502197, "logits/rejected": -2.9832520484924316, "logps/chosen": -58.505836486816406, "logps/rejected": -60.24690628051758, "loss": 0.6838, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.03818178176879883, "rewards/margins": 0.01996403932571411, "rewards/rejected": -0.058145828545093536, "step": 1960 }, { "epoch": 1.3576843556168159, "grad_norm": 1.3594894409179688, "learning_rate": 1.413935775250609e-08, "logits/chosen": -2.9915037155151367, "logits/rejected": -2.9657304286956787, "logps/chosen": -58.22015380859375, "logps/rejected": -58.225196838378906, "loss": 0.6821, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03904888778924942, "rewards/margins": 0.023439262062311172, "rewards/rejected": -0.06248814985156059, "step": 1970 }, { "epoch": 1.3645761543762922, "grad_norm": 1.305829644203186, "learning_rate": 1.3869214341964411e-08, "logits/chosen": -2.9901621341705322, "logits/rejected": -2.972623825073242, "logps/chosen": -58.67400360107422, "logps/rejected": -59.30767059326172, "loss": 0.6854, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0412491112947464, "rewards/margins": 0.01679963245987892, "rewards/rejected": -0.058048736304044724, "step": 1980 }, { "epoch": 1.3714679531357685, "grad_norm": 1.2267251014709473, "learning_rate": 1.3600682343380247e-08, "logits/chosen": -2.953930616378784, "logits/rejected": -2.9335622787475586, "logps/chosen": -58.66155242919922, "logps/rejected": -59.33677291870117, "loss": 0.6835, "rewards/accuracies": 0.609375, "rewards/chosen": -0.039415620267391205, "rewards/margins": 0.020520631223917007, "rewards/rejected": -0.05993625521659851, "step": 1990 }, { "epoch": 1.3783597518952446, "grad_norm": 1.2556020021438599, "learning_rate": 1.3333800632326858e-08, "logits/chosen": -3.0334630012512207, "logits/rejected": -3.0166397094726562, "logps/chosen": -58.55223846435547, "logps/rejected": -59.941978454589844, "loss": 0.6851, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.0418681763112545, "rewards/margins": 0.017354335635900497, "rewards/rejected": -0.0592225082218647, "step": 2000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -3.1071271896362305, "eval_logits/rejected": -3.101414680480957, "eval_logps/chosen": -60.261566162109375, "eval_logps/rejected": -65.51657104492188, "eval_loss": 0.6894936561584473, "eval_rewards/accuracies": 0.5906133651733398, "eval_rewards/chosen": -0.015496725216507912, "eval_rewards/margins": 0.007867763750255108, "eval_rewards/rejected": -0.023364488035440445, "eval_runtime": 383.0695, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.404, "step": 2000 }, { "epoch": 1.385251550654721, "grad_norm": 1.4159228801727295, "learning_rate": 1.3068607845464202e-08, "logits/chosen": -2.9797048568725586, "logits/rejected": -2.952303171157837, "logps/chosen": -59.8831901550293, "logps/rejected": -59.22021484375, "loss": 0.6842, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.041412778198719025, "rewards/margins": 0.0192607082426548, "rewards/rejected": -0.06067349389195442, "step": 2010 }, { "epoch": 1.392143349414197, "grad_norm": 1.3155369758605957, "learning_rate": 1.2805142374945437e-08, "logits/chosen": -3.0014488697052, "logits/rejected": -2.978201389312744, "logps/chosen": -57.95537185668945, "logps/rejected": -59.4213752746582, "loss": 0.6827, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.03732657432556152, "rewards/margins": 0.022235842421650887, "rewards/rejected": -0.05956241488456726, "step": 2020 }, { "epoch": 1.3990351481736734, "grad_norm": 1.2982782125473022, "learning_rate": 1.254344236285888e-08, "logits/chosen": -2.984819173812866, "logits/rejected": -2.9616565704345703, "logps/chosen": -58.2531623840332, "logps/rejected": -59.219261169433594, "loss": 0.684, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.039121102541685104, "rewards/margins": 0.019388314336538315, "rewards/rejected": -0.05850941687822342, "step": 2030 }, { "epoch": 1.4059269469331497, "grad_norm": 1.328587532043457, "learning_rate": 1.2283545695706135e-08, "logits/chosen": -2.9852476119995117, "logits/rejected": -2.9641222953796387, "logps/chosen": -58.166831970214844, "logps/rejected": -58.347557067871094, "loss": 0.6842, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.042398639023303986, "rewards/margins": 0.019147690385580063, "rewards/rejected": -0.06154633313417435, "step": 2040 }, { "epoch": 1.4128187456926258, "grad_norm": 1.2532403469085693, "learning_rate": 1.2025489998917254e-08, "logits/chosen": -3.011920690536499, "logits/rejected": -2.985671043395996, "logps/chosen": -61.16510009765625, "logps/rejected": -59.185394287109375, "loss": 0.6842, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.03920884430408478, "rewards/margins": 0.019202571362257004, "rewards/rejected": -0.058411408215761185, "step": 2050 }, { "epoch": 1.4197105444521019, "grad_norm": 1.379606008529663, "learning_rate": 1.1769312631403659e-08, "logits/chosen": -2.9937710762023926, "logits/rejected": -2.9706432819366455, "logps/chosen": -59.001708984375, "logps/rejected": -58.9688720703125, "loss": 0.6849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0409664623439312, "rewards/margins": 0.017764370888471603, "rewards/rejected": -0.0587308332324028, "step": 2060 }, { "epoch": 1.4266023432115782, "grad_norm": 1.3206267356872559, "learning_rate": 1.1515050680149687e-08, "logits/chosen": -3.0447440147399902, "logits/rejected": -3.0277929306030273, "logps/chosen": -59.411956787109375, "logps/rejected": -60.502716064453125, "loss": 0.6859, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04452786594629288, "rewards/margins": 0.01587734930217266, "rewards/rejected": -0.06040521338582039, "step": 2070 }, { "epoch": 1.4334941419710545, "grad_norm": 1.2896159887313843, "learning_rate": 1.1262740954843456e-08, "logits/chosen": -3.0021471977233887, "logits/rejected": -2.971998691558838, "logps/chosen": -60.27617645263672, "logps/rejected": -59.302833557128906, "loss": 0.6832, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03701635077595711, "rewards/margins": 0.021184273064136505, "rewards/rejected": -0.058200620114803314, "step": 2080 }, { "epoch": 1.4403859407305306, "grad_norm": 1.326338768005371, "learning_rate": 1.1012419982547905e-08, "logits/chosen": -2.961845636367798, "logits/rejected": -2.941849946975708, "logps/chosen": -57.25081253051758, "logps/rejected": -59.027015686035156, "loss": 0.684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04554927721619606, "rewards/margins": 0.019669197499752045, "rewards/rejected": -0.0652184709906578, "step": 2090 }, { "epoch": 1.447277739490007, "grad_norm": 1.316919207572937, "learning_rate": 1.0764124002412758e-08, "logits/chosen": -3.0356943607330322, "logits/rejected": -3.012575626373291, "logps/chosen": -56.51226806640625, "logps/rejected": -59.925514221191406, "loss": 0.6834, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04187322035431862, "rewards/margins": 0.02075764164328575, "rewards/rejected": -0.06263085454702377, "step": 2100 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -3.1055357456207275, "eval_logits/rejected": -3.099832534790039, "eval_logps/chosen": -60.38422775268555, "eval_logps/rejected": -65.65011596679688, "eval_loss": 0.6894546151161194, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.01672333851456642, "eval_rewards/margins": 0.00797655712813139, "eval_rewards/rejected": -0.024699894711375237, "eval_runtime": 383.1595, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 2100 }, { "epoch": 1.454169538249483, "grad_norm": 1.3180962800979614, "learning_rate": 1.0517888960428139e-08, "logits/chosen": -2.964921236038208, "logits/rejected": -2.94468355178833, "logps/chosen": -58.661651611328125, "logps/rejected": -59.41533660888672, "loss": 0.6821, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -0.042625896632671356, "rewards/margins": 0.023367973044514656, "rewards/rejected": -0.06599386781454086, "step": 2110 }, { "epoch": 1.4610613370089593, "grad_norm": 1.2736924886703491, "learning_rate": 1.0273750504220666e-08, "logits/chosen": -2.982936382293701, "logits/rejected": -2.9598629474639893, "logps/chosen": -56.829505920410156, "logps/rejected": -59.5037956237793, "loss": 0.6822, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.04348963871598244, "rewards/margins": 0.023124249652028084, "rewards/rejected": -0.06661389768123627, "step": 2120 }, { "epoch": 1.4679531357684357, "grad_norm": 1.305767297744751, "learning_rate": 1.003174397789269e-08, "logits/chosen": -2.981260299682617, "logits/rejected": -2.9577889442443848, "logps/chosen": -59.381996154785156, "logps/rejected": -59.3449821472168, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04265505075454712, "rewards/margins": 0.023621436208486557, "rewards/rejected": -0.06627649068832397, "step": 2130 }, { "epoch": 1.4748449345279118, "grad_norm": 1.2957626581192017, "learning_rate": 9.791904416905526e-09, "logits/chosen": -3.0431559085845947, "logits/rejected": -3.0270159244537354, "logps/chosen": -58.642250061035156, "logps/rejected": -59.5418586730957, "loss": 0.6855, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.044856615364551544, "rewards/margins": 0.016749290749430656, "rewards/rejected": -0.06160591170191765, "step": 2140 }, { "epoch": 1.481736733287388, "grad_norm": 1.3108528852462769, "learning_rate": 9.554266543007328e-09, "logits/chosen": -3.007779598236084, "logits/rejected": -2.9805774688720703, "logps/chosen": -58.5262565612793, "logps/rejected": -59.313941955566406, "loss": 0.6809, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.0379471480846405, "rewards/margins": 0.02591646835207939, "rewards/rejected": -0.0638636127114296, "step": 2150 }, { "epoch": 1.4886285320468642, "grad_norm": 1.2914735078811646, "learning_rate": 9.318864759206429e-09, "logits/chosen": -2.9647016525268555, "logits/rejected": -2.9434256553649902, "logps/chosen": -58.31499481201172, "logps/rejected": -57.80283737182617, "loss": 0.6842, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -0.04223569482564926, "rewards/margins": 0.0191799309104681, "rewards/rejected": -0.061415620148181915, "step": 2160 }, { "epoch": 1.4955203308063405, "grad_norm": 1.3237831592559814, "learning_rate": 9.085733144790744e-09, "logits/chosen": -3.0011582374572754, "logits/rejected": -2.9859848022460938, "logps/chosen": -57.24296188354492, "logps/rejected": -59.595008850097656, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": -0.04495619237422943, "rewards/margins": 0.022577274590730667, "rewards/rejected": -0.0675334706902504, "step": 2170 }, { "epoch": 1.5024121295658168, "grad_norm": 1.409790277481079, "learning_rate": 8.854905450394113e-09, "logits/chosen": -3.0098440647125244, "logits/rejected": -2.983942747116089, "logps/chosen": -59.406158447265625, "logps/rejected": -58.25774002075195, "loss": 0.6837, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.04101915657520294, "rewards/margins": 0.020175766199827194, "rewards/rejected": -0.06119491904973984, "step": 2180 }, { "epoch": 1.509303928325293, "grad_norm": 1.3202848434448242, "learning_rate": 8.626415093110202e-09, "logits/chosen": -2.9948554039001465, "logits/rejected": -2.975142240524292, "logps/chosen": -56.601722717285156, "logps/rejected": -59.769569396972656, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": -0.04384131729602814, "rewards/margins": 0.020800283178687096, "rewards/rejected": -0.06464160233736038, "step": 2190 }, { "epoch": 1.516195727084769, "grad_norm": 1.366294503211975, "learning_rate": 8.40029515165467e-09, "logits/chosen": -3.006235361099243, "logits/rejected": -2.983431577682495, "logps/chosen": -57.9134635925293, "logps/rejected": -58.527076721191406, "loss": 0.6828, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.04390479251742363, "rewards/margins": 0.022031091153621674, "rewards/rejected": -0.065935879945755, "step": 2200 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -3.104142904281616, "eval_logits/rejected": -3.098437547683716, "eval_logps/chosen": -60.504878997802734, "eval_logps/rejected": -65.79142761230469, "eval_loss": 0.6893645524978638, "eval_rewards/accuracies": 0.5873606204986572, "eval_rewards/chosen": -0.01792982593178749, "eval_rewards/margins": 0.008183243684470654, "eval_rewards/rejected": -0.02611307054758072, "eval_runtime": 382.8386, "eval_samples_per_second": 11.242, "eval_steps_per_second": 1.405, "step": 2200 }, { "epoch": 1.5230875258442453, "grad_norm": 1.290756106376648, "learning_rate": 8.176578361576358e-09, "logits/chosen": -2.9781322479248047, "logits/rejected": -2.959134340286255, "logps/chosen": -56.6655387878418, "logps/rejected": -58.62583541870117, "loss": 0.6848, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.04583312198519707, "rewards/margins": 0.01810682937502861, "rewards/rejected": -0.06393995136022568, "step": 2210 }, { "epoch": 1.5299793246037217, "grad_norm": 1.2776232957839966, "learning_rate": 7.955297110518117e-09, "logits/chosen": -3.0543761253356934, "logits/rejected": -3.028890371322632, "logps/chosen": -58.1224365234375, "logps/rejected": -59.595802307128906, "loss": 0.684, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.04209282249212265, "rewards/margins": 0.019542943686246872, "rewards/rejected": -0.06163576990365982, "step": 2220 }, { "epoch": 1.5368711233631978, "grad_norm": 1.349440574645996, "learning_rate": 7.73648343352806e-09, "logits/chosen": -3.0225765705108643, "logits/rejected": -2.996718168258667, "logps/chosen": -59.868263244628906, "logps/rejected": -58.85590744018555, "loss": 0.6824, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.043428223580121994, "rewards/margins": 0.022792860865592957, "rewards/rejected": -0.06622108817100525, "step": 2230 }, { "epoch": 1.5437629221226739, "grad_norm": 1.3181602954864502, "learning_rate": 7.520169008421775e-09, "logits/chosen": -2.999849796295166, "logits/rejected": -2.9812533855438232, "logps/chosen": -59.130516052246094, "logps/rejected": -60.17681884765625, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": -0.04423438385128975, "rewards/margins": 0.019376900047063828, "rewards/rejected": -0.06361128389835358, "step": 2240 }, { "epoch": 1.5506547208821502, "grad_norm": 1.389096975326538, "learning_rate": 7.3063851511963535e-09, "logits/chosen": -3.01939058303833, "logits/rejected": -2.992645740509033, "logps/chosen": -59.214324951171875, "logps/rejected": -59.09142303466797, "loss": 0.6823, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.04182130843400955, "rewards/margins": 0.023091908544301987, "rewards/rejected": -0.06491322070360184, "step": 2250 }, { "epoch": 1.5575465196416265, "grad_norm": 1.2921773195266724, "learning_rate": 7.095162811496716e-09, "logits/chosen": -2.9625678062438965, "logits/rejected": -2.947840452194214, "logps/chosen": -57.61260986328125, "logps/rejected": -58.932640075683594, "loss": 0.6843, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.046415556222200394, "rewards/margins": 0.01893490180373192, "rewards/rejected": -0.06535045802593231, "step": 2260 }, { "epoch": 1.5644383184011028, "grad_norm": 1.2538701295852661, "learning_rate": 6.886532568135017e-09, "logits/chosen": -2.9978413581848145, "logits/rejected": -2.98121976852417, "logps/chosen": -58.2430305480957, "logps/rejected": -60.4543571472168, "loss": 0.6838, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.044888339936733246, "rewards/margins": 0.019984986633062363, "rewards/rejected": -0.06487332284450531, "step": 2270 }, { "epoch": 1.571330117160579, "grad_norm": 1.3960515260696411, "learning_rate": 6.680524624663763e-09, "logits/chosen": -3.0089173316955566, "logits/rejected": -2.977341413497925, "logps/chosen": -60.883209228515625, "logps/rejected": -59.40845489501953, "loss": 0.6812, "rewards/accuracies": 0.6234375238418579, "rewards/chosen": -0.03985728323459625, "rewards/margins": 0.025174889713525772, "rewards/rejected": -0.06503216922283173, "step": 2280 }, { "epoch": 1.578221915920055, "grad_norm": 1.1940710544586182, "learning_rate": 6.477168805003166e-09, "logits/chosen": -3.00933575630188, "logits/rejected": -2.982250690460205, "logps/chosen": -59.1096076965332, "logps/rejected": -59.2675895690918, "loss": 0.6819, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.04165520519018173, "rewards/margins": 0.023930717259645462, "rewards/rejected": -0.06558592617511749, "step": 2290 }, { "epoch": 1.5851137146795313, "grad_norm": 1.3209586143493652, "learning_rate": 6.276494549123546e-09, "logits/chosen": -3.050356388092041, "logits/rejected": -3.02972412109375, "logps/chosen": -58.97772979736328, "logps/rejected": -59.48607635498047, "loss": 0.6833, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.04832325503230095, "rewards/margins": 0.021100293844938278, "rewards/rejected": -0.06942354887723923, "step": 2300 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -3.1029651165008545, "eval_logits/rejected": -3.097285509109497, "eval_logps/chosen": -60.59327697753906, "eval_logps/rejected": -65.90728759765625, "eval_loss": 0.6892400979995728, "eval_rewards/accuracies": 0.5901486873626709, "eval_rewards/chosen": -0.018813807517290115, "eval_rewards/margins": 0.008457801304757595, "eval_rewards/rejected": -0.027271609753370285, "eval_runtime": 383.2974, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 2300 }, { "epoch": 1.5920055134390076, "grad_norm": 1.2838672399520874, "learning_rate": 6.078530908783283e-09, "logits/chosen": -2.946258068084717, "logits/rejected": -2.9289188385009766, "logps/chosen": -57.40240478515625, "logps/rejected": -58.80238723754883, "loss": 0.6837, "rewards/accuracies": 0.609375, "rewards/chosen": -0.04573064297437668, "rewards/margins": 0.020323526114225388, "rewards/rejected": -0.06605416536331177, "step": 2310 }, { "epoch": 1.598897312198484, "grad_norm": 1.3529164791107178, "learning_rate": 5.883306543322963e-09, "logits/chosen": -3.0067434310913086, "logits/rejected": -2.983191967010498, "logps/chosen": -57.39630126953125, "logps/rejected": -58.9193000793457, "loss": 0.6836, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.045682333409786224, "rewards/margins": 0.020494289696216583, "rewards/rejected": -0.06617662310600281, "step": 2320 }, { "epoch": 1.60578911095796, "grad_norm": 1.3721948862075806, "learning_rate": 5.690849715516346e-09, "logits/chosen": -2.9921982288360596, "logits/rejected": -2.972947597503662, "logps/chosen": -58.18434524536133, "logps/rejected": -59.79640579223633, "loss": 0.684, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.049553245306015015, "rewards/margins": 0.019603563472628593, "rewards/rejected": -0.06915681809186935, "step": 2330 }, { "epoch": 1.6126809097174362, "grad_norm": 1.326244831085205, "learning_rate": 5.50118828747877e-09, "logits/chosen": -3.013467311859131, "logits/rejected": -2.985992908477783, "logps/chosen": -59.114105224609375, "logps/rejected": -59.962989807128906, "loss": 0.6816, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.04709188640117645, "rewards/margins": 0.024677757173776627, "rewards/rejected": -0.07176963984966278, "step": 2340 }, { "epoch": 1.6195727084769125, "grad_norm": 1.3307464122772217, "learning_rate": 5.314349716633484e-09, "logits/chosen": -2.999783515930176, "logits/rejected": -2.978919744491577, "logps/chosen": -58.51006317138672, "logps/rejected": -59.8082389831543, "loss": 0.6859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.04656077176332474, "rewards/margins": 0.015919920057058334, "rewards/rejected": -0.06248069554567337, "step": 2350 }, { "epoch": 1.6264645072363888, "grad_norm": 1.3505630493164062, "learning_rate": 5.130361051736656e-09, "logits/chosen": -2.992077589035034, "logits/rejected": -2.9786789417266846, "logps/chosen": -57.856048583984375, "logps/rejected": -58.31081008911133, "loss": 0.685, "rewards/accuracies": 0.59375, "rewards/chosen": -0.05111172795295715, "rewards/margins": 0.01770883984863758, "rewards/rejected": -0.06882055848836899, "step": 2360 }, { "epoch": 1.633356305995865, "grad_norm": 1.3144451379776, "learning_rate": 4.9492489289614884e-09, "logits/chosen": -2.9724109172821045, "logits/rejected": -2.9529943466186523, "logps/chosen": -58.423919677734375, "logps/rejected": -59.29913330078125, "loss": 0.6839, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.053545523434877396, "rewards/margins": 0.019974233582615852, "rewards/rejected": -0.0735197439789772, "step": 2370 }, { "epoch": 1.640248104755341, "grad_norm": 1.3916033506393433, "learning_rate": 4.771039568042076e-09, "logits/chosen": -3.004544734954834, "logits/rejected": -2.988704204559326, "logps/chosen": -57.34346389770508, "logps/rejected": -61.55018997192383, "loss": 0.683, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -0.04598530754446983, "rewards/margins": 0.021662291139364243, "rewards/rejected": -0.06764759868383408, "step": 2380 }, { "epoch": 1.6471399035148173, "grad_norm": 1.4496431350708008, "learning_rate": 4.595758768477576e-09, "logits/chosen": -3.0240254402160645, "logits/rejected": -3.011583089828491, "logps/chosen": -58.51326370239258, "logps/rejected": -60.47749710083008, "loss": 0.6844, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.05048090219497681, "rewards/margins": 0.01893479749560356, "rewards/rejected": -0.06941570341587067, "step": 2390 }, { "epoch": 1.6540317022742936, "grad_norm": 1.3277703523635864, "learning_rate": 4.423431905797162e-09, "logits/chosen": -3.039842128753662, "logits/rejected": -3.0183472633361816, "logps/chosen": -58.69083786010742, "logps/rejected": -60.8518180847168, "loss": 0.6835, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.04716577008366585, "rewards/margins": 0.020657068118453026, "rewards/rejected": -0.06782282888889313, "step": 2400 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -3.1017863750457764, "eval_logits/rejected": -3.09609055519104, "eval_logps/chosen": -60.646873474121094, "eval_logps/rejected": -65.97390747070312, "eval_loss": 0.6891800761222839, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.019349750131368637, "eval_rewards/margins": 0.008588053286075592, "eval_rewards/rejected": -0.027937807142734528, "eval_runtime": 383.0908, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.404, "step": 2400 }, { "epoch": 1.66092350103377, "grad_norm": 1.3616927862167358, "learning_rate": 4.254083927886443e-09, "logits/chosen": -3.052434206008911, "logits/rejected": -3.0306789875030518, "logps/chosen": -60.23524856567383, "logps/rejected": -59.88490676879883, "loss": 0.685, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -0.04647786170244217, "rewards/margins": 0.017560753971338272, "rewards/rejected": -0.06403861939907074, "step": 2410 }, { "epoch": 1.667815299793246, "grad_norm": 1.3540840148925781, "learning_rate": 4.0877393513756795e-09, "logits/chosen": -3.0015170574188232, "logits/rejected": -2.9834518432617188, "logps/chosen": -58.74982452392578, "logps/rejected": -59.712005615234375, "loss": 0.6847, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.045878536999225616, "rewards/margins": 0.018289810046553612, "rewards/rejected": -0.06416834890842438, "step": 2420 }, { "epoch": 1.6747070985527222, "grad_norm": 1.319036841392517, "learning_rate": 3.924422258090529e-09, "logits/chosen": -2.939756155014038, "logits/rejected": -2.919666290283203, "logps/chosen": -58.5392951965332, "logps/rejected": -59.17338943481445, "loss": 0.684, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04658854380249977, "rewards/margins": 0.019758421927690506, "rewards/rejected": -0.06634696573019028, "step": 2430 }, { "epoch": 1.6815988973121985, "grad_norm": 1.3268150091171265, "learning_rate": 3.764156291565693e-09, "logits/chosen": -3.0177316665649414, "logits/rejected": -2.9926140308380127, "logps/chosen": -58.570648193359375, "logps/rejected": -58.78978729248047, "loss": 0.6825, "rewards/accuracies": 0.625, "rewards/chosen": -0.04440145939588547, "rewards/margins": 0.022650301456451416, "rewards/rejected": -0.06705176085233688, "step": 2440 }, { "epoch": 1.6884906960716748, "grad_norm": 1.3211112022399902, "learning_rate": 3.6069646536220357e-09, "logits/chosen": -2.998032331466675, "logits/rejected": -2.9721641540527344, "logps/chosen": -60.38201904296875, "logps/rejected": -60.990257263183594, "loss": 0.6805, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.040809061378240585, "rewards/margins": 0.026985710486769676, "rewards/rejected": -0.06779477745294571, "step": 2450 }, { "epoch": 1.6953824948311509, "grad_norm": 1.285194993019104, "learning_rate": 3.4528701010076155e-09, "logits/chosen": -3.003739833831787, "logits/rejected": -2.9794375896453857, "logps/chosen": -60.3626823425293, "logps/rejected": -61.65105438232422, "loss": 0.6822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.045110072940588, "rewards/margins": 0.02330555021762848, "rewards/rejected": -0.06841562688350677, "step": 2460 }, { "epoch": 1.7022742935906272, "grad_norm": 1.2852113246917725, "learning_rate": 3.3018949421032003e-09, "logits/chosen": -3.0109405517578125, "logits/rejected": -2.9975745677948, "logps/chosen": -58.778053283691406, "logps/rejected": -59.56703567504883, "loss": 0.6851, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.04846884682774544, "rewards/margins": 0.01748683862388134, "rewards/rejected": -0.06595568358898163, "step": 2470 }, { "epoch": 1.7091660923501033, "grad_norm": 1.258186936378479, "learning_rate": 3.154061033692651e-09, "logits/chosen": -3.0072379112243652, "logits/rejected": -2.979935884475708, "logps/chosen": -59.062705993652344, "logps/rejected": -58.03764724731445, "loss": 0.6807, "rewards/accuracies": 0.640625, "rewards/chosen": -0.04467375949025154, "rewards/margins": 0.026565441861748695, "rewards/rejected": -0.07123919576406479, "step": 2480 }, { "epoch": 1.7160578911095796, "grad_norm": 1.3505935668945312, "learning_rate": 3.0093897777987098e-09, "logits/chosen": -3.0517494678497314, "logits/rejected": -3.0366005897521973, "logps/chosen": -58.27477264404297, "logps/rejected": -61.6846923828125, "loss": 0.6858, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.04682334139943123, "rewards/margins": 0.016174782067537308, "rewards/rejected": -0.06299812346696854, "step": 2490 }, { "epoch": 1.722949689869056, "grad_norm": 1.3154429197311401, "learning_rate": 2.8679021185845975e-09, "logits/chosen": -3.023200273513794, "logits/rejected": -2.997267246246338, "logps/chosen": -58.07569122314453, "logps/rejected": -59.87085723876953, "loss": 0.6826, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -0.046663668006658554, "rewards/margins": 0.022471796721220016, "rewards/rejected": -0.06913547217845917, "step": 2500 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -3.1012966632843018, "eval_logits/rejected": -3.095568895339966, "eval_logps/chosen": -60.681880950927734, "eval_logps/rejected": -66.00990295410156, "eval_loss": 0.689177393913269, "eval_rewards/accuracies": 0.5850371718406677, "eval_rewards/chosen": -0.019699882715940475, "eval_rewards/margins": 0.008597951382398605, "eval_rewards/rejected": -0.02829783223569393, "eval_runtime": 383.2161, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 2500 }, { "epoch": 1.729841488628532, "grad_norm": 1.3162225484848022, "learning_rate": 2.7296185393219316e-09, "logits/chosen": -3.0459542274475098, "logits/rejected": -3.0207812786102295, "logps/chosen": -59.15156173706055, "logps/rejected": -59.254676818847656, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": -0.045805253088474274, "rewards/margins": 0.020932307466864586, "rewards/rejected": -0.06673755496740341, "step": 2510 }, { "epoch": 1.7367332873880081, "grad_norm": 1.328461766242981, "learning_rate": 2.5945590594253305e-09, "logits/chosen": -2.9799602031707764, "logits/rejected": -2.9713258743286133, "logps/chosen": -58.0362548828125, "logps/rejected": -60.381080627441406, "loss": 0.6877, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.0511082224547863, "rewards/margins": 0.012231842614710331, "rewards/rejected": -0.06334006786346436, "step": 2520 }, { "epoch": 1.7436250861474845, "grad_norm": 1.2932238578796387, "learning_rate": 2.4627432315541986e-09, "logits/chosen": -3.055954694747925, "logits/rejected": -3.0452940464019775, "logps/chosen": -58.16063690185547, "logps/rejected": -61.20969772338867, "loss": 0.6834, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.04746638238430023, "rewards/margins": 0.02094622328877449, "rewards/rejected": -0.06841260939836502, "step": 2530 }, { "epoch": 1.7505168849069608, "grad_norm": 1.3644834756851196, "learning_rate": 2.3341901387820717e-09, "logits/chosen": -3.0201711654663086, "logits/rejected": -2.995832681655884, "logps/chosen": -59.7026481628418, "logps/rejected": -60.433990478515625, "loss": 0.6828, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04398275539278984, "rewards/margins": 0.02229396626353264, "rewards/rejected": -0.06627672165632248, "step": 2540 }, { "epoch": 1.757408683666437, "grad_norm": 1.3037844896316528, "learning_rate": 2.2089183918339445e-09, "logits/chosen": -2.996652126312256, "logits/rejected": -2.976022243499756, "logps/chosen": -57.28471755981445, "logps/rejected": -59.08478546142578, "loss": 0.684, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04602036252617836, "rewards/margins": 0.019708681851625443, "rewards/rejected": -0.0657290443778038, "step": 2550 }, { "epoch": 1.7643004824259132, "grad_norm": 1.3543256521224976, "learning_rate": 2.086946126391981e-09, "logits/chosen": -2.9888083934783936, "logits/rejected": -2.9732577800750732, "logps/chosen": -56.89265823364258, "logps/rejected": -60.136573791503906, "loss": 0.6841, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.05068554729223251, "rewards/margins": 0.019495617598295212, "rewards/rejected": -0.07018117606639862, "step": 2560 }, { "epoch": 1.7711922811853893, "grad_norm": 1.3135391473770142, "learning_rate": 1.9682910004700155e-09, "logits/chosen": -3.000701904296875, "logits/rejected": -2.9841086864471436, "logps/chosen": -59.79181671142578, "logps/rejected": -60.78386306762695, "loss": 0.6832, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.048417720943689346, "rewards/margins": 0.02133244276046753, "rewards/rejected": -0.06975016742944717, "step": 2570 }, { "epoch": 1.7780840799448656, "grad_norm": 1.2864971160888672, "learning_rate": 1.852970191857159e-09, "logits/chosen": -2.9674811363220215, "logits/rejected": -2.94804048538208, "logps/chosen": -59.39619064331055, "logps/rejected": -60.783851623535156, "loss": 0.6818, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04619085043668747, "rewards/margins": 0.02428482472896576, "rewards/rejected": -0.07047567516565323, "step": 2580 }, { "epoch": 1.784975878704342, "grad_norm": 1.2950899600982666, "learning_rate": 1.741000395630976e-09, "logits/chosen": -3.034547805786133, "logits/rejected": -3.0095696449279785, "logps/chosen": -58.98634719848633, "logps/rejected": -60.004661560058594, "loss": 0.6821, "rewards/accuracies": 0.625, "rewards/chosen": -0.044211823493242264, "rewards/margins": 0.02347356267273426, "rewards/rejected": -0.06768538057804108, "step": 2590 }, { "epoch": 1.791867677463818, "grad_norm": 1.3900827169418335, "learning_rate": 1.6323978217405277e-09, "logits/chosen": -2.962319850921631, "logits/rejected": -2.9394538402557373, "logps/chosen": -57.80451202392578, "logps/rejected": -60.771339416503906, "loss": 0.6825, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.0485307052731514, "rewards/margins": 0.022718578577041626, "rewards/rejected": -0.07124929130077362, "step": 2600 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -3.1006553173065186, "eval_logits/rejected": -3.094916820526123, "eval_logps/chosen": -60.688175201416016, "eval_logps/rejected": -66.03443145751953, "eval_loss": 0.6890937089920044, "eval_rewards/accuracies": 0.5889869928359985, "eval_rewards/chosen": -0.019762787967920303, "eval_rewards/margins": 0.008780322037637234, "eval_rewards/rejected": -0.028543109074234962, "eval_runtime": 383.3128, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.404, "step": 2600 }, { "epoch": 1.7987594762232941, "grad_norm": 1.325190544128418, "learning_rate": 1.5271781926596449e-09, "logits/chosen": -3.0393474102020264, "logits/rejected": -3.0156404972076416, "logps/chosen": -60.42161178588867, "logps/rejected": -61.184486389160156, "loss": 0.6827, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.04576939716935158, "rewards/margins": 0.022201048210263252, "rewards/rejected": -0.06797045469284058, "step": 2610 }, { "epoch": 1.8056512749827704, "grad_norm": 1.3093925714492798, "learning_rate": 1.4253567411107643e-09, "logits/chosen": -2.989856243133545, "logits/rejected": -2.9666576385498047, "logps/chosen": -58.600990295410156, "logps/rejected": -60.18854904174805, "loss": 0.6831, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0458240807056427, "rewards/margins": 0.021440699696540833, "rewards/rejected": -0.06726478040218353, "step": 2620 }, { "epoch": 1.8125430737422468, "grad_norm": 1.2918739318847656, "learning_rate": 1.326948207859685e-09, "logits/chosen": -3.0238237380981445, "logits/rejected": -3.0077781677246094, "logps/chosen": -57.79582595825195, "logps/rejected": -60.6348762512207, "loss": 0.6832, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.046499475836753845, "rewards/margins": 0.021402059122920036, "rewards/rejected": -0.06790152937173843, "step": 2630 }, { "epoch": 1.819434872501723, "grad_norm": 1.349001407623291, "learning_rate": 1.2319668395815358e-09, "logits/chosen": -3.0028393268585205, "logits/rejected": -2.9857001304626465, "logps/chosen": -58.69614791870117, "logps/rejected": -59.920021057128906, "loss": 0.6839, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.04932459071278572, "rewards/margins": 0.0200694240629673, "rewards/rejected": -0.06939400732517242, "step": 2640 }, { "epoch": 1.8263266712611992, "grad_norm": 1.2818963527679443, "learning_rate": 1.1404263867982738e-09, "logits/chosen": -3.0455758571624756, "logits/rejected": -3.0237960815429688, "logps/chosen": -59.25251388549805, "logps/rejected": -60.45496368408203, "loss": 0.683, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -0.04728539660573006, "rewards/margins": 0.021762443706393242, "rewards/rejected": -0.06904784590005875, "step": 2650 }, { "epoch": 1.8332184700206753, "grad_norm": 1.2889119386672974, "learning_rate": 1.0523401018880134e-09, "logits/chosen": -2.983532428741455, "logits/rejected": -2.9646944999694824, "logps/chosen": -58.277976989746094, "logps/rejected": -59.491722106933594, "loss": 0.6837, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.047298818826675415, "rewards/margins": 0.020261693745851517, "rewards/rejected": -0.06756050884723663, "step": 2660 }, { "epoch": 1.8401102687801516, "grad_norm": 1.3029212951660156, "learning_rate": 9.677207371664608e-10, "logits/chosen": -3.0146260261535645, "logits/rejected": -2.9899539947509766, "logps/chosen": -59.18970489501953, "logps/rejected": -60.14207077026367, "loss": 0.6824, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05002979189157486, "rewards/margins": 0.02303471975028515, "rewards/rejected": -0.07306452095508575, "step": 2670 }, { "epoch": 1.847002067539628, "grad_norm": 1.260703444480896, "learning_rate": 8.865805430407575e-10, "logits/chosen": -3.0160889625549316, "logits/rejected": -2.9888625144958496, "logps/chosen": -58.788368225097656, "logps/rejected": -59.13869094848633, "loss": 0.6827, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.048010729253292084, "rewards/margins": 0.0222895760089159, "rewards/rejected": -0.07030030339956284, "step": 2680 }, { "epoch": 1.853893866299104, "grad_norm": 1.3435821533203125, "learning_rate": 8.089312662359904e-10, "logits/chosen": -3.000649929046631, "logits/rejected": -2.9763736724853516, "logps/chosen": -58.345603942871094, "logps/rejected": -59.42350387573242, "loss": 0.6836, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04748475179076195, "rewards/margins": 0.02055184543132782, "rewards/rejected": -0.06803660839796066, "step": 2690 }, { "epoch": 1.8607856650585803, "grad_norm": 1.3498848676681519, "learning_rate": 7.34784148094586e-10, "logits/chosen": -3.062194347381592, "logits/rejected": -3.040365219116211, "logps/chosen": -58.02552032470703, "logps/rejected": -60.96380615234375, "loss": 0.6823, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.04390079155564308, "rewards/margins": 0.023236598819494247, "rewards/rejected": -0.06713739782571793, "step": 2700 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -3.1006381511688232, "eval_logits/rejected": -3.0949079990386963, "eval_logps/chosen": -60.71648025512695, "eval_logps/rejected": -66.05257415771484, "eval_loss": 0.6891458034515381, "eval_rewards/accuracies": 0.5889869928359985, "eval_rewards/chosen": -0.020045887678861618, "eval_rewards/margins": 0.008678610436618328, "eval_rewards/rejected": -0.02872449718415737, "eval_runtime": 383.351, "eval_samples_per_second": 11.227, "eval_steps_per_second": 1.403, "step": 2700 }, { "epoch": 1.8676774638180564, "grad_norm": 1.2470507621765137, "learning_rate": 6.641499229489145e-10, "logits/chosen": -3.003091335296631, "logits/rejected": -2.9715797901153564, "logps/chosen": -58.2459831237793, "logps/rejected": -58.3682975769043, "loss": 0.6813, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -0.04425545781850815, "rewards/margins": 0.02511006034910679, "rewards/rejected": -0.06936550885438919, "step": 2710 }, { "epoch": 1.8745692625775328, "grad_norm": 1.3162429332733154, "learning_rate": 5.970388165672691e-10, "logits/chosen": -2.99006724357605, "logits/rejected": -2.971386671066284, "logps/chosen": -57.105255126953125, "logps/rejected": -60.72968673706055, "loss": 0.6826, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.0457664355635643, "rewards/margins": 0.02246815897524357, "rewards/rejected": -0.06823460012674332, "step": 2720 }, { "epoch": 1.881461061337009, "grad_norm": 1.4187453985214233, "learning_rate": 5.334605446734585e-10, "logits/chosen": -3.0353336334228516, "logits/rejected": -3.007887363433838, "logps/chosen": -59.300804138183594, "logps/rejected": -59.99883270263672, "loss": 0.6814, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -0.043865978717803955, "rewards/margins": 0.02479901909828186, "rewards/rejected": -0.06866499781608582, "step": 2730 }, { "epoch": 1.8883528600964852, "grad_norm": 1.3848966360092163, "learning_rate": 4.734243115402825e-10, "logits/chosen": -2.9592947959899902, "logits/rejected": -2.9372572898864746, "logps/chosen": -59.5694465637207, "logps/rejected": -60.315895080566406, "loss": 0.6837, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.047974247485399246, "rewards/margins": 0.02036571130156517, "rewards/rejected": -0.06833995878696442, "step": 2740 }, { "epoch": 1.8952446588559613, "grad_norm": 1.4468791484832764, "learning_rate": 4.169388086569886e-10, "logits/chosen": -3.0385963916778564, "logits/rejected": -3.0236904621124268, "logps/chosen": -58.946388244628906, "logps/rejected": -61.70532989501953, "loss": 0.6838, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -0.04743208736181259, "rewards/margins": 0.020196830853819847, "rewards/rejected": -0.06762892007827759, "step": 2750 }, { "epoch": 1.9021364576154376, "grad_norm": 1.3899776935577393, "learning_rate": 3.640122134710294e-10, "logits/chosen": -3.06270170211792, "logits/rejected": -3.0440831184387207, "logps/chosen": -59.356605529785156, "logps/rejected": -60.465003967285156, "loss": 0.6821, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0449027419090271, "rewards/margins": 0.023394212126731873, "rewards/rejected": -0.06829695403575897, "step": 2760 }, { "epoch": 1.909028256374914, "grad_norm": 1.328192114830017, "learning_rate": 3.1465218820418415e-10, "logits/chosen": -3.037365436553955, "logits/rejected": -3.0028114318847656, "logps/chosen": -58.11069869995117, "logps/rejected": -59.29419708251953, "loss": 0.6807, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.04381219670176506, "rewards/margins": 0.026473551988601685, "rewards/rejected": -0.07028575241565704, "step": 2770 }, { "epoch": 1.9159200551343902, "grad_norm": 1.2775218486785889, "learning_rate": 2.688658787433157e-10, "logits/chosen": -3.022888660430908, "logits/rejected": -3.000300884246826, "logps/chosen": -60.5079460144043, "logps/rejected": -60.63434600830078, "loss": 0.6831, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.04978417605161667, "rewards/margins": 0.021695107221603394, "rewards/rejected": -0.07147928327322006, "step": 2780 }, { "epoch": 1.9228118538938663, "grad_norm": 1.2735910415649414, "learning_rate": 2.266599136058367e-10, "logits/chosen": -3.0028035640716553, "logits/rejected": -2.9840023517608643, "logps/chosen": -59.78889083862305, "logps/rejected": -59.44769287109375, "loss": 0.6861, "rewards/accuracies": 0.589062511920929, "rewards/chosen": -0.04813474044203758, "rewards/margins": 0.015620408579707146, "rewards/rejected": -0.06375513970851898, "step": 2790 }, { "epoch": 1.9297036526533424, "grad_norm": 1.376592755317688, "learning_rate": 1.8804040298009693e-10, "logits/chosen": -3.0288257598876953, "logits/rejected": -3.0026650428771973, "logps/chosen": -59.218475341796875, "logps/rejected": -58.154075622558594, "loss": 0.6816, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.04387739300727844, "rewards/margins": 0.02458575740456581, "rewards/rejected": -0.06846315413713455, "step": 2800 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -3.1007766723632812, "eval_logits/rejected": -3.095076084136963, "eval_logps/chosen": -60.726348876953125, "eval_logps/rejected": -66.07279968261719, "eval_loss": 0.6890966892242432, "eval_rewards/accuracies": 0.5841078162193298, "eval_rewards/chosen": -0.020144494250416756, "eval_rewards/margins": 0.008782317861914635, "eval_rewards/rejected": -0.02892681024968624, "eval_runtime": 383.6267, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 2800 }, { "epoch": 1.9365954514128187, "grad_norm": 1.2682085037231445, "learning_rate": 1.5301293784081847e-10, "logits/chosen": -2.9736599922180176, "logits/rejected": -2.9589531421661377, "logps/chosen": -58.26537322998047, "logps/rejected": -60.63109588623047, "loss": 0.6842, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.0504550039768219, "rewards/margins": 0.019359614700078964, "rewards/rejected": -0.06981462240219116, "step": 2810 }, { "epoch": 1.943487250172295, "grad_norm": 1.3405542373657227, "learning_rate": 1.2158258913967102e-10, "logits/chosen": -3.0063540935516357, "logits/rejected": -2.9755642414093018, "logps/chosen": -60.50700759887695, "logps/rejected": -58.998687744140625, "loss": 0.6822, "rewards/accuracies": 0.6015625, "rewards/chosen": -0.04506916552782059, "rewards/margins": 0.023324180394411087, "rewards/rejected": -0.06839334219694138, "step": 2820 }, { "epoch": 1.9503790489317712, "grad_norm": 1.3689327239990234, "learning_rate": 9.37539070711646e-11, "logits/chosen": -3.0321671962738037, "logits/rejected": -3.012648820877075, "logps/chosen": -60.28644943237305, "logps/rejected": -60.640167236328125, "loss": 0.681, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -0.042195506393909454, "rewards/margins": 0.02599485218524933, "rewards/rejected": -0.06819035857915878, "step": 2830 }, { "epoch": 1.9572708476912473, "grad_norm": 1.3046759366989136, "learning_rate": 6.953092041389607e-11, "logits/chosen": -3.014383554458618, "logits/rejected": -2.9899418354034424, "logps/chosen": -59.1005859375, "logps/rejected": -59.03815460205078, "loss": 0.6822, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04825712740421295, "rewards/margins": 0.023372991010546684, "rewards/rejected": -0.07163011282682419, "step": 2840 }, { "epoch": 1.9641626464507236, "grad_norm": 1.3823450803756714, "learning_rate": 4.891713594731006e-11, "logits/chosen": -3.0164265632629395, "logits/rejected": -2.993161678314209, "logps/chosen": -58.6377067565918, "logps/rejected": -59.821807861328125, "loss": 0.6829, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04551283270120621, "rewards/margins": 0.021898990496993065, "rewards/rejected": -0.06741182506084442, "step": 2850 }, { "epoch": 1.9710544452102, "grad_norm": 1.2972795963287354, "learning_rate": 3.191553794401336e-11, "logits/chosen": -3.0041041374206543, "logits/rejected": -2.9793601036071777, "logps/chosen": -58.8316535949707, "logps/rejected": -59.15874481201172, "loss": 0.6844, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -0.04608858376741409, "rewards/margins": 0.018982943147420883, "rewards/rejected": -0.06507153064012527, "step": 2860 }, { "epoch": 1.9779462439696762, "grad_norm": 1.4782917499542236, "learning_rate": 1.8528587737753898e-11, "logits/chosen": -3.00868558883667, "logits/rejected": -2.981982707977295, "logps/chosen": -59.979164123535156, "logps/rejected": -59.065032958984375, "loss": 0.6813, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.04683419317007065, "rewards/margins": 0.025204036384820938, "rewards/rejected": -0.07203822582960129, "step": 2870 }, { "epoch": 1.9848380427291523, "grad_norm": 1.387197494506836, "learning_rate": 8.758223367075212e-12, "logits/chosen": -3.0078389644622803, "logits/rejected": -2.978468656539917, "logps/chosen": -59.639747619628906, "logps/rejected": -58.05632781982422, "loss": 0.683, "rewards/accuracies": 0.6265624761581421, "rewards/chosen": -0.04575073719024658, "rewards/margins": 0.021932676434516907, "rewards/rejected": -0.06768341362476349, "step": 2880 }, { "epoch": 1.9917298414886284, "grad_norm": 1.3310401439666748, "learning_rate": 2.605859294749213e-12, "logits/chosen": -3.018655300140381, "logits/rejected": -2.994286298751831, "logps/chosen": -57.234657287597656, "logps/rejected": -59.07853317260742, "loss": 0.6825, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.046042539179325104, "rewards/margins": 0.022625811398029327, "rewards/rejected": -0.06866835057735443, "step": 2890 }, { "epoch": 1.9986216402481047, "grad_norm": 1.3212953805923462, "learning_rate": 7.2386203012198e-14, "logits/chosen": -3.0002474784851074, "logits/rejected": -2.9772603511810303, "logps/chosen": -59.88157272338867, "logps/rejected": -60.886680603027344, "loss": 0.6836, "rewards/accuracies": 0.614062488079071, "rewards/chosen": -0.04801579564809799, "rewards/margins": 0.0204045120626688, "rewards/rejected": -0.06842031329870224, "step": 2900 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -3.100578546524048, "eval_logits/rejected": -3.094856023788452, "eval_logps/chosen": -60.72254180908203, "eval_logps/rejected": -66.06378173828125, "eval_loss": 0.6891194581985474, "eval_rewards/accuracies": 0.5910780429840088, "eval_rewards/chosen": -0.020106395706534386, "eval_rewards/margins": 0.008730227127671242, "eval_rewards/rejected": -0.028836622834205627, "eval_runtime": 383.6501, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 2900 }, { "epoch": 2.0, "step": 2902, "total_flos": 0.0, "train_loss": 0.6870454205553093, "train_runtime": 56536.4846, "train_samples_per_second": 3.285, "train_steps_per_second": 0.051 } ], "logging_steps": 10, "max_steps": 2902, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }