diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4871 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 2902, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006891798759476223, + "grad_norm": 1.1716080904006958, + "learning_rate": 1.718213058419244e-10, + "logits/chosen": -3.184086799621582, + "logits/rejected": -3.1319174766540527, + "logps/chosen": -49.95408630371094, + "logps/rejected": -44.33523178100586, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 1.0663460493087769, + "learning_rate": 1.718213058419244e-09, + "logits/chosen": -3.080113172531128, + "logits/rejected": -3.0596792697906494, + "logps/chosen": -54.03813171386719, + "logps/rejected": -53.65137481689453, + "loss": 0.6932, + "rewards/accuracies": 0.4565972089767456, + "rewards/chosen": 8.68273782543838e-05, + "rewards/margins": -1.9125265680486336e-05, + "rewards/rejected": 0.00010595263302093372, + "step": 10 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 1.1690140962600708, + "learning_rate": 3.436426116838488e-09, + "logits/chosen": -3.1165332794189453, + "logits/rejected": -3.0916168689727783, + "logps/chosen": -55.888938903808594, + "logps/rejected": -53.246864318847656, + "loss": 0.6931, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 3.240557634853758e-05, + "rewards/margins": -3.6290578009356977e-06, + "rewards/rejected": 3.603463846957311e-05, + "step": 20 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 1.2955037355422974, + "learning_rate": 5.154639175257731e-09, + "logits/chosen": -3.0878665447235107, + "logits/rejected": -3.058804988861084, + "logps/chosen": -54.54620361328125, + "logps/rejected": -52.591636657714844, + "loss": 0.6932, + "rewards/accuracies": 0.4921875, + "rewards/chosen": -2.4173205019906163e-05, + "rewards/margins": -4.490778155741282e-05, + "rewards/rejected": 2.0734580175485462e-05, + "step": 30 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 1.1852333545684814, + "learning_rate": 6.872852233676976e-09, + "logits/chosen": -3.0849013328552246, + "logits/rejected": -3.0671732425689697, + "logps/chosen": -53.879005432128906, + "logps/rejected": -53.66566848754883, + "loss": 0.6931, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -5.587830673903227e-05, + "rewards/margins": 4.417077434482053e-05, + "rewards/rejected": -0.00010004905925597996, + "step": 40 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 1.2431070804595947, + "learning_rate": 8.59106529209622e-09, + "logits/chosen": -3.0804286003112793, + "logits/rejected": -3.0561296939849854, + "logps/chosen": -56.24019241333008, + "logps/rejected": -53.092872619628906, + "loss": 0.6931, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": 8.934068318922073e-05, + "rewards/margins": 0.00011236695718253031, + "rewards/rejected": -2.302624488947913e-05, + "step": 50 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 1.1313049793243408, + "learning_rate": 1.0309278350515463e-08, + "logits/chosen": -3.0351052284240723, + "logits/rejected": -3.0099387168884277, + "logps/chosen": -52.579429626464844, + "logps/rejected": -52.6761589050293, + "loss": 0.6931, + "rewards/accuracies": 0.48906248807907104, + "rewards/chosen": 2.3904693080112338e-05, + "rewards/margins": 2.025809772021603e-05, + "rewards/rejected": 3.6465789889916778e-06, + "step": 60 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 1.2357141971588135, + "learning_rate": 1.2027491408934707e-08, + "logits/chosen": -3.092390537261963, + "logits/rejected": -3.0711493492126465, + "logps/chosen": -54.469940185546875, + "logps/rejected": -53.86017990112305, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 5.560473709920188e-06, + "rewards/margins": 2.101451354974415e-05, + "rewards/rejected": -1.5454041204066016e-05, + "step": 70 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 1.1142845153808594, + "learning_rate": 1.3745704467353952e-08, + "logits/chosen": -3.0345962047576904, + "logits/rejected": -3.0208940505981445, + "logps/chosen": -54.06622314453125, + "logps/rejected": -52.69053268432617, + "loss": 0.6932, + "rewards/accuracies": 0.48906248807907104, + "rewards/chosen": -4.671530405175872e-05, + "rewards/margins": -4.488803824642673e-05, + "rewards/rejected": -1.827271603360714e-06, + "step": 80 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 1.1985735893249512, + "learning_rate": 1.5463917525773195e-08, + "logits/chosen": -3.048698663711548, + "logits/rejected": -3.0217783451080322, + "logps/chosen": -54.59540939331055, + "logps/rejected": -52.060035705566406, + "loss": 0.6931, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -6.244768155738711e-05, + "rewards/margins": -3.21494007948786e-06, + "rewards/rejected": -5.923274511587806e-05, + "step": 90 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 1.3350454568862915, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -3.119621753692627, + "logits/rejected": -3.095787763595581, + "logps/chosen": -53.65461349487305, + "logps/rejected": -52.88787841796875, + "loss": 0.6931, + "rewards/accuracies": 0.515625, + "rewards/chosen": -6.203976226970553e-05, + "rewards/margins": 5.055965812061913e-05, + "rewards/rejected": -0.00011259941675234586, + "step": 100 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.163339376449585, + "eval_logits/rejected": -3.157687187194824, + "eval_logps/chosen": -58.7006721496582, + "eval_logps/rejected": -63.17026138305664, + "eval_loss": 0.693140983581543, + "eval_rewards/accuracies": 0.5023234486579895, + "eval_rewards/chosen": 0.00011220378655707464, + "eval_rewards/margins": 1.3582017345470376e-05, + "eval_rewards/rejected": 9.862175647867844e-05, + "eval_runtime": 383.3503, + "eval_samples_per_second": 11.227, + "eval_steps_per_second": 1.403, + "step": 100 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 1.2324384450912476, + "learning_rate": 1.8900343642611684e-08, + "logits/chosen": -3.0891432762145996, + "logits/rejected": -3.0738348960876465, + "logps/chosen": -53.08173751831055, + "logps/rejected": -54.20978546142578, + "loss": 0.6932, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -4.9080466851592064e-05, + "rewards/margins": -7.88484321674332e-05, + "rewards/rejected": 2.976796167786233e-05, + "step": 110 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 1.2855055332183838, + "learning_rate": 2.0618556701030925e-08, + "logits/chosen": -3.043365478515625, + "logits/rejected": -3.0211169719696045, + "logps/chosen": -54.957427978515625, + "logps/rejected": -54.4825439453125, + "loss": 0.6932, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -7.188355084508657e-05, + "rewards/margins": -0.00011629929940681905, + "rewards/rejected": 4.441575947566889e-05, + "step": 120 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 1.1282892227172852, + "learning_rate": 2.2336769759450173e-08, + "logits/chosen": -3.0101354122161865, + "logits/rejected": -2.9788012504577637, + "logps/chosen": -57.5596923828125, + "logps/rejected": -51.651153564453125, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 2.1165338694117963e-05, + "rewards/margins": 0.00015237969637382776, + "rewards/rejected": -0.00013121434312779456, + "step": 130 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 1.1657721996307373, + "learning_rate": 2.4054982817869415e-08, + "logits/chosen": -3.067199468612671, + "logits/rejected": -3.046125888824463, + "logps/chosen": -53.55717849731445, + "logps/rejected": -52.773223876953125, + "loss": 0.6931, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 1.3996473171573598e-05, + "rewards/margins": 0.00015452780644409359, + "rewards/rejected": -0.00014053132326807827, + "step": 140 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 1.2658566236495972, + "learning_rate": 2.5773195876288656e-08, + "logits/chosen": -3.04317569732666, + "logits/rejected": -3.0280072689056396, + "logps/chosen": -52.809234619140625, + "logps/rejected": -54.64301300048828, + "loss": 0.6931, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -1.0054915037471801e-05, + "rewards/margins": 0.0001238631666637957, + "rewards/rejected": -0.00013391808897722512, + "step": 150 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 1.2126415967941284, + "learning_rate": 2.7491408934707904e-08, + "logits/chosen": -3.09346342086792, + "logits/rejected": -3.07668399810791, + "logps/chosen": -53.59107208251953, + "logps/rejected": -52.9258918762207, + "loss": 0.6932, + "rewards/accuracies": 0.4703125059604645, + "rewards/chosen": -9.462583875574637e-06, + "rewards/margins": -2.1159441530471668e-05, + "rewards/rejected": 1.1696849469444714e-05, + "step": 160 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 1.1890392303466797, + "learning_rate": 2.9209621993127148e-08, + "logits/chosen": -3.0306668281555176, + "logits/rejected": -3.0220158100128174, + "logps/chosen": -53.26588821411133, + "logps/rejected": -53.87241744995117, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -5.447790681500919e-05, + "rewards/margins": -7.718646884313785e-06, + "rewards/rejected": -4.675926174968481e-05, + "step": 170 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 1.15412175655365, + "learning_rate": 3.092783505154639e-08, + "logits/chosen": -3.0752334594726562, + "logits/rejected": -3.0524303913116455, + "logps/chosen": -55.69530487060547, + "logps/rejected": -53.15666961669922, + "loss": 0.6931, + "rewards/accuracies": 0.504687488079071, + "rewards/chosen": -1.1674828783725388e-05, + "rewards/margins": 9.358397619507741e-06, + "rewards/rejected": -2.1033218217780814e-05, + "step": 180 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 1.1720036268234253, + "learning_rate": 3.264604810996564e-08, + "logits/chosen": -3.1030337810516357, + "logits/rejected": -3.0736050605773926, + "logps/chosen": -55.423614501953125, + "logps/rejected": -52.4505500793457, + "loss": 0.6931, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 9.101578143599909e-06, + "rewards/margins": 0.0001561685057822615, + "rewards/rejected": -0.00014706689398735762, + "step": 190 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 1.2227604389190674, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -3.0704421997070312, + "logits/rejected": -3.041954278945923, + "logps/chosen": -53.747833251953125, + "logps/rejected": -52.85246658325195, + "loss": 0.6931, + "rewards/accuracies": 0.515625, + "rewards/chosen": -6.959711026865989e-05, + "rewards/margins": 9.592306014383212e-05, + "rewards/rejected": -0.0001655201631365344, + "step": 200 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.1631689071655273, + "eval_logits/rejected": -3.157501220703125, + "eval_logps/chosen": -58.700950622558594, + "eval_logps/rejected": -63.162139892578125, + "eval_loss": 0.6931830048561096, + "eval_rewards/accuracies": 0.48745352029800415, + "eval_rewards/chosen": 0.00010945786925731227, + "eval_rewards/margins": -7.042505603749305e-05, + "eval_rewards/rejected": 0.00017988293257076293, + "eval_runtime": 383.3981, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 200 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 1.1424545049667358, + "learning_rate": 3.608247422680412e-08, + "logits/chosen": -3.08945631980896, + "logits/rejected": -3.0655088424682617, + "logps/chosen": -54.22871780395508, + "logps/rejected": -52.478431701660156, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00013771439262200147, + "rewards/margins": 0.00012528176011983305, + "rewards/rejected": -0.00026299612363800406, + "step": 210 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 1.1047999858856201, + "learning_rate": 3.780068728522337e-08, + "logits/chosen": -3.0537705421447754, + "logits/rejected": -3.039431571960449, + "logps/chosen": -51.688323974609375, + "logps/rejected": -53.095741271972656, + "loss": 0.6931, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.00010369622759753838, + "rewards/margins": 0.00015849454212002456, + "rewards/rejected": -0.00026219076244160533, + "step": 220 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 1.2490479946136475, + "learning_rate": 3.951890034364261e-08, + "logits/chosen": -3.071945905685425, + "logits/rejected": -3.0471181869506836, + "logps/chosen": -54.49678421020508, + "logps/rejected": -52.037872314453125, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.5270270018372685e-05, + "rewards/margins": 0.0002025824796874076, + "rewards/rejected": -0.0002378527569817379, + "step": 230 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 1.141684889793396, + "learning_rate": 4.123711340206185e-08, + "logits/chosen": -3.028677225112915, + "logits/rejected": -3.0117344856262207, + "logps/chosen": -54.690513610839844, + "logps/rejected": -55.188621520996094, + "loss": 0.693, + "rewards/accuracies": 0.5296875238418579, + "rewards/chosen": -0.00015048097702674568, + "rewards/margins": 0.00021787775040138513, + "rewards/rejected": -0.00036835874198004603, + "step": 240 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 1.171937108039856, + "learning_rate": 4.295532646048109e-08, + "logits/chosen": -3.06539249420166, + "logits/rejected": -3.0387420654296875, + "logps/chosen": -57.0573616027832, + "logps/rejected": -52.94896697998047, + "loss": 0.6931, + "rewards/accuracies": 0.5296875238418579, + "rewards/chosen": -0.0001992958423215896, + "rewards/margins": 0.00017509344615973532, + "rewards/rejected": -0.0003743892884813249, + "step": 250 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 1.1496978998184204, + "learning_rate": 4.4673539518900346e-08, + "logits/chosen": -3.0649943351745605, + "logits/rejected": -3.0493435859680176, + "logps/chosen": -54.52451705932617, + "logps/rejected": -54.94301223754883, + "loss": 0.693, + "rewards/accuracies": 0.535937488079071, + "rewards/chosen": -0.00030117519781924784, + "rewards/margins": 0.00021073469542898238, + "rewards/rejected": -0.0005119099514558911, + "step": 260 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 1.1325643062591553, + "learning_rate": 4.639175257731959e-08, + "logits/chosen": -3.066349506378174, + "logits/rejected": -3.0383307933807373, + "logps/chosen": -56.371307373046875, + "logps/rejected": -52.432106018066406, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00030968443024903536, + "rewards/margins": 0.0001904324017232284, + "rewards/rejected": -0.000500116846524179, + "step": 270 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 1.2462892532348633, + "learning_rate": 4.810996563573883e-08, + "logits/chosen": -3.0566208362579346, + "logits/rejected": -3.051412582397461, + "logps/chosen": -53.14699172973633, + "logps/rejected": -54.41425323486328, + "loss": 0.6931, + "rewards/accuracies": 0.49531251192092896, + "rewards/chosen": -0.00045495276572182775, + "rewards/margins": 9.360066178487614e-05, + "rewards/rejected": -0.0005485534202307463, + "step": 280 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 1.1743725538253784, + "learning_rate": 4.982817869415808e-08, + "logits/chosen": -3.0853469371795654, + "logits/rejected": -3.063814640045166, + "logps/chosen": -54.09833908081055, + "logps/rejected": -54.12751007080078, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0002439660020172596, + "rewards/margins": 0.000504250347148627, + "rewards/rejected": -0.0007482162909582257, + "step": 290 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 1.1194610595703125, + "learning_rate": 4.999853419300577e-08, + "logits/chosen": -3.012183666229248, + "logits/rejected": -2.9885506629943848, + "logps/chosen": -54.22556686401367, + "logps/rejected": -51.91581344604492, + "loss": 0.6929, + "rewards/accuracies": 0.5546875, + "rewards/chosen": -0.000393096124753356, + "rewards/margins": 0.0003973825369030237, + "rewards/rejected": -0.0007904786616563797, + "step": 300 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.1624693870544434, + "eval_logits/rejected": -3.156888484954834, + "eval_logps/chosen": -58.67123794555664, + "eval_logps/rejected": -63.15048599243164, + "eval_loss": 0.6930928826332092, + "eval_rewards/accuracies": 0.5148698687553406, + "eval_rewards/chosen": 0.00040659555816091597, + "eval_rewards/margins": 0.00011023049592040479, + "eval_rewards/rejected": 0.0002963650331366807, + "eval_runtime": 383.575, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 300 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 1.1925629377365112, + "learning_rate": 4.9993467426542045e-08, + "logits/chosen": -3.086402416229248, + "logits/rejected": -3.0562937259674072, + "logps/chosen": -53.876312255859375, + "logps/rejected": -52.675437927246094, + "loss": 0.6929, + "rewards/accuracies": 0.5796874761581421, + "rewards/chosen": -0.0003693565959110856, + "rewards/margins": 0.0005247757071629167, + "rewards/rejected": -0.0008941322448663414, + "step": 310 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 1.154595136642456, + "learning_rate": 4.998478233757101e-08, + "logits/chosen": -3.0752129554748535, + "logits/rejected": -3.0584304332733154, + "logps/chosen": -52.4905891418457, + "logps/rejected": -54.12751388549805, + "loss": 0.6929, + "rewards/accuracies": 0.5718749761581421, + "rewards/chosen": -0.0005890514003112912, + "rewards/margins": 0.0004612796474248171, + "rewards/rejected": -0.0010503310477361083, + "step": 320 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 1.143236517906189, + "learning_rate": 4.9972480183439325e-08, + "logits/chosen": -3.075157642364502, + "logits/rejected": -3.0487570762634277, + "logps/chosen": -53.44994354248047, + "logps/rejected": -51.2059326171875, + "loss": 0.6928, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.0005590206128545105, + "rewards/margins": 0.0007544254185631871, + "rewards/rejected": -0.0013134460896253586, + "step": 330 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 1.3542113304138184, + "learning_rate": 4.995656274513881e-08, + "logits/chosen": -3.0580501556396484, + "logits/rejected": -3.035737991333008, + "logps/chosen": -54.966087341308594, + "logps/rejected": -53.1796760559082, + "loss": 0.6928, + "rewards/accuracies": 0.5609375238418579, + "rewards/chosen": -0.0005089120240882039, + "rewards/margins": 0.0006240031216293573, + "rewards/rejected": -0.001132915262132883, + "step": 340 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 1.2170838117599487, + "learning_rate": 4.993703232704862e-08, + "logits/chosen": -3.0822110176086426, + "logits/rejected": -3.059418201446533, + "logps/chosen": -54.97810745239258, + "logps/rejected": -52.979820251464844, + "loss": 0.6927, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.0005054243374615908, + "rewards/margins": 0.0008770185522735119, + "rewards/rejected": -0.0013824428897351027, + "step": 350 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 1.107391595840454, + "learning_rate": 4.991389175660163e-08, + "logits/chosen": -3.0396039485931396, + "logits/rejected": -3.0273656845092773, + "logps/chosen": -52.375274658203125, + "logps/rejected": -53.336265563964844, + "loss": 0.6928, + "rewards/accuracies": 0.573437511920929, + "rewards/chosen": -0.0008153729140758514, + "rewards/margins": 0.0007024986553005874, + "rewards/rejected": -0.001517871511168778, + "step": 360 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 1.1218314170837402, + "learning_rate": 4.98871443838751e-08, + "logits/chosen": -3.114689350128174, + "logits/rejected": -3.0790865421295166, + "logps/chosen": -53.952476501464844, + "logps/rejected": -52.38344192504883, + "loss": 0.6926, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.0007519819191657007, + "rewards/margins": 0.001077468739822507, + "rewards/rejected": -0.0018294507171958685, + "step": 370 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 1.1392273902893066, + "learning_rate": 4.985679408110568e-08, + "logits/chosen": -3.0398175716400146, + "logits/rejected": -3.0220084190368652, + "logps/chosen": -54.647239685058594, + "logps/rejected": -52.84843826293945, + "loss": 0.6927, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0008999688434414566, + "rewards/margins": 0.0009324215352535248, + "rewards/rejected": -0.001832390553317964, + "step": 380 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 1.1045254468917847, + "learning_rate": 4.9822845242128844e-08, + "logits/chosen": -3.0233044624328613, + "logits/rejected": -3.001706600189209, + "logps/chosen": -53.072547912597656, + "logps/rejected": -50.9716796875, + "loss": 0.6926, + "rewards/accuracies": 0.589062511920929, + "rewards/chosen": -0.0011974747758358717, + "rewards/margins": 0.001123163616284728, + "rewards/rejected": -0.0023206386249512434, + "step": 390 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 1.1129488945007324, + "learning_rate": 4.9785302781742763e-08, + "logits/chosen": -3.050330400466919, + "logits/rejected": -3.035008192062378, + "logps/chosen": -52.901397705078125, + "logps/rejected": -54.134605407714844, + "loss": 0.6927, + "rewards/accuracies": 0.582812488079071, + "rewards/chosen": -0.0011613852111622691, + "rewards/margins": 0.0009022338199429214, + "rewards/rejected": -0.0020636192057281733, + "step": 400 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.1611053943634033, + "eval_logits/rejected": -3.1554572582244873, + "eval_logps/chosen": -58.63969039916992, + "eval_logps/rejected": -63.135032653808594, + "eval_loss": 0.6930131316184998, + "eval_rewards/accuracies": 0.5257899761199951, + "eval_rewards/chosen": 0.0007220551487989724, + "eval_rewards/margins": 0.0002711908018682152, + "eval_rewards/rejected": 0.00045086428872309625, + "eval_runtime": 383.1949, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 400 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 1.2506204843521118, + "learning_rate": 4.974417213499681e-08, + "logits/chosen": -3.0777323246002197, + "logits/rejected": -3.049983501434326, + "logps/chosen": -55.058868408203125, + "logps/rejected": -53.96419143676758, + "loss": 0.6924, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0010929839918389916, + "rewards/margins": 0.0014660651795566082, + "rewards/rejected": -0.0025590492878109217, + "step": 410 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 1.237091302871704, + "learning_rate": 4.9699459256404706e-08, + "logits/chosen": -3.105699300765991, + "logits/rejected": -3.0748677253723145, + "logps/chosen": -55.66558837890625, + "logps/rejected": -53.8339729309082, + "loss": 0.6923, + "rewards/accuracies": 0.6109374761581421, + "rewards/chosen": -0.000828454561997205, + "rewards/margins": 0.001723860390484333, + "rewards/rejected": -0.002552315127104521, + "step": 420 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 1.1707303524017334, + "learning_rate": 4.965117061908251e-08, + "logits/chosen": -3.056098461151123, + "logits/rejected": -3.035871982574463, + "logps/chosen": -55.13801193237305, + "logps/rejected": -53.53112030029297, + "loss": 0.6927, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0015201037749648094, + "rewards/margins": 0.0009773834608495235, + "rewards/rejected": -0.002497487235814333, + "step": 430 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 1.1965198516845703, + "learning_rate": 4.959931321381145e-08, + "logits/chosen": -3.082432508468628, + "logits/rejected": -3.063544750213623, + "logps/chosen": -54.456016540527344, + "logps/rejected": -54.16331100463867, + "loss": 0.6925, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.0014482419937849045, + "rewards/margins": 0.0013292920775711536, + "rewards/rejected": -0.002777534071356058, + "step": 440 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 1.2304091453552246, + "learning_rate": 4.954389454802591e-08, + "logits/chosen": -3.1104228496551514, + "logits/rejected": -3.090036153793335, + "logps/chosen": -53.494163513183594, + "logps/rejected": -53.315879821777344, + "loss": 0.6923, + "rewards/accuracies": 0.5921875238418579, + "rewards/chosen": -0.0016971270088106394, + "rewards/margins": 0.0016467798268422484, + "rewards/rejected": -0.003343907417729497, + "step": 450 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 1.1292587518692017, + "learning_rate": 4.948492264472656e-08, + "logits/chosen": -3.1166298389434814, + "logits/rejected": -3.094527006149292, + "logps/chosen": -55.6964111328125, + "logps/rejected": -53.82384490966797, + "loss": 0.6925, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0016075067687779665, + "rewards/margins": 0.001251583336852491, + "rewards/rejected": -0.0028590902220457792, + "step": 460 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 1.197009563446045, + "learning_rate": 4.9422406041318844e-08, + "logits/chosen": -3.0635745525360107, + "logits/rejected": -3.038623094558716, + "logps/chosen": -54.91028594970703, + "logps/rejected": -53.81779861450195, + "loss": 0.6918, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0011848278809338808, + "rewards/margins": 0.002738000126555562, + "rewards/rejected": -0.0039228275418281555, + "step": 470 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 1.2151196002960205, + "learning_rate": 4.9356353788377026e-08, + "logits/chosen": -3.055495023727417, + "logits/rejected": -3.0299649238586426, + "logps/chosen": -55.23993682861328, + "logps/rejected": -53.810813903808594, + "loss": 0.692, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.0015645608073100448, + "rewards/margins": 0.0022706836462020874, + "rewards/rejected": -0.003835244569927454, + "step": 480 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 1.1114208698272705, + "learning_rate": 4.9286775448333944e-08, + "logits/chosen": -3.0453591346740723, + "logits/rejected": -3.0262703895568848, + "logps/chosen": -53.14439010620117, + "logps/rejected": -53.70630645751953, + "loss": 0.6922, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.0021999510936439037, + "rewards/margins": 0.0019915387965738773, + "rewards/rejected": -0.0041914889588952065, + "step": 490 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 1.2440327405929565, + "learning_rate": 4.921368109409663e-08, + "logits/chosen": -3.0790770053863525, + "logits/rejected": -3.0631680488586426, + "logps/chosen": -53.35895538330078, + "logps/rejected": -53.36548614501953, + "loss": 0.692, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.002265265677124262, + "rewards/margins": 0.002222201321274042, + "rewards/rejected": -0.004487467464059591, + "step": 500 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.1591975688934326, + "eval_logits/rejected": -3.153568983078003, + "eval_logps/chosen": -58.59514236450195, + "eval_logps/rejected": -63.1102180480957, + "eval_loss": 0.6929171681404114, + "eval_rewards/accuracies": 0.5246282815933228, + "eval_rewards/chosen": 0.0011674691922962666, + "eval_rewards/margins": 0.0004684112500399351, + "eval_rewards/rejected": 0.0006990578840486705, + "eval_runtime": 382.8893, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 500 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 1.1804462671279907, + "learning_rate": 4.913708130758806e-08, + "logits/chosen": -3.0682575702667236, + "logits/rejected": -3.046999454498291, + "logps/chosen": -54.03418731689453, + "logps/rejected": -54.376319885253906, + "loss": 0.692, + "rewards/accuracies": 0.5953124761581421, + "rewards/chosen": -0.002343302359804511, + "rewards/margins": 0.002379921730607748, + "rewards/rejected": -0.0047232238575816154, + "step": 510 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 1.1343954801559448, + "learning_rate": 4.9056987178215176e-08, + "logits/chosen": -3.1094601154327393, + "logits/rejected": -3.0802154541015625, + "logps/chosen": -53.637245178222656, + "logps/rejected": -53.262474060058594, + "loss": 0.6922, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0020912564359605312, + "rewards/margins": 0.002018420724198222, + "rewards/rejected": -0.004109677392989397, + "step": 520 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 1.1996898651123047, + "learning_rate": 4.8973410301263516e-08, + "logits/chosen": -3.051212787628174, + "logits/rejected": -3.0387063026428223, + "logps/chosen": -53.287681579589844, + "logps/rejected": -53.440711975097656, + "loss": 0.6922, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.0023356422316282988, + "rewards/margins": 0.0020052504260092974, + "rewards/rejected": -0.004340892191976309, + "step": 530 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 1.164119839668274, + "learning_rate": 4.8886362776218506e-08, + "logits/chosen": -3.0033349990844727, + "logits/rejected": -2.9812140464782715, + "logps/chosen": -53.450355529785156, + "logps/rejected": -51.471229553222656, + "loss": 0.6919, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0030757987406104803, + "rewards/margins": 0.0025067501701414585, + "rewards/rejected": -0.005582548677921295, + "step": 540 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 1.28213632106781, + "learning_rate": 4.879585720501382e-08, + "logits/chosen": -3.148085355758667, + "logits/rejected": -3.127159595489502, + "logps/chosen": -54.660545349121094, + "logps/rejected": -53.745887756347656, + "loss": 0.6919, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0028407000936567783, + "rewards/margins": 0.0024546708445996046, + "rewards/rejected": -0.005295370705425739, + "step": 550 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 1.3044832944869995, + "learning_rate": 4.870190669020703e-08, + "logits/chosen": -3.0593714714050293, + "logits/rejected": -3.036311388015747, + "logps/chosen": -55.014060974121094, + "logps/rejected": -53.53757858276367, + "loss": 0.6915, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.002411695895716548, + "rewards/margins": 0.0032335221767425537, + "rewards/rejected": -0.005645217839628458, + "step": 560 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 1.1550047397613525, + "learning_rate": 4.860452483308266e-08, + "logits/chosen": -2.9982199668884277, + "logits/rejected": -2.972108840942383, + "logps/chosen": -56.20374298095703, + "logps/rejected": -55.09558868408203, + "loss": 0.6916, + "rewards/accuracies": 0.5953124761581421, + "rewards/chosen": -0.0028813418466597795, + "rewards/margins": 0.0031816777773201466, + "rewards/rejected": -0.00606301985681057, + "step": 570 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 1.230724573135376, + "learning_rate": 4.8503725731683204e-08, + "logits/chosen": -3.0479977130889893, + "logits/rejected": -3.0179476737976074, + "logps/chosen": -54.623687744140625, + "logps/rejected": -53.172157287597656, + "loss": 0.6912, + "rewards/accuracies": 0.659375011920929, + "rewards/chosen": -0.002868245355784893, + "rewards/margins": 0.003858409356325865, + "rewards/rejected": -0.006726655177772045, + "step": 580 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 1.1609071493148804, + "learning_rate": 4.839952397876808e-08, + "logits/chosen": -3.0574018955230713, + "logits/rejected": -3.039822816848755, + "logps/chosen": -54.512779235839844, + "logps/rejected": -54.206886291503906, + "loss": 0.6914, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.0032920341473072767, + "rewards/margins": 0.0035798237659037113, + "rewards/rejected": -0.006871857680380344, + "step": 590 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 1.0820258855819702, + "learning_rate": 4.829193465970105e-08, + "logits/chosen": -3.089672327041626, + "logits/rejected": -3.069746494293213, + "logps/chosen": -54.53960418701172, + "logps/rejected": -53.9844970703125, + "loss": 0.6915, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.003799352329224348, + "rewards/margins": 0.0033282779622823, + "rewards/rejected": -0.007127630058676004, + "step": 600 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -3.1564178466796875, + "eval_logits/rejected": -3.150780200958252, + "eval_logps/chosen": -58.54813766479492, + "eval_logps/rejected": -63.1104850769043, + "eval_loss": 0.6926856637001038, + "eval_rewards/accuracies": 0.5504181981086731, + "eval_rewards/chosen": 0.001637543668039143, + "eval_rewards/margins": 0.0009411590872332454, + "eval_rewards/rejected": 0.0006963845225982368, + "eval_runtime": 383.4087, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 600 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 1.1835054159164429, + "learning_rate": 4.818097335026631e-08, + "logits/chosen": -3.101921319961548, + "logits/rejected": -3.0772037506103516, + "logps/chosen": -55.26588821411133, + "logps/rejected": -53.28364181518555, + "loss": 0.6912, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.002828064141795039, + "rewards/margins": 0.00399785814806819, + "rewards/rejected": -0.006825921591371298, + "step": 610 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 1.203052043914795, + "learning_rate": 4.806665611441354e-08, + "logits/chosen": -3.077770233154297, + "logits/rejected": -3.0505123138427734, + "logps/chosen": -55.078880310058594, + "logps/rejected": -52.72577667236328, + "loss": 0.6916, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.004233444109559059, + "rewards/margins": 0.0032121867407113314, + "rewards/rejected": -0.007445631083101034, + "step": 620 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 1.1228797435760498, + "learning_rate": 4.794899950193235e-08, + "logits/chosen": -3.0409035682678223, + "logits/rejected": -3.0232279300689697, + "logps/chosen": -53.423980712890625, + "logps/rejected": -52.98765182495117, + "loss": 0.6921, + "rewards/accuracies": 0.5796874761581421, + "rewards/chosen": -0.004781276918947697, + "rewards/margins": 0.0022252718918025494, + "rewards/rejected": -0.007006548345088959, + "step": 630 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 1.262542486190796, + "learning_rate": 4.782802054605635e-08, + "logits/chosen": -3.0899507999420166, + "logits/rejected": -3.0717437267303467, + "logps/chosen": -55.078704833984375, + "logps/rejected": -54.794776916503906, + "loss": 0.6913, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0041534146293997765, + "rewards/margins": 0.0038144378922879696, + "rewards/rejected": -0.007967852056026459, + "step": 640 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 1.2199469804763794, + "learning_rate": 4.77037367609972e-08, + "logits/chosen": -3.0735621452331543, + "logits/rejected": -3.0427281856536865, + "logps/chosen": -56.89426803588867, + "logps/rejected": -53.209136962890625, + "loss": 0.6911, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.004497360438108444, + "rewards/margins": 0.004101374186575413, + "rewards/rejected": -0.008598734624683857, + "step": 650 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 1.1544371843338013, + "learning_rate": 4.7576166139409105e-08, + "logits/chosen": -3.042221784591675, + "logits/rejected": -3.0110714435577393, + "logps/chosen": -54.11481475830078, + "logps/rejected": -52.04207229614258, + "loss": 0.6907, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.004763273987919092, + "rewards/margins": 0.004999758210033178, + "rewards/rejected": -0.00976303219795227, + "step": 660 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 1.281175136566162, + "learning_rate": 4.744532714978399e-08, + "logits/chosen": -3.0140280723571777, + "logits/rejected": -2.9848811626434326, + "logps/chosen": -56.1414680480957, + "logps/rejected": -54.0085334777832, + "loss": 0.6908, + "rewards/accuracies": 0.6265624761581421, + "rewards/chosen": -0.0041292086243629456, + "rewards/margins": 0.004748177714645863, + "rewards/rejected": -0.008877387270331383, + "step": 670 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 1.1695414781570435, + "learning_rate": 4.7311238733777815e-08, + "logits/chosen": -3.046804428100586, + "logits/rejected": -3.0304887294769287, + "logps/chosen": -54.355079650878906, + "logps/rejected": -54.04961395263672, + "loss": 0.6911, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.005136381834745407, + "rewards/margins": 0.00416863476857543, + "rewards/rejected": -0.009305017068982124, + "step": 680 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 1.1991028785705566, + "learning_rate": 4.717392030346835e-08, + "logits/chosen": -3.028083562850952, + "logits/rejected": -3.011951446533203, + "logps/chosen": -54.25959396362305, + "logps/rejected": -54.1555061340332, + "loss": 0.6911, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00518420897424221, + "rewards/margins": 0.004269171506166458, + "rewards/rejected": -0.009453380480408669, + "step": 690 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 1.2611873149871826, + "learning_rate": 4.70333917385449e-08, + "logits/chosen": -3.079685926437378, + "logits/rejected": -3.049795627593994, + "logps/chosen": -55.45751190185547, + "logps/rejected": -53.548301696777344, + "loss": 0.6912, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.004976336378604174, + "rewards/margins": 0.003996217157691717, + "rewards/rejected": -0.00897255353629589, + "step": 700 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -3.1538004875183105, + "eval_logits/rejected": -3.1481423377990723, + "eval_logps/chosen": -58.522918701171875, + "eval_logps/rejected": -63.142425537109375, + "eval_loss": 0.6924082636833191, + "eval_rewards/accuracies": 0.5671468377113342, + "eval_rewards/chosen": 0.0018897424452006817, + "eval_rewards/margins": 0.0015127337537705898, + "eval_rewards/rejected": 0.0003770088078454137, + "eval_runtime": 383.1967, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 700 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 1.1392662525177002, + "learning_rate": 4.688967338343029e-08, + "logits/chosen": -3.0261685848236084, + "logits/rejected": -3.0102686882019043, + "logps/chosen": -54.990821838378906, + "logps/rejected": -54.767127990722656, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00540867168456316, + "rewards/margins": 0.004442816134542227, + "rewards/rejected": -0.0098514873534441, + "step": 710 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 1.1779069900512695, + "learning_rate": 4.6742786044335625e-08, + "logits/chosen": -3.0809476375579834, + "logits/rejected": -3.057307481765747, + "logps/chosen": -55.18914794921875, + "logps/rejected": -53.8927116394043, + "loss": 0.6898, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.0052046263590455055, + "rewards/margins": 0.006748650223016739, + "rewards/rejected": -0.011953277513384819, + "step": 720 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 1.2199147939682007, + "learning_rate": 4.6592750986248085e-08, + "logits/chosen": -3.107689380645752, + "logits/rejected": -3.1000123023986816, + "logps/chosen": -54.34379959106445, + "logps/rejected": -54.853431701660156, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005630369298160076, + "rewards/margins": 0.004252653103321791, + "rewards/rejected": -0.009883022867143154, + "step": 730 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 1.307981014251709, + "learning_rate": 4.6439589929852476e-08, + "logits/chosen": -3.0687716007232666, + "logits/rejected": -3.0409016609191895, + "logps/chosen": -53.86914825439453, + "logps/rejected": -53.336158752441406, + "loss": 0.6905, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.005739855580031872, + "rewards/margins": 0.0054161581210792065, + "rewards/rejected": -0.011156014166772366, + "step": 740 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 1.1373140811920166, + "learning_rate": 4.6283325048386624e-08, + "logits/chosen": -3.0201470851898193, + "logits/rejected": -2.998100519180298, + "logps/chosen": -55.00568389892578, + "logps/rejected": -54.43558883666992, + "loss": 0.6903, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0055595808662474155, + "rewards/margins": 0.005786740221083164, + "rewards/rejected": -0.011346321552991867, + "step": 750 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 1.1454448699951172, + "learning_rate": 4.612397896443138e-08, + "logits/chosen": -3.118800163269043, + "logits/rejected": -3.0978825092315674, + "logps/chosen": -54.798065185546875, + "logps/rejected": -54.3465461730957, + "loss": 0.6909, + "rewards/accuracies": 0.5796874761581421, + "rewards/chosen": -0.007258473429828882, + "rewards/margins": 0.004668924491852522, + "rewards/rejected": -0.011927397921681404, + "step": 760 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 1.1706945896148682, + "learning_rate": 4.5961574746635536e-08, + "logits/chosen": -3.012247323989868, + "logits/rejected": -2.993521213531494, + "logps/chosen": -55.298187255859375, + "logps/rejected": -55.779624938964844, + "loss": 0.6913, + "rewards/accuracies": 0.582812488079071, + "rewards/chosen": -0.00789455696940422, + "rewards/margins": 0.0037407889030873775, + "rewards/rejected": -0.011635346338152885, + "step": 770 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 1.2820113897323608, + "learning_rate": 4.5796135906376144e-08, + "logits/chosen": -3.0310168266296387, + "logits/rejected": -3.015160083770752, + "logps/chosen": -54.29914474487305, + "logps/rejected": -55.2180290222168, + "loss": 0.691, + "rewards/accuracies": 0.590624988079071, + "rewards/chosen": -0.007602076046168804, + "rewards/margins": 0.004507972858846188, + "rewards/rejected": -0.012110048905014992, + "step": 780 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 1.1508716344833374, + "learning_rate": 4.5627686394354766e-08, + "logits/chosen": -3.0379862785339355, + "logits/rejected": -3.017380475997925, + "logps/chosen": -53.72552490234375, + "logps/rejected": -54.60520553588867, + "loss": 0.6901, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0065218256786465645, + "rewards/margins": 0.006237885914742947, + "rewards/rejected": -0.012759710662066936, + "step": 790 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 1.1988805532455444, + "learning_rate": 4.545625059713011e-08, + "logits/chosen": -3.0689666271209717, + "logits/rejected": -3.046346664428711, + "logps/chosen": -54.87028121948242, + "logps/rejected": -53.7490119934082, + "loss": 0.69, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.006772381253540516, + "rewards/margins": 0.006438801996409893, + "rewards/rejected": -0.013211183249950409, + "step": 800 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -3.1500000953674316, + "eval_logits/rejected": -3.1443684101104736, + "eval_logps/chosen": -58.52486038208008, + "eval_logps/rejected": -63.183868408203125, + "eval_loss": 0.69222092628479, + "eval_rewards/accuracies": 0.5759758353233337, + "eval_rewards/chosen": 0.001870311563834548, + "eval_rewards/margins": 0.0019077310571447015, + "eval_rewards/rejected": -3.741981345228851e-05, + "eval_runtime": 383.2221, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 800 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 1.181986927986145, + "learning_rate": 4.528185333358756e-08, + "logits/chosen": -3.026899814605713, + "logits/rejected": -3.0093157291412354, + "logps/chosen": -54.46189498901367, + "logps/rejected": -54.8513298034668, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00789581798017025, + "rewards/margins": 0.004861229099333286, + "rewards/rejected": -0.01275704801082611, + "step": 810 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 1.1672871112823486, + "learning_rate": 4.510451985134616e-08, + "logits/chosen": -3.0875649452209473, + "logits/rejected": -3.0743203163146973, + "logps/chosen": -53.040733337402344, + "logps/rejected": -55.541954040527344, + "loss": 0.6903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00876162201166153, + "rewards/margins": 0.00582465436309576, + "rewards/rejected": -0.014586275443434715, + "step": 820 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 1.2362406253814697, + "learning_rate": 4.492427582310346e-08, + "logits/chosen": -3.0630054473876953, + "logits/rejected": -3.0335052013397217, + "logps/chosen": -54.42986297607422, + "logps/rejected": -53.1016845703125, + "loss": 0.6894, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.007075751665979624, + "rewards/margins": 0.007580357138067484, + "rewards/rejected": -0.014656109735369682, + "step": 830 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 1.1612728834152222, + "learning_rate": 4.4741147342918894e-08, + "logits/chosen": -3.076169013977051, + "logits/rejected": -3.0513038635253906, + "logps/chosen": -55.8946533203125, + "logps/rejected": -55.85911178588867, + "loss": 0.689, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.0071454280987381935, + "rewards/margins": 0.008417905308306217, + "rewards/rejected": -0.01556333340704441, + "step": 840 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 1.1926907300949097, + "learning_rate": 4.4555160922436074e-08, + "logits/chosen": -3.079662322998047, + "logits/rejected": -3.0524630546569824, + "logps/chosen": -54.04046630859375, + "logps/rejected": -53.262847900390625, + "loss": 0.6896, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.007903190329670906, + "rewards/margins": 0.007286435458809137, + "rewards/rejected": -0.015189625322818756, + "step": 850 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 1.2318311929702759, + "learning_rate": 4.4366343487044754e-08, + "logits/chosen": -3.031019926071167, + "logits/rejected": -3.0084445476531982, + "logps/chosen": -52.4871711730957, + "logps/rejected": -53.59075927734375, + "loss": 0.6897, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.009235886856913567, + "rewards/margins": 0.007172322832047939, + "rewards/rejected": -0.01640820875763893, + "step": 860 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 1.2643660306930542, + "learning_rate": 4.417472237198275e-08, + "logits/chosen": -3.122987985610962, + "logits/rejected": -3.097611665725708, + "logps/chosen": -56.34685134887695, + "logps/rejected": -55.146095275878906, + "loss": 0.6896, + "rewards/accuracies": 0.620312511920929, + "rewards/chosen": -0.0072593227960169315, + "rewards/margins": 0.007286491803824902, + "rewards/rejected": -0.01454581506550312, + "step": 870 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 1.2349611520767212, + "learning_rate": 4.398032531837865e-08, + "logits/chosen": -3.000382423400879, + "logits/rejected": -2.979700803756714, + "logps/chosen": -54.820579528808594, + "logps/rejected": -54.77504348754883, + "loss": 0.69, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.010271805338561535, + "rewards/margins": 0.006492338143289089, + "rewards/rejected": -0.016764143481850624, + "step": 880 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 1.2228236198425293, + "learning_rate": 4.378318046923567e-08, + "logits/chosen": -3.046607494354248, + "logits/rejected": -3.0200607776641846, + "logps/chosen": -55.26753616333008, + "logps/rejected": -54.108428955078125, + "loss": 0.6894, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.00934204924851656, + "rewards/margins": 0.007791099604219198, + "rewards/rejected": -0.01713315024971962, + "step": 890 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 1.186522126197815, + "learning_rate": 4.3583316365357413e-08, + "logits/chosen": -3.081699848175049, + "logits/rejected": -3.0569376945495605, + "logps/chosen": -56.97715377807617, + "logps/rejected": -55.800636291503906, + "loss": 0.6893, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.009422613307833672, + "rewards/margins": 0.007982470095157623, + "rewards/rejected": -0.017405081540346146, + "step": 900 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -3.145947217941284, + "eval_logits/rejected": -3.140315532684326, + "eval_logps/chosen": -58.54254913330078, + "eval_logps/rejected": -63.26302719116211, + "eval_loss": 0.6919277906417847, + "eval_rewards/accuracies": 0.5708643198013306, + "eval_rewards/chosen": 0.0016934837913140655, + "eval_rewards/margins": 0.00252249906770885, + "eval_rewards/rejected": -0.0008290152181871235, + "eval_runtime": 383.2519, + "eval_samples_per_second": 11.23, + "eval_steps_per_second": 1.404, + "step": 900 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 1.234681248664856, + "learning_rate": 4.3380761941215947e-08, + "logits/chosen": -3.046011447906494, + "logits/rejected": -3.0302977561950684, + "logps/chosen": -54.25246047973633, + "logps/rejected": -55.46947479248047, + "loss": 0.6893, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.009562310762703419, + "rewards/margins": 0.007873213849961758, + "rewards/rejected": -0.017435524612665176, + "step": 910 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 1.141934871673584, + "learning_rate": 4.317554652076299e-08, + "logits/chosen": -3.054769992828369, + "logits/rejected": -3.0311903953552246, + "logps/chosen": -54.04453659057617, + "logps/rejected": -54.37770462036133, + "loss": 0.6894, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.009277190081775188, + "rewards/margins": 0.0076979040168225765, + "rewards/rejected": -0.016975093632936478, + "step": 920 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 1.236680269241333, + "learning_rate": 4.2967699813184615e-08, + "logits/chosen": -3.0500195026397705, + "logits/rejected": -3.0328176021575928, + "logps/chosen": -54.70762252807617, + "logps/rejected": -57.55879592895508, + "loss": 0.6884, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.008434431627392769, + "rewards/margins": 0.009904151782393456, + "rewards/rejected": -0.018338583409786224, + "step": 930 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 1.2360023260116577, + "learning_rate": 4.275725190860027e-08, + "logits/chosen": -3.073611259460449, + "logits/rejected": -3.0537660121917725, + "logps/chosen": -55.351104736328125, + "logps/rejected": -55.8747673034668, + "loss": 0.6896, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.010648580268025398, + "rewards/margins": 0.007464288733899593, + "rewards/rejected": -0.018112869933247566, + "step": 940 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 1.2623155117034912, + "learning_rate": 4.2544233273706585e-08, + "logits/chosen": -3.0598671436309814, + "logits/rejected": -3.0294106006622314, + "logps/chosen": -55.8059196472168, + "logps/rejected": -53.73136520385742, + "loss": 0.6893, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.009804973378777504, + "rewards/margins": 0.007985373958945274, + "rewards/rejected": -0.01779034733772278, + "step": 950 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 1.2945950031280518, + "learning_rate": 4.232867474736669e-08, + "logits/chosen": -3.0672502517700195, + "logits/rejected": -3.0369277000427246, + "logps/chosen": -56.809417724609375, + "logps/rejected": -55.6953239440918, + "loss": 0.6879, + "rewards/accuracies": 0.660937488079071, + "rewards/chosen": -0.007269621826708317, + "rewards/margins": 0.010824671015143394, + "rewards/rejected": -0.018094293773174286, + "step": 960 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 1.1434519290924072, + "learning_rate": 4.211060753614565e-08, + "logits/chosen": -3.1128265857696533, + "logits/rejected": -3.0972368717193604, + "logps/chosen": -56.41877365112305, + "logps/rejected": -55.6785774230957, + "loss": 0.6903, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.011154340580105782, + "rewards/margins": 0.005970745347440243, + "rewards/rejected": -0.0171250868588686, + "step": 970 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 1.1750149726867676, + "learning_rate": 4.1890063209792674e-08, + "logits/chosen": -3.1158690452575684, + "logits/rejected": -3.079075336456299, + "logps/chosen": -57.21317672729492, + "logps/rejected": -53.88423538208008, + "loss": 0.6876, + "rewards/accuracies": 0.6421874761581421, + "rewards/chosen": -0.008817395195364952, + "rewards/margins": 0.01136676874011755, + "rewards/rejected": -0.020184166729450226, + "step": 980 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 1.3042854070663452, + "learning_rate": 4.166707369667073e-08, + "logits/chosen": -3.045738458633423, + "logits/rejected": -3.0281968116760254, + "logps/chosen": -54.19719696044922, + "logps/rejected": -56.0754280090332, + "loss": 0.6888, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.009636206552386284, + "rewards/margins": 0.009063459932804108, + "rewards/rejected": -0.01869966648519039, + "step": 990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 1.22942054271698, + "learning_rate": 4.144167127913426e-08, + "logits/chosen": -3.075810194015503, + "logits/rejected": -3.052361249923706, + "logps/chosen": -55.58427810668945, + "logps/rejected": -55.51273727416992, + "loss": 0.6892, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.011259499937295914, + "rewards/margins": 0.008291425183415413, + "rewards/rejected": -0.019550926983356476, + "step": 1000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -3.1417765617370605, + "eval_logits/rejected": -3.136131525039673, + "eval_logps/chosen": -58.606346130371094, + "eval_logps/rejected": -63.375797271728516, + "eval_loss": 0.6916959881782532, + "eval_rewards/accuracies": 0.5724906921386719, + "eval_rewards/chosen": 0.001055453554727137, + "eval_rewards/margins": 0.0030122159514576197, + "eval_rewards/rejected": -0.0019567625131458044, + "eval_runtime": 382.8427, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 1000 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 1.228550910949707, + "learning_rate": 4.1213888588855636e-08, + "logits/chosen": -3.0645551681518555, + "logits/rejected": -3.0503764152526855, + "logps/chosen": -54.256507873535156, + "logps/rejected": -55.95310592651367, + "loss": 0.6892, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.011437224224209785, + "rewards/margins": 0.008133414201438427, + "rewards/rejected": -0.019570637494325638, + "step": 1010 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 1.290880560874939, + "learning_rate": 4.098375860210107e-08, + "logits/chosen": -3.0364532470703125, + "logits/rejected": -3.0164756774902344, + "logps/chosen": -54.49522018432617, + "logps/rejected": -55.22959518432617, + "loss": 0.6887, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.013310156762599945, + "rewards/margins": 0.0093264514580369, + "rewards/rejected": -0.02263660728931427, + "step": 1020 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 1.234087586402893, + "learning_rate": 4.075131463495657e-08, + "logits/chosen": -3.0410397052764893, + "logits/rejected": -3.023860454559326, + "logps/chosen": -54.86391067504883, + "logps/rejected": -54.73369598388672, + "loss": 0.6886, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.012769045308232307, + "rewards/margins": 0.009414998814463615, + "rewards/rejected": -0.022184044122695923, + "step": 1030 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 1.1497515439987183, + "learning_rate": 4.051659033850477e-08, + "logits/chosen": -3.0711050033569336, + "logits/rejected": -3.0434327125549316, + "logps/chosen": -55.960113525390625, + "logps/rejected": -53.39757537841797, + "loss": 0.6876, + "rewards/accuracies": 0.6656249761581421, + "rewards/chosen": -0.01206748653203249, + "rewards/margins": 0.011478706263005733, + "rewards/rejected": -0.023546192795038223, + "step": 1040 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 1.241176724433899, + "learning_rate": 4.0279619693953283e-08, + "logits/chosen": -3.0579118728637695, + "logits/rejected": -3.044525623321533, + "logps/chosen": -54.537757873535156, + "logps/rejected": -55.7606201171875, + "loss": 0.6891, + "rewards/accuracies": 0.5953124761581421, + "rewards/chosen": -0.01158697810024023, + "rewards/margins": 0.008560305461287498, + "rewards/rejected": -0.020147282630205154, + "step": 1050 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 1.287839651107788, + "learning_rate": 4.0040437007715124e-08, + "logits/chosen": -3.0260822772979736, + "logits/rejected": -3.0041518211364746, + "logps/chosen": -55.73114013671875, + "logps/rejected": -56.4024772644043, + "loss": 0.6875, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.013397350907325745, + "rewards/margins": 0.01179309468716383, + "rewards/rejected": -0.02519044652581215, + "step": 1060 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 1.1840453147888184, + "learning_rate": 3.979907690644222e-08, + "logits/chosen": -3.005467653274536, + "logits/rejected": -2.9843525886535645, + "logps/chosen": -54.47725296020508, + "logps/rejected": -54.86272048950195, + "loss": 0.688, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.012515179812908173, + "rewards/margins": 0.010800262913107872, + "rewards/rejected": -0.023315440863370895, + "step": 1070 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 1.2041012048721313, + "learning_rate": 3.9555574332012454e-08, + "logits/chosen": -3.0442147254943848, + "logits/rejected": -3.0234692096710205, + "logps/chosen": -56.4234619140625, + "logps/rejected": -55.07111740112305, + "loss": 0.6882, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.013009254820644855, + "rewards/margins": 0.01034192182123661, + "rewards/rejected": -0.02335117571055889, + "step": 1080 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 1.192734956741333, + "learning_rate": 3.930996453647113e-08, + "logits/chosen": -3.008514881134033, + "logits/rejected": -2.986760139465332, + "logps/chosen": -53.92486572265625, + "logps/rejected": -53.8699951171875, + "loss": 0.6886, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.016327153891324997, + "rewards/margins": 0.009530487470328808, + "rewards/rejected": -0.02585764229297638, + "step": 1090 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 1.1945998668670654, + "learning_rate": 3.906228307692747e-08, + "logits/chosen": -3.050058126449585, + "logits/rejected": -3.0325589179992676, + "logps/chosen": -56.26338577270508, + "logps/rejected": -56.20615768432617, + "loss": 0.6892, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.014123663306236267, + "rewards/margins": 0.0081967543810606, + "rewards/rejected": -0.022320415824651718, + "step": 1100 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -3.136918067932129, + "eval_logits/rejected": -3.131256103515625, + "eval_logps/chosen": -58.693904876708984, + "eval_logps/rejected": -63.52504348754883, + "eval_loss": 0.6914047598838806, + "eval_rewards/accuracies": 0.5808550119400024, + "eval_rewards/chosen": 0.00017988457693718374, + "eval_rewards/margins": 0.003629034385085106, + "eval_rewards/rejected": -0.0034491494297981262, + "eval_runtime": 382.7678, + "eval_samples_per_second": 11.244, + "eval_steps_per_second": 1.406, + "step": 1100 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 1.2611422538757324, + "learning_rate": 3.8812565810407006e-08, + "logits/chosen": -3.0509583950042725, + "logits/rejected": -3.019794225692749, + "logps/chosen": -57.166297912597656, + "logps/rejected": -55.56831741333008, + "loss": 0.6876, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.014432080090045929, + "rewards/margins": 0.01164001040160656, + "rewards/rejected": -0.02607208862900734, + "step": 1110 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 1.1777701377868652, + "learning_rate": 3.856084888866052e-08, + "logits/chosen": -3.0596282482147217, + "logits/rejected": -3.045269250869751, + "logps/chosen": -55.52899932861328, + "logps/rejected": -54.93024444580078, + "loss": 0.6888, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.01683993637561798, + "rewards/margins": 0.009132475592195988, + "rewards/rejected": -0.025972411036491394, + "step": 1120 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 1.2296311855316162, + "learning_rate": 3.830716875293038e-08, + "logits/chosen": -3.0673999786376953, + "logits/rejected": -3.0444142818450928, + "logps/chosen": -54.970741271972656, + "logps/rejected": -54.68275833129883, + "loss": 0.6885, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.015940912067890167, + "rewards/margins": 0.009639047086238861, + "rewards/rejected": -0.02557995915412903, + "step": 1130 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 1.1905580759048462, + "learning_rate": 3.805156212867483e-08, + "logits/chosen": -3.029092788696289, + "logits/rejected": -3.0086588859558105, + "logps/chosen": -56.229042053222656, + "logps/rejected": -55.34952926635742, + "loss": 0.6875, + "rewards/accuracies": 0.6421874761581421, + "rewards/chosen": -0.01458609290421009, + "rewards/margins": 0.011665640398859978, + "rewards/rejected": -0.02625173330307007, + "step": 1140 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 1.1570724248886108, + "learning_rate": 3.779406602025128e-08, + "logits/chosen": -3.007833957672119, + "logits/rejected": -2.9827017784118652, + "logps/chosen": -55.04015350341797, + "logps/rejected": -55.21650314331055, + "loss": 0.6881, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.01640317775309086, + "rewards/margins": 0.010581018403172493, + "rewards/rejected": -0.02698419615626335, + "step": 1150 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 1.211165189743042, + "learning_rate": 3.7534717705559146e-08, + "logits/chosen": -3.036921977996826, + "logits/rejected": -3.0160536766052246, + "logps/chosen": -56.46533203125, + "logps/rejected": -57.42781448364258, + "loss": 0.6882, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.017415925860404968, + "rewards/margins": 0.01034791674464941, + "rewards/rejected": -0.027763843536376953, + "step": 1160 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 1.1748243570327759, + "learning_rate": 3.727355473064308e-08, + "logits/chosen": -3.05203914642334, + "logits/rejected": -3.024839162826538, + "logps/chosen": -54.88653564453125, + "logps/rejected": -54.187705993652344, + "loss": 0.6875, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.015425342135131359, + "rewards/margins": 0.011839036829769611, + "rewards/rejected": -0.02726438082754612, + "step": 1170 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 1.2590429782867432, + "learning_rate": 3.701061490425745e-08, + "logits/chosen": -3.053898334503174, + "logits/rejected": -3.0290002822875977, + "logps/chosen": -57.20033645629883, + "logps/rejected": -56.57124710083008, + "loss": 0.6873, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.016385816037654877, + "rewards/margins": 0.012309985235333443, + "rewards/rejected": -0.02869580127298832, + "step": 1180 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 1.2485055923461914, + "learning_rate": 3.6745936292392666e-08, + "logits/chosen": -3.021477460861206, + "logits/rejected": -3.0019021034240723, + "logps/chosen": -55.60076141357422, + "logps/rejected": -55.449058532714844, + "loss": 0.6881, + "rewards/accuracies": 0.6343749761581421, + "rewards/chosen": -0.01587783917784691, + "rewards/margins": 0.010499360039830208, + "rewards/rejected": -0.026377201080322266, + "step": 1190 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 1.2800626754760742, + "learning_rate": 3.6479557212764414e-08, + "logits/chosen": -3.028402090072632, + "logits/rejected": -3.008002519607544, + "logps/chosen": -56.96452713012695, + "logps/rejected": -55.789756774902344, + "loss": 0.6885, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.01752752624452114, + "rewards/margins": 0.009723445400595665, + "rewards/rejected": -0.027250971645116806, + "step": 1200 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -3.133814811706543, + "eval_logits/rejected": -3.128159284591675, + "eval_logps/chosen": -58.78531265258789, + "eval_logps/rejected": -63.680179595947266, + "eval_loss": 0.6911039352416992, + "eval_rewards/accuracies": 0.5755111575126648, + "eval_rewards/chosen": -0.0007341906311921775, + "eval_rewards/margins": 0.004266415257006884, + "eval_rewards/rejected": -0.005000605713576078, + "eval_runtime": 383.3119, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.404, + "step": 1200 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 1.3170323371887207, + "learning_rate": 3.621151622926631e-08, + "logits/chosen": -3.022981643676758, + "logits/rejected": -2.9983408451080322, + "logps/chosen": -56.321983337402344, + "logps/rejected": -55.284454345703125, + "loss": 0.6875, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.017917579039931297, + "rewards/margins": 0.011845814064145088, + "rewards/rejected": -0.029763391241431236, + "step": 1210 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 1.170351505279541, + "learning_rate": 3.594185214638704e-08, + "logits/chosen": -3.066943407058716, + "logits/rejected": -3.0385823249816895, + "logps/chosen": -57.5960807800293, + "logps/rejected": -54.60730743408203, + "loss": 0.6872, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.0178567823022604, + "rewards/margins": 0.012462841346859932, + "rewards/rejected": -0.03031962178647518, + "step": 1220 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 1.224771499633789, + "learning_rate": 3.567060400359253e-08, + "logits/chosen": -3.0506491661071777, + "logits/rejected": -3.0242903232574463, + "logps/chosen": -56.49556350708008, + "logps/rejected": -55.71235275268555, + "loss": 0.686, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.017950473353266716, + "rewards/margins": 0.014979615807533264, + "rewards/rejected": -0.03293009102344513, + "step": 1230 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 1.2280082702636719, + "learning_rate": 3.5397811069674256e-08, + "logits/chosen": -3.037538528442383, + "logits/rejected": -3.023832082748413, + "logps/chosen": -56.15264129638672, + "logps/rejected": -58.523162841796875, + "loss": 0.6886, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.02116158790886402, + "rewards/margins": 0.0096513070166111, + "rewards/rejected": -0.03081289492547512, + "step": 1240 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 1.3131028413772583, + "learning_rate": 3.512351283706419e-08, + "logits/chosen": -3.0145888328552246, + "logits/rejected": -3.0035436153411865, + "logps/chosen": -55.00154495239258, + "logps/rejected": -56.4818229675293, + "loss": 0.6892, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.02046709507703781, + "rewards/margins": 0.008517416194081306, + "rewards/rejected": -0.02898450754582882, + "step": 1250 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 1.2719794511795044, + "learning_rate": 3.484774901611753e-08, + "logits/chosen": -3.037191390991211, + "logits/rejected": -3.011030673980713, + "logps/chosen": -56.1077880859375, + "logps/rejected": -55.119110107421875, + "loss": 0.6874, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.020265808328986168, + "rewards/margins": 0.012135522440075874, + "rewards/rejected": -0.03240133076906204, + "step": 1260 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 1.223455786705017, + "learning_rate": 3.4570559529363756e-08, + "logits/chosen": -3.0510623455047607, + "logits/rejected": -3.0273430347442627, + "logps/chosen": -56.024391174316406, + "logps/rejected": -54.66645431518555, + "loss": 0.687, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.01942160725593567, + "rewards/margins": 0.012839846312999725, + "rewards/rejected": -0.032261453568935394, + "step": 1270 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 1.224640965461731, + "learning_rate": 3.429198450572702e-08, + "logits/chosen": -3.045257568359375, + "logits/rejected": -3.0113143920898438, + "logps/chosen": -57.24846267700195, + "logps/rejected": -55.42502975463867, + "loss": 0.6862, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.019725024700164795, + "rewards/margins": 0.014653083868324757, + "rewards/rejected": -0.03437810391187668, + "step": 1280 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 1.1708803176879883, + "learning_rate": 3.401206427471665e-08, + "logits/chosen": -3.083014965057373, + "logits/rejected": -3.053872585296631, + "logps/chosen": -55.94194412231445, + "logps/rejected": -54.92655563354492, + "loss": 0.6859, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.019591879099607468, + "rewards/margins": 0.015089405700564384, + "rewards/rejected": -0.034681286662817, + "step": 1290 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 1.2637726068496704, + "learning_rate": 3.3730839360588633e-08, + "logits/chosen": -3.0728800296783447, + "logits/rejected": -3.0561113357543945, + "logps/chosen": -55.9746208190918, + "logps/rejected": -56.79145431518555, + "loss": 0.6877, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.020700206980109215, + "rewards/margins": 0.011441068723797798, + "rewards/rejected": -0.03214127570390701, + "step": 1300 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -3.12795090675354, + "eval_logits/rejected": -3.1222612857818604, + "eval_logps/chosen": -58.95671081542969, + "eval_logps/rejected": -63.90719223022461, + "eval_loss": 0.690849244594574, + "eval_rewards/accuracies": 0.578066885471344, + "eval_rewards/chosen": -0.002448198851197958, + "eval_rewards/margins": 0.004822447896003723, + "eval_rewards/rejected": -0.007270646747201681, + "eval_runtime": 383.2075, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 1300 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 1.2189267873764038, + "learning_rate": 3.344835047647892e-08, + "logits/chosen": -3.038391351699829, + "logits/rejected": -3.0231399536132812, + "logps/chosen": -55.052391052246094, + "logps/rejected": -57.159202575683594, + "loss": 0.6869, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.01980864442884922, + "rewards/margins": 0.013179932720959187, + "rewards/rejected": -0.03298857808113098, + "step": 1310 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 1.2375820875167847, + "learning_rate": 3.316463851850925e-08, + "logits/chosen": -3.0543713569641113, + "logits/rejected": -3.0287716388702393, + "logps/chosen": -55.84870147705078, + "logps/rejected": -54.72203826904297, + "loss": 0.6873, + "rewards/accuracies": 0.620312511920929, + "rewards/chosen": -0.024270061403512955, + "rewards/margins": 0.012434338219463825, + "rewards/rejected": -0.036704398691654205, + "step": 1320 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 1.2331100702285767, + "learning_rate": 3.287974455986671e-08, + "logits/chosen": -3.0482242107391357, + "logits/rejected": -3.0168027877807617, + "logps/chosen": -58.51416015625, + "logps/rejected": -55.834266662597656, + "loss": 0.6859, + "rewards/accuracies": 0.6578124761581421, + "rewards/chosen": -0.018537839874625206, + "rewards/margins": 0.015199096873402596, + "rewards/rejected": -0.0337369367480278, + "step": 1330 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 1.2201625108718872, + "learning_rate": 3.259370984485746e-08, + "logits/chosen": -3.0217771530151367, + "logits/rejected": -2.998465061187744, + "logps/chosen": -55.553428649902344, + "logps/rejected": -56.95924758911133, + "loss": 0.6867, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022853773087263107, + "rewards/margins": 0.013605187647044659, + "rewards/rejected": -0.03645896166563034, + "step": 1340 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 1.2265971899032593, + "learning_rate": 3.2306575782935806e-08, + "logits/chosen": -3.043489456176758, + "logits/rejected": -3.013939380645752, + "logps/chosen": -56.77729415893555, + "logps/rejected": -56.281822204589844, + "loss": 0.6854, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.020281706005334854, + "rewards/margins": 0.016187874600291252, + "rewards/rejected": -0.036469582468271255, + "step": 1350 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 1.245123267173767, + "learning_rate": 3.201838394270931e-08, + "logits/chosen": -3.064115524291992, + "logits/rejected": -3.0484519004821777, + "logps/chosen": -57.521820068359375, + "logps/rejected": -57.416893005371094, + "loss": 0.6874, + "rewards/accuracies": 0.6109374761581421, + "rewards/chosen": -0.024825390428304672, + "rewards/margins": 0.012256421148777008, + "rewards/rejected": -0.03708181157708168, + "step": 1360 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 1.2887479066848755, + "learning_rate": 3.172917604592084e-08, + "logits/chosen": -3.0228118896484375, + "logits/rejected": -3.0045721530914307, + "logps/chosen": -55.98120880126953, + "logps/rejected": -55.73798751831055, + "loss": 0.6871, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.025501202791929245, + "rewards/margins": 0.012837904505431652, + "rewards/rejected": -0.03833910822868347, + "step": 1370 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 1.2927711009979248, + "learning_rate": 3.143899396140856e-08, + "logits/chosen": -3.03184175491333, + "logits/rejected": -3.010230779647827, + "logps/chosen": -56.847740173339844, + "logps/rejected": -56.31840133666992, + "loss": 0.6871, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.022407762706279755, + "rewards/margins": 0.01289152167737484, + "rewards/rejected": -0.035299282521009445, + "step": 1380 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 1.2551859617233276, + "learning_rate": 3.114787969904446e-08, + "logits/chosen": -3.001943826675415, + "logits/rejected": -2.9838249683380127, + "logps/chosen": -56.46649169921875, + "logps/rejected": -57.01629638671875, + "loss": 0.6865, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.022768724709749222, + "rewards/margins": 0.014040583744645119, + "rewards/rejected": -0.03680930659174919, + "step": 1390 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 1.1776050329208374, + "learning_rate": 3.085587540365262e-08, + "logits/chosen": -3.054063081741333, + "logits/rejected": -3.0365357398986816, + "logps/chosen": -55.7647819519043, + "logps/rejected": -59.496559143066406, + "loss": 0.6874, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.024700012058019638, + "rewards/margins": 0.01230792049318552, + "rewards/rejected": -0.03700793534517288, + "step": 1400 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -3.1261656284332275, + "eval_logits/rejected": -3.1204779148101807, + "eval_logps/chosen": -59.10846710205078, + "eval_logps/rejected": -64.10256958007812, + "eval_loss": 0.6906503438949585, + "eval_rewards/accuracies": 0.5771375298500061, + "eval_rewards/chosen": -0.003965714480727911, + "eval_rewards/margins": 0.0052587250247597694, + "eval_rewards/rejected": -0.009224439039826393, + "eval_runtime": 383.1495, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 1400 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 1.347545862197876, + "learning_rate": 3.056302334890786e-08, + "logits/chosen": -3.0551466941833496, + "logits/rejected": -3.0341668128967285, + "logps/chosen": -56.55133056640625, + "logps/rejected": -57.29961395263672, + "loss": 0.6861, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.02232871949672699, + "rewards/margins": 0.014838054776191711, + "rewards/rejected": -0.037166766822338104, + "step": 1410 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 1.2241698503494263, + "learning_rate": 3.02693659312157e-08, + "logits/chosen": -2.9941155910491943, + "logits/rejected": -2.9760937690734863, + "logps/chosen": -55.75859451293945, + "logps/rejected": -55.846839904785156, + "loss": 0.6873, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.02684764564037323, + "rewards/margins": 0.012402022257447243, + "rewards/rejected": -0.039249666035175323, + "step": 1420 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 1.3626172542572021, + "learning_rate": 2.9974945663574684e-08, + "logits/chosen": -3.026280403137207, + "logits/rejected": -3.0005555152893066, + "logps/chosen": -57.986793518066406, + "logps/rejected": -56.02061080932617, + "loss": 0.684, + "rewards/accuracies": 0.645312488079071, + "rewards/chosen": -0.02259395457804203, + "rewards/margins": 0.019132796674966812, + "rewards/rejected": -0.041726745665073395, + "step": 1430 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 1.289384126663208, + "learning_rate": 2.967980516942168e-08, + "logits/chosen": -3.0657455921173096, + "logits/rejected": -3.040670394897461, + "logps/chosen": -58.47277069091797, + "logps/rejected": -57.19081497192383, + "loss": 0.6851, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.024709826335310936, + "rewards/margins": 0.016781199723482132, + "rewards/rejected": -0.04149102419614792, + "step": 1440 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 1.288656234741211, + "learning_rate": 2.9383987176461268e-08, + "logits/chosen": -2.991293430328369, + "logits/rejected": -2.9657962322235107, + "logps/chosen": -54.06468963623047, + "logps/rejected": -57.25890350341797, + "loss": 0.6871, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024288879707455635, + "rewards/margins": 0.012961057014763355, + "rewards/rejected": -0.037249937653541565, + "step": 1450 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 1.3280855417251587, + "learning_rate": 2.9087534510480032e-08, + "logits/chosen": -3.045292377471924, + "logits/rejected": -3.0192904472351074, + "logps/chosen": -55.768096923828125, + "logps/rejected": -56.869842529296875, + "loss": 0.6848, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.02522682026028633, + "rewards/margins": 0.017563799396157265, + "rewards/rejected": -0.04279061779379845, + "step": 1460 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 1.2552244663238525, + "learning_rate": 2.879049008914664e-08, + "logits/chosen": -3.015263080596924, + "logits/rejected": -2.999004602432251, + "logps/chosen": -55.86402130126953, + "logps/rejected": -58.1766471862793, + "loss": 0.6849, + "rewards/accuracies": 0.645312488079071, + "rewards/chosen": -0.024612870067358017, + "rewards/margins": 0.01742735505104065, + "rewards/rejected": -0.04204022139310837, + "step": 1470 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 1.2557737827301025, + "learning_rate": 2.8492896915798605e-08, + "logits/chosen": -3.021721601486206, + "logits/rejected": -3.0094776153564453, + "logps/chosen": -56.1937141418457, + "logps/rejected": -59.952537536621094, + "loss": 0.6864, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.024868253618478775, + "rewards/margins": 0.014400708489120007, + "rewards/rejected": -0.039268963038921356, + "step": 1480 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 1.3632835149765015, + "learning_rate": 2.8194798073216665e-08, + "logits/chosen": -2.9897549152374268, + "logits/rejected": -2.9639222621917725, + "logps/chosen": -56.546180725097656, + "logps/rejected": -56.685096740722656, + "loss": 0.6847, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.024227097630500793, + "rewards/margins": 0.017609497532248497, + "rewards/rejected": -0.04183660075068474, + "step": 1490 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 1.359270691871643, + "learning_rate": 2.7896236717387662e-08, + "logits/chosen": -2.9973204135894775, + "logits/rejected": -2.979785680770874, + "logps/chosen": -55.88494110107422, + "logps/rejected": -56.941490173339844, + "loss": 0.6871, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.029738834127783775, + "rewards/margins": 0.012930555269122124, + "rewards/rejected": -0.0426693893969059, + "step": 1500 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -3.121001720428467, + "eval_logits/rejected": -3.1152734756469727, + "eval_logps/chosen": -59.26029968261719, + "eval_logps/rejected": -64.31062316894531, + "eval_loss": 0.6903930306434631, + "eval_rewards/accuracies": 0.5824813842773438, + "eval_rewards/chosen": -0.005484058987349272, + "eval_rewards/margins": 0.005820916499942541, + "eval_rewards/rejected": -0.011304975487291813, + "eval_runtime": 383.1818, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 1500 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 1.1823673248291016, + "learning_rate": 2.7597256071256836e-08, + "logits/chosen": -3.0232627391815186, + "logits/rejected": -3.002992630004883, + "logps/chosen": -55.69109344482422, + "logps/rejected": -55.20794677734375, + "loss": 0.6864, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.029489045962691307, + "rewards/margins": 0.014345052652060986, + "rewards/rejected": -0.04383409768342972, + "step": 1510 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 1.2642569541931152, + "learning_rate": 2.7297899418470372e-08, + "logits/chosen": -2.9904372692108154, + "logits/rejected": -2.9669933319091797, + "logps/chosen": -59.17595672607422, + "logps/rejected": -57.17033767700195, + "loss": 0.685, + "rewards/accuracies": 0.651562511920929, + "rewards/chosen": -0.027122503146529198, + "rewards/margins": 0.017171606421470642, + "rewards/rejected": -0.04429411143064499, + "step": 1520 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 1.3126106262207031, + "learning_rate": 2.6998210097109213e-08, + "logits/chosen": -3.062737226486206, + "logits/rejected": -3.041637659072876, + "logps/chosen": -55.9976806640625, + "logps/rejected": -57.3626823425293, + "loss": 0.6843, + "rewards/accuracies": 0.6546875238418579, + "rewards/chosen": -0.0244468804448843, + "rewards/margins": 0.018709514290094376, + "rewards/rejected": -0.043156400322914124, + "step": 1530 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 1.1926969289779663, + "learning_rate": 2.669823149341498e-08, + "logits/chosen": -3.0017895698547363, + "logits/rejected": -2.9859423637390137, + "logps/chosen": -55.688560485839844, + "logps/rejected": -56.54026412963867, + "loss": 0.6862, + "rewards/accuracies": 0.620312511920929, + "rewards/chosen": -0.02824980393052101, + "rewards/margins": 0.01474563218653202, + "rewards/rejected": -0.04299543425440788, + "step": 1540 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 1.2355592250823975, + "learning_rate": 2.6398007035508906e-08, + "logits/chosen": -3.020545482635498, + "logits/rejected": -2.9991073608398438, + "logps/chosen": -60.19884490966797, + "logps/rejected": -58.834068298339844, + "loss": 0.6847, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02622285485267639, + "rewards/margins": 0.01788989268243313, + "rewards/rejected": -0.04411274939775467, + "step": 1550 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 1.2842044830322266, + "learning_rate": 2.609758018710473e-08, + "logits/chosen": -3.0513670444488525, + "logits/rejected": -3.0258631706237793, + "logps/chosen": -58.162193298339844, + "logps/rejected": -58.37177276611328, + "loss": 0.6845, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.026162385940551758, + "rewards/margins": 0.01831069216132164, + "rewards/rejected": -0.0444730743765831, + "step": 1560 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 1.2734873294830322, + "learning_rate": 2.5796994441216392e-08, + "logits/chosen": -3.025871753692627, + "logits/rejected": -3.008380174636841, + "logps/chosen": -57.0385856628418, + "logps/rejected": -57.39351272583008, + "loss": 0.6851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.027973037213087082, + "rewards/margins": 0.017069904133677483, + "rewards/rejected": -0.045042943209409714, + "step": 1570 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 1.2615596055984497, + "learning_rate": 2.5496293313861533e-08, + "logits/chosen": -3.0057692527770996, + "logits/rejected": -2.9775753021240234, + "logps/chosen": -55.155418395996094, + "logps/rejected": -56.9516716003418, + "loss": 0.685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02907036617398262, + "rewards/margins": 0.01720438338816166, + "rewards/rejected": -0.04627475142478943, + "step": 1580 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 1.260473370552063, + "learning_rate": 2.519552033776168e-08, + "logits/chosen": -2.992969512939453, + "logits/rejected": -2.9799506664276123, + "logps/chosen": -57.432411193847656, + "logps/rejected": -58.8470458984375, + "loss": 0.6859, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.029672112315893173, + "rewards/margins": 0.015414416790008545, + "rewards/rejected": -0.04508653283119202, + "step": 1590 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 1.3105090856552124, + "learning_rate": 2.4894719056039933e-08, + "logits/chosen": -3.059690237045288, + "logits/rejected": -3.0416653156280518, + "logps/chosen": -57.756431579589844, + "logps/rejected": -58.930335998535156, + "loss": 0.6863, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.02842426858842373, + "rewards/margins": 0.014618475921452045, + "rewards/rejected": -0.04304274171590805, + "step": 1600 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -3.117943286895752, + "eval_logits/rejected": -3.112224817276001, + "eval_logps/chosen": -59.45923614501953, + "eval_logps/rejected": -64.5576171875, + "eval_loss": 0.6901748180389404, + "eval_rewards/accuracies": 0.5887546539306641, + "eval_rewards/chosen": -0.007473426405340433, + "eval_rewards/margins": 0.00630148034542799, + "eval_rewards/rejected": -0.013774906285107136, + "eval_runtime": 383.3589, + "eval_samples_per_second": 11.227, + "eval_steps_per_second": 1.403, + "step": 1600 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 1.2811577320098877, + "learning_rate": 2.459393301591723e-08, + "logits/chosen": -3.0472471714019775, + "logits/rejected": -3.0280234813690186, + "logps/chosen": -56.1804313659668, + "logps/rejected": -56.194740295410156, + "loss": 0.6864, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.029790574684739113, + "rewards/margins": 0.014425704255700111, + "rewards/rejected": -0.044216278940439224, + "step": 1610 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 1.2719690799713135, + "learning_rate": 2.429320576240796e-08, + "logits/chosen": -2.983424186706543, + "logits/rejected": -2.960758686065674, + "logps/chosen": -57.0593147277832, + "logps/rejected": -57.68733596801758, + "loss": 0.684, + "rewards/accuracies": 0.6703125238418579, + "rewards/chosen": -0.0267618540674448, + "rewards/margins": 0.01918993890285492, + "rewards/rejected": -0.04595179110765457, + "step": 1620 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 1.274936556816101, + "learning_rate": 2.3992580832015937e-08, + "logits/chosen": -3.0748069286346436, + "logits/rejected": -3.0516257286071777, + "logps/chosen": -57.92633056640625, + "logps/rejected": -57.392669677734375, + "loss": 0.6851, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.028706436976790428, + "rewards/margins": 0.016950782388448715, + "rewards/rejected": -0.04565722122788429, + "step": 1630 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 1.3350669145584106, + "learning_rate": 2.3692101746431582e-08, + "logits/chosen": -2.999372720718384, + "logits/rejected": -2.9686479568481445, + "logps/chosen": -57.523155212402344, + "logps/rejected": -57.58971405029297, + "loss": 0.6842, + "rewards/accuracies": 0.645312488079071, + "rewards/chosen": -0.0288604237139225, + "rewards/margins": 0.018762212246656418, + "rewards/rejected": -0.04762263223528862, + "step": 1640 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 1.1888097524642944, + "learning_rate": 2.3391812006231252e-08, + "logits/chosen": -3.051567792892456, + "logits/rejected": -3.024486780166626, + "logps/chosen": -57.35553741455078, + "logps/rejected": -57.0074462890625, + "loss": 0.6863, + "rewards/accuracies": 0.5921875238418579, + "rewards/chosen": -0.030893787741661072, + "rewards/margins": 0.01476077176630497, + "rewards/rejected": -0.04565456137061119, + "step": 1650 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 1.1799283027648926, + "learning_rate": 2.3091755084579655e-08, + "logits/chosen": -3.032055377960205, + "logits/rejected": -3.004883289337158, + "logps/chosen": -56.0220832824707, + "logps/rejected": -55.85259246826172, + "loss": 0.6844, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.033284805715084076, + "rewards/margins": 0.018482720479369164, + "rewards/rejected": -0.05176752805709839, + "step": 1660 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 1.3657642602920532, + "learning_rate": 2.2791974420936168e-08, + "logits/chosen": -3.0568909645080566, + "logits/rejected": -3.0396854877471924, + "logps/chosen": -55.1065673828125, + "logps/rejected": -58.755226135253906, + "loss": 0.6845, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.03204982727766037, + "rewards/margins": 0.018315287306904793, + "rewards/rejected": -0.05036511272192001, + "step": 1670 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 1.238609790802002, + "learning_rate": 2.2492513414766092e-08, + "logits/chosen": -3.015735626220703, + "logits/rejected": -2.9980287551879883, + "logps/chosen": -58.169593811035156, + "logps/rejected": -59.08977127075195, + "loss": 0.6831, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.027626004070043564, + "rewards/margins": 0.0211968831717968, + "rewards/rejected": -0.04882288724184036, + "step": 1680 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 1.2068655490875244, + "learning_rate": 2.2193415419257697e-08, + "logits/chosen": -3.023995876312256, + "logits/rejected": -3.0071871280670166, + "logps/chosen": -57.2905158996582, + "logps/rejected": -58.23944854736328, + "loss": 0.6866, + "rewards/accuracies": 0.5953124761581421, + "rewards/chosen": -0.03379104658961296, + "rewards/margins": 0.014122662134468555, + "rewards/rejected": -0.04791371151804924, + "step": 1690 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 1.2340092658996582, + "learning_rate": 2.1894723735045923e-08, + "logits/chosen": -3.015665054321289, + "logits/rejected": -2.995542526245117, + "logps/chosen": -56.78801727294922, + "logps/rejected": -58.15932083129883, + "loss": 0.6854, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.034682370722293854, + "rewards/margins": 0.01648074761033058, + "rewards/rejected": -0.051163118332624435, + "step": 1700 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -3.11429500579834, + "eval_logits/rejected": -3.1086244583129883, + "eval_logps/chosen": -59.66813278198242, + "eval_logps/rejected": -64.80902099609375, + "eval_loss": 0.6899911165237427, + "eval_rewards/accuracies": 0.586663544178009, + "eval_rewards/chosen": -0.009562356397509575, + "eval_rewards/margins": 0.006726610474288464, + "eval_rewards/rejected": -0.016288965940475464, + "eval_runtime": 383.0366, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 1700 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 1.361463189125061, + "learning_rate": 2.159648160394373e-08, + "logits/chosen": -3.077711582183838, + "logits/rejected": -3.057525396347046, + "logps/chosen": -58.2719612121582, + "logps/rejected": -58.065284729003906, + "loss": 0.6842, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.028816580772399902, + "rewards/margins": 0.018897056579589844, + "rewards/rejected": -0.047713637351989746, + "step": 1710 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 1.315765619277954, + "learning_rate": 2.1298732202681956e-08, + "logits/chosen": -2.9981892108917236, + "logits/rejected": -2.9703431129455566, + "logps/chosen": -57.9752311706543, + "logps/rejected": -57.5850944519043, + "loss": 0.6835, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.03318381309509277, + "rewards/margins": 0.02043316699564457, + "rewards/rejected": -0.053616978228092194, + "step": 1720 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 1.2448103427886963, + "learning_rate": 2.1001518636658567e-08, + "logits/chosen": -3.0659806728363037, + "logits/rejected": -3.0397450923919678, + "logps/chosen": -57.8054313659668, + "logps/rejected": -58.7362174987793, + "loss": 0.684, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.030917003750801086, + "rewards/margins": 0.019438743591308594, + "rewards/rejected": -0.05035575106739998, + "step": 1730 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 1.3165340423583984, + "learning_rate": 2.0704883933698286e-08, + "logits/chosen": -3.0220000743865967, + "logits/rejected": -2.988614559173584, + "logps/chosen": -59.221092224121094, + "logps/rejected": -56.499412536621094, + "loss": 0.6833, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.03028092160820961, + "rewards/margins": 0.020808402448892593, + "rewards/rejected": -0.0510893277823925, + "step": 1740 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 1.2597706317901611, + "learning_rate": 2.0408871037823392e-08, + "logits/chosen": -3.050657033920288, + "logits/rejected": -3.0249600410461426, + "logps/chosen": -58.73793411254883, + "logps/rejected": -58.411109924316406, + "loss": 0.6832, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.032266996800899506, + "rewards/margins": 0.020968910306692123, + "rewards/rejected": -0.05323590710759163, + "step": 1750 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 1.3488494157791138, + "learning_rate": 2.0113522803036697e-08, + "logits/chosen": -3.0145840644836426, + "logits/rejected": -2.9919447898864746, + "logps/chosen": -57.56928253173828, + "logps/rejected": -59.618064880371094, + "loss": 0.6833, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.033350322395563126, + "rewards/margins": 0.020749535411596298, + "rewards/rejected": -0.054099857807159424, + "step": 1760 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 1.2959116697311401, + "learning_rate": 1.981888198711757e-08, + "logits/chosen": -3.0438103675842285, + "logits/rejected": -3.0201594829559326, + "logps/chosen": -56.78424072265625, + "logps/rejected": -58.80998992919922, + "loss": 0.6845, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.03599938377737999, + "rewards/margins": 0.018340986222028732, + "rewards/rejected": -0.05434036999940872, + "step": 1770 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 1.3248341083526611, + "learning_rate": 1.952499124543181e-08, + "logits/chosen": -3.050183057785034, + "logits/rejected": -3.0215609073638916, + "logps/chosen": -59.94126510620117, + "logps/rejected": -58.557289123535156, + "loss": 0.6838, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.03428710997104645, + "rewards/margins": 0.019783692434430122, + "rewards/rejected": -0.05407080054283142, + "step": 1780 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 1.2419155836105347, + "learning_rate": 1.923189312475642e-08, + "logits/chosen": -3.0126285552978516, + "logits/rejected": -2.989089250564575, + "logps/chosen": -57.96059036254883, + "logps/rejected": -58.968994140625, + "loss": 0.6844, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.035335466265678406, + "rewards/margins": 0.01862800493836403, + "rewards/rejected": -0.05396346375346184, + "step": 1790 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 1.3300213813781738, + "learning_rate": 1.8939630057120098e-08, + "logits/chosen": -3.000619888305664, + "logits/rejected": -2.9799740314483643, + "logps/chosen": -58.02915573120117, + "logps/rejected": -60.03089141845703, + "loss": 0.6855, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.03870057314634323, + "rewards/margins": 0.01641600951552391, + "rewards/rejected": -0.05511658638715744, + "step": 1800 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -3.11264705657959, + "eval_logits/rejected": -3.1069631576538086, + "eval_logps/chosen": -59.91139221191406, + "eval_logps/rejected": -65.09736633300781, + "eval_loss": 0.6897966265678406, + "eval_rewards/accuracies": 0.582713782787323, + "eval_rewards/chosen": -0.01199500635266304, + "eval_rewards/margins": 0.007177378050982952, + "eval_rewards/rejected": -0.019172383472323418, + "eval_runtime": 383.0708, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.404, + "step": 1800 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 1.2643280029296875, + "learning_rate": 1.8648244353660288e-08, + "logits/chosen": -3.0149238109588623, + "logits/rejected": -2.9926235675811768, + "logps/chosen": -59.53852081298828, + "logps/rejected": -58.4305419921875, + "loss": 0.6858, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.03731811046600342, + "rewards/margins": 0.015873271971940994, + "rewards/rejected": -0.05319138243794441, + "step": 1810 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 1.353582501411438, + "learning_rate": 1.835777819849779e-08, + "logits/chosen": -3.084817886352539, + "logits/rejected": -3.052018880844116, + "logps/chosen": -58.27227783203125, + "logps/rejected": -58.49510955810547, + "loss": 0.6821, + "rewards/accuracies": 0.676562488079071, + "rewards/chosen": -0.034875739365816116, + "rewards/margins": 0.023144185543060303, + "rewards/rejected": -0.05801992490887642, + "step": 1820 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 1.3098019361495972, + "learning_rate": 1.806827364262974e-08, + "logits/chosen": -2.970393657684326, + "logits/rejected": -2.9504239559173584, + "logps/chosen": -57.546607971191406, + "logps/rejected": -58.0165901184082, + "loss": 0.687, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0393044538795948, + "rewards/margins": 0.013580908067524433, + "rewards/rejected": -0.05288536101579666, + "step": 1830 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 1.2913509607315063, + "learning_rate": 1.7779772597841818e-08, + "logits/chosen": -3.0347402095794678, + "logits/rejected": -3.008613109588623, + "logps/chosen": -58.36812210083008, + "logps/rejected": -58.02600860595703, + "loss": 0.6827, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.03452888876199722, + "rewards/margins": 0.022128187119960785, + "rewards/rejected": -0.05665707588195801, + "step": 1840 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 1.252109169960022, + "learning_rate": 1.749231683064069e-08, + "logits/chosen": -2.9613466262817383, + "logits/rejected": -2.9358131885528564, + "logps/chosen": -57.309776306152344, + "logps/rejected": -57.78460693359375, + "loss": 0.6849, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.03900003433227539, + "rewards/margins": 0.017551228404045105, + "rewards/rejected": -0.056551266461610794, + "step": 1850 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 1.3581938743591309, + "learning_rate": 1.7205947956207416e-08, + "logits/chosen": -2.9560749530792236, + "logits/rejected": -2.9271953105926514, + "logps/chosen": -58.55373001098633, + "logps/rejected": -58.74352264404297, + "loss": 0.6828, + "rewards/accuracies": 0.6265624761581421, + "rewards/chosen": -0.03720385953783989, + "rewards/margins": 0.021904241293668747, + "rewards/rejected": -0.059108100831508636, + "step": 1860 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 1.377907395362854, + "learning_rate": 1.69207074323728e-08, + "logits/chosen": -3.007751703262329, + "logits/rejected": -2.990427255630493, + "logps/chosen": -57.58440017700195, + "logps/rejected": -57.02080154418945, + "loss": 0.685, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.0374065637588501, + "rewards/margins": 0.01753416657447815, + "rewards/rejected": -0.05494073033332825, + "step": 1870 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 1.3684296607971191, + "learning_rate": 1.6636636553615502e-08, + "logits/chosen": -2.969104290008545, + "logits/rejected": -2.9459190368652344, + "logps/chosen": -57.26690673828125, + "logps/rejected": -58.23255157470703, + "loss": 0.6839, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.03711014613509178, + "rewards/margins": 0.019704418256878853, + "rewards/rejected": -0.056814562529325485, + "step": 1880 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 1.2850284576416016, + "learning_rate": 1.6353776445083815e-08, + "logits/chosen": -3.0240912437438965, + "logits/rejected": -3.0120110511779785, + "logps/chosen": -55.608795166015625, + "logps/rejected": -58.87140655517578, + "loss": 0.6839, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.03577902913093567, + "rewards/margins": 0.019716601818799973, + "rewards/rejected": -0.05549562722444534, + "step": 1890 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 1.2016737461090088, + "learning_rate": 1.6072168056641944e-08, + "logits/chosen": -3.0512993335723877, + "logits/rejected": -3.0232746601104736, + "logps/chosen": -59.6502571105957, + "logps/rejected": -57.75080490112305, + "loss": 0.6824, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.03732553869485855, + "rewards/margins": 0.02288132533431053, + "rewards/rejected": -0.06020686775445938, + "step": 1900 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -3.109053134918213, + "eval_logits/rejected": -3.1033873558044434, + "eval_logps/chosen": -60.10012435913086, + "eval_logps/rejected": -65.30886840820312, + "eval_loss": 0.6897038221359253, + "eval_rewards/accuracies": 0.5824813842773438, + "eval_rewards/chosen": -0.013882317580282688, + "eval_rewards/margins": 0.007405092474073172, + "eval_rewards/rejected": -0.02128741145133972, + "eval_runtime": 383.365, + "eval_samples_per_second": 11.227, + "eval_steps_per_second": 1.403, + "step": 1900 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 1.3704478740692139, + "learning_rate": 1.5791852156941672e-08, + "logits/chosen": -2.9737439155578613, + "logits/rejected": -2.9562785625457764, + "logps/chosen": -58.3253173828125, + "logps/rejected": -58.146751403808594, + "loss": 0.685, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.03781484067440033, + "rewards/margins": 0.017499810084700584, + "rewards/rejected": -0.05531464144587517, + "step": 1910 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 1.304793119430542, + "learning_rate": 1.5512869327520234e-08, + "logits/chosen": -3.0345849990844727, + "logits/rejected": -3.006624937057495, + "logps/chosen": -59.780555725097656, + "logps/rejected": -59.3910026550293, + "loss": 0.6828, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.038787275552749634, + "rewards/margins": 0.022125843912363052, + "rewards/rejected": -0.060913123190402985, + "step": 1920 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 1.281746506690979, + "learning_rate": 1.52352599569253e-08, + "logits/chosen": -3.0547759532928467, + "logits/rejected": -3.0221850872039795, + "logps/chosen": -57.975791931152344, + "logps/rejected": -56.89446258544922, + "loss": 0.6843, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.03873300552368164, + "rewards/margins": 0.018973171710968018, + "rewards/rejected": -0.05770616978406906, + "step": 1930 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 1.3232277631759644, + "learning_rate": 1.4959064234867925e-08, + "logits/chosen": -2.9585065841674805, + "logits/rejected": -2.936213970184326, + "logps/chosen": -56.48392868041992, + "logps/rejected": -58.73712158203125, + "loss": 0.6824, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.03964737430214882, + "rewards/margins": 0.022885087877511978, + "rewards/rejected": -0.0625324696302414, + "step": 1940 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 1.266557216644287, + "learning_rate": 1.4684322146404215e-08, + "logits/chosen": -3.035268783569336, + "logits/rejected": -3.01952862739563, + "logps/chosen": -56.704620361328125, + "logps/rejected": -59.3856086730957, + "loss": 0.6851, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04403103142976761, + "rewards/margins": 0.017256928607821465, + "rewards/rejected": -0.061287958174943924, + "step": 1950 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 1.2548446655273438, + "learning_rate": 1.4411073466146698e-08, + "logits/chosen": -3.0059549808502197, + "logits/rejected": -2.9832520484924316, + "logps/chosen": -58.505836486816406, + "logps/rejected": -60.24690628051758, + "loss": 0.6838, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.03818178176879883, + "rewards/margins": 0.01996403932571411, + "rewards/rejected": -0.058145828545093536, + "step": 1960 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 1.3594894409179688, + "learning_rate": 1.413935775250609e-08, + "logits/chosen": -2.9915037155151367, + "logits/rejected": -2.9657304286956787, + "logps/chosen": -58.22015380859375, + "logps/rejected": -58.225196838378906, + "loss": 0.6821, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03904888778924942, + "rewards/margins": 0.023439262062311172, + "rewards/rejected": -0.06248814985156059, + "step": 1970 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 1.305829644203186, + "learning_rate": 1.3869214341964411e-08, + "logits/chosen": -2.9901621341705322, + "logits/rejected": -2.972623825073242, + "logps/chosen": -58.67400360107422, + "logps/rejected": -59.30767059326172, + "loss": 0.6854, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0412491112947464, + "rewards/margins": 0.01679963245987892, + "rewards/rejected": -0.058048736304044724, + "step": 1980 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 1.2267251014709473, + "learning_rate": 1.3600682343380247e-08, + "logits/chosen": -2.953930616378784, + "logits/rejected": -2.9335622787475586, + "logps/chosen": -58.66155242919922, + "logps/rejected": -59.33677291870117, + "loss": 0.6835, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.039415620267391205, + "rewards/margins": 0.020520631223917007, + "rewards/rejected": -0.05993625521659851, + "step": 1990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 1.2556020021438599, + "learning_rate": 1.3333800632326858e-08, + "logits/chosen": -3.0334630012512207, + "logits/rejected": -3.0166397094726562, + "logps/chosen": -58.55223846435547, + "logps/rejected": -59.941978454589844, + "loss": 0.6851, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.0418681763112545, + "rewards/margins": 0.017354335635900497, + "rewards/rejected": -0.0592225082218647, + "step": 2000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -3.1071271896362305, + "eval_logits/rejected": -3.101414680480957, + "eval_logps/chosen": -60.261566162109375, + "eval_logps/rejected": -65.51657104492188, + "eval_loss": 0.6894936561584473, + "eval_rewards/accuracies": 0.5906133651733398, + "eval_rewards/chosen": -0.015496725216507912, + "eval_rewards/margins": 0.007867763750255108, + "eval_rewards/rejected": -0.023364488035440445, + "eval_runtime": 383.0695, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.404, + "step": 2000 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 1.4159228801727295, + "learning_rate": 1.3068607845464202e-08, + "logits/chosen": -2.9797048568725586, + "logits/rejected": -2.952303171157837, + "logps/chosen": -59.8831901550293, + "logps/rejected": -59.22021484375, + "loss": 0.6842, + "rewards/accuracies": 0.5843750238418579, + "rewards/chosen": -0.041412778198719025, + "rewards/margins": 0.0192607082426548, + "rewards/rejected": -0.06067349389195442, + "step": 2010 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 1.3155369758605957, + "learning_rate": 1.2805142374945437e-08, + "logits/chosen": -3.0014488697052, + "logits/rejected": -2.978201389312744, + "logps/chosen": -57.95537185668945, + "logps/rejected": -59.4213752746582, + "loss": 0.6827, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.03732657432556152, + "rewards/margins": 0.022235842421650887, + "rewards/rejected": -0.05956241488456726, + "step": 2020 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 1.2982782125473022, + "learning_rate": 1.254344236285888e-08, + "logits/chosen": -2.984819173812866, + "logits/rejected": -2.9616565704345703, + "logps/chosen": -58.2531623840332, + "logps/rejected": -59.219261169433594, + "loss": 0.684, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.039121102541685104, + "rewards/margins": 0.019388314336538315, + "rewards/rejected": -0.05850941687822342, + "step": 2030 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 1.328587532043457, + "learning_rate": 1.2283545695706135e-08, + "logits/chosen": -2.9852476119995117, + "logits/rejected": -2.9641222953796387, + "logps/chosen": -58.166831970214844, + "logps/rejected": -58.347557067871094, + "loss": 0.6842, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.042398639023303986, + "rewards/margins": 0.019147690385580063, + "rewards/rejected": -0.06154633313417435, + "step": 2040 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 1.2532403469085693, + "learning_rate": 1.2025489998917254e-08, + "logits/chosen": -3.011920690536499, + "logits/rejected": -2.985671043395996, + "logps/chosen": -61.16510009765625, + "logps/rejected": -59.185394287109375, + "loss": 0.6842, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.03920884430408478, + "rewards/margins": 0.019202571362257004, + "rewards/rejected": -0.058411408215761185, + "step": 2050 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 1.379606008529663, + "learning_rate": 1.1769312631403659e-08, + "logits/chosen": -2.9937710762023926, + "logits/rejected": -2.9706432819366455, + "logps/chosen": -59.001708984375, + "logps/rejected": -58.9688720703125, + "loss": 0.6849, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0409664623439312, + "rewards/margins": 0.017764370888471603, + "rewards/rejected": -0.0587308332324028, + "step": 2060 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 1.3206267356872559, + "learning_rate": 1.1515050680149687e-08, + "logits/chosen": -3.0447440147399902, + "logits/rejected": -3.0277929306030273, + "logps/chosen": -59.411956787109375, + "logps/rejected": -60.502716064453125, + "loss": 0.6859, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.04452786594629288, + "rewards/margins": 0.01587734930217266, + "rewards/rejected": -0.06040521338582039, + "step": 2070 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 1.2896159887313843, + "learning_rate": 1.1262740954843456e-08, + "logits/chosen": -3.0021471977233887, + "logits/rejected": -2.971998691558838, + "logps/chosen": -60.27617645263672, + "logps/rejected": -59.302833557128906, + "loss": 0.6832, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.03701635077595711, + "rewards/margins": 0.021184273064136505, + "rewards/rejected": -0.058200620114803314, + "step": 2080 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 1.326338768005371, + "learning_rate": 1.1012419982547905e-08, + "logits/chosen": -2.961845636367798, + "logits/rejected": -2.941849946975708, + "logps/chosen": -57.25081253051758, + "logps/rejected": -59.027015686035156, + "loss": 0.684, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.04554927721619606, + "rewards/margins": 0.019669197499752045, + "rewards/rejected": -0.0652184709906578, + "step": 2090 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 1.316919207572937, + "learning_rate": 1.0764124002412758e-08, + "logits/chosen": -3.0356943607330322, + "logits/rejected": -3.012575626373291, + "logps/chosen": -56.51226806640625, + "logps/rejected": -59.925514221191406, + "loss": 0.6834, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04187322035431862, + "rewards/margins": 0.02075764164328575, + "rewards/rejected": -0.06263085454702377, + "step": 2100 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -3.1055357456207275, + "eval_logits/rejected": -3.099832534790039, + "eval_logps/chosen": -60.38422775268555, + "eval_logps/rejected": -65.65011596679688, + "eval_loss": 0.6894546151161194, + "eval_rewards/accuracies": 0.5861988663673401, + "eval_rewards/chosen": -0.01672333851456642, + "eval_rewards/margins": 0.00797655712813139, + "eval_rewards/rejected": -0.024699894711375237, + "eval_runtime": 383.1595, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 2100 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 1.3180962800979614, + "learning_rate": 1.0517888960428139e-08, + "logits/chosen": -2.964921236038208, + "logits/rejected": -2.94468355178833, + "logps/chosen": -58.661651611328125, + "logps/rejected": -59.41533660888672, + "loss": 0.6821, + "rewards/accuracies": 0.651562511920929, + "rewards/chosen": -0.042625896632671356, + "rewards/margins": 0.023367973044514656, + "rewards/rejected": -0.06599386781454086, + "step": 2110 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 1.2736924886703491, + "learning_rate": 1.0273750504220666e-08, + "logits/chosen": -2.982936382293701, + "logits/rejected": -2.9598629474639893, + "logps/chosen": -56.829505920410156, + "logps/rejected": -59.5037956237793, + "loss": 0.6822, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.04348963871598244, + "rewards/margins": 0.023124249652028084, + "rewards/rejected": -0.06661389768123627, + "step": 2120 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 1.305767297744751, + "learning_rate": 1.003174397789269e-08, + "logits/chosen": -2.981260299682617, + "logits/rejected": -2.9577889442443848, + "logps/chosen": -59.381996154785156, + "logps/rejected": -59.3449821472168, + "loss": 0.6821, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.04265505075454712, + "rewards/margins": 0.023621436208486557, + "rewards/rejected": -0.06627649068832397, + "step": 2130 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 1.2957626581192017, + "learning_rate": 9.791904416905526e-09, + "logits/chosen": -3.0431559085845947, + "logits/rejected": -3.0270159244537354, + "logps/chosen": -58.642250061035156, + "logps/rejected": -59.5418586730957, + "loss": 0.6855, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.044856615364551544, + "rewards/margins": 0.016749290749430656, + "rewards/rejected": -0.06160591170191765, + "step": 2140 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 1.3108528852462769, + "learning_rate": 9.554266543007328e-09, + "logits/chosen": -3.007779598236084, + "logits/rejected": -2.9805774688720703, + "logps/chosen": -58.5262565612793, + "logps/rejected": -59.313941955566406, + "loss": 0.6809, + "rewards/accuracies": 0.6421874761581421, + "rewards/chosen": -0.0379471480846405, + "rewards/margins": 0.02591646835207939, + "rewards/rejected": -0.0638636127114296, + "step": 2150 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 1.2914735078811646, + "learning_rate": 9.318864759206429e-09, + "logits/chosen": -2.9647016525268555, + "logits/rejected": -2.9434256553649902, + "logps/chosen": -58.31499481201172, + "logps/rejected": -57.80283737182617, + "loss": 0.6842, + "rewards/accuracies": 0.6078125238418579, + "rewards/chosen": -0.04223569482564926, + "rewards/margins": 0.0191799309104681, + "rewards/rejected": -0.061415620148181915, + "step": 2160 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 1.3237831592559814, + "learning_rate": 9.085733144790744e-09, + "logits/chosen": -3.0011582374572754, + "logits/rejected": -2.9859848022460938, + "logps/chosen": -57.24296188354492, + "logps/rejected": -59.595008850097656, + "loss": 0.6826, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04495619237422943, + "rewards/margins": 0.022577274590730667, + "rewards/rejected": -0.0675334706902504, + "step": 2170 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 1.409790277481079, + "learning_rate": 8.854905450394113e-09, + "logits/chosen": -3.0098440647125244, + "logits/rejected": -2.983942747116089, + "logps/chosen": -59.406158447265625, + "logps/rejected": -58.25774002075195, + "loss": 0.6837, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.04101915657520294, + "rewards/margins": 0.020175766199827194, + "rewards/rejected": -0.06119491904973984, + "step": 2180 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 1.3202848434448242, + "learning_rate": 8.626415093110202e-09, + "logits/chosen": -2.9948554039001465, + "logits/rejected": -2.975142240524292, + "logps/chosen": -56.601722717285156, + "logps/rejected": -59.769569396972656, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04384131729602814, + "rewards/margins": 0.020800283178687096, + "rewards/rejected": -0.06464160233736038, + "step": 2190 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 1.366294503211975, + "learning_rate": 8.40029515165467e-09, + "logits/chosen": -3.006235361099243, + "logits/rejected": -2.983431577682495, + "logps/chosen": -57.9134635925293, + "logps/rejected": -58.527076721191406, + "loss": 0.6828, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.04390479251742363, + "rewards/margins": 0.022031091153621674, + "rewards/rejected": -0.065935879945755, + "step": 2200 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -3.104142904281616, + "eval_logits/rejected": -3.098437547683716, + "eval_logps/chosen": -60.504878997802734, + "eval_logps/rejected": -65.79142761230469, + "eval_loss": 0.6893645524978638, + "eval_rewards/accuracies": 0.5873606204986572, + "eval_rewards/chosen": -0.01792982593178749, + "eval_rewards/margins": 0.008183243684470654, + "eval_rewards/rejected": -0.02611307054758072, + "eval_runtime": 382.8386, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 2200 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 1.290756106376648, + "learning_rate": 8.176578361576358e-09, + "logits/chosen": -2.9781322479248047, + "logits/rejected": -2.959134340286255, + "logps/chosen": -56.6655387878418, + "logps/rejected": -58.62583541870117, + "loss": 0.6848, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.04583312198519707, + "rewards/margins": 0.01810682937502861, + "rewards/rejected": -0.06393995136022568, + "step": 2210 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 1.2776232957839966, + "learning_rate": 7.955297110518117e-09, + "logits/chosen": -3.0543761253356934, + "logits/rejected": -3.028890371322632, + "logps/chosen": -58.1224365234375, + "logps/rejected": -59.595802307128906, + "loss": 0.684, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.04209282249212265, + "rewards/margins": 0.019542943686246872, + "rewards/rejected": -0.06163576990365982, + "step": 2220 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 1.349440574645996, + "learning_rate": 7.73648343352806e-09, + "logits/chosen": -3.0225765705108643, + "logits/rejected": -2.996718168258667, + "logps/chosen": -59.868263244628906, + "logps/rejected": -58.85590744018555, + "loss": 0.6824, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.043428223580121994, + "rewards/margins": 0.022792860865592957, + "rewards/rejected": -0.06622108817100525, + "step": 2230 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 1.3181602954864502, + "learning_rate": 7.520169008421775e-09, + "logits/chosen": -2.999849796295166, + "logits/rejected": -2.9812533855438232, + "logps/chosen": -59.130516052246094, + "logps/rejected": -60.17681884765625, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04423438385128975, + "rewards/margins": 0.019376900047063828, + "rewards/rejected": -0.06361128389835358, + "step": 2240 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 1.389096975326538, + "learning_rate": 7.3063851511963535e-09, + "logits/chosen": -3.01939058303833, + "logits/rejected": -2.992645740509033, + "logps/chosen": -59.214324951171875, + "logps/rejected": -59.09142303466797, + "loss": 0.6823, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.04182130843400955, + "rewards/margins": 0.023091908544301987, + "rewards/rejected": -0.06491322070360184, + "step": 2250 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 1.2921773195266724, + "learning_rate": 7.095162811496716e-09, + "logits/chosen": -2.9625678062438965, + "logits/rejected": -2.947840452194214, + "logps/chosen": -57.61260986328125, + "logps/rejected": -58.932640075683594, + "loss": 0.6843, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.046415556222200394, + "rewards/margins": 0.01893490180373192, + "rewards/rejected": -0.06535045802593231, + "step": 2260 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 1.2538701295852661, + "learning_rate": 6.886532568135017e-09, + "logits/chosen": -2.9978413581848145, + "logits/rejected": -2.98121976852417, + "logps/chosen": -58.2430305480957, + "logps/rejected": -60.4543571472168, + "loss": 0.6838, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.044888339936733246, + "rewards/margins": 0.019984986633062363, + "rewards/rejected": -0.06487332284450531, + "step": 2270 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 1.3960515260696411, + "learning_rate": 6.680524624663763e-09, + "logits/chosen": -3.0089173316955566, + "logits/rejected": -2.977341413497925, + "logps/chosen": -60.883209228515625, + "logps/rejected": -59.40845489501953, + "loss": 0.6812, + "rewards/accuracies": 0.6234375238418579, + "rewards/chosen": -0.03985728323459625, + "rewards/margins": 0.025174889713525772, + "rewards/rejected": -0.06503216922283173, + "step": 2280 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 1.1940710544586182, + "learning_rate": 6.477168805003166e-09, + "logits/chosen": -3.00933575630188, + "logits/rejected": -2.982250690460205, + "logps/chosen": -59.1096076965332, + "logps/rejected": -59.2675895690918, + "loss": 0.6819, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.04165520519018173, + "rewards/margins": 0.023930717259645462, + "rewards/rejected": -0.06558592617511749, + "step": 2290 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 1.3209586143493652, + "learning_rate": 6.276494549123546e-09, + "logits/chosen": -3.050356388092041, + "logits/rejected": -3.02972412109375, + "logps/chosen": -58.97772979736328, + "logps/rejected": -59.48607635498047, + "loss": 0.6833, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.04832325503230095, + "rewards/margins": 0.021100293844938278, + "rewards/rejected": -0.06942354887723923, + "step": 2300 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -3.1029651165008545, + "eval_logits/rejected": -3.097285509109497, + "eval_logps/chosen": -60.59327697753906, + "eval_logps/rejected": -65.90728759765625, + "eval_loss": 0.6892400979995728, + "eval_rewards/accuracies": 0.5901486873626709, + "eval_rewards/chosen": -0.018813807517290115, + "eval_rewards/margins": 0.008457801304757595, + "eval_rewards/rejected": -0.027271609753370285, + "eval_runtime": 383.2974, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 2300 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 1.2838672399520874, + "learning_rate": 6.078530908783283e-09, + "logits/chosen": -2.946258068084717, + "logits/rejected": -2.9289188385009766, + "logps/chosen": -57.40240478515625, + "logps/rejected": -58.80238723754883, + "loss": 0.6837, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.04573064297437668, + "rewards/margins": 0.020323526114225388, + "rewards/rejected": -0.06605416536331177, + "step": 2310 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 1.3529164791107178, + "learning_rate": 5.883306543322963e-09, + "logits/chosen": -3.0067434310913086, + "logits/rejected": -2.983191967010498, + "logps/chosen": -57.39630126953125, + "logps/rejected": -58.9193000793457, + "loss": 0.6836, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.045682333409786224, + "rewards/margins": 0.020494289696216583, + "rewards/rejected": -0.06617662310600281, + "step": 2320 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 1.3721948862075806, + "learning_rate": 5.690849715516346e-09, + "logits/chosen": -2.9921982288360596, + "logits/rejected": -2.972947597503662, + "logps/chosen": -58.18434524536133, + "logps/rejected": -59.79640579223633, + "loss": 0.684, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.049553245306015015, + "rewards/margins": 0.019603563472628593, + "rewards/rejected": -0.06915681809186935, + "step": 2330 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 1.326244831085205, + "learning_rate": 5.50118828747877e-09, + "logits/chosen": -3.013467311859131, + "logits/rejected": -2.985992908477783, + "logps/chosen": -59.114105224609375, + "logps/rejected": -59.962989807128906, + "loss": 0.6816, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.04709188640117645, + "rewards/margins": 0.024677757173776627, + "rewards/rejected": -0.07176963984966278, + "step": 2340 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 1.3307464122772217, + "learning_rate": 5.314349716633484e-09, + "logits/chosen": -2.999783515930176, + "logits/rejected": -2.978919744491577, + "logps/chosen": -58.51006317138672, + "logps/rejected": -59.8082389831543, + "loss": 0.6859, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04656077176332474, + "rewards/margins": 0.015919920057058334, + "rewards/rejected": -0.06248069554567337, + "step": 2350 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 1.3505630493164062, + "learning_rate": 5.130361051736656e-09, + "logits/chosen": -2.992077589035034, + "logits/rejected": -2.9786789417266846, + "logps/chosen": -57.856048583984375, + "logps/rejected": -58.31081008911133, + "loss": 0.685, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.05111172795295715, + "rewards/margins": 0.01770883984863758, + "rewards/rejected": -0.06882055848836899, + "step": 2360 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 1.3144451379776, + "learning_rate": 4.9492489289614884e-09, + "logits/chosen": -2.9724109172821045, + "logits/rejected": -2.9529943466186523, + "logps/chosen": -58.423919677734375, + "logps/rejected": -59.29913330078125, + "loss": 0.6839, + "rewards/accuracies": 0.596875011920929, + "rewards/chosen": -0.053545523434877396, + "rewards/margins": 0.019974233582615852, + "rewards/rejected": -0.0735197439789772, + "step": 2370 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 1.3916033506393433, + "learning_rate": 4.771039568042076e-09, + "logits/chosen": -3.004544734954834, + "logits/rejected": -2.988704204559326, + "logps/chosen": -57.34346389770508, + "logps/rejected": -61.55018997192383, + "loss": 0.683, + "rewards/accuracies": 0.635937511920929, + "rewards/chosen": -0.04598530754446983, + "rewards/margins": 0.021662291139364243, + "rewards/rejected": -0.06764759868383408, + "step": 2380 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 1.4496431350708008, + "learning_rate": 4.595758768477576e-09, + "logits/chosen": -3.0240254402160645, + "logits/rejected": -3.011583089828491, + "logps/chosen": -58.51326370239258, + "logps/rejected": -60.47749710083008, + "loss": 0.6844, + "rewards/accuracies": 0.5921875238418579, + "rewards/chosen": -0.05048090219497681, + "rewards/margins": 0.01893479749560356, + "rewards/rejected": -0.06941570341587067, + "step": 2390 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 1.3277703523635864, + "learning_rate": 4.423431905797162e-09, + "logits/chosen": -3.039842128753662, + "logits/rejected": -3.0183472633361816, + "logps/chosen": -58.69083786010742, + "logps/rejected": -60.8518180847168, + "loss": 0.6835, + "rewards/accuracies": 0.6109374761581421, + "rewards/chosen": -0.04716577008366585, + "rewards/margins": 0.020657068118453026, + "rewards/rejected": -0.06782282888889313, + "step": 2400 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -3.1017863750457764, + "eval_logits/rejected": -3.09609055519104, + "eval_logps/chosen": -60.646873474121094, + "eval_logps/rejected": -65.97390747070312, + "eval_loss": 0.6891800761222839, + "eval_rewards/accuracies": 0.5861988663673401, + "eval_rewards/chosen": -0.019349750131368637, + "eval_rewards/margins": 0.008588053286075592, + "eval_rewards/rejected": -0.027937807142734528, + "eval_runtime": 383.0908, + "eval_samples_per_second": 11.235, + "eval_steps_per_second": 1.404, + "step": 2400 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 1.3616927862167358, + "learning_rate": 4.254083927886443e-09, + "logits/chosen": -3.052434206008911, + "logits/rejected": -3.0306789875030518, + "logps/chosen": -60.23524856567383, + "logps/rejected": -59.88490676879883, + "loss": 0.685, + "rewards/accuracies": 0.598437488079071, + "rewards/chosen": -0.04647786170244217, + "rewards/margins": 0.017560753971338272, + "rewards/rejected": -0.06403861939907074, + "step": 2410 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 1.3540840148925781, + "learning_rate": 4.0877393513756795e-09, + "logits/chosen": -3.0015170574188232, + "logits/rejected": -2.9834518432617188, + "logps/chosen": -58.74982452392578, + "logps/rejected": -59.712005615234375, + "loss": 0.6847, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.045878536999225616, + "rewards/margins": 0.018289810046553612, + "rewards/rejected": -0.06416834890842438, + "step": 2420 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 1.319036841392517, + "learning_rate": 3.924422258090529e-09, + "logits/chosen": -2.939756155014038, + "logits/rejected": -2.919666290283203, + "logps/chosen": -58.5392951965332, + "logps/rejected": -59.17338943481445, + "loss": 0.684, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04658854380249977, + "rewards/margins": 0.019758421927690506, + "rewards/rejected": -0.06634696573019028, + "step": 2430 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 1.3268150091171265, + "learning_rate": 3.764156291565693e-09, + "logits/chosen": -3.0177316665649414, + "logits/rejected": -2.9926140308380127, + "logps/chosen": -58.570648193359375, + "logps/rejected": -58.78978729248047, + "loss": 0.6825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04440145939588547, + "rewards/margins": 0.022650301456451416, + "rewards/rejected": -0.06705176085233688, + "step": 2440 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 1.3211112022399902, + "learning_rate": 3.6069646536220357e-09, + "logits/chosen": -2.998032331466675, + "logits/rejected": -2.9721641540527344, + "logps/chosen": -60.38201904296875, + "logps/rejected": -60.990257263183594, + "loss": 0.6805, + "rewards/accuracies": 0.682812511920929, + "rewards/chosen": -0.040809061378240585, + "rewards/margins": 0.026985710486769676, + "rewards/rejected": -0.06779477745294571, + "step": 2450 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 1.285194993019104, + "learning_rate": 3.4528701010076155e-09, + "logits/chosen": -3.003739833831787, + "logits/rejected": -2.9794375896453857, + "logps/chosen": -60.3626823425293, + "logps/rejected": -61.65105438232422, + "loss": 0.6822, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.045110072940588, + "rewards/margins": 0.02330555021762848, + "rewards/rejected": -0.06841562688350677, + "step": 2460 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 1.2852113246917725, + "learning_rate": 3.3018949421032003e-09, + "logits/chosen": -3.0109405517578125, + "logits/rejected": -2.9975745677948, + "logps/chosen": -58.778053283691406, + "logps/rejected": -59.56703567504883, + "loss": 0.6851, + "rewards/accuracies": 0.5703125, + "rewards/chosen": -0.04846884682774544, + "rewards/margins": 0.01748683862388134, + "rewards/rejected": -0.06595568358898163, + "step": 2470 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 1.258186936378479, + "learning_rate": 3.154061033692651e-09, + "logits/chosen": -3.0072379112243652, + "logits/rejected": -2.979935884475708, + "logps/chosen": -59.062705993652344, + "logps/rejected": -58.03764724731445, + "loss": 0.6807, + "rewards/accuracies": 0.640625, + "rewards/chosen": -0.04467375949025154, + "rewards/margins": 0.026565441861748695, + "rewards/rejected": -0.07123919576406479, + "step": 2480 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 1.3505935668945312, + "learning_rate": 3.0093897777987098e-09, + "logits/chosen": -3.0517494678497314, + "logits/rejected": -3.0366005897521973, + "logps/chosen": -58.27477264404297, + "logps/rejected": -61.6846923828125, + "loss": 0.6858, + "rewards/accuracies": 0.6109374761581421, + "rewards/chosen": -0.04682334139943123, + "rewards/margins": 0.016174782067537308, + "rewards/rejected": -0.06299812346696854, + "step": 2490 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 1.3154429197311401, + "learning_rate": 2.8679021185845975e-09, + "logits/chosen": -3.023200273513794, + "logits/rejected": -2.997267246246338, + "logps/chosen": -58.07569122314453, + "logps/rejected": -59.87085723876953, + "loss": 0.6826, + "rewards/accuracies": 0.620312511920929, + "rewards/chosen": -0.046663668006658554, + "rewards/margins": 0.022471796721220016, + "rewards/rejected": -0.06913547217845917, + "step": 2500 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -3.1012966632843018, + "eval_logits/rejected": -3.095568895339966, + "eval_logps/chosen": -60.681880950927734, + "eval_logps/rejected": -66.00990295410156, + "eval_loss": 0.689177393913269, + "eval_rewards/accuracies": 0.5850371718406677, + "eval_rewards/chosen": -0.019699882715940475, + "eval_rewards/margins": 0.008597951382398605, + "eval_rewards/rejected": -0.02829783223569393, + "eval_runtime": 383.2161, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 2500 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 1.3162225484848022, + "learning_rate": 2.7296185393219316e-09, + "logits/chosen": -3.0459542274475098, + "logits/rejected": -3.0207812786102295, + "logps/chosen": -59.15156173706055, + "logps/rejected": -59.254676818847656, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.045805253088474274, + "rewards/margins": 0.020932307466864586, + "rewards/rejected": -0.06673755496740341, + "step": 2510 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 1.328461766242981, + "learning_rate": 2.5945590594253305e-09, + "logits/chosen": -2.9799602031707764, + "logits/rejected": -2.9713258743286133, + "logps/chosen": -58.0362548828125, + "logps/rejected": -60.381080627441406, + "loss": 0.6877, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": -0.0511082224547863, + "rewards/margins": 0.012231842614710331, + "rewards/rejected": -0.06334006786346436, + "step": 2520 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 1.2932238578796387, + "learning_rate": 2.4627432315541986e-09, + "logits/chosen": -3.055954694747925, + "logits/rejected": -3.0452940464019775, + "logps/chosen": -58.16063690185547, + "logps/rejected": -61.20969772338867, + "loss": 0.6834, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.04746638238430023, + "rewards/margins": 0.02094622328877449, + "rewards/rejected": -0.06841260939836502, + "step": 2530 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 1.3644834756851196, + "learning_rate": 2.3341901387820717e-09, + "logits/chosen": -3.0201711654663086, + "logits/rejected": -2.995832681655884, + "logps/chosen": -59.7026481628418, + "logps/rejected": -60.433990478515625, + "loss": 0.6828, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04398275539278984, + "rewards/margins": 0.02229396626353264, + "rewards/rejected": -0.06627672165632248, + "step": 2540 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 1.3037844896316528, + "learning_rate": 2.2089183918339445e-09, + "logits/chosen": -2.996652126312256, + "logits/rejected": -2.976022243499756, + "logps/chosen": -57.28471755981445, + "logps/rejected": -59.08478546142578, + "loss": 0.684, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04602036252617836, + "rewards/margins": 0.019708681851625443, + "rewards/rejected": -0.0657290443778038, + "step": 2550 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 1.3543256521224976, + "learning_rate": 2.086946126391981e-09, + "logits/chosen": -2.9888083934783936, + "logits/rejected": -2.9732577800750732, + "logps/chosen": -56.89265823364258, + "logps/rejected": -60.136573791503906, + "loss": 0.6841, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.05068554729223251, + "rewards/margins": 0.019495617598295212, + "rewards/rejected": -0.07018117606639862, + "step": 2560 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 1.3135391473770142, + "learning_rate": 1.9682910004700155e-09, + "logits/chosen": -3.000701904296875, + "logits/rejected": -2.9841086864471436, + "logps/chosen": -59.79181671142578, + "logps/rejected": -60.78386306762695, + "loss": 0.6832, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.048417720943689346, + "rewards/margins": 0.02133244276046753, + "rewards/rejected": -0.06975016742944717, + "step": 2570 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 1.2864971160888672, + "learning_rate": 1.852970191857159e-09, + "logits/chosen": -2.9674811363220215, + "logits/rejected": -2.94804048538208, + "logps/chosen": -59.39619064331055, + "logps/rejected": -60.783851623535156, + "loss": 0.6818, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.04619085043668747, + "rewards/margins": 0.02428482472896576, + "rewards/rejected": -0.07047567516565323, + "step": 2580 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 1.2950899600982666, + "learning_rate": 1.741000395630976e-09, + "logits/chosen": -3.034547805786133, + "logits/rejected": -3.0095696449279785, + "logps/chosen": -58.98634719848633, + "logps/rejected": -60.004661560058594, + "loss": 0.6821, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.044211823493242264, + "rewards/margins": 0.02347356267273426, + "rewards/rejected": -0.06768538057804108, + "step": 2590 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 1.3900827169418335, + "learning_rate": 1.6323978217405277e-09, + "logits/chosen": -2.962319850921631, + "logits/rejected": -2.9394538402557373, + "logps/chosen": -57.80451202392578, + "logps/rejected": -60.771339416503906, + "loss": 0.6825, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.0485307052731514, + "rewards/margins": 0.022718578577041626, + "rewards/rejected": -0.07124929130077362, + "step": 2600 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -3.1006553173065186, + "eval_logits/rejected": -3.094916820526123, + "eval_logps/chosen": -60.688175201416016, + "eval_logps/rejected": -66.03443145751953, + "eval_loss": 0.6890937089920044, + "eval_rewards/accuracies": 0.5889869928359985, + "eval_rewards/chosen": -0.019762787967920303, + "eval_rewards/margins": 0.008780322037637234, + "eval_rewards/rejected": -0.028543109074234962, + "eval_runtime": 383.3128, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.404, + "step": 2600 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 1.325190544128418, + "learning_rate": 1.5271781926596449e-09, + "logits/chosen": -3.0393474102020264, + "logits/rejected": -3.0156404972076416, + "logps/chosen": -60.42161178588867, + "logps/rejected": -61.184486389160156, + "loss": 0.6827, + "rewards/accuracies": 0.6484375, + "rewards/chosen": -0.04576939716935158, + "rewards/margins": 0.022201048210263252, + "rewards/rejected": -0.06797045469284058, + "step": 2610 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 1.3093925714492798, + "learning_rate": 1.4253567411107643e-09, + "logits/chosen": -2.989856243133545, + "logits/rejected": -2.9666576385498047, + "logps/chosen": -58.600990295410156, + "logps/rejected": -60.18854904174805, + "loss": 0.6831, + "rewards/accuracies": 0.609375, + "rewards/chosen": -0.0458240807056427, + "rewards/margins": 0.021440699696540833, + "rewards/rejected": -0.06726478040218353, + "step": 2620 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 1.2918739318847656, + "learning_rate": 1.326948207859685e-09, + "logits/chosen": -3.0238237380981445, + "logits/rejected": -3.0077781677246094, + "logps/chosen": -57.79582595825195, + "logps/rejected": -60.6348762512207, + "loss": 0.6832, + "rewards/accuracies": 0.6328125, + "rewards/chosen": -0.046499475836753845, + "rewards/margins": 0.021402059122920036, + "rewards/rejected": -0.06790152937173843, + "step": 2630 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 1.349001407623291, + "learning_rate": 1.2319668395815358e-09, + "logits/chosen": -3.0028393268585205, + "logits/rejected": -2.9857001304626465, + "logps/chosen": -58.69614791870117, + "logps/rejected": -59.920021057128906, + "loss": 0.6839, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.04932459071278572, + "rewards/margins": 0.0200694240629673, + "rewards/rejected": -0.06939400732517242, + "step": 2640 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 1.2818963527679443, + "learning_rate": 1.1404263867982738e-09, + "logits/chosen": -3.0455758571624756, + "logits/rejected": -3.0237960815429688, + "logps/chosen": -59.25251388549805, + "logps/rejected": -60.45496368408203, + "loss": 0.683, + "rewards/accuracies": 0.629687488079071, + "rewards/chosen": -0.04728539660573006, + "rewards/margins": 0.021762443706393242, + "rewards/rejected": -0.06904784590005875, + "step": 2650 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 1.2889119386672974, + "learning_rate": 1.0523401018880134e-09, + "logits/chosen": -2.983532428741455, + "logits/rejected": -2.9646944999694824, + "logps/chosen": -58.277976989746094, + "logps/rejected": -59.491722106933594, + "loss": 0.6837, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.047298818826675415, + "rewards/margins": 0.020261693745851517, + "rewards/rejected": -0.06756050884723663, + "step": 2660 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 1.3029212951660156, + "learning_rate": 9.677207371664608e-10, + "logits/chosen": -3.0146260261535645, + "logits/rejected": -2.9899539947509766, + "logps/chosen": -59.18970489501953, + "logps/rejected": -60.14207077026367, + "loss": 0.6824, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.05002979189157486, + "rewards/margins": 0.02303471975028515, + "rewards/rejected": -0.07306452095508575, + "step": 2670 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 1.260703444480896, + "learning_rate": 8.865805430407575e-10, + "logits/chosen": -3.0160889625549316, + "logits/rejected": -2.9888625144958496, + "logps/chosen": -58.788368225097656, + "logps/rejected": -59.13869094848633, + "loss": 0.6827, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.048010729253292084, + "rewards/margins": 0.0222895760089159, + "rewards/rejected": -0.07030030339956284, + "step": 2680 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 1.3435821533203125, + "learning_rate": 8.089312662359904e-10, + "logits/chosen": -3.000649929046631, + "logits/rejected": -2.9763736724853516, + "logps/chosen": -58.345603942871094, + "logps/rejected": -59.42350387573242, + "loss": 0.6836, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04748475179076195, + "rewards/margins": 0.02055184543132782, + "rewards/rejected": -0.06803660839796066, + "step": 2690 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 1.3498848676681519, + "learning_rate": 7.34784148094586e-10, + "logits/chosen": -3.062194347381592, + "logits/rejected": -3.040365219116211, + "logps/chosen": -58.02552032470703, + "logps/rejected": -60.96380615234375, + "loss": 0.6823, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.04390079155564308, + "rewards/margins": 0.023236598819494247, + "rewards/rejected": -0.06713739782571793, + "step": 2700 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -3.1006381511688232, + "eval_logits/rejected": -3.0949079990386963, + "eval_logps/chosen": -60.71648025512695, + "eval_logps/rejected": -66.05257415771484, + "eval_loss": 0.6891458034515381, + "eval_rewards/accuracies": 0.5889869928359985, + "eval_rewards/chosen": -0.020045887678861618, + "eval_rewards/margins": 0.008678610436618328, + "eval_rewards/rejected": -0.02872449718415737, + "eval_runtime": 383.351, + "eval_samples_per_second": 11.227, + "eval_steps_per_second": 1.403, + "step": 2700 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 1.2470507621765137, + "learning_rate": 6.641499229489145e-10, + "logits/chosen": -3.003091335296631, + "logits/rejected": -2.9715797901153564, + "logps/chosen": -58.2459831237793, + "logps/rejected": -58.3682975769043, + "loss": 0.6813, + "rewards/accuracies": 0.6390625238418579, + "rewards/chosen": -0.04425545781850815, + "rewards/margins": 0.02511006034910679, + "rewards/rejected": -0.06936550885438919, + "step": 2710 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 1.3162429332733154, + "learning_rate": 5.970388165672691e-10, + "logits/chosen": -2.99006724357605, + "logits/rejected": -2.971386671066284, + "logps/chosen": -57.105255126953125, + "logps/rejected": -60.72968673706055, + "loss": 0.6826, + "rewards/accuracies": 0.6171875, + "rewards/chosen": -0.0457664355635643, + "rewards/margins": 0.02246815897524357, + "rewards/rejected": -0.06823460012674332, + "step": 2720 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 1.4187453985214233, + "learning_rate": 5.334605446734585e-10, + "logits/chosen": -3.0353336334228516, + "logits/rejected": -3.007887363433838, + "logps/chosen": -59.300804138183594, + "logps/rejected": -59.99883270263672, + "loss": 0.6814, + "rewards/accuracies": 0.6421874761581421, + "rewards/chosen": -0.043865978717803955, + "rewards/margins": 0.02479901909828186, + "rewards/rejected": -0.06866499781608582, + "step": 2730 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 1.3848966360092163, + "learning_rate": 4.734243115402825e-10, + "logits/chosen": -2.9592947959899902, + "logits/rejected": -2.9372572898864746, + "logps/chosen": -59.5694465637207, + "logps/rejected": -60.315895080566406, + "loss": 0.6837, + "rewards/accuracies": 0.6031249761581421, + "rewards/chosen": -0.047974247485399246, + "rewards/margins": 0.02036571130156517, + "rewards/rejected": -0.06833995878696442, + "step": 2740 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 1.4468791484832764, + "learning_rate": 4.169388086569886e-10, + "logits/chosen": -3.0385963916778564, + "logits/rejected": -3.0236904621124268, + "logps/chosen": -58.946388244628906, + "logps/rejected": -61.70532989501953, + "loss": 0.6838, + "rewards/accuracies": 0.604687511920929, + "rewards/chosen": -0.04743208736181259, + "rewards/margins": 0.020196830853819847, + "rewards/rejected": -0.06762892007827759, + "step": 2750 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 1.3899776935577393, + "learning_rate": 3.640122134710294e-10, + "logits/chosen": -3.06270170211792, + "logits/rejected": -3.0440831184387207, + "logps/chosen": -59.356605529785156, + "logps/rejected": -60.465003967285156, + "loss": 0.6821, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0449027419090271, + "rewards/margins": 0.023394212126731873, + "rewards/rejected": -0.06829695403575897, + "step": 2760 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 1.328192114830017, + "learning_rate": 3.1465218820418415e-10, + "logits/chosen": -3.037365436553955, + "logits/rejected": -3.0028114318847656, + "logps/chosen": -58.11069869995117, + "logps/rejected": -59.29419708251953, + "loss": 0.6807, + "rewards/accuracies": 0.667187511920929, + "rewards/chosen": -0.04381219670176506, + "rewards/margins": 0.026473551988601685, + "rewards/rejected": -0.07028575241565704, + "step": 2770 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 1.2775218486785889, + "learning_rate": 2.688658787433157e-10, + "logits/chosen": -3.022888660430908, + "logits/rejected": -3.000300884246826, + "logps/chosen": -60.5079460144043, + "logps/rejected": -60.63434600830078, + "loss": 0.6831, + "rewards/accuracies": 0.628125011920929, + "rewards/chosen": -0.04978417605161667, + "rewards/margins": 0.021695107221603394, + "rewards/rejected": -0.07147928327322006, + "step": 2780 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 1.2735910415649414, + "learning_rate": 2.266599136058367e-10, + "logits/chosen": -3.0028035640716553, + "logits/rejected": -2.9840023517608643, + "logps/chosen": -59.78889083862305, + "logps/rejected": -59.44769287109375, + "loss": 0.6861, + "rewards/accuracies": 0.589062511920929, + "rewards/chosen": -0.04813474044203758, + "rewards/margins": 0.015620408579707146, + "rewards/rejected": -0.06375513970851898, + "step": 2790 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 1.376592755317688, + "learning_rate": 1.8804040298009693e-10, + "logits/chosen": -3.0288257598876953, + "logits/rejected": -3.0026650428771973, + "logps/chosen": -59.218475341796875, + "logps/rejected": -58.154075622558594, + "loss": 0.6816, + "rewards/accuracies": 0.6468750238418579, + "rewards/chosen": -0.04387739300727844, + "rewards/margins": 0.02458575740456581, + "rewards/rejected": -0.06846315413713455, + "step": 2800 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -3.1007766723632812, + "eval_logits/rejected": -3.095076084136963, + "eval_logps/chosen": -60.726348876953125, + "eval_logps/rejected": -66.07279968261719, + "eval_loss": 0.6890966892242432, + "eval_rewards/accuracies": 0.5841078162193298, + "eval_rewards/chosen": -0.020144494250416756, + "eval_rewards/margins": 0.008782317861914635, + "eval_rewards/rejected": -0.02892681024968624, + "eval_runtime": 383.6267, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 2800 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 1.2682085037231445, + "learning_rate": 1.5301293784081847e-10, + "logits/chosen": -2.9736599922180176, + "logits/rejected": -2.9589531421661377, + "logps/chosen": -58.26537322998047, + "logps/rejected": -60.63109588623047, + "loss": 0.6842, + "rewards/accuracies": 0.6156250238418579, + "rewards/chosen": -0.0504550039768219, + "rewards/margins": 0.019359614700078964, + "rewards/rejected": -0.06981462240219116, + "step": 2810 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 1.3405542373657227, + "learning_rate": 1.2158258913967102e-10, + "logits/chosen": -3.0063540935516357, + "logits/rejected": -2.9755642414093018, + "logps/chosen": -60.50700759887695, + "logps/rejected": -58.998687744140625, + "loss": 0.6822, + "rewards/accuracies": 0.6015625, + "rewards/chosen": -0.04506916552782059, + "rewards/margins": 0.023324180394411087, + "rewards/rejected": -0.06839334219694138, + "step": 2820 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 1.3689327239990234, + "learning_rate": 9.37539070711646e-11, + "logits/chosen": -3.0321671962738037, + "logits/rejected": -3.012648820877075, + "logps/chosen": -60.28644943237305, + "logps/rejected": -60.640167236328125, + "loss": 0.681, + "rewards/accuracies": 0.6109374761581421, + "rewards/chosen": -0.042195506393909454, + "rewards/margins": 0.02599485218524933, + "rewards/rejected": -0.06819035857915878, + "step": 2830 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 1.3046759366989136, + "learning_rate": 6.953092041389607e-11, + "logits/chosen": -3.014383554458618, + "logits/rejected": -2.9899418354034424, + "logps/chosen": -59.1005859375, + "logps/rejected": -59.03815460205078, + "loss": 0.6822, + "rewards/accuracies": 0.621874988079071, + "rewards/chosen": -0.04825712740421295, + "rewards/margins": 0.023372991010546684, + "rewards/rejected": -0.07163011282682419, + "step": 2840 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 1.3823450803756714, + "learning_rate": 4.891713594731006e-11, + "logits/chosen": -3.0164265632629395, + "logits/rejected": -2.993161678314209, + "logps/chosen": -58.6377067565918, + "logps/rejected": -59.821807861328125, + "loss": 0.6829, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04551283270120621, + "rewards/margins": 0.021898990496993065, + "rewards/rejected": -0.06741182506084442, + "step": 2850 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 1.2972795963287354, + "learning_rate": 3.191553794401336e-11, + "logits/chosen": -3.0041041374206543, + "logits/rejected": -2.9793601036071777, + "logps/chosen": -58.8316535949707, + "logps/rejected": -59.15874481201172, + "loss": 0.6844, + "rewards/accuracies": 0.5921875238418579, + "rewards/chosen": -0.04608858376741409, + "rewards/margins": 0.018982943147420883, + "rewards/rejected": -0.06507153064012527, + "step": 2860 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 1.4782917499542236, + "learning_rate": 1.8528587737753898e-11, + "logits/chosen": -3.00868558883667, + "logits/rejected": -2.981982707977295, + "logps/chosen": -59.979164123535156, + "logps/rejected": -59.065032958984375, + "loss": 0.6813, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.04683419317007065, + "rewards/margins": 0.025204036384820938, + "rewards/rejected": -0.07203822582960129, + "step": 2870 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 1.387197494506836, + "learning_rate": 8.758223367075212e-12, + "logits/chosen": -3.0078389644622803, + "logits/rejected": -2.978468656539917, + "logps/chosen": -59.639747619628906, + "logps/rejected": -58.05632781982422, + "loss": 0.683, + "rewards/accuracies": 0.6265624761581421, + "rewards/chosen": -0.04575073719024658, + "rewards/margins": 0.021932676434516907, + "rewards/rejected": -0.06768341362476349, + "step": 2880 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 1.3310401439666748, + "learning_rate": 2.605859294749213e-12, + "logits/chosen": -3.018655300140381, + "logits/rejected": -2.994286298751831, + "logps/chosen": -57.234657287597656, + "logps/rejected": -59.07853317260742, + "loss": 0.6825, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.046042539179325104, + "rewards/margins": 0.022625811398029327, + "rewards/rejected": -0.06866835057735443, + "step": 2890 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 1.3212953805923462, + "learning_rate": 7.2386203012198e-14, + "logits/chosen": -3.0002474784851074, + "logits/rejected": -2.9772603511810303, + "logps/chosen": -59.88157272338867, + "logps/rejected": -60.886680603027344, + "loss": 0.6836, + "rewards/accuracies": 0.614062488079071, + "rewards/chosen": -0.04801579564809799, + "rewards/margins": 0.0204045120626688, + "rewards/rejected": -0.06842031329870224, + "step": 2900 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -3.100578546524048, + "eval_logits/rejected": -3.094856023788452, + "eval_logps/chosen": -60.72254180908203, + "eval_logps/rejected": -66.06378173828125, + "eval_loss": 0.6891194581985474, + "eval_rewards/accuracies": 0.5910780429840088, + "eval_rewards/chosen": -0.020106395706534386, + "eval_rewards/margins": 0.008730227127671242, + "eval_rewards/rejected": -0.028836622834205627, + "eval_runtime": 383.6501, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 2900 + }, + { + "epoch": 2.0, + "step": 2902, + "total_flos": 0.0, + "train_loss": 0.6870454205553093, + "train_runtime": 56536.4846, + "train_samples_per_second": 3.285, + "train_steps_per_second": 0.051 + } + ], + "logging_steps": 10, + "max_steps": 2902, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}