{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 11608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00017229496898690558, "grad_norm": 2.1822969913482666, "learning_rate": 1.7226528854435833e-11, "logits/chosen": -2.967046022415161, "logits/rejected": -2.9243061542510986, "logps/chosen": -43.99115753173828, "logps/rejected": -41.627906799316406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0017229496898690559, "grad_norm": 2.3854050636291504, "learning_rate": 1.7226528854435832e-10, "logits/chosen": -3.055169105529785, "logits/rejected": -3.025726795196533, "logps/chosen": -50.45830535888672, "logps/rejected": -49.59857177734375, "loss": 0.693, "rewards/accuracies": 0.3819444477558136, "rewards/chosen": 7.992664905032143e-05, "rewards/margins": 0.00021500879665836692, "rewards/rejected": -0.00013508212578017265, "step": 10 }, { "epoch": 0.0034458993797381117, "grad_norm": 2.243231773376465, "learning_rate": 3.4453057708871663e-10, "logits/chosen": -3.1189680099487305, "logits/rejected": -3.110758066177368, "logps/chosen": -52.657142639160156, "logps/rejected": -52.99263381958008, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 5.9444828366395086e-05, "rewards/margins": -6.718172517139465e-05, "rewards/rejected": 0.0001266265317099169, "step": 20 }, { "epoch": 0.005168849069607168, "grad_norm": 2.578056573867798, "learning_rate": 5.167958656330749e-10, "logits/chosen": -3.0915324687957764, "logits/rejected": -3.067788600921631, "logps/chosen": -56.78974151611328, "logps/rejected": -58.443809509277344, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 6.385785673046485e-05, "rewards/margins": 0.0001299582072533667, "rewards/rejected": -6.610035052290186e-05, "step": 30 }, { "epoch": 0.006891798759476223, "grad_norm": 2.0117297172546387, "learning_rate": 6.890611541774333e-10, "logits/chosen": -3.105164051055908, "logits/rejected": -3.0736613273620605, "logps/chosen": -55.2633056640625, "logps/rejected": -50.67898178100586, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 8.863389666657895e-05, "rewards/margins": 2.5206496502505615e-05, "rewards/rejected": 6.342738925013691e-05, "step": 40 }, { "epoch": 0.00861474844934528, "grad_norm": 2.3875701427459717, "learning_rate": 8.613264427217916e-10, "logits/chosen": -3.1009817123413086, "logits/rejected": -3.0846290588378906, "logps/chosen": -53.1203498840332, "logps/rejected": -51.499549865722656, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.746306902030483e-05, "rewards/margins": -4.0330953197553754e-05, "rewards/rejected": 2.86787394543353e-06, "step": 50 }, { "epoch": 0.010337698139214336, "grad_norm": 2.789041757583618, "learning_rate": 1.0335917312661499e-09, "logits/chosen": -3.153869867324829, "logits/rejected": -3.1241626739501953, "logps/chosen": -57.59900665283203, "logps/rejected": -54.145477294921875, "loss": 0.6934, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00012567281373776495, "rewards/margins": -0.0004897199687547982, "rewards/rejected": 0.0003640470968093723, "step": 60 }, { "epoch": 0.012060647829083391, "grad_norm": 2.1988277435302734, "learning_rate": 1.2058570198105082e-09, "logits/chosen": -3.0509283542633057, "logits/rejected": -3.030928134918213, "logps/chosen": -53.77088165283203, "logps/rejected": -53.22446823120117, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.639079346205108e-05, "rewards/margins": -1.7232532627531327e-05, "rewards/rejected": 8.417169965468929e-07, "step": 70 }, { "epoch": 0.013783597518952447, "grad_norm": 2.439988136291504, "learning_rate": 1.3781223083548665e-09, "logits/chosen": -3.159721851348877, "logits/rejected": -3.126398801803589, "logps/chosen": -59.07847213745117, "logps/rejected": -54.11749267578125, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.00017951996414922178, "rewards/margins": 0.0003985256771557033, "rewards/rejected": -0.0002190056984545663, "step": 80 }, { "epoch": 0.015506547208821502, "grad_norm": 2.474202871322632, "learning_rate": 1.5503875968992249e-09, "logits/chosen": -2.9933810234069824, "logits/rejected": -2.9786269664764404, "logps/chosen": -53.468894958496094, "logps/rejected": -52.8430290222168, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 2.1566442228504457e-05, "rewards/margins": 3.089628808083944e-05, "rewards/rejected": -9.329845852334984e-06, "step": 90 }, { "epoch": 0.01722949689869056, "grad_norm": 2.488950252532959, "learning_rate": 1.7226528854435832e-09, "logits/chosen": -3.169787883758545, "logits/rejected": -3.107752561569214, "logps/chosen": -55.961708068847656, "logps/rejected": -49.63905715942383, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00016789429355412722, "rewards/margins": -7.354038825724274e-05, "rewards/rejected": -9.435390529688448e-05, "step": 100 }, { "epoch": 0.01722949689869056, "eval_logits/chosen": -3.1630771160125732, "eval_logits/rejected": -3.157426118850708, "eval_logps/chosen": -58.701412200927734, "eval_logps/rejected": -63.16501998901367, "eval_loss": 0.6931708455085754, "eval_rewards/accuracies": 0.49465614557266235, "eval_rewards/chosen": 0.00010478924377821386, "eval_rewards/margins": -4.6228298742789775e-05, "eval_rewards/rejected": 0.00015101753524504602, "eval_runtime": 383.2857, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 100 }, { "epoch": 0.018952446588559616, "grad_norm": 2.542346715927124, "learning_rate": 1.8949181739879414e-09, "logits/chosen": -3.121835470199585, "logits/rejected": -3.0979726314544678, "logps/chosen": -55.59098434448242, "logps/rejected": -52.3350715637207, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00013937894254922867, "rewards/margins": -9.85497172223404e-05, "rewards/rejected": -4.08292435167823e-05, "step": 110 }, { "epoch": 0.02067539627842867, "grad_norm": 2.5591018199920654, "learning_rate": 2.0671834625322997e-09, "logits/chosen": -3.065739870071411, "logits/rejected": -3.0502285957336426, "logps/chosen": -53.17518997192383, "logps/rejected": -55.5625, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 6.941436004126444e-05, "rewards/margins": -2.7006733944290318e-05, "rewards/rejected": 9.642112127039582e-05, "step": 120 }, { "epoch": 0.022398345968297727, "grad_norm": 2.138871908187866, "learning_rate": 2.239448751076658e-09, "logits/chosen": -3.1010239124298096, "logits/rejected": -3.087006092071533, "logps/chosen": -55.1888313293457, "logps/rejected": -53.76519775390625, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 4.798931331606582e-05, "rewards/margins": 9.56843578023836e-05, "rewards/rejected": -4.769503721036017e-05, "step": 130 }, { "epoch": 0.024121295658166782, "grad_norm": 2.427386999130249, "learning_rate": 2.4117140396210164e-09, "logits/chosen": -3.122720241546631, "logits/rejected": -3.1043787002563477, "logps/chosen": -54.18378829956055, "logps/rejected": -53.78192901611328, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 2.194441003666725e-05, "rewards/margins": 0.0002684879000298679, "rewards/rejected": -0.0002465434663463384, "step": 140 }, { "epoch": 0.025844245348035838, "grad_norm": 2.211906909942627, "learning_rate": 2.5839793281653743e-09, "logits/chosen": -3.027405261993408, "logits/rejected": -3.0095810890197754, "logps/chosen": -52.62847137451172, "logps/rejected": -52.40435791015625, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.676296778896358e-05, "rewards/margins": 5.470390988193685e-06, "rewards/rejected": -2.2233365598367527e-05, "step": 150 }, { "epoch": 0.027567195037904894, "grad_norm": 2.1558420658111572, "learning_rate": 2.756244616709733e-09, "logits/chosen": -3.0889861583709717, "logits/rejected": -3.0681259632110596, "logps/chosen": -53.5107536315918, "logps/rejected": -54.70793914794922, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00023508230515290052, "rewards/margins": -0.0001835847506299615, "rewards/rejected": -5.1497579988790676e-05, "step": 160 }, { "epoch": 0.02929014472777395, "grad_norm": 2.360699415206909, "learning_rate": 2.9285099052540914e-09, "logits/chosen": -3.076531410217285, "logits/rejected": -3.056931734085083, "logps/chosen": -56.27750778198242, "logps/rejected": -51.34051513671875, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 7.097180059645325e-05, "rewards/margins": 0.00026213008095510304, "rewards/rejected": -0.00019115829491056502, "step": 170 }, { "epoch": 0.031013094417643005, "grad_norm": 2.610349655151367, "learning_rate": 3.1007751937984498e-09, "logits/chosen": -3.0624287128448486, "logits/rejected": -3.043592929840088, "logps/chosen": -56.41620635986328, "logps/rejected": -53.785743713378906, "loss": 0.6932, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.00010555762855801731, "rewards/margins": -5.6592289183754474e-05, "rewards/rejected": -4.8965321184368804e-05, "step": 180 }, { "epoch": 0.03273604410751206, "grad_norm": 2.6376547813415527, "learning_rate": 3.2730404823428077e-09, "logits/chosen": -3.124223232269287, "logits/rejected": -3.0807528495788574, "logps/chosen": -58.18146896362305, "logps/rejected": -52.5531120300293, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 2.0690204109996557e-05, "rewards/margins": 0.00017370580462738872, "rewards/rejected": -0.0001530156150693074, "step": 190 }, { "epoch": 0.03445899379738112, "grad_norm": 2.581326484680176, "learning_rate": 3.4453057708871665e-09, "logits/chosen": -3.059694528579712, "logits/rejected": -3.0440192222595215, "logps/chosen": -54.1214714050293, "logps/rejected": -54.7166862487793, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 1.2389538824209012e-05, "rewards/margins": -4.3077256123069674e-05, "rewards/rejected": 5.546676038648002e-05, "step": 200 }, { "epoch": 0.03445899379738112, "eval_logits/chosen": -3.163004159927368, "eval_logits/rejected": -3.1573050022125244, "eval_logps/chosen": -58.7076301574707, "eval_logps/rejected": -63.1617546081543, "eval_loss": 0.6932182908058167, "eval_rewards/accuracies": 0.4839684069156647, "eval_rewards/chosen": 4.26560036430601e-05, "eval_rewards/margins": -0.00014103209832683206, "eval_rewards/rejected": 0.00018368809833191335, "eval_runtime": 383.3111, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.404, "step": 200 }, { "epoch": 0.03618194348725017, "grad_norm": 2.2879650592803955, "learning_rate": 3.617571059431525e-09, "logits/chosen": -3.014284610748291, "logits/rejected": -3.005575656890869, "logps/chosen": -53.272361755371094, "logps/rejected": -57.27521896362305, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -8.317607716890052e-05, "rewards/margins": -0.00023591746867168695, "rewards/rejected": 0.00015274141333065927, "step": 210 }, { "epoch": 0.03790489317711923, "grad_norm": 2.327847957611084, "learning_rate": 3.789836347975883e-09, "logits/chosen": -3.0507373809814453, "logits/rejected": -3.019326686859131, "logps/chosen": -52.20600128173828, "logps/rejected": -51.31000900268555, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -6.532550469273701e-05, "rewards/margins": -0.0001269731583306566, "rewards/rejected": 6.164763908600435e-05, "step": 220 }, { "epoch": 0.03962784286698828, "grad_norm": 2.3976006507873535, "learning_rate": 3.962101636520241e-09, "logits/chosen": -3.0511653423309326, "logits/rejected": -3.032834529876709, "logps/chosen": -48.90367889404297, "logps/rejected": -49.952972412109375, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 5.9426492953207344e-05, "rewards/margins": 0.0001413938298355788, "rewards/rejected": -8.196735143428668e-05, "step": 230 }, { "epoch": 0.04135079255685734, "grad_norm": 2.251117706298828, "learning_rate": 4.134366925064599e-09, "logits/chosen": -3.0247445106506348, "logits/rejected": -2.982290029525757, "logps/chosen": -55.94219207763672, "logps/rejected": -52.15814208984375, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0001196784432977438, "rewards/margins": 0.0001547091087559238, "rewards/rejected": -3.503066545818001e-05, "step": 240 }, { "epoch": 0.043073742246726394, "grad_norm": 2.3111438751220703, "learning_rate": 4.306632213608958e-09, "logits/chosen": -3.1180098056793213, "logits/rejected": -3.0976080894470215, "logps/chosen": -52.28120803833008, "logps/rejected": -51.09599685668945, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1321669262542855e-05, "rewards/margins": -2.742862307059113e-05, "rewards/rejected": 1.61069674504688e-05, "step": 250 }, { "epoch": 0.044796691936595454, "grad_norm": 2.313485622406006, "learning_rate": 4.478897502153316e-09, "logits/chosen": -3.09426212310791, "logits/rejected": -3.0820038318634033, "logps/chosen": -54.8555908203125, "logps/rejected": -56.63024139404297, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00013580010272562504, "rewards/margins": 0.0002656346478033811, "rewards/rejected": -0.0001298345159739256, "step": 260 }, { "epoch": 0.046519641626464506, "grad_norm": 2.2110888957977295, "learning_rate": 4.6511627906976744e-09, "logits/chosen": -3.03316593170166, "logits/rejected": -3.0150184631347656, "logps/chosen": -53.122108459472656, "logps/rejected": -54.308738708496094, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.5679284590296447e-05, "rewards/margins": 9.607634638086893e-06, "rewards/rejected": -3.52868992194999e-05, "step": 270 }, { "epoch": 0.048242591316333565, "grad_norm": 2.4297540187835693, "learning_rate": 4.823428079242033e-09, "logits/chosen": -3.124643325805664, "logits/rejected": -3.090567111968994, "logps/chosen": -57.60612106323242, "logps/rejected": -53.4041862487793, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 7.05211132299155e-05, "rewards/margins": -3.20916369673796e-05, "rewards/rejected": 0.00010261273564537987, "step": 280 }, { "epoch": 0.04996554100620262, "grad_norm": 2.245150566101074, "learning_rate": 4.995693367786391e-09, "logits/chosen": -3.046997547149658, "logits/rejected": -3.0325276851654053, "logps/chosen": -55.36955642700195, "logps/rejected": -54.27775192260742, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 6.89334119670093e-05, "rewards/margins": 2.902208325394895e-05, "rewards/rejected": 3.991132689407095e-05, "step": 290 }, { "epoch": 0.051688490696071676, "grad_norm": 2.361457347869873, "learning_rate": 5.167958656330749e-09, "logits/chosen": -3.0027079582214355, "logits/rejected": -2.9940621852874756, "logps/chosen": -52.8454475402832, "logps/rejected": -53.935462951660156, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -3.082995681324974e-05, "rewards/margins": -0.00011777142208302394, "rewards/rejected": 8.694147254573181e-05, "step": 300 }, { "epoch": 0.051688490696071676, "eval_logits/chosen": -3.163076639175415, "eval_logits/rejected": -3.1573708057403564, "eval_logps/chosen": -58.71210479736328, "eval_logps/rejected": -63.17204284667969, "eval_loss": 0.6931892037391663, "eval_rewards/accuracies": 0.48420074582099915, "eval_rewards/chosen": -2.114654535034788e-06, "eval_rewards/margins": -8.291260019177571e-05, "eval_rewards/rejected": 8.079793769866228e-05, "eval_runtime": 383.4681, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 300 }, { "epoch": 0.05341144038594073, "grad_norm": 2.4729714393615723, "learning_rate": 5.340223944875108e-09, "logits/chosen": -3.0653252601623535, "logits/rejected": -3.060011625289917, "logps/chosen": -53.5208854675293, "logps/rejected": -53.29474639892578, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4837738490314223e-05, "rewards/margins": -2.0687919459305704e-05, "rewards/rejected": 5.8501582316239364e-06, "step": 310 }, { "epoch": 0.05513439007580979, "grad_norm": 2.3792271614074707, "learning_rate": 5.512489233419466e-09, "logits/chosen": -3.023054599761963, "logits/rejected": -2.9967429637908936, "logps/chosen": -54.5090446472168, "logps/rejected": -49.25890350341797, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.0441066478961147e-05, "rewards/margins": 0.00017671639216132462, "rewards/rejected": -0.00018715743499342352, "step": 320 }, { "epoch": 0.05685733976567884, "grad_norm": 2.3428661823272705, "learning_rate": 5.6847545219638245e-09, "logits/chosen": -3.083404064178467, "logits/rejected": -3.059711456298828, "logps/chosen": -55.03895950317383, "logps/rejected": -52.21526336669922, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -5.828489156556316e-05, "rewards/margins": -8.575782885600347e-06, "rewards/rejected": -4.9709080485627055e-05, "step": 330 }, { "epoch": 0.0585802894555479, "grad_norm": 2.1629602909088135, "learning_rate": 5.857019810508183e-09, "logits/chosen": -3.005366802215576, "logits/rejected": -2.983691692352295, "logps/chosen": -52.5191535949707, "logps/rejected": -51.93767166137695, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 7.401472248602659e-05, "rewards/margins": 9.557695739204064e-05, "rewards/rejected": -2.1562223992077634e-05, "step": 340 }, { "epoch": 0.06030323914541695, "grad_norm": 2.317671298980713, "learning_rate": 6.02928509905254e-09, "logits/chosen": -2.977421998977661, "logits/rejected": -2.937859296798706, "logps/chosen": -56.217933654785156, "logps/rejected": -53.575584411621094, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.471935871057212e-05, "rewards/margins": 0.00010555875633144751, "rewards/rejected": -0.00019027813686989248, "step": 350 }, { "epoch": 0.06202618883528601, "grad_norm": 2.4035069942474365, "learning_rate": 6.2015503875968995e-09, "logits/chosen": -3.128420114517212, "logits/rejected": -3.105487585067749, "logps/chosen": -54.573753356933594, "logps/rejected": -50.5115966796875, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0001374700659653172, "rewards/margins": 0.00011274970893282443, "rewards/rejected": 2.4720366127439775e-05, "step": 360 }, { "epoch": 0.06374913852515507, "grad_norm": 2.323314666748047, "learning_rate": 6.373815676141258e-09, "logits/chosen": -3.10359263420105, "logits/rejected": -3.074420213699341, "logps/chosen": -52.39251708984375, "logps/rejected": -51.35547637939453, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -7.354580156970769e-05, "rewards/margins": 7.115851622074842e-05, "rewards/rejected": -0.00014470433234237134, "step": 370 }, { "epoch": 0.06547208821502412, "grad_norm": 2.0820016860961914, "learning_rate": 6.546080964685615e-09, "logits/chosen": -3.203951597213745, "logits/rejected": -3.17911958694458, "logps/chosen": -53.507789611816406, "logps/rejected": -52.19771194458008, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.075709875905886e-05, "rewards/margins": 3.0424340366153046e-05, "rewards/rejected": -7.11814864189364e-05, "step": 380 }, { "epoch": 0.06719503790489317, "grad_norm": 2.4141087532043457, "learning_rate": 6.7183462532299746e-09, "logits/chosen": -3.1004703044891357, "logits/rejected": -3.0749688148498535, "logps/chosen": -56.06284713745117, "logps/rejected": -55.22441864013672, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 2.6862904633162543e-05, "rewards/margins": 0.00019700443954207003, "rewards/rejected": -0.00017014151671901345, "step": 390 }, { "epoch": 0.06891798759476224, "grad_norm": 2.0874099731445312, "learning_rate": 6.890611541774333e-09, "logits/chosen": -3.0706710815429688, "logits/rejected": -3.054839849472046, "logps/chosen": -52.64934539794922, "logps/rejected": -52.7557258605957, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00010136842320207506, "rewards/margins": -0.00021895563986618072, "rewards/rejected": 0.00011758720211219043, "step": 400 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.163289785385132, "eval_logits/rejected": -3.1576626300811768, "eval_logps/chosen": -58.715309143066406, "eval_logps/rejected": -63.17881774902344, "eval_loss": 0.6931713819503784, "eval_rewards/accuracies": 0.48559480905532837, "eval_rewards/chosen": -3.4172830055467784e-05, "eval_rewards/margins": -4.7213809011736885e-05, "eval_rewards/rejected": 1.304098896071082e-05, "eval_runtime": 383.4664, "eval_samples_per_second": 11.224, "eval_steps_per_second": 1.403, "step": 400 }, { "epoch": 0.07064093728463129, "grad_norm": 2.1497490406036377, "learning_rate": 7.0628768303186904e-09, "logits/chosen": -3.0745506286621094, "logits/rejected": -3.0703208446502686, "logps/chosen": -50.75954818725586, "logps/rejected": -55.532379150390625, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00014055120118428022, "rewards/margins": -9.702378883957863e-05, "rewards/rejected": -4.35274196206592e-05, "step": 410 }, { "epoch": 0.07236388697450034, "grad_norm": 2.540682792663574, "learning_rate": 7.23514211886305e-09, "logits/chosen": -3.0597333908081055, "logits/rejected": -3.0517995357513428, "logps/chosen": -54.171173095703125, "logps/rejected": -53.88414764404297, "loss": 0.6933, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -5.193626202526502e-05, "rewards/margins": -0.00034823891473934054, "rewards/rejected": 0.00029630266362801194, "step": 420 }, { "epoch": 0.0740868366643694, "grad_norm": 2.244760274887085, "learning_rate": 7.407407407407407e-09, "logits/chosen": -3.0882716178894043, "logits/rejected": -3.074573040008545, "logps/chosen": -53.08251953125, "logps/rejected": -54.14112091064453, "loss": 0.6932, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 8.592494123149663e-05, "rewards/margins": -0.00013404383207671344, "rewards/rejected": 0.00021996880241204053, "step": 430 }, { "epoch": 0.07580978635423846, "grad_norm": 2.5090692043304443, "learning_rate": 7.579672695951765e-09, "logits/chosen": -3.1335277557373047, "logits/rejected": -3.098327398300171, "logps/chosen": -54.31789016723633, "logps/rejected": -53.2615966796875, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00013140590453986079, "rewards/margins": 2.339702950848732e-05, "rewards/rejected": -0.00015480289584957063, "step": 440 }, { "epoch": 0.07753273604410751, "grad_norm": 2.2801172733306885, "learning_rate": 7.751937984496125e-09, "logits/chosen": -3.056034803390503, "logits/rejected": -3.023494005203247, "logps/chosen": -56.08808517456055, "logps/rejected": -54.58913040161133, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 7.79370020609349e-05, "rewards/margins": -4.6285451389849186e-05, "rewards/rejected": 0.0001242224534507841, "step": 450 }, { "epoch": 0.07925568573397657, "grad_norm": 2.3933279514312744, "learning_rate": 7.924203273040482e-09, "logits/chosen": -3.0245463848114014, "logits/rejected": -3.0049405097961426, "logps/chosen": -56.2148551940918, "logps/rejected": -52.991844177246094, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.00014541010023094714, "rewards/margins": -0.00010043817746918648, "rewards/rejected": -4.497191548580304e-05, "step": 460 }, { "epoch": 0.08097863542384562, "grad_norm": 2.2035293579101562, "learning_rate": 8.096468561584841e-09, "logits/chosen": -3.0475990772247314, "logits/rejected": -3.016028642654419, "logps/chosen": -53.28934860229492, "logps/rejected": -51.39487838745117, "loss": 0.6933, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0002375897893216461, "rewards/margins": -0.0002828095166478306, "rewards/rejected": 4.52197200502269e-05, "step": 470 }, { "epoch": 0.08270158511371468, "grad_norm": 2.4459140300750732, "learning_rate": 8.268733850129199e-09, "logits/chosen": -3.045457363128662, "logits/rejected": -3.040175199508667, "logps/chosen": -54.21479415893555, "logps/rejected": -58.94578170776367, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00024394100182689726, "rewards/margins": 0.00010521084914216772, "rewards/rejected": 0.0001387301308568567, "step": 480 }, { "epoch": 0.08442453480358374, "grad_norm": 2.470845937728882, "learning_rate": 8.440999138673558e-09, "logits/chosen": -2.955177068710327, "logits/rejected": -2.905369520187378, "logps/chosen": -60.59124755859375, "logps/rejected": -51.36162567138672, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00024259297060780227, "rewards/margins": 0.00047093481407500803, "rewards/rejected": -0.000228341858019121, "step": 490 }, { "epoch": 0.08614748449345279, "grad_norm": 2.259693145751953, "learning_rate": 8.613264427217916e-09, "logits/chosen": -3.0174524784088135, "logits/rejected": -2.9887466430664062, "logps/chosen": -54.963653564453125, "logps/rejected": -51.62682342529297, "loss": 0.693, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00013784432667307556, "rewards/margins": 0.00024374081112910062, "rewards/rejected": -0.0003815850941464305, "step": 500 }, { "epoch": 0.08614748449345279, "eval_logits/chosen": -3.1633033752441406, "eval_logits/rejected": -3.157639265060425, "eval_logps/chosen": -58.70395278930664, "eval_logps/rejected": -63.16484832763672, "eval_loss": 0.6931844353675842, "eval_rewards/accuracies": 0.4846654236316681, "eval_rewards/chosen": 7.943623495521024e-05, "eval_rewards/margins": -7.329090294661e-05, "eval_rewards/rejected": 0.00015272715245373547, "eval_runtime": 383.5733, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 500 }, { "epoch": 0.08787043418332184, "grad_norm": 2.237663984298706, "learning_rate": 8.785529715762273e-09, "logits/chosen": -3.0099520683288574, "logits/rejected": -2.9886584281921387, "logps/chosen": -58.27411651611328, "logps/rejected": -51.98424530029297, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 8.301792149723042e-06, "rewards/margins": -9.65881918091327e-05, "rewards/rejected": 0.00010488999396329746, "step": 510 }, { "epoch": 0.08959338387319091, "grad_norm": 2.0645158290863037, "learning_rate": 8.957795004306632e-09, "logits/chosen": -3.0576930046081543, "logits/rejected": -3.032172441482544, "logps/chosen": -56.39656448364258, "logps/rejected": -51.65898895263672, "loss": 0.693, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00010374795965617523, "rewards/margins": 0.00039428164018318057, "rewards/rejected": -0.00029053367325104773, "step": 520 }, { "epoch": 0.09131633356305996, "grad_norm": 2.061357259750366, "learning_rate": 9.130060292850991e-09, "logits/chosen": -3.0545859336853027, "logits/rejected": -3.012650966644287, "logps/chosen": -55.69318771362305, "logps/rejected": -51.20554733276367, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 6.437679257942364e-05, "rewards/margins": 0.0002606067282613367, "rewards/rejected": -0.00019622994295787066, "step": 530 }, { "epoch": 0.09303928325292901, "grad_norm": 2.2379069328308105, "learning_rate": 9.302325581395349e-09, "logits/chosen": -3.0406105518341064, "logits/rejected": -3.0240485668182373, "logps/chosen": -52.84473419189453, "logps/rejected": -52.934791564941406, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 7.517748599639162e-05, "rewards/margins": 0.0002457521914038807, "rewards/rejected": -0.00017057466902770102, "step": 540 }, { "epoch": 0.09476223294279806, "grad_norm": 2.266610622406006, "learning_rate": 9.474590869939708e-09, "logits/chosen": -3.1011712551116943, "logits/rejected": -3.0831127166748047, "logps/chosen": -53.59284210205078, "logps/rejected": -51.971588134765625, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0001898288755910471, "rewards/margins": -0.0001018575276248157, "rewards/rejected": -8.797131886240095e-05, "step": 550 }, { "epoch": 0.09648518263266713, "grad_norm": 2.573582649230957, "learning_rate": 9.646856158484066e-09, "logits/chosen": -3.072469472885132, "logits/rejected": -3.0646400451660156, "logps/chosen": -52.102210998535156, "logps/rejected": -54.95751190185547, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 6.365185981849208e-05, "rewards/margins": -6.325983122223988e-05, "rewards/rejected": 0.00012691169104073197, "step": 560 }, { "epoch": 0.09820813232253618, "grad_norm": 2.2397186756134033, "learning_rate": 9.819121447028425e-09, "logits/chosen": -3.0452890396118164, "logits/rejected": -3.0365958213806152, "logps/chosen": -51.370277404785156, "logps/rejected": -53.8305778503418, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -8.120768325170502e-05, "rewards/margins": 4.172849003225565e-06, "rewards/rejected": -8.538054680684581e-05, "step": 570 }, { "epoch": 0.09993108201240523, "grad_norm": 1.768967628479004, "learning_rate": 9.991386735572782e-09, "logits/chosen": -3.0521512031555176, "logits/rejected": -3.0463624000549316, "logps/chosen": -51.12982940673828, "logps/rejected": -53.257110595703125, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00021762735559605062, "rewards/margins": -6.527120422106236e-05, "rewards/rejected": -0.00015235615137498826, "step": 580 }, { "epoch": 0.1016540317022743, "grad_norm": 2.06492018699646, "learning_rate": 1.016365202411714e-08, "logits/chosen": -3.051334857940674, "logits/rejected": -3.0294275283813477, "logps/chosen": -54.873023986816406, "logps/rejected": -54.8026008605957, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 5.9067795518785715e-05, "rewards/margins": 0.000223180977627635, "rewards/rejected": -0.00016411316755693406, "step": 590 }, { "epoch": 0.10337698139214335, "grad_norm": 2.32629132270813, "learning_rate": 1.0335917312661497e-08, "logits/chosen": -3.024348735809326, "logits/rejected": -3.0000967979431152, "logps/chosen": -53.887939453125, "logps/rejected": -56.68596649169922, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.189003448118456e-05, "rewards/margins": 0.00013417910668067634, "rewards/rejected": -0.0001760691375238821, "step": 600 }, { "epoch": 0.10337698139214335, "eval_logits/chosen": -3.162902355194092, "eval_logits/rejected": -3.157241106033325, "eval_logps/chosen": -58.70703125, "eval_logps/rejected": -63.17946243286133, "eval_loss": 0.6931267976760864, "eval_rewards/accuracies": 0.4960501790046692, "eval_rewards/chosen": 4.8649228119757026e-05, "eval_rewards/margins": 4.204766082693823e-05, "eval_rewards/rejected": 6.601568657060852e-06, "eval_runtime": 383.5477, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 600 }, { "epoch": 0.1050999310820124, "grad_norm": 2.2237448692321777, "learning_rate": 1.0508182601205858e-08, "logits/chosen": -2.9901814460754395, "logits/rejected": -2.987638473510742, "logps/chosen": -52.5461540222168, "logps/rejected": -53.254302978515625, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.00018452919903211296, "rewards/margins": 0.00011761534551624209, "rewards/rejected": 6.691385351587087e-05, "step": 610 }, { "epoch": 0.10682288077188146, "grad_norm": 2.368446111679077, "learning_rate": 1.0680447889750216e-08, "logits/chosen": -3.149543523788452, "logits/rejected": -3.122969150543213, "logps/chosen": -55.02565383911133, "logps/rejected": -53.259056091308594, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.642246444243938e-05, "rewards/margins": -0.00014482057304121554, "rewards/rejected": 7.839810859877616e-05, "step": 620 }, { "epoch": 0.10854583046175052, "grad_norm": 2.478334903717041, "learning_rate": 1.0852713178294573e-08, "logits/chosen": -3.1329097747802734, "logits/rejected": -3.1060264110565186, "logps/chosen": -53.866172790527344, "logps/rejected": -50.85861587524414, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -1.3715110071643721e-05, "rewards/margins": 3.362820280017331e-05, "rewards/rejected": -4.7343302867375314e-05, "step": 630 }, { "epoch": 0.11026878015161957, "grad_norm": 2.4779860973358154, "learning_rate": 1.1024978466838932e-08, "logits/chosen": -3.1013245582580566, "logits/rejected": -3.0902600288391113, "logps/chosen": -52.90496063232422, "logps/rejected": -54.33183670043945, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 7.12585897417739e-05, "rewards/margins": 0.00012497395800892264, "rewards/rejected": -5.371534643927589e-05, "step": 640 }, { "epoch": 0.11199172984148863, "grad_norm": 2.6754417419433594, "learning_rate": 1.1197243755383291e-08, "logits/chosen": -3.1137092113494873, "logits/rejected": -3.1150639057159424, "logps/chosen": -51.477821350097656, "logps/rejected": -54.8226318359375, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.929338370449841e-05, "rewards/margins": -1.7334494259557687e-05, "rewards/rejected": -1.1958894901908934e-05, "step": 650 }, { "epoch": 0.11371467953135768, "grad_norm": 2.2342963218688965, "learning_rate": 1.1369509043927649e-08, "logits/chosen": -3.0037097930908203, "logits/rejected": -2.9974873065948486, "logps/chosen": -54.63112258911133, "logps/rejected": -52.28680419921875, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.072547769988887e-05, "rewards/margins": 0.00020177674014121294, "rewards/rejected": -0.0002325022069271654, "step": 660 }, { "epoch": 0.11543762922122675, "grad_norm": 2.200890064239502, "learning_rate": 1.1541774332472008e-08, "logits/chosen": -3.0266995429992676, "logits/rejected": -3.02182936668396, "logps/chosen": -53.01947021484375, "logps/rejected": -57.498291015625, "loss": 0.6933, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.00013879637117497623, "rewards/margins": -0.00036192036350257695, "rewards/rejected": 0.0002231239777756855, "step": 670 }, { "epoch": 0.1171605789110958, "grad_norm": 2.317094564437866, "learning_rate": 1.1714039621016366e-08, "logits/chosen": -2.97847056388855, "logits/rejected": -2.9534149169921875, "logps/chosen": -53.9498405456543, "logps/rejected": -50.85622024536133, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00016623221745248884, "rewards/margins": -0.00025777897099033, "rewards/rejected": 9.154676081379876e-05, "step": 680 }, { "epoch": 0.11888352860096485, "grad_norm": 2.5922181606292725, "learning_rate": 1.1886304909560723e-08, "logits/chosen": -3.1273410320281982, "logits/rejected": -3.0974478721618652, "logps/chosen": -59.101234436035156, "logps/rejected": -50.5319709777832, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00018734775949269533, "rewards/margins": -0.00028559009660966694, "rewards/rejected": 9.8242329841014e-05, "step": 690 }, { "epoch": 0.1206064782908339, "grad_norm": 2.1908469200134277, "learning_rate": 1.205857019810508e-08, "logits/chosen": -3.0850281715393066, "logits/rejected": -3.0565896034240723, "logps/chosen": -55.72981643676758, "logps/rejected": -52.97819137573242, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -1.571721804793924e-05, "rewards/margins": -4.1126662836177275e-05, "rewards/rejected": 2.540946297813207e-05, "step": 700 }, { "epoch": 0.1206064782908339, "eval_logits/chosen": -3.16308331489563, "eval_logits/rejected": -3.1573946475982666, "eval_logps/chosen": -58.70769119262695, "eval_logps/rejected": -63.170066833496094, "eval_loss": 0.6931769847869873, "eval_rewards/accuracies": 0.4911710023880005, "eval_rewards/chosen": 4.198389797238633e-05, "eval_rewards/margins": -5.857193536940031e-05, "eval_rewards/rejected": 0.00010055583697976544, "eval_runtime": 383.7985, "eval_samples_per_second": 11.214, "eval_steps_per_second": 1.402, "step": 700 }, { "epoch": 0.12232942798070297, "grad_norm": 2.2663283348083496, "learning_rate": 1.2230835486649442e-08, "logits/chosen": -3.063431978225708, "logits/rejected": -3.0346083641052246, "logps/chosen": -54.637359619140625, "logps/rejected": -54.698516845703125, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00011339558841427788, "rewards/margins": 0.00011806526163127273, "rewards/rejected": -4.669668669521343e-06, "step": 710 }, { "epoch": 0.12405237767057202, "grad_norm": 2.4980812072753906, "learning_rate": 1.2403100775193799e-08, "logits/chosen": -3.0252647399902344, "logits/rejected": -3.0209569931030273, "logps/chosen": -53.32914352416992, "logps/rejected": -54.40614700317383, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00012032913218718022, "rewards/margins": -3.563683640095405e-05, "rewards/rejected": -8.469230670016259e-05, "step": 720 }, { "epoch": 0.12577532736044109, "grad_norm": 2.3726646900177, "learning_rate": 1.2575366063738157e-08, "logits/chosen": -3.1419854164123535, "logits/rejected": -3.116884708404541, "logps/chosen": -56.341453552246094, "logps/rejected": -52.523292541503906, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -9.19977537705563e-05, "rewards/margins": -7.002463098615408e-05, "rewards/rejected": -2.197313733631745e-05, "step": 730 }, { "epoch": 0.12749827705031014, "grad_norm": 2.221714735031128, "learning_rate": 1.2747631352282516e-08, "logits/chosen": -3.02409291267395, "logits/rejected": -2.9982411861419678, "logps/chosen": -54.91374588012695, "logps/rejected": -53.7316780090332, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 3.111477053607814e-05, "rewards/margins": 0.00035662594018504024, "rewards/rejected": -0.0003255111805628985, "step": 740 }, { "epoch": 0.1292212267401792, "grad_norm": 2.4172091484069824, "learning_rate": 1.2919896640826873e-08, "logits/chosen": -3.1980748176574707, "logits/rejected": -3.169914960861206, "logps/chosen": -56.0087776184082, "logps/rejected": -54.03033447265625, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 3.72965369024314e-05, "rewards/margins": 0.0002863496483769268, "rewards/rejected": -0.000249053118750453, "step": 750 }, { "epoch": 0.13094417643004824, "grad_norm": 2.5564565658569336, "learning_rate": 1.309216192937123e-08, "logits/chosen": -3.0481514930725098, "logits/rejected": -3.009295701980591, "logps/chosen": -54.42664337158203, "logps/rejected": -49.540592193603516, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 9.81381963356398e-05, "rewards/margins": 0.0003268247819505632, "rewards/rejected": -0.00022868656378705055, "step": 760 }, { "epoch": 0.1326671261199173, "grad_norm": 2.1019692420959473, "learning_rate": 1.3264427217915592e-08, "logits/chosen": -3.098783254623413, "logits/rejected": -3.075409412384033, "logps/chosen": -52.925331115722656, "logps/rejected": -52.193687438964844, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.0318860050756484e-05, "rewards/margins": -0.00021020628628320992, "rewards/rejected": 0.0001498874044045806, "step": 770 }, { "epoch": 0.13439007580978635, "grad_norm": 2.6707863807678223, "learning_rate": 1.3436692506459949e-08, "logits/chosen": -3.093926191329956, "logits/rejected": -3.062821865081787, "logps/chosen": -53.164955139160156, "logps/rejected": -51.190185546875, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00024976825807243586, "rewards/margins": -5.909635729040019e-05, "rewards/rejected": -0.00019067191169597208, "step": 780 }, { "epoch": 0.1361130254996554, "grad_norm": 2.2457172870635986, "learning_rate": 1.3608957795004307e-08, "logits/chosen": -3.101630687713623, "logits/rejected": -3.0681843757629395, "logps/chosen": -53.92539596557617, "logps/rejected": -53.56317138671875, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.00018752229516394436, "rewards/margins": -0.00024204616784118116, "rewards/rejected": 5.452388359117322e-05, "step": 790 }, { "epoch": 0.13783597518952448, "grad_norm": 2.69878888130188, "learning_rate": 1.3781223083548666e-08, "logits/chosen": -2.9877758026123047, "logits/rejected": -2.961682081222534, "logps/chosen": -55.00641632080078, "logps/rejected": -54.427734375, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.858703949954361e-05, "rewards/margins": 0.00023625604808330536, "rewards/rejected": -0.0003248431021347642, "step": 800 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.163255214691162, "eval_logits/rejected": -3.1575722694396973, "eval_logps/chosen": -58.69498062133789, "eval_logps/rejected": -63.16041946411133, "eval_loss": 0.6931617259979248, "eval_rewards/accuracies": 0.490938663482666, "eval_rewards/chosen": 0.0001691343932179734, "eval_rewards/margins": -2.792733175738249e-05, "eval_rewards/rejected": 0.00019706170132849365, "eval_runtime": 383.156, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 800 }, { "epoch": 0.13955892487939353, "grad_norm": 2.2642996311187744, "learning_rate": 1.3953488372093023e-08, "logits/chosen": -3.0624594688415527, "logits/rejected": -3.0343310832977295, "logps/chosen": -56.66850662231445, "logps/rejected": -55.576751708984375, "loss": 0.6932, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.00031107664108276367, "rewards/margins": -0.00017458898946642876, "rewards/rejected": -0.00013648762251250446, "step": 810 }, { "epoch": 0.14128187456926258, "grad_norm": 2.155913829803467, "learning_rate": 1.4125753660637381e-08, "logits/chosen": -3.1119720935821533, "logits/rejected": -3.086433172225952, "logps/chosen": -51.55561447143555, "logps/rejected": -50.739524841308594, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.932411972433329e-05, "rewards/margins": 4.6575911255786195e-05, "rewards/rejected": -0.0001259000418940559, "step": 820 }, { "epoch": 0.14300482425913164, "grad_norm": 2.428403615951538, "learning_rate": 1.429801894918174e-08, "logits/chosen": -3.039891481399536, "logits/rejected": -3.0244946479797363, "logps/chosen": -54.586090087890625, "logps/rejected": -54.00188446044922, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -6.285020208451897e-05, "rewards/margins": 0.00019378354772925377, "rewards/rejected": -0.0002566337352618575, "step": 830 }, { "epoch": 0.1447277739490007, "grad_norm": 2.541550397872925, "learning_rate": 1.44702842377261e-08, "logits/chosen": -3.14371919631958, "logits/rejected": -3.117215156555176, "logps/chosen": -54.11638259887695, "logps/rejected": -49.55205535888672, "loss": 0.6932, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.00021488538186531514, "rewards/margins": -0.00011695261491695419, "rewards/rejected": -9.793278150027618e-05, "step": 840 }, { "epoch": 0.14645072363886974, "grad_norm": 2.313411235809326, "learning_rate": 1.4642549526270457e-08, "logits/chosen": -3.0190269947052, "logits/rejected": -3.008479118347168, "logps/chosen": -50.86606979370117, "logps/rejected": -54.94173049926758, "loss": 0.6933, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.00044976655044592917, "rewards/margins": -0.00038919810322113335, "rewards/rejected": -6.056845450075343e-05, "step": 850 }, { "epoch": 0.1481736733287388, "grad_norm": 2.3254234790802, "learning_rate": 1.4814814814814814e-08, "logits/chosen": -3.0448098182678223, "logits/rejected": -3.0247676372528076, "logps/chosen": -52.861968994140625, "logps/rejected": -52.325035095214844, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.7349426090950146e-05, "rewards/margins": 0.00015882976003922522, "rewards/rejected": -0.0001861791533883661, "step": 860 }, { "epoch": 0.14989662301860784, "grad_norm": 1.9174031019210815, "learning_rate": 1.4987080103359175e-08, "logits/chosen": -3.1186442375183105, "logits/rejected": -3.114907741546631, "logps/chosen": -51.29471969604492, "logps/rejected": -53.679344177246094, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 6.848828343208879e-05, "rewards/margins": 0.0003035779227502644, "rewards/rejected": -0.00023508965387009084, "step": 870 }, { "epoch": 0.15161957270847692, "grad_norm": 1.8897603750228882, "learning_rate": 1.515934539190353e-08, "logits/chosen": -3.032423734664917, "logits/rejected": -3.009608507156372, "logps/chosen": -51.75300979614258, "logps/rejected": -51.415382385253906, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0002309719566255808, "rewards/margins": 0.00012134017742937431, "rewards/rejected": -0.0003523121413309127, "step": 880 }, { "epoch": 0.15334252239834598, "grad_norm": 2.2520952224731445, "learning_rate": 1.533161068044789e-08, "logits/chosen": -3.060913562774658, "logits/rejected": -3.0235142707824707, "logps/chosen": -58.44805145263672, "logps/rejected": -54.19799041748047, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": 0.0003900781739503145, "rewards/margins": 0.000767029938288033, "rewards/rejected": -0.0003769517061300576, "step": 890 }, { "epoch": 0.15506547208821503, "grad_norm": 2.146815538406372, "learning_rate": 1.550387596899225e-08, "logits/chosen": -3.0814216136932373, "logits/rejected": -3.070983409881592, "logps/chosen": -54.16011428833008, "logps/rejected": -52.36823272705078, "loss": 0.6934, "rewards/accuracies": 0.375, "rewards/chosen": -0.00023367287940345705, "rewards/margins": -0.00040599278872832656, "rewards/rejected": 0.00017231988022103906, "step": 900 }, { "epoch": 0.15506547208821503, "eval_logits/chosen": -3.1633338928222656, "eval_logits/rejected": -3.1576449871063232, "eval_logps/chosen": -58.702510833740234, "eval_logps/rejected": -63.16947937011719, "eval_loss": 0.6931542754173279, "eval_rewards/accuracies": 0.5060408711433411, "eval_rewards/chosen": 9.381815470987931e-05, "eval_rewards/margins": -1.2631733625312336e-05, "eval_rewards/rejected": 0.00010644988651620224, "eval_runtime": 383.1861, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 900 }, { "epoch": 0.15678842177808408, "grad_norm": 2.1222777366638184, "learning_rate": 1.567614125753661e-08, "logits/chosen": -3.0520882606506348, "logits/rejected": -3.0439276695251465, "logps/chosen": -51.022674560546875, "logps/rejected": -52.06635284423828, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 4.3686806748155504e-05, "rewards/margins": 0.0001305050973314792, "rewards/rejected": -8.68182978592813e-05, "step": 910 }, { "epoch": 0.15851137146795313, "grad_norm": 2.3210599422454834, "learning_rate": 1.5848406546080964e-08, "logits/chosen": -3.0938668251037598, "logits/rejected": -3.0504050254821777, "logps/chosen": -54.33452224731445, "logps/rejected": -49.48955535888672, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00012349920871201903, "rewards/margins": 0.0002427975705359131, "rewards/rejected": -0.00036629679379984736, "step": 920 }, { "epoch": 0.16023432115782218, "grad_norm": 2.5764107704162598, "learning_rate": 1.6020671834625323e-08, "logits/chosen": -3.155245780944824, "logits/rejected": -3.1392102241516113, "logps/chosen": -52.4669189453125, "logps/rejected": -54.50859832763672, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0001693625090410933, "rewards/margins": 0.0003471010422799736, "rewards/rejected": -0.00017773854779079556, "step": 930 }, { "epoch": 0.16195727084769124, "grad_norm": 2.1913082599639893, "learning_rate": 1.6192937123169683e-08, "logits/chosen": -3.1352171897888184, "logits/rejected": -3.097580671310425, "logps/chosen": -60.212013244628906, "logps/rejected": -54.92094802856445, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -6.684206891804934e-05, "rewards/margins": 0.00029070020536892116, "rewards/rejected": -0.0003575422742869705, "step": 940 }, { "epoch": 0.16368022053756032, "grad_norm": 2.1950390338897705, "learning_rate": 1.636520241171404e-08, "logits/chosen": -2.9277501106262207, "logits/rejected": -2.910341739654541, "logps/chosen": -55.29688262939453, "logps/rejected": -55.66237258911133, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00025500665651634336, "rewards/margins": 5.84846711717546e-05, "rewards/rejected": -0.00031349132768809795, "step": 950 }, { "epoch": 0.16540317022742937, "grad_norm": 2.2826006412506104, "learning_rate": 1.6537467700258398e-08, "logits/chosen": -2.8967947959899902, "logits/rejected": -2.8999621868133545, "logps/chosen": -50.75765609741211, "logps/rejected": -55.64331817626953, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0001634789223317057, "rewards/margins": 0.000268703734036535, "rewards/rejected": -0.0004321826563682407, "step": 960 }, { "epoch": 0.16712611991729842, "grad_norm": 2.3014798164367676, "learning_rate": 1.6709732988802757e-08, "logits/chosen": -3.0674057006835938, "logits/rejected": -3.0318312644958496, "logps/chosen": -60.67380905151367, "logps/rejected": -52.45458221435547, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001028587794280611, "rewards/margins": -4.980314315616852e-06, "rewards/rejected": -9.787843737285584e-05, "step": 970 }, { "epoch": 0.16884906960716747, "grad_norm": 3.0363972187042236, "learning_rate": 1.6881998277347116e-08, "logits/chosen": -3.1694116592407227, "logits/rejected": -3.149758815765381, "logps/chosen": -56.066307067871094, "logps/rejected": -54.85820388793945, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00017563713481649756, "rewards/margins": 7.07071740180254e-06, "rewards/rejected": -0.00018270780856255442, "step": 980 }, { "epoch": 0.17057201929703653, "grad_norm": 2.4502651691436768, "learning_rate": 1.7054263565891472e-08, "logits/chosen": -3.036480665206909, "logits/rejected": -3.013042449951172, "logps/chosen": -54.58531951904297, "logps/rejected": -52.66328048706055, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00016072092694230378, "rewards/margins": 0.0002869007585104555, "rewards/rejected": -0.0004476216563489288, "step": 990 }, { "epoch": 0.17229496898690558, "grad_norm": 2.1972694396972656, "learning_rate": 1.722652885443583e-08, "logits/chosen": -2.9883952140808105, "logits/rejected": -2.9606387615203857, "logps/chosen": -56.881141662597656, "logps/rejected": -51.748878479003906, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0001291980588575825, "rewards/margins": -6.897748971823603e-05, "rewards/rejected": -6.022059096721932e-05, "step": 1000 }, { "epoch": 0.17229496898690558, "eval_logits/chosen": -3.1630799770355225, "eval_logits/rejected": -3.157437562942505, "eval_logps/chosen": -58.69733810424805, "eval_logps/rejected": -63.16838073730469, "eval_loss": 0.6931337118148804, "eval_rewards/accuracies": 0.4948884844779968, "eval_rewards/chosen": 0.0001455719320802018, "eval_rewards/margins": 2.818080065480899e-05, "eval_rewards/rejected": 0.00011739113688236102, "eval_runtime": 383.2811, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 1000 }, { "epoch": 0.17401791867677463, "grad_norm": 2.363290309906006, "learning_rate": 1.739879414298019e-08, "logits/chosen": -2.9239678382873535, "logits/rejected": -2.934494972229004, "logps/chosen": -53.7801399230957, "logps/rejected": -57.93299102783203, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00022317534603644162, "rewards/margins": 0.0002450795436743647, "rewards/rejected": -0.000468254933366552, "step": 1010 }, { "epoch": 0.17574086836664368, "grad_norm": 2.214242458343506, "learning_rate": 1.7571059431524546e-08, "logits/chosen": -3.1074411869049072, "logits/rejected": -3.0718629360198975, "logps/chosen": -57.12139892578125, "logps/rejected": -54.26393508911133, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00018779639503918588, "rewards/margins": 0.0001408702664775774, "rewards/rejected": -0.00032866670517250896, "step": 1020 }, { "epoch": 0.17746381805651276, "grad_norm": 2.365598201751709, "learning_rate": 1.774332472006891e-08, "logits/chosen": -3.1569809913635254, "logits/rejected": -3.129786729812622, "logps/chosen": -53.72365188598633, "logps/rejected": -51.948936462402344, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -4.860901754000224e-05, "rewards/margins": 0.0003156032762490213, "rewards/rejected": -0.00036421226104721427, "step": 1030 }, { "epoch": 0.17918676774638181, "grad_norm": 2.4483449459075928, "learning_rate": 1.7915590008613264e-08, "logits/chosen": -3.0721476078033447, "logits/rejected": -3.0619688034057617, "logps/chosen": -53.41161346435547, "logps/rejected": -55.581390380859375, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.00013245883747003973, "rewards/margins": 0.00030615561990998685, "rewards/rejected": -0.0004386144573800266, "step": 1040 }, { "epoch": 0.18090971743625087, "grad_norm": 2.4459915161132812, "learning_rate": 1.8087855297157624e-08, "logits/chosen": -2.99334716796875, "logits/rejected": -2.9578709602355957, "logps/chosen": -57.15093231201172, "logps/rejected": -50.6357307434082, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.875073060858995e-05, "rewards/margins": 0.0005473211058415473, "rewards/rejected": -0.000626071821898222, "step": 1050 }, { "epoch": 0.18263266712611992, "grad_norm": 2.437818765640259, "learning_rate": 1.8260120585701983e-08, "logits/chosen": -3.002201795578003, "logits/rejected": -2.982909917831421, "logps/chosen": -56.52927780151367, "logps/rejected": -54.999961853027344, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0002802074304781854, "rewards/margins": -4.4669039198197424e-05, "rewards/rejected": -0.0002355383476242423, "step": 1060 }, { "epoch": 0.18435561681598897, "grad_norm": 2.2553157806396484, "learning_rate": 1.843238587424634e-08, "logits/chosen": -3.146183490753174, "logits/rejected": -3.112220287322998, "logps/chosen": -55.97260665893555, "logps/rejected": -53.28546905517578, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.000413569767260924, "rewards/margins": -4.205874938634224e-05, "rewards/rejected": -0.0003715109487529844, "step": 1070 }, { "epoch": 0.18607856650585802, "grad_norm": 2.253410816192627, "learning_rate": 1.8604651162790698e-08, "logits/chosen": -3.123624086380005, "logits/rejected": -3.100271463394165, "logps/chosen": -55.81609344482422, "logps/rejected": -50.781005859375, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0003030496300198138, "rewards/margins": 0.00020159417181275785, "rewards/rejected": -0.0005046438309364021, "step": 1080 }, { "epoch": 0.18780151619572708, "grad_norm": 2.353863477706909, "learning_rate": 1.8776916451335057e-08, "logits/chosen": -3.000624656677246, "logits/rejected": -2.9921963214874268, "logps/chosen": -52.26164627075195, "logps/rejected": -52.68159866333008, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0002996811526827514, "rewards/margins": 0.00017872537137009203, "rewards/rejected": -0.000478406494949013, "step": 1090 }, { "epoch": 0.18952446588559613, "grad_norm": 2.1127638816833496, "learning_rate": 1.8949181739879416e-08, "logits/chosen": -3.0667271614074707, "logits/rejected": -3.0638859272003174, "logps/chosen": -50.94106674194336, "logps/rejected": -54.64240646362305, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0003363923169672489, "rewards/margins": 1.02457470347872e-05, "rewards/rejected": -0.0003466380585450679, "step": 1100 }, { "epoch": 0.18952446588559613, "eval_logits/chosen": -3.162712812423706, "eval_logits/rejected": -3.1570920944213867, "eval_logps/chosen": -58.683189392089844, "eval_logps/rejected": -63.15974426269531, "eval_loss": 0.6931062936782837, "eval_rewards/accuracies": 0.515566885471344, "eval_rewards/chosen": 0.0002870217140298337, "eval_rewards/margins": 8.319580228999257e-05, "eval_rewards/rejected": 0.0002038259117398411, "eval_runtime": 383.1432, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 1100 }, { "epoch": 0.1912474155754652, "grad_norm": 2.639432668685913, "learning_rate": 1.9121447028423772e-08, "logits/chosen": -3.064542055130005, "logits/rejected": -3.0769705772399902, "logps/chosen": -52.93410110473633, "logps/rejected": -56.49309539794922, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00038267840864136815, "rewards/margins": 2.7892394427908584e-05, "rewards/rejected": -0.00041057082125917077, "step": 1110 }, { "epoch": 0.19297036526533426, "grad_norm": 2.38193941116333, "learning_rate": 1.929371231696813e-08, "logits/chosen": -3.0945816040039062, "logits/rejected": -3.072700262069702, "logps/chosen": -56.38603973388672, "logps/rejected": -53.792144775390625, "loss": 0.693, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00015010975766927004, "rewards/margins": 0.0003310061583761126, "rewards/rejected": -0.0004811159451492131, "step": 1120 }, { "epoch": 0.1946933149552033, "grad_norm": 2.2352566719055176, "learning_rate": 1.946597760551249e-08, "logits/chosen": -3.1612792015075684, "logits/rejected": -3.1366684436798096, "logps/chosen": -52.0703125, "logps/rejected": -54.243202209472656, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00037932602572254837, "rewards/margins": 0.0001295518159167841, "rewards/rejected": -0.0005088779143989086, "step": 1130 }, { "epoch": 0.19641626464507236, "grad_norm": 2.4157116413116455, "learning_rate": 1.963824289405685e-08, "logits/chosen": -3.0681800842285156, "logits/rejected": -3.0344736576080322, "logps/chosen": -56.911476135253906, "logps/rejected": -53.044189453125, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00030110584339126945, "rewards/margins": 0.0002923770225606859, "rewards/rejected": -0.0005934828659519553, "step": 1140 }, { "epoch": 0.19813921433494142, "grad_norm": 2.3486196994781494, "learning_rate": 1.9810508182601205e-08, "logits/chosen": -3.007859706878662, "logits/rejected": -2.9886839389801025, "logps/chosen": -53.38232421875, "logps/rejected": -54.55126953125, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00025944452499970794, "rewards/margins": 0.000397385039832443, "rewards/rejected": -0.0006568295648321509, "step": 1150 }, { "epoch": 0.19986216402481047, "grad_norm": 2.2162816524505615, "learning_rate": 1.9982773471145565e-08, "logits/chosen": -3.1045355796813965, "logits/rejected": -3.095886707305908, "logps/chosen": -54.0484619140625, "logps/rejected": -54.620849609375, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0002283039502799511, "rewards/margins": 0.0005001203389838338, "rewards/rejected": -0.0007284242892637849, "step": 1160 }, { "epoch": 0.20158511371467952, "grad_norm": 2.117940902709961, "learning_rate": 1.9999963375532916e-08, "logits/chosen": -2.9765725135803223, "logits/rejected": -2.9637269973754883, "logps/chosen": -52.097686767578125, "logps/rejected": -54.725807189941406, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00018397597887087613, "rewards/margins": 0.00034015963319689035, "rewards/rejected": -0.0005241355393081903, "step": 1170 }, { "epoch": 0.2033080634045486, "grad_norm": 2.5803020000457764, "learning_rate": 1.9999836772781233e-08, "logits/chosen": -2.948564291000366, "logits/rejected": -2.917327642440796, "logps/chosen": -52.84502410888672, "logps/rejected": -51.07862091064453, "loss": 0.693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00043227747664786875, "rewards/margins": 0.000333493750076741, "rewards/rejected": -0.0007657711976207793, "step": 1180 }, { "epoch": 0.20503101309441765, "grad_norm": 2.5155222415924072, "learning_rate": 1.9999619740735644e-08, "logits/chosen": -3.1464123725891113, "logits/rejected": -3.111192226409912, "logps/chosen": -58.7384147644043, "logps/rejected": -50.48283386230469, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003894003457389772, "rewards/margins": 0.0004400517209433019, "rewards/rejected": -0.0008294520666822791, "step": 1190 }, { "epoch": 0.2067539627842867, "grad_norm": 2.093356132507324, "learning_rate": 1.999931228135879e-08, "logits/chosen": -2.978224515914917, "logits/rejected": -2.9631943702697754, "logps/chosen": -53.191490173339844, "logps/rejected": -51.32482147216797, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0002702943456824869, "rewards/margins": 0.0002604659821372479, "rewards/rejected": -0.0005307603860273957, "step": 1200 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1626968383789062, "eval_logits/rejected": -3.157034158706665, "eval_logps/chosen": -58.677303314208984, "eval_logps/rejected": -63.153099060058594, "eval_loss": 0.6931099891662598, "eval_rewards/accuracies": 0.5153345465660095, "eval_rewards/chosen": 0.0003459024301264435, "eval_rewards/margins": 7.563854160252959e-05, "eval_rewards/rejected": 0.00027026390307582915, "eval_runtime": 383.167, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 1200 }, { "epoch": 0.20847691247415576, "grad_norm": 2.264655590057373, "learning_rate": 1.999891439743105e-08, "logits/chosen": -3.0778591632843018, "logits/rejected": -3.0437171459198, "logps/chosen": -53.799530029296875, "logps/rejected": -53.07401657104492, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00011281321349088103, "rewards/margins": 0.0004704711027443409, "rewards/rejected": -0.0005832842434756458, "step": 1210 }, { "epoch": 0.2101998621640248, "grad_norm": 2.1167376041412354, "learning_rate": 1.9998426092550514e-08, "logits/chosen": -3.089040994644165, "logits/rejected": -3.061993360519409, "logps/chosen": -53.419349670410156, "logps/rejected": -52.35260772705078, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.00038046788540668786, "rewards/margins": 0.0003108192759100348, "rewards/rejected": -0.0006912872195243835, "step": 1220 }, { "epoch": 0.21192281185389386, "grad_norm": 2.5345876216888428, "learning_rate": 1.999784737113296e-08, "logits/chosen": -3.1593711376190186, "logits/rejected": -3.117147922515869, "logps/chosen": -55.76252365112305, "logps/rejected": -52.289398193359375, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00011659580195555463, "rewards/margins": 0.0005294819129630923, "rewards/rejected": -0.0006460777367465198, "step": 1230 }, { "epoch": 0.2136457615437629, "grad_norm": 2.0671513080596924, "learning_rate": 1.999717823841182e-08, "logits/chosen": -3.0202252864837646, "logits/rejected": -3.0034167766571045, "logps/chosen": -52.499916076660156, "logps/rejected": -52.897300720214844, "loss": 0.6931, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0006282069953158498, "rewards/margins": 0.0001435217709513381, "rewards/rejected": -0.0007717286935076118, "step": 1240 }, { "epoch": 0.21536871123363197, "grad_norm": 2.48652720451355, "learning_rate": 1.99964187004381e-08, "logits/chosen": -3.1409010887145996, "logits/rejected": -3.1046009063720703, "logps/chosen": -53.774330139160156, "logps/rejected": -51.2626953125, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0007029867265373468, "rewards/margins": 5.009355299989693e-05, "rewards/rejected": -0.000753080181311816, "step": 1250 }, { "epoch": 0.21709166092350105, "grad_norm": 2.1381783485412598, "learning_rate": 1.999556876408037e-08, "logits/chosen": -2.9890284538269043, "logits/rejected": -2.9575300216674805, "logps/chosen": -53.025726318359375, "logps/rejected": -50.35152816772461, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0005038737435825169, "rewards/margins": 0.00016801193123683333, "rewards/rejected": -0.0006718856748193502, "step": 1260 }, { "epoch": 0.2188146106133701, "grad_norm": 2.4455270767211914, "learning_rate": 1.9994628437024666e-08, "logits/chosen": -3.049938678741455, "logits/rejected": -3.043519973754883, "logps/chosen": -51.91309356689453, "logps/rejected": -54.79779815673828, "loss": 0.6929, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0003726807772181928, "rewards/margins": 0.0005565760657191277, "rewards/rejected": -0.0009292567847296596, "step": 1270 }, { "epoch": 0.22053756030323915, "grad_norm": 2.275219678878784, "learning_rate": 1.9993597727774438e-08, "logits/chosen": -3.1218161582946777, "logits/rejected": -3.1289162635803223, "logps/chosen": -51.21726608276367, "logps/rejected": -59.98350143432617, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.00045720464549958706, "rewards/margins": 0.00024440709967166185, "rewards/rejected": -0.000701611686963588, "step": 1280 }, { "epoch": 0.2222605099931082, "grad_norm": 2.1652839183807373, "learning_rate": 1.999247664565047e-08, "logits/chosen": -3.0220205783843994, "logits/rejected": -2.99029278755188, "logps/chosen": -54.62388229370117, "logps/rejected": -50.48380661010742, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.00022962580260355026, "rewards/margins": 0.0004112176247872412, "rewards/rejected": -0.0006408434128388762, "step": 1290 }, { "epoch": 0.22398345968297725, "grad_norm": 2.1271426677703857, "learning_rate": 1.9991265200790797e-08, "logits/chosen": -3.1019668579101562, "logits/rejected": -3.089221715927124, "logps/chosen": -50.25926971435547, "logps/rejected": -53.80472946166992, "loss": 0.693, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0004433942958712578, "rewards/margins": 0.0003799190162681043, "rewards/rejected": -0.000823313370347023, "step": 1300 }, { "epoch": 0.22398345968297725, "eval_logits/chosen": -3.16233229637146, "eval_logits/rejected": -3.1566648483276367, "eval_logps/chosen": -58.66947555541992, "eval_logps/rejected": -63.150753021240234, "eval_loss": 0.6930826306343079, "eval_rewards/accuracies": 0.5174256563186646, "eval_rewards/chosen": 0.0004242155991960317, "eval_rewards/margins": 0.00013055592717137188, "eval_rewards/rejected": 0.00029365968657657504, "eval_runtime": 383.1603, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 1300 }, { "epoch": 0.2257064093728463, "grad_norm": 1.916630506515503, "learning_rate": 1.99899634041506e-08, "logits/chosen": -3.1023964881896973, "logits/rejected": -3.066821575164795, "logps/chosen": -53.39400100708008, "logps/rejected": -49.54502487182617, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0005346934776753187, "rewards/margins": 0.0007152494508773088, "rewards/rejected": -0.0012499429285526276, "step": 1310 }, { "epoch": 0.22742935906271536, "grad_norm": 2.2478299140930176, "learning_rate": 1.9988571267502137e-08, "logits/chosen": -3.075390338897705, "logits/rejected": -3.049856662750244, "logps/chosen": -55.4655647277832, "logps/rejected": -50.82777404785156, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00045771937584504485, "rewards/margins": 0.0004580998793244362, "rewards/rejected": -0.000915819313377142, "step": 1320 }, { "epoch": 0.22915230875258444, "grad_norm": 2.4180092811584473, "learning_rate": 1.9987088803434594e-08, "logits/chosen": -3.1548409461975098, "logits/rejected": -3.1240522861480713, "logps/chosen": -55.25426483154297, "logps/rejected": -50.222694396972656, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00036666609230451286, "rewards/margins": 0.0004698067787103355, "rewards/rejected": -0.0008364729583263397, "step": 1330 }, { "epoch": 0.2308752584424535, "grad_norm": 2.302684783935547, "learning_rate": 1.9985516025354018e-08, "logits/chosen": -2.9962856769561768, "logits/rejected": -2.974853515625, "logps/chosen": -55.167320251464844, "logps/rejected": -52.973182678222656, "loss": 0.6928, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00027919316198676825, "rewards/margins": 0.0007761950837448239, "rewards/rejected": -0.0010553881293162704, "step": 1340 }, { "epoch": 0.23259820813232254, "grad_norm": 2.2061896324157715, "learning_rate": 1.9983852947483158e-08, "logits/chosen": -3.0621368885040283, "logits/rejected": -3.037357807159424, "logps/chosen": -54.15734100341797, "logps/rejected": -53.328369140625, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0005606129416264594, "rewards/margins": 0.00038336380384862423, "rewards/rejected": -0.0009439766290597618, "step": 1350 }, { "epoch": 0.2343211578221916, "grad_norm": 2.3170135021209717, "learning_rate": 1.9982099584861356e-08, "logits/chosen": -3.0194990634918213, "logits/rejected": -3.0071558952331543, "logps/chosen": -55.27729034423828, "logps/rejected": -56.09923553466797, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0007478878833353519, "rewards/margins": -4.510110102273757e-06, "rewards/rejected": -0.0007433776627294719, "step": 1360 }, { "epoch": 0.23604410751206065, "grad_norm": 2.3610928058624268, "learning_rate": 1.9980255953344406e-08, "logits/chosen": -3.133690595626831, "logits/rejected": -3.1139533519744873, "logps/chosen": -52.73761749267578, "logps/rejected": -52.88722610473633, "loss": 0.693, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0006277118809521198, "rewards/margins": 0.0003715711645781994, "rewards/rejected": -0.000999283161945641, "step": 1370 }, { "epoch": 0.2377670572019297, "grad_norm": 2.0576012134552, "learning_rate": 1.9978322069604412e-08, "logits/chosen": -3.025172710418701, "logits/rejected": -3.0068202018737793, "logps/chosen": -54.36109161376953, "logps/rejected": -52.417701721191406, "loss": 0.6928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00028318905970081687, "rewards/margins": 0.0007199858082458377, "rewards/rejected": -0.0010031748097389936, "step": 1380 }, { "epoch": 0.23949000689179875, "grad_norm": 2.3447775840759277, "learning_rate": 1.9976297951129625e-08, "logits/chosen": -3.175565242767334, "logits/rejected": -3.147883892059326, "logps/chosen": -56.92084503173828, "logps/rejected": -53.5631103515625, "loss": 0.6928, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00027007010066881776, "rewards/margins": 0.0007962186937220395, "rewards/rejected": -0.0010662887943908572, "step": 1390 }, { "epoch": 0.2412129565816678, "grad_norm": 2.098050832748413, "learning_rate": 1.9974183616224314e-08, "logits/chosen": -2.9952683448791504, "logits/rejected": -2.9694926738739014, "logps/chosen": -55.85969161987305, "logps/rejected": -52.920196533203125, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0005089627811685205, "rewards/margins": 0.0006413789233192801, "rewards/rejected": -0.0011503419373184443, "step": 1400 }, { "epoch": 0.2412129565816678, "eval_logits/chosen": -3.1622049808502197, "eval_logits/rejected": -3.1565866470336914, "eval_logps/chosen": -58.66376876831055, "eval_logps/rejected": -63.146671295166016, "eval_loss": 0.693074643611908, "eval_rewards/accuracies": 0.5130111575126648, "eval_rewards/chosen": 0.0004812688275706023, "eval_rewards/margins": 0.00014677205763291568, "eval_rewards/rejected": 0.0003344967553857714, "eval_runtime": 383.5374, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 1400 }, { "epoch": 0.24293590627153688, "grad_norm": 2.009655475616455, "learning_rate": 1.9971979084008567e-08, "logits/chosen": -3.0599923133850098, "logits/rejected": -3.047905445098877, "logps/chosen": -54.46318435668945, "logps/rejected": -53.312583923339844, "loss": 0.693, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0008952345815487206, "rewards/margins": 0.0002275546285090968, "rewards/rejected": -0.0011227892246097326, "step": 1410 }, { "epoch": 0.24465885596140594, "grad_norm": 2.1165614128112793, "learning_rate": 1.9969684374418137e-08, "logits/chosen": -2.977726697921753, "logits/rejected": -2.983685255050659, "logps/chosen": -50.83002471923828, "logps/rejected": -54.78386306762695, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0007586570573039353, "rewards/margins": 0.00024669113918207586, "rewards/rejected": -0.0010053481673821807, "step": 1420 }, { "epoch": 0.246381805651275, "grad_norm": 2.3889706134796143, "learning_rate": 1.9967299508204266e-08, "logits/chosen": -3.097414016723633, "logits/rejected": -3.085890531539917, "logps/chosen": -53.62162399291992, "logps/rejected": -56.60115432739258, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00036977732088416815, "rewards/margins": 0.0007430274854414165, "rewards/rejected": -0.0011128047481179237, "step": 1430 }, { "epoch": 0.24810475534114404, "grad_norm": 2.135133981704712, "learning_rate": 1.996482450693348e-08, "logits/chosen": -3.0247979164123535, "logits/rejected": -2.993356704711914, "logps/chosen": -50.533348083496094, "logps/rejected": -48.5119743347168, "loss": 0.6928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0007086361292749643, "rewards/margins": 0.000766909564845264, "rewards/rejected": -0.0014755458105355501, "step": 1440 }, { "epoch": 0.2498277050310131, "grad_norm": 2.7499895095825195, "learning_rate": 1.9962259392987405e-08, "logits/chosen": -3.0386104583740234, "logits/rejected": -3.0010645389556885, "logps/chosen": -55.5077018737793, "logps/rejected": -52.076683044433594, "loss": 0.6925, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0003259262884967029, "rewards/margins": 0.0012405237648636103, "rewards/rejected": -0.0015664503443986177, "step": 1450 }, { "epoch": 0.25155065472088217, "grad_norm": 2.2248263359069824, "learning_rate": 1.995960418956256e-08, "logits/chosen": -3.041229248046875, "logits/rejected": -3.008714437484741, "logps/chosen": -51.79349899291992, "logps/rejected": -50.624542236328125, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0009388105827383697, "rewards/margins": 0.0006548297824338078, "rewards/rejected": -0.0015936403069645166, "step": 1460 }, { "epoch": 0.2532736044107512, "grad_norm": 2.4349653720855713, "learning_rate": 1.9956858920670163e-08, "logits/chosen": -3.2067325115203857, "logits/rejected": -3.1706488132476807, "logps/chosen": -57.066429138183594, "logps/rejected": -54.8380241394043, "loss": 0.6926, "rewards/accuracies": 0.625, "rewards/chosen": -0.0004277366679161787, "rewards/margins": 0.0011886181309819221, "rewards/rejected": -0.0016163547988981009, "step": 1470 }, { "epoch": 0.2549965541006203, "grad_norm": 2.150383472442627, "learning_rate": 1.9954023611135885e-08, "logits/chosen": -3.173959970474243, "logits/rejected": -3.1377644538879395, "logps/chosen": -51.36625289916992, "logps/rejected": -51.846641540527344, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0005552283255383372, "rewards/margins": 0.0005073813372291625, "rewards/rejected": -0.0010626097209751606, "step": 1480 }, { "epoch": 0.2567195037904893, "grad_norm": 2.321821928024292, "learning_rate": 1.995109828659965e-08, "logits/chosen": -3.105658769607544, "logits/rejected": -3.0809221267700195, "logps/chosen": -57.459449768066406, "logps/rejected": -53.830970764160156, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": -0.000757657631766051, "rewards/margins": 0.0008538023685105145, "rewards/rejected": -0.0016114600002765656, "step": 1490 }, { "epoch": 0.2584424534803584, "grad_norm": 2.2787139415740967, "learning_rate": 1.9948082973515395e-08, "logits/chosen": -3.014392614364624, "logits/rejected": -3.0117087364196777, "logps/chosen": -50.05243682861328, "logps/rejected": -52.818359375, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006372680654749274, "rewards/margins": 0.0008813614840619266, "rewards/rejected": -0.0015186297241598368, "step": 1500 }, { "epoch": 0.2584424534803584, "eval_logits/chosen": -3.1620614528656006, "eval_logits/rejected": -3.1564226150512695, "eval_logps/chosen": -58.660011291503906, "eval_logps/rejected": -63.144344329833984, "eval_loss": 0.6930677890777588, "eval_rewards/accuracies": 0.5206784605979919, "eval_rewards/chosen": 0.0005188515642657876, "eval_rewards/margins": 0.0001610520266694948, "eval_rewards/rejected": 0.00035779952304437757, "eval_runtime": 383.1959, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 1500 }, { "epoch": 0.2601654031702274, "grad_norm": 2.407702684402466, "learning_rate": 1.9944977699150825e-08, "logits/chosen": -2.9633898735046387, "logits/rejected": -2.927666425704956, "logps/chosen": -58.6661262512207, "logps/rejected": -52.97742462158203, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0006018796702846885, "rewards/margins": 0.0009065620834007859, "rewards/rejected": -0.0015084416372701526, "step": 1510 }, { "epoch": 0.2618883528600965, "grad_norm": 2.5089685916900635, "learning_rate": 1.9941782491587175e-08, "logits/chosen": -3.0775656700134277, "logits/rejected": -3.0695385932922363, "logps/chosen": -52.35771560668945, "logps/rejected": -51.60698318481445, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0010706728789955378, "rewards/margins": 2.0194147509755567e-05, "rewards/rejected": -0.0010908670956268907, "step": 1520 }, { "epoch": 0.26361130254996556, "grad_norm": 2.536482334136963, "learning_rate": 1.993849737971896e-08, "logits/chosen": -2.96296763420105, "logits/rejected": -2.9462900161743164, "logps/chosen": -51.77375030517578, "logps/rejected": -50.902069091796875, "loss": 0.6928, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.0008512031054124236, "rewards/margins": 0.000627328990958631, "rewards/rejected": -0.0014785320963710546, "step": 1530 }, { "epoch": 0.2653342522398346, "grad_norm": 2.257230043411255, "learning_rate": 1.9935122393253692e-08, "logits/chosen": -3.09374737739563, "logits/rejected": -3.0615248680114746, "logps/chosen": -55.999359130859375, "logps/rejected": -51.18012237548828, "loss": 0.6927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0009388996404595673, "rewards/margins": 0.0008283822098746896, "rewards/rejected": -0.0017672820249572396, "step": 1540 }, { "epoch": 0.26705720192970367, "grad_norm": 2.0905497074127197, "learning_rate": 1.9931657562711637e-08, "logits/chosen": -3.0119576454162598, "logits/rejected": -2.9918830394744873, "logps/chosen": -53.45903396606445, "logps/rejected": -50.73937225341797, "loss": 0.6926, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0008969675982370973, "rewards/margins": 0.0010906046954914927, "rewards/rejected": -0.00198757229372859, "step": 1550 }, { "epoch": 0.2687801516195727, "grad_norm": 2.2873430252075195, "learning_rate": 1.9928102919425526e-08, "logits/chosen": -3.026608943939209, "logits/rejected": -3.0091512203216553, "logps/chosen": -50.974910736083984, "logps/rejected": -50.85519027709961, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0012713803444057703, "rewards/margins": 0.0006782411364838481, "rewards/rejected": -0.0019496215973049402, "step": 1560 }, { "epoch": 0.2705031013094418, "grad_norm": 2.2136454582214355, "learning_rate": 1.9924458495540268e-08, "logits/chosen": -3.0785794258117676, "logits/rejected": -3.0804882049560547, "logps/chosen": -51.9824104309082, "logps/rejected": -56.40099334716797, "loss": 0.6926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0006633226876147091, "rewards/margins": 0.0010450478876009583, "rewards/rejected": -0.0017083704005926847, "step": 1570 }, { "epoch": 0.2722260509993108, "grad_norm": 2.671224355697632, "learning_rate": 1.992072432401267e-08, "logits/chosen": -3.013683557510376, "logits/rejected": -3.0035407543182373, "logps/chosen": -53.052703857421875, "logps/rejected": -54.06764602661133, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0013744436437264085, "rewards/margins": 0.0003955420688726008, "rewards/rejected": -0.0017699853051453829, "step": 1580 }, { "epoch": 0.2739490006891799, "grad_norm": 2.372692584991455, "learning_rate": 1.991690043861113e-08, "logits/chosen": -3.0596675872802734, "logits/rejected": -3.0450618267059326, "logps/chosen": -54.20551681518555, "logps/rejected": -54.7797966003418, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0007508709677495062, "rewards/margins": 0.0006255079642869532, "rewards/rejected": -0.0013763790484517813, "step": 1590 }, { "epoch": 0.27567195037904896, "grad_norm": 2.3430051803588867, "learning_rate": 1.9912986873915344e-08, "logits/chosen": -3.051609992980957, "logits/rejected": -3.0133116245269775, "logps/chosen": -52.2765998840332, "logps/rejected": -51.117454528808594, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0009734455379657447, "rewards/margins": 0.000701104465406388, "rewards/rejected": -0.001674549886956811, "step": 1600 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.161592483520508, "eval_logits/rejected": -3.155942678451538, "eval_logps/chosen": -58.649715423583984, "eval_logps/rejected": -63.13517379760742, "eval_loss": 0.6930622458457947, "eval_rewards/accuracies": 0.515566885471344, "eval_rewards/chosen": 0.0006218124181032181, "eval_rewards/margins": 0.00017230722005479038, "eval_rewards/rejected": 0.0004495051980484277, "eval_runtime": 383.4363, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 1600 }, { "epoch": 0.277394900068918, "grad_norm": 2.1574578285217285, "learning_rate": 1.9908983665315976e-08, "logits/chosen": -3.090954065322876, "logits/rejected": -3.0656261444091797, "logps/chosen": -56.01924514770508, "logps/rejected": -57.913787841796875, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0012514353729784489, "rewards/margins": 0.0009009768255054951, "rewards/rejected": -0.0021524124313145876, "step": 1610 }, { "epoch": 0.27911784975878706, "grad_norm": 2.0792019367218018, "learning_rate": 1.990489084901435e-08, "logits/chosen": -3.012794017791748, "logits/rejected": -2.990811347961426, "logps/chosen": -51.55316925048828, "logps/rejected": -54.24964141845703, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0011311458656564355, "rewards/margins": 0.000712685112375766, "rewards/rejected": -0.0018438309198245406, "step": 1620 }, { "epoch": 0.2808407994486561, "grad_norm": 2.3274433612823486, "learning_rate": 1.990070846202212e-08, "logits/chosen": -3.0834336280822754, "logits/rejected": -3.0569839477539062, "logps/chosen": -55.578704833984375, "logps/rejected": -51.934776306152344, "loss": 0.6925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.000689486158080399, "rewards/margins": 0.001395157421939075, "rewards/rejected": -0.0020846438128501177, "step": 1630 }, { "epoch": 0.28256374913852517, "grad_norm": 2.333423614501953, "learning_rate": 1.989643654216093e-08, "logits/chosen": -3.1261088848114014, "logits/rejected": -3.0887601375579834, "logps/chosen": -56.98992156982422, "logps/rejected": -51.56941604614258, "loss": 0.6922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0003556433948688209, "rewards/margins": 0.001908325357362628, "rewards/rejected": -0.00226396881043911, "step": 1640 }, { "epoch": 0.2842866988283942, "grad_norm": 2.48422908782959, "learning_rate": 1.9892075128062082e-08, "logits/chosen": -3.0957982540130615, "logits/rejected": -3.0681676864624023, "logps/chosen": -57.281890869140625, "logps/rejected": -55.1717414855957, "loss": 0.6924, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0004545752890408039, "rewards/margins": 0.0014074406353756785, "rewards/rejected": -0.0018620159244164824, "step": 1650 }, { "epoch": 0.28600964851826327, "grad_norm": 2.484135627746582, "learning_rate": 1.988762425916618e-08, "logits/chosen": -3.131685256958008, "logits/rejected": -3.084357500076294, "logps/chosen": -55.02167892456055, "logps/rejected": -49.93708038330078, "loss": 0.6923, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0005012772744521499, "rewards/margins": 0.0017122188583016396, "rewards/rejected": -0.0022134962491691113, "step": 1660 }, { "epoch": 0.2877325982081323, "grad_norm": 2.482999324798584, "learning_rate": 1.9883083975722772e-08, "logits/chosen": -3.1197116374969482, "logits/rejected": -3.0976428985595703, "logps/chosen": -54.56291961669922, "logps/rejected": -55.3899040222168, "loss": 0.6925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00048305519158020616, "rewards/margins": 0.001361898030154407, "rewards/rejected": -0.0018449531635269523, "step": 1670 }, { "epoch": 0.2894555478980014, "grad_norm": 2.213054895401001, "learning_rate": 1.987845431879e-08, "logits/chosen": -3.077357530593872, "logits/rejected": -3.051088809967041, "logps/chosen": -55.71039962768555, "logps/rejected": -54.652687072753906, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.0010206119623035192, "rewards/margins": 0.0014232432004064322, "rewards/rejected": -0.0024438551627099514, "step": 1680 }, { "epoch": 0.29117849758787046, "grad_norm": 2.3456971645355225, "learning_rate": 1.9873735330234196e-08, "logits/chosen": -3.0565898418426514, "logits/rejected": -3.0409586429595947, "logps/chosen": -55.663421630859375, "logps/rejected": -52.42894744873047, "loss": 0.693, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.001181212835945189, "rewards/margins": 0.00037458629230968654, "rewards/rejected": -0.001555799157358706, "step": 1690 }, { "epoch": 0.2929014472777395, "grad_norm": 2.397773265838623, "learning_rate": 1.986892705272954e-08, "logits/chosen": -2.9650046825408936, "logits/rejected": -2.9699463844299316, "logps/chosen": -49.70085906982422, "logps/rejected": -55.52735137939453, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0013781094457954168, "rewards/margins": 0.0007676255772821605, "rewards/rejected": -0.0021457350812852383, "step": 1700 }, { "epoch": 0.2929014472777395, "eval_logits/chosen": -3.1609573364257812, "eval_logits/rejected": -3.1553165912628174, "eval_logps/chosen": -58.63572311401367, "eval_logps/rejected": -63.1287841796875, "eval_loss": 0.6930245757102966, "eval_rewards/accuracies": 0.5290427803993225, "eval_rewards/chosen": 0.0007616986404173076, "eval_rewards/margins": 0.00024828972527757287, "eval_rewards/rejected": 0.0005134089151397347, "eval_runtime": 383.8785, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.401, "step": 1700 }, { "epoch": 0.29462439696760856, "grad_norm": 2.4840126037597656, "learning_rate": 1.986402952975766e-08, "logits/chosen": -3.097963809967041, "logits/rejected": -3.057067394256592, "logps/chosen": -58.48628616333008, "logps/rejected": -54.680519104003906, "loss": 0.6925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.000765048258472234, "rewards/margins": 0.0012052144156768918, "rewards/rejected": -0.001970262499526143, "step": 1710 }, { "epoch": 0.2963473466574776, "grad_norm": 2.4435672760009766, "learning_rate": 1.985904280560723e-08, "logits/chosen": -3.107060432434082, "logits/rejected": -3.077709674835205, "logps/chosen": -56.552574157714844, "logps/rejected": -51.2525749206543, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0012668697163462639, "rewards/margins": 0.0007003343780525029, "rewards/rejected": -0.001967204036191106, "step": 1720 }, { "epoch": 0.29807029634734666, "grad_norm": 2.1897132396698, "learning_rate": 1.9853966925373585e-08, "logits/chosen": -3.1019604206085205, "logits/rejected": -3.0847129821777344, "logps/chosen": -54.06304168701172, "logps/rejected": -52.69061279296875, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0012872053775936365, "rewards/margins": 0.0005615145200863481, "rewards/rejected": -0.0018487200140953064, "step": 1730 }, { "epoch": 0.2997932460372157, "grad_norm": 2.3099875450134277, "learning_rate": 1.9848801934958293e-08, "logits/chosen": -3.0509893894195557, "logits/rejected": -3.041018009185791, "logps/chosen": -52.259376525878906, "logps/rejected": -54.26354217529297, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.001581055112183094, "rewards/margins": 0.0004412340931594372, "rewards/rejected": -0.002022289205342531, "step": 1740 }, { "epoch": 0.30151619572708477, "grad_norm": 2.471845865249634, "learning_rate": 1.9843547881068763e-08, "logits/chosen": -3.0957400798797607, "logits/rejected": -3.0844783782958984, "logps/chosen": -55.27183151245117, "logps/rejected": -56.19903564453125, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0008580518770031631, "rewards/margins": 0.0008658823790028691, "rewards/rejected": -0.0017239341977983713, "step": 1750 }, { "epoch": 0.30323914541695385, "grad_norm": 2.350327253341675, "learning_rate": 1.983820481121781e-08, "logits/chosen": -3.0834641456604004, "logits/rejected": -3.046614408493042, "logps/chosen": -56.12206268310547, "logps/rejected": -53.220245361328125, "loss": 0.6923, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0009891155641525984, "rewards/margins": 0.0017277583247050643, "rewards/rejected": -0.0027168740052729845, "step": 1760 }, { "epoch": 0.3049620951068229, "grad_norm": 1.9723304510116577, "learning_rate": 1.9832772773723228e-08, "logits/chosen": -3.110938310623169, "logits/rejected": -3.074063777923584, "logps/chosen": -55.12842559814453, "logps/rejected": -49.23452377319336, "loss": 0.6924, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0014672544784843922, "rewards/margins": 0.001455864286981523, "rewards/rejected": -0.002923118881881237, "step": 1770 }, { "epoch": 0.30668504479669195, "grad_norm": 2.564880609512329, "learning_rate": 1.9827251817707347e-08, "logits/chosen": -3.02485990524292, "logits/rejected": -3.0266916751861572, "logps/chosen": -54.67890548706055, "logps/rejected": -58.6965446472168, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0016034668078646064, "rewards/margins": 0.0007381063187494874, "rewards/rejected": -0.00234157289378345, "step": 1780 }, { "epoch": 0.308407994486561, "grad_norm": 2.5322887897491455, "learning_rate": 1.98216419930966e-08, "logits/chosen": -3.1726322174072266, "logits/rejected": -3.1501011848449707, "logps/chosen": -52.1053352355957, "logps/rejected": -52.75413131713867, "loss": 0.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0010994903277605772, "rewards/margins": 0.0011364398524165154, "rewards/rejected": -0.002235929947346449, "step": 1790 }, { "epoch": 0.31013094417643006, "grad_norm": 2.2157833576202393, "learning_rate": 1.9815943350621065e-08, "logits/chosen": -3.1358418464660645, "logits/rejected": -3.1119844913482666, "logps/chosen": -51.9261589050293, "logps/rejected": -52.291221618652344, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": -0.001239894307218492, "rewards/margins": 0.0017638758290559053, "rewards/rejected": -0.0030037700198590755, "step": 1800 }, { "epoch": 0.31013094417643006, "eval_logits/chosen": -3.1606557369232178, "eval_logits/rejected": -3.154991626739502, "eval_logps/chosen": -58.635379791259766, "eval_logps/rejected": -63.13025665283203, "eval_loss": 0.6930158734321594, "eval_rewards/accuracies": 0.5394981503486633, "eval_rewards/chosen": 0.0007651591440662742, "eval_rewards/margins": 0.00026646017795428634, "eval_rewards/rejected": 0.0004986989079043269, "eval_runtime": 383.5622, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 1800 }, { "epoch": 0.3118538938662991, "grad_norm": 2.3355607986450195, "learning_rate": 1.9810155941813995e-08, "logits/chosen": -3.139138698577881, "logits/rejected": -3.1034657955169678, "logps/chosen": -56.69929885864258, "logps/rejected": -53.90766143798828, "loss": 0.6925, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0008215238340198994, "rewards/margins": 0.0013491030549630523, "rewards/rejected": -0.002170627238228917, "step": 1810 }, { "epoch": 0.31357684355616816, "grad_norm": 2.256990909576416, "learning_rate": 1.9804279819011383e-08, "logits/chosen": -3.110410690307617, "logits/rejected": -3.0756146907806396, "logps/chosen": -55.28718948364258, "logps/rejected": -50.91753005981445, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0012509961379691958, "rewards/margins": 0.00110354193020612, "rewards/rejected": -0.002354537835344672, "step": 1820 }, { "epoch": 0.31529979324603724, "grad_norm": 2.239717483520508, "learning_rate": 1.9798315035351457e-08, "logits/chosen": -3.004528522491455, "logits/rejected": -2.990546941757202, "logps/chosen": -55.655792236328125, "logps/rejected": -54.94952392578125, "loss": 0.6926, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0011249512899667025, "rewards/margins": 0.001042040647007525, "rewards/rejected": -0.0021669918205589056, "step": 1830 }, { "epoch": 0.31702274293590627, "grad_norm": 2.331162929534912, "learning_rate": 1.9792261644774218e-08, "logits/chosen": -3.215100049972534, "logits/rejected": -3.2110018730163574, "logps/chosen": -54.98247146606445, "logps/rejected": -55.251922607421875, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0016236340161412954, "rewards/margins": 0.0004331713425926864, "rewards/rejected": -0.0020568054169416428, "step": 1840 }, { "epoch": 0.31874569262577535, "grad_norm": 2.3451895713806152, "learning_rate": 1.9786119702020934e-08, "logits/chosen": -3.0801799297332764, "logits/rejected": -3.0801517963409424, "logps/chosen": -53.336570739746094, "logps/rejected": -54.976661682128906, "loss": 0.6924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00126861990429461, "rewards/margins": 0.0014225415652617812, "rewards/rejected": -0.002691161585971713, "step": 1850 }, { "epoch": 0.32046864231564437, "grad_norm": 2.414311647415161, "learning_rate": 1.9779889262633673e-08, "logits/chosen": -3.0122768878936768, "logits/rejected": -2.976020097732544, "logps/chosen": -57.33064651489258, "logps/rejected": -54.029083251953125, "loss": 0.6919, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0003869426145683974, "rewards/margins": 0.002549747470766306, "rewards/rejected": -0.0029366896487772465, "step": 1860 }, { "epoch": 0.32219159200551345, "grad_norm": 2.6407299041748047, "learning_rate": 1.9773570382954776e-08, "logits/chosen": -3.0591697692871094, "logits/rejected": -3.0335114002227783, "logps/chosen": -56.03263473510742, "logps/rejected": -54.18107986450195, "loss": 0.6916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0008292980492115021, "rewards/margins": 0.0030972822569310665, "rewards/rejected": -0.003926579840481281, "step": 1870 }, { "epoch": 0.3239145416953825, "grad_norm": 2.2970800399780273, "learning_rate": 1.9767163120126365e-08, "logits/chosen": -3.1058273315429688, "logits/rejected": -3.0676872730255127, "logps/chosen": -52.84998321533203, "logps/rejected": -51.77935791015625, "loss": 0.6923, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0013415589928627014, "rewards/margins": 0.0017453646287322044, "rewards/rejected": -0.003086923388764262, "step": 1880 }, { "epoch": 0.32563749138525155, "grad_norm": 2.5498745441436768, "learning_rate": 1.97606675320898e-08, "logits/chosen": -3.1286263465881348, "logits/rejected": -3.1229748725891113, "logps/chosen": -53.16112518310547, "logps/rejected": -54.714317321777344, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0010337254498153925, "rewards/margins": 0.0016309624770656228, "rewards/rejected": -0.002664688043296337, "step": 1890 }, { "epoch": 0.32736044107512063, "grad_norm": 2.452503204345703, "learning_rate": 1.975408367758519e-08, "logits/chosen": -3.0616016387939453, "logits/rejected": -3.0282962322235107, "logps/chosen": -57.5360107421875, "logps/rejected": -52.49907684326172, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0009620963828638196, "rewards/margins": 0.0014106438029557467, "rewards/rejected": -0.002372740302234888, "step": 1900 }, { "epoch": 0.32736044107512063, "eval_logits/chosen": -3.160248279571533, "eval_logits/rejected": -3.1546409130096436, "eval_logps/chosen": -58.629112243652344, "eval_logps/rejected": -63.12491226196289, "eval_loss": 0.6930115222930908, "eval_rewards/accuracies": 0.5223048329353333, "eval_rewards/chosen": 0.0008278373279608786, "eval_rewards/margins": 0.0002757786714937538, "eval_rewards/rejected": 0.0005520587437786162, "eval_runtime": 384.1694, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 1900 }, { "epoch": 0.32908339076498966, "grad_norm": 2.60693621635437, "learning_rate": 1.9747411616150837e-08, "logits/chosen": -2.960416316986084, "logits/rejected": -2.9253084659576416, "logps/chosen": -54.61848068237305, "logps/rejected": -53.64332962036133, "loss": 0.6922, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0015078171854838729, "rewards/margins": 0.0019502185750752687, "rewards/rejected": -0.00345803564414382, "step": 1910 }, { "epoch": 0.33080634045485874, "grad_norm": 2.3648858070373535, "learning_rate": 1.974065140812271e-08, "logits/chosen": -3.0746426582336426, "logits/rejected": -3.0466296672821045, "logps/chosen": -55.4968147277832, "logps/rejected": -54.046043395996094, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": -0.0012814325746148825, "rewards/margins": 0.0021591049153357744, "rewards/rejected": -0.0034405372571200132, "step": 1920 }, { "epoch": 0.33252929014472776, "grad_norm": 2.1132709980010986, "learning_rate": 1.973380311463389e-08, "logits/chosen": -3.0233805179595947, "logits/rejected": -2.986288070678711, "logps/chosen": -53.99530029296875, "logps/rejected": -54.19614791870117, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.0010488248663023114, "rewards/margins": 0.002408870728686452, "rewards/rejected": -0.003457695245742798, "step": 1930 }, { "epoch": 0.33425223983459684, "grad_norm": 2.1448163986206055, "learning_rate": 1.9726866797614016e-08, "logits/chosen": -3.0474705696105957, "logits/rejected": -3.0288310050964355, "logps/chosen": -51.166717529296875, "logps/rejected": -50.376094818115234, "loss": 0.6925, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0022671956103295088, "rewards/margins": 0.0012547748629003763, "rewards/rejected": -0.0035219707060605288, "step": 1940 }, { "epoch": 0.33597518952446587, "grad_norm": 2.253101348876953, "learning_rate": 1.9719842519788743e-08, "logits/chosen": -3.053658962249756, "logits/rejected": -3.052180051803589, "logps/chosen": -52.709800720214844, "logps/rejected": -55.065162658691406, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.0020216540433466434, "rewards/margins": 0.0008553097140975296, "rewards/rejected": -0.002876963932067156, "step": 1950 }, { "epoch": 0.33769813921433495, "grad_norm": 2.281599998474121, "learning_rate": 1.971273034467915e-08, "logits/chosen": -3.059936046600342, "logits/rejected": -3.0406055450439453, "logps/chosen": -54.529571533203125, "logps/rejected": -54.80952835083008, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0017005018889904022, "rewards/margins": 0.0014261369360610843, "rewards/rejected": -0.0031266387086361647, "step": 1960 }, { "epoch": 0.33942108890420397, "grad_norm": 2.1517691612243652, "learning_rate": 1.9705530336601192e-08, "logits/chosen": -3.105541706085205, "logits/rejected": -3.0760109424591064, "logps/chosen": -56.24677658081055, "logps/rejected": -52.138221740722656, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0010016715386882424, "rewards/margins": 0.0019247450400143862, "rewards/rejected": -0.0029264166951179504, "step": 1970 }, { "epoch": 0.34114403859407305, "grad_norm": 2.4106578826904297, "learning_rate": 1.969824256066509e-08, "logits/chosen": -3.010801315307617, "logits/rejected": -3.0054256916046143, "logps/chosen": -55.7724609375, "logps/rejected": -54.3217658996582, "loss": 0.6927, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0018466083565726876, "rewards/margins": 0.0009771850891411304, "rewards/rejected": -0.00282379356212914, "step": 1980 }, { "epoch": 0.34286698828394213, "grad_norm": 2.3051235675811768, "learning_rate": 1.9690867082774768e-08, "logits/chosen": -3.1406359672546387, "logits/rejected": -3.105625629425049, "logps/chosen": -51.02665710449219, "logps/rejected": -48.61326217651367, "loss": 0.6919, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0019225224386900663, "rewards/margins": 0.002514256862923503, "rewards/rejected": -0.004436778835952282, "step": 1990 }, { "epoch": 0.34458993797381116, "grad_norm": 2.5171186923980713, "learning_rate": 1.968340396962724e-08, "logits/chosen": -3.0621447563171387, "logits/rejected": -3.0685670375823975, "logps/chosen": -50.19961929321289, "logps/rejected": -57.98683547973633, "loss": 0.6925, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.002387142274528742, "rewards/margins": 0.0013573340838775039, "rewards/rejected": -0.003744476707652211, "step": 2000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.160059928894043, "eval_logits/rejected": -3.154402017593384, "eval_logps/chosen": -58.621490478515625, "eval_logps/rejected": -63.13191223144531, "eval_loss": 0.6929388046264648, "eval_rewards/accuracies": 0.542286217212677, "eval_rewards/chosen": 0.0009040239383466542, "eval_rewards/margins": 0.0004219270485918969, "eval_rewards/rejected": 0.00048209683154709637, "eval_runtime": 383.2101, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 2000 }, { "epoch": 0.34631288766368024, "grad_norm": 2.4561853408813477, "learning_rate": 1.9675853288712007e-08, "logits/chosen": -3.0682225227355957, "logits/rejected": -3.039653778076172, "logps/chosen": -55.76312255859375, "logps/rejected": -52.18657302856445, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -0.0016124986577779055, "rewards/margins": 0.0022291613277047873, "rewards/rejected": -0.0038416602183133364, "step": 2010 }, { "epoch": 0.34803583735354926, "grad_norm": 2.3522374629974365, "learning_rate": 1.9668215108310464e-08, "logits/chosen": -3.0565547943115234, "logits/rejected": -3.034590244293213, "logps/chosen": -50.00489044189453, "logps/rejected": -55.06145095825195, "loss": 0.6921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0017557486426085234, "rewards/margins": 0.002072775736451149, "rewards/rejected": -0.0038285241462290287, "step": 2020 }, { "epoch": 0.34975878704341834, "grad_norm": 2.365412950515747, "learning_rate": 1.9660489497495258e-08, "logits/chosen": -3.1443276405334473, "logits/rejected": -3.1226987838745117, "logps/chosen": -57.24473190307617, "logps/rejected": -55.498451232910156, "loss": 0.6925, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0019245322328060865, "rewards/margins": 0.0012448631459847093, "rewards/rejected": -0.0031693950295448303, "step": 2030 }, { "epoch": 0.35148173673328736, "grad_norm": 2.3502867221832275, "learning_rate": 1.965267652612969e-08, "logits/chosen": -3.007511615753174, "logits/rejected": -2.9947307109832764, "logps/chosen": -52.93482208251953, "logps/rejected": -54.30632781982422, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0021886585745960474, "rewards/margins": 0.0013397895963862538, "rewards/rejected": -0.003528448287397623, "step": 2040 }, { "epoch": 0.35320468642315644, "grad_norm": 2.388796329498291, "learning_rate": 1.964477626486706e-08, "logits/chosen": -3.1260132789611816, "logits/rejected": -3.100912094116211, "logps/chosen": -51.36164093017578, "logps/rejected": -54.71668243408203, "loss": 0.6923, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0013962425291538239, "rewards/margins": 0.0016656548250466585, "rewards/rejected": -0.0030618971213698387, "step": 2050 }, { "epoch": 0.3549276361130255, "grad_norm": 2.1621086597442627, "learning_rate": 1.9636788785150038e-08, "logits/chosen": -3.1135175228118896, "logits/rejected": -3.0747947692871094, "logps/chosen": -55.415794372558594, "logps/rejected": -51.89849090576172, "loss": 0.692, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0008633068646304309, "rewards/margins": 0.002299419604241848, "rewards/rejected": -0.0031627260614186525, "step": 2060 }, { "epoch": 0.35665058580289455, "grad_norm": 2.4029855728149414, "learning_rate": 1.962871415921001e-08, "logits/chosen": -3.131016969680786, "logits/rejected": -3.107579469680786, "logps/chosen": -55.44580078125, "logps/rejected": -55.06316375732422, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0015260230284184217, "rewards/margins": 0.0012810361804440618, "rewards/rejected": -0.0028070593252778053, "step": 2070 }, { "epoch": 0.35837353549276363, "grad_norm": 2.4090819358825684, "learning_rate": 1.9620552460066455e-08, "logits/chosen": -3.071326732635498, "logits/rejected": -3.0413899421691895, "logps/chosen": -52.11077880859375, "logps/rejected": -51.01923370361328, "loss": 0.6924, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0024296611081808805, "rewards/margins": 0.001453958684578538, "rewards/rejected": -0.003883620025590062, "step": 2080 }, { "epoch": 0.36009648518263265, "grad_norm": 2.2452566623687744, "learning_rate": 1.9612303761526236e-08, "logits/chosen": -3.095968246459961, "logits/rejected": -3.092430353164673, "logps/chosen": -54.56378173828125, "logps/rejected": -54.91994094848633, "loss": 0.6927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0016902232309803367, "rewards/margins": 0.0009237364865839481, "rewards/rejected": -0.0026139598339796066, "step": 2090 }, { "epoch": 0.36181943487250173, "grad_norm": 2.5663130283355713, "learning_rate": 1.9603968138182974e-08, "logits/chosen": -3.045274257659912, "logits/rejected": -3.0230183601379395, "logps/chosen": -55.53925323486328, "logps/rejected": -51.63383102416992, "loss": 0.6922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0019278887193650007, "rewards/margins": 0.0019202090334147215, "rewards/rejected": -0.0038480982184410095, "step": 2100 }, { "epoch": 0.36181943487250173, "eval_logits/chosen": -3.1595144271850586, "eval_logits/rejected": -3.1538989543914795, "eval_logps/chosen": -58.603946685791016, "eval_logps/rejected": -63.11533737182617, "eval_loss": 0.692934513092041, "eval_rewards/accuracies": 0.5511152148246765, "eval_rewards/chosen": 0.0010795381385833025, "eval_rewards/margins": 0.0004316373378969729, "eval_rewards/rejected": 0.0006479007424786687, "eval_runtime": 383.4209, "eval_samples_per_second": 11.225, "eval_steps_per_second": 1.403, "step": 2100 }, { "epoch": 0.36354238456237076, "grad_norm": 2.277183771133423, "learning_rate": 1.959554566541635e-08, "logits/chosen": -3.1013741493225098, "logits/rejected": -3.104506731033325, "logps/chosen": -49.140750885009766, "logps/rejected": -55.5598258972168, "loss": 0.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0024113706313073635, "rewards/margins": 0.0012931081000715494, "rewards/rejected": -0.0037044784985482693, "step": 2110 }, { "epoch": 0.36526533425223984, "grad_norm": 2.345186471939087, "learning_rate": 1.9587036419391437e-08, "logits/chosen": -2.9661850929260254, "logits/rejected": -2.938690185546875, "logps/chosen": -53.660194396972656, "logps/rejected": -51.288002014160156, "loss": 0.6918, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0008453844930045307, "rewards/margins": 0.002739228308200836, "rewards/rejected": -0.0035846128594130278, "step": 2120 }, { "epoch": 0.3669882839421089, "grad_norm": 2.1597959995269775, "learning_rate": 1.9578440477057998e-08, "logits/chosen": -3.0039591789245605, "logits/rejected": -2.9852240085601807, "logps/chosen": -54.717498779296875, "logps/rejected": -52.96318817138672, "loss": 0.6923, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.002471815561875701, "rewards/margins": 0.0017909994348883629, "rewards/rejected": -0.0042628152295947075, "step": 2130 }, { "epoch": 0.36871123363197794, "grad_norm": 2.2517848014831543, "learning_rate": 1.9569757916149805e-08, "logits/chosen": -2.9810574054718018, "logits/rejected": -2.9728846549987793, "logps/chosen": -49.01877975463867, "logps/rejected": -54.04807662963867, "loss": 0.6925, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0036360882222652435, "rewards/margins": 0.001321372459642589, "rewards/rejected": -0.00495745986700058, "step": 2140 }, { "epoch": 0.370434183321847, "grad_norm": 2.262716054916382, "learning_rate": 1.956098881518392e-08, "logits/chosen": -3.0333046913146973, "logits/rejected": -2.9868977069854736, "logps/chosen": -54.80281448364258, "logps/rejected": -48.4425163269043, "loss": 0.6916, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.001142291584983468, "rewards/margins": 0.00319478427991271, "rewards/rejected": -0.004337075632065535, "step": 2150 }, { "epoch": 0.37215713301171605, "grad_norm": 2.2613468170166016, "learning_rate": 1.9552133253460006e-08, "logits/chosen": -2.999462366104126, "logits/rejected": -2.984152317047119, "logps/chosen": -54.93077850341797, "logps/rejected": -49.919715881347656, "loss": 0.6922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0017375343013554811, "rewards/margins": 0.0019211741164326668, "rewards/rejected": -0.0036587081849575043, "step": 2160 }, { "epoch": 0.3738800827015851, "grad_norm": 2.3072938919067383, "learning_rate": 1.954319131105958e-08, "logits/chosen": -3.123497247695923, "logits/rejected": -3.1076152324676514, "logps/chosen": -54.589698791503906, "logps/rejected": -51.89857864379883, "loss": 0.6924, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0025846485514193773, "rewards/margins": 0.0015800563851371408, "rewards/rejected": -0.004164704121649265, "step": 2170 }, { "epoch": 0.37560303239145415, "grad_norm": 2.694261074066162, "learning_rate": 1.953416306884532e-08, "logits/chosen": -3.1719765663146973, "logits/rejected": -3.133807897567749, "logps/chosen": -58.207847595214844, "logps/rejected": -53.22174072265625, "loss": 0.6916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0018373355269432068, "rewards/margins": 0.003037033136934042, "rewards/rejected": -0.0048743681982159615, "step": 2180 }, { "epoch": 0.37732598208132323, "grad_norm": 2.5449416637420654, "learning_rate": 1.952504860846032e-08, "logits/chosen": -3.235800266265869, "logits/rejected": -3.2293059825897217, "logps/chosen": -53.08635711669922, "logps/rejected": -54.184967041015625, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0024451869539916515, "rewards/margins": 0.0008518371032550931, "rewards/rejected": -0.0032970241736620665, "step": 2190 }, { "epoch": 0.37904893177119225, "grad_norm": 2.451497793197632, "learning_rate": 1.951584801232734e-08, "logits/chosen": -3.065656900405884, "logits/rejected": -3.0425562858581543, "logps/chosen": -52.4706916809082, "logps/rejected": -55.25239181518555, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0016196580836549401, "rewards/margins": 0.0029670994263142347, "rewards/rejected": -0.004586757160723209, "step": 2200 }, { "epoch": 0.37904893177119225, "eval_logits/chosen": -3.159013032913208, "eval_logits/rejected": -3.1533420085906982, "eval_logps/chosen": -58.596683502197266, "eval_logps/rejected": -63.1153450012207, "eval_loss": 0.6928985714912415, "eval_rewards/accuracies": 0.5378717184066772, "eval_rewards/chosen": 0.0011521215783432126, "eval_rewards/margins": 0.0005043414421379566, "eval_rewards/rejected": 0.0006477802526205778, "eval_runtime": 383.2871, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 2200 }, { "epoch": 0.38077188146106133, "grad_norm": 2.220224618911743, "learning_rate": 1.9506561363648082e-08, "logits/chosen": -3.1231906414031982, "logits/rejected": -3.1068406105041504, "logps/chosen": -54.8387451171875, "logps/rejected": -54.0641975402832, "loss": 0.6919, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0011239739833399653, "rewards/margins": 0.002422004472464323, "rewards/rejected": -0.00354597857221961, "step": 2210 }, { "epoch": 0.3824948311509304, "grad_norm": 2.312995433807373, "learning_rate": 1.9497188746402428e-08, "logits/chosen": -2.9493377208709717, "logits/rejected": -2.941772937774658, "logps/chosen": -54.039085388183594, "logps/rejected": -54.20904541015625, "loss": 0.6922, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0016108605777844787, "rewards/margins": 0.0020083982963114977, "rewards/rejected": -0.003619259223341942, "step": 2220 }, { "epoch": 0.38421778084079944, "grad_norm": 2.4227945804595947, "learning_rate": 1.948773024534767e-08, "logits/chosen": -3.1078758239746094, "logits/rejected": -3.075817584991455, "logps/chosen": -53.9468879699707, "logps/rejected": -51.3045654296875, "loss": 0.6917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0026930735912173986, "rewards/margins": 0.002932693576440215, "rewards/rejected": -0.005625767167657614, "step": 2230 }, { "epoch": 0.3859407305306685, "grad_norm": 2.5007476806640625, "learning_rate": 1.9478185946017774e-08, "logits/chosen": -3.0606689453125, "logits/rejected": -3.0243403911590576, "logps/chosen": -57.04365921020508, "logps/rejected": -54.07979202270508, "loss": 0.6919, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.002340201986953616, "rewards/margins": 0.0025223747361451387, "rewards/rejected": -0.00486257579177618, "step": 2240 }, { "epoch": 0.38766368022053754, "grad_norm": 2.6046512126922607, "learning_rate": 1.946855593472256e-08, "logits/chosen": -3.0197103023529053, "logits/rejected": -2.9851489067077637, "logps/chosen": -55.551429748535156, "logps/rejected": -53.84883499145508, "loss": 0.6917, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0020156418904662132, "rewards/margins": 0.0030193165875971317, "rewards/rejected": -0.005034958478063345, "step": 2250 }, { "epoch": 0.3893866299104066, "grad_norm": 2.2071502208709717, "learning_rate": 1.945884029854697e-08, "logits/chosen": -2.9758896827697754, "logits/rejected": -2.94950270652771, "logps/chosen": -60.996726989746094, "logps/rejected": -57.81394577026367, "loss": 0.6919, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0018908934434875846, "rewards/margins": 0.0025968493428081274, "rewards/rejected": -0.00448774266988039, "step": 2260 }, { "epoch": 0.39110957960027565, "grad_norm": 2.4286463260650635, "learning_rate": 1.9449039125350245e-08, "logits/chosen": -2.9611337184906006, "logits/rejected": -2.927670478820801, "logps/chosen": -54.38924026489258, "logps/rejected": -52.332435607910156, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0032784740906208754, "rewards/margins": 0.0028812182135879993, "rewards/rejected": -0.006159692537039518, "step": 2270 }, { "epoch": 0.3928325292901447, "grad_norm": 2.2948031425476074, "learning_rate": 1.943915250376515e-08, "logits/chosen": -3.0407679080963135, "logits/rejected": -3.030714750289917, "logps/chosen": -53.62548828125, "logps/rejected": -55.8744010925293, "loss": 0.6923, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0018194593722000718, "rewards/margins": 0.00162306590937078, "rewards/rejected": -0.0034425253979861736, "step": 2280 }, { "epoch": 0.3945554789800138, "grad_norm": 2.675964593887329, "learning_rate": 1.9429180523197173e-08, "logits/chosen": -2.9342846870422363, "logits/rejected": -2.9056806564331055, "logps/chosen": -53.1612663269043, "logps/rejected": -54.345176696777344, "loss": 0.6916, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.002783877542242408, "rewards/margins": 0.00306482776068151, "rewards/rejected": -0.0058487048372626305, "step": 2290 }, { "epoch": 0.39627842866988283, "grad_norm": 2.2668910026550293, "learning_rate": 1.9419123273823692e-08, "logits/chosen": -3.121488571166992, "logits/rejected": -3.087228775024414, "logps/chosen": -56.041099548339844, "logps/rejected": -54.0965576171875, "loss": 0.6914, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0009602559730410576, "rewards/margins": 0.0034466232173144817, "rewards/rejected": -0.004406879656016827, "step": 2300 }, { "epoch": 0.39627842866988283, "eval_logits/chosen": -3.1587371826171875, "eval_logits/rejected": -3.153092622756958, "eval_logps/chosen": -58.58064270019531, "eval_logps/rejected": -63.10941696166992, "eval_loss": 0.6928492188453674, "eval_rewards/accuracies": 0.5480948090553284, "eval_rewards/chosen": 0.001312516164034605, "eval_rewards/margins": 0.0006054288824088871, "eval_rewards/rejected": 0.000707087223418057, "eval_runtime": 383.4093, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 2300 }, { "epoch": 0.3980013783597519, "grad_norm": 2.4323816299438477, "learning_rate": 1.940898084659319e-08, "logits/chosen": -3.022576093673706, "logits/rejected": -3.0067625045776367, "logps/chosen": -51.424705505371094, "logps/rejected": -51.964454650878906, "loss": 0.6922, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.003465785412117839, "rewards/margins": 0.0018342696130275726, "rewards/rejected": -0.005300055257976055, "step": 2310 }, { "epoch": 0.39972432804962094, "grad_norm": 2.3269340991973877, "learning_rate": 1.939875333322442e-08, "logits/chosen": -3.1188583374023438, "logits/rejected": -3.077375888824463, "logps/chosen": -57.623809814453125, "logps/rejected": -51.6974983215332, "loss": 0.6913, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0018242349615320563, "rewards/margins": 0.003677531611174345, "rewards/rejected": -0.005501766689121723, "step": 2320 }, { "epoch": 0.40144727773949, "grad_norm": 2.243847608566284, "learning_rate": 1.938844082620557e-08, "logits/chosen": -3.029540538787842, "logits/rejected": -3.007463216781616, "logps/chosen": -56.02894973754883, "logps/rejected": -52.85606002807617, "loss": 0.6913, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.002195675391703844, "rewards/margins": 0.0038120609242469072, "rewards/rejected": -0.006007737014442682, "step": 2330 }, { "epoch": 0.40317022742935904, "grad_norm": 2.2276456356048584, "learning_rate": 1.9378043418793438e-08, "logits/chosen": -3.0718817710876465, "logits/rejected": -3.0597751140594482, "logps/chosen": -52.65851974487305, "logps/rejected": -55.89642333984375, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003569630440324545, "rewards/margins": 0.0016181267565116286, "rewards/rejected": -0.005187757313251495, "step": 2340 }, { "epoch": 0.4048931771192281, "grad_norm": 2.4954779148101807, "learning_rate": 1.936756120501258e-08, "logits/chosen": -3.0566139221191406, "logits/rejected": -3.0337390899658203, "logps/chosen": -58.45566940307617, "logps/rejected": -55.84288787841797, "loss": 0.6915, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0019436674192547798, "rewards/margins": 0.0033232986461371183, "rewards/rejected": -0.005266966298222542, "step": 2350 }, { "epoch": 0.4066161268090972, "grad_norm": 2.192545175552368, "learning_rate": 1.935699427965446e-08, "logits/chosen": -3.07662296295166, "logits/rejected": -3.0633997917175293, "logps/chosen": -50.66261291503906, "logps/rejected": -51.67461395263672, "loss": 0.6919, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0030053765513002872, "rewards/margins": 0.0024440365377813578, "rewards/rejected": -0.005449412856251001, "step": 2360 }, { "epoch": 0.4083390764989662, "grad_norm": 2.514657497406006, "learning_rate": 1.9346342738276593e-08, "logits/chosen": -3.0921218395233154, "logits/rejected": -3.0855157375335693, "logps/chosen": -54.12009811401367, "logps/rejected": -54.41124725341797, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": -0.0035813034046441317, "rewards/margins": 0.0018840819830074906, "rewards/rejected": -0.005465385504066944, "step": 2370 }, { "epoch": 0.4100620261888353, "grad_norm": 2.1899521350860596, "learning_rate": 1.93356066772017e-08, "logits/chosen": -3.0172626972198486, "logits/rejected": -2.9923696517944336, "logps/chosen": -54.54243850708008, "logps/rejected": -53.3748893737793, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0020027304999530315, "rewards/margins": 0.003100222907960415, "rewards/rejected": -0.005102953407913446, "step": 2380 }, { "epoch": 0.41178497587870433, "grad_norm": 2.286336660385132, "learning_rate": 1.9324786193516794e-08, "logits/chosen": -3.1032328605651855, "logits/rejected": -3.0758352279663086, "logps/chosen": -56.50042724609375, "logps/rejected": -53.74065017700195, "loss": 0.6913, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0026643401943147182, "rewards/margins": 0.0038009281270205975, "rewards/rejected": -0.006465268321335316, "step": 2390 }, { "epoch": 0.4135079255685734, "grad_norm": 2.1463067531585693, "learning_rate": 1.9313881385072357e-08, "logits/chosen": -3.151245355606079, "logits/rejected": -3.1303699016571045, "logps/chosen": -52.655494689941406, "logps/rejected": -53.832435607910156, "loss": 0.6921, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.003549426095560193, "rewards/margins": 0.002139916643500328, "rewards/rejected": -0.005689342971891165, "step": 2400 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.157942771911621, "eval_logits/rejected": -3.1523544788360596, "eval_logps/chosen": -58.57807159423828, "eval_logps/rejected": -63.113616943359375, "eval_loss": 0.6928165555000305, "eval_rewards/accuracies": 0.5499535202980042, "eval_rewards/chosen": 0.0013382199686020613, "eval_rewards/margins": 0.0006732027977705002, "eval_rewards/rejected": 0.0006650172872468829, "eval_runtime": 383.7102, "eval_samples_per_second": 11.217, "eval_steps_per_second": 1.402, "step": 2400 }, { "epoch": 0.41523087525844243, "grad_norm": 2.061424970626831, "learning_rate": 1.9302892350481398e-08, "logits/chosen": -3.1260993480682373, "logits/rejected": -3.0860531330108643, "logps/chosen": -54.02878952026367, "logps/rejected": -49.81685256958008, "loss": 0.6913, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0017845083493739367, "rewards/margins": 0.003701354144141078, "rewards/rejected": -0.005485862959176302, "step": 2410 }, { "epoch": 0.4169538249483115, "grad_norm": 2.0637965202331543, "learning_rate": 1.9291819189118608e-08, "logits/chosen": -3.1225945949554443, "logits/rejected": -3.0999155044555664, "logps/chosen": -56.397972106933594, "logps/rejected": -55.43430709838867, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0021585775539278984, "rewards/margins": 0.003159626852720976, "rewards/rejected": -0.005318204872310162, "step": 2420 }, { "epoch": 0.41867677463818054, "grad_norm": 2.2857463359832764, "learning_rate": 1.9280662001119444e-08, "logits/chosen": -3.085228443145752, "logits/rejected": -3.062695026397705, "logps/chosen": -55.07440948486328, "logps/rejected": -52.27777099609375, "loss": 0.6916, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.002314269542694092, "rewards/margins": 0.00313003221526742, "rewards/rejected": -0.005444302223622799, "step": 2430 }, { "epoch": 0.4203997243280496, "grad_norm": 2.347783327102661, "learning_rate": 1.9269420887379205e-08, "logits/chosen": -3.0803112983703613, "logits/rejected": -3.0668201446533203, "logps/chosen": -55.31931686401367, "logps/rejected": -54.99340057373047, "loss": 0.692, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.002624379238113761, "rewards/margins": 0.0023085675202310085, "rewards/rejected": -0.004932946525514126, "step": 2440 }, { "epoch": 0.4221226740179187, "grad_norm": 2.415978193283081, "learning_rate": 1.9258095949552154e-08, "logits/chosen": -3.0366785526275635, "logits/rejected": -3.013326406478882, "logps/chosen": -53.44938278198242, "logps/rejected": -52.85466766357422, "loss": 0.6921, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.003910476807504892, "rewards/margins": 0.0020568217150866985, "rewards/rejected": -0.005967298522591591, "step": 2450 }, { "epoch": 0.4238456237077877, "grad_norm": 2.3307220935821533, "learning_rate": 1.9246687290050577e-08, "logits/chosen": -3.020193576812744, "logits/rejected": -2.98730731010437, "logps/chosen": -56.455039978027344, "logps/rejected": -53.13280487060547, "loss": 0.6917, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.003136079292744398, "rewards/margins": 0.0028621095698326826, "rewards/rejected": -0.005998189095407724, "step": 2460 }, { "epoch": 0.4255685733976568, "grad_norm": 2.3056654930114746, "learning_rate": 1.923519501204386e-08, "logits/chosen": -3.1820156574249268, "logits/rejected": -3.160578966140747, "logps/chosen": -55.23564910888672, "logps/rejected": -53.224754333496094, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": -0.0019143400713801384, "rewards/margins": 0.003250572830438614, "rewards/rejected": -0.005164912901818752, "step": 2470 }, { "epoch": 0.4272915230875258, "grad_norm": 2.56144642829895, "learning_rate": 1.9223619219457556e-08, "logits/chosen": -3.0781826972961426, "logits/rejected": -3.046678066253662, "logps/chosen": -54.766204833984375, "logps/rejected": -51.062870025634766, "loss": 0.6919, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.003880419535562396, "rewards/margins": 0.00249149976298213, "rewards/rejected": -0.0063719190657138824, "step": 2480 }, { "epoch": 0.4290144727773949, "grad_norm": 2.0401649475097656, "learning_rate": 1.9211960016972447e-08, "logits/chosen": -3.061281681060791, "logits/rejected": -3.0525238513946533, "logps/chosen": -50.482879638671875, "logps/rejected": -52.11334228515625, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.004440463148057461, "rewards/margins": 0.0007436785381287336, "rewards/rejected": -0.005184141453355551, "step": 2490 }, { "epoch": 0.43073742246726393, "grad_norm": 2.23286509513855, "learning_rate": 1.9200217510023604e-08, "logits/chosen": -3.140615224838257, "logits/rejected": -3.116429090499878, "logps/chosen": -57.23732376098633, "logps/rejected": -57.09992218017578, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0034662075340747833, "rewards/margins": 0.0018366838339716196, "rewards/rejected": -0.005302890669554472, "step": 2500 }, { "epoch": 0.43073742246726393, "eval_logits/chosen": -3.1574504375457764, "eval_logits/rejected": -3.1518211364746094, "eval_logps/chosen": -58.564754486083984, "eval_logps/rejected": -63.113121032714844, "eval_loss": 0.6927535533905029, "eval_rewards/accuracies": 0.5601765513420105, "eval_rewards/chosen": 0.0014713724376633763, "eval_rewards/margins": 0.0008013962069526315, "eval_rewards/rejected": 0.0006699761725030839, "eval_runtime": 383.6708, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 2500 }, { "epoch": 0.432460372157133, "grad_norm": 2.3200342655181885, "learning_rate": 1.9188391804799416e-08, "logits/chosen": -2.9886715412139893, "logits/rejected": -2.978713274002075, "logps/chosen": -52.80669403076172, "logps/rejected": -52.61443328857422, "loss": 0.6921, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004210265818983316, "rewards/margins": 0.002118567703291774, "rewards/rejected": -0.006328833755105734, "step": 2510 }, { "epoch": 0.4341833218470021, "grad_norm": 2.2182769775390625, "learning_rate": 1.9176483008240652e-08, "logits/chosen": -2.9787347316741943, "logits/rejected": -2.9508373737335205, "logps/chosen": -52.73773193359375, "logps/rejected": -49.517173767089844, "loss": 0.6919, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0026951669715344906, "rewards/margins": 0.0024579425808042288, "rewards/rejected": -0.005153109785169363, "step": 2520 }, { "epoch": 0.4359062715368711, "grad_norm": 2.1345114707946777, "learning_rate": 1.916449122803947e-08, "logits/chosen": -3.067570924758911, "logits/rejected": -3.0630366802215576, "logps/chosen": -51.600563049316406, "logps/rejected": -54.767860412597656, "loss": 0.6922, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.004309863317757845, "rewards/margins": 0.0018437877297401428, "rewards/rejected": -0.0061536505818367004, "step": 2530 }, { "epoch": 0.4376292212267402, "grad_norm": 2.2189536094665527, "learning_rate": 1.9152416572638466e-08, "logits/chosen": -3.1077988147735596, "logits/rejected": -3.0970089435577393, "logps/chosen": -54.01378631591797, "logps/rejected": -54.51206588745117, "loss": 0.6922, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004733550362288952, "rewards/margins": 0.0019095508614555001, "rewards/rejected": -0.006643100641667843, "step": 2540 }, { "epoch": 0.4393521709166092, "grad_norm": 2.328441619873047, "learning_rate": 1.9140259151229674e-08, "logits/chosen": -3.0512402057647705, "logits/rejected": -3.018428325653076, "logps/chosen": -58.816741943359375, "logps/rejected": -54.53766632080078, "loss": 0.691, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.00133642612490803, "rewards/margins": 0.0043244450353085995, "rewards/rejected": -0.005660871509462595, "step": 2550 }, { "epoch": 0.4410751206064783, "grad_norm": 2.3890929222106934, "learning_rate": 1.9128019073753598e-08, "logits/chosen": -3.140418291091919, "logits/rejected": -3.1156299114227295, "logps/chosen": -55.52399826049805, "logps/rejected": -54.633018493652344, "loss": 0.6914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0026364210061728954, "rewards/margins": 0.0034923895727843046, "rewards/rejected": -0.006128811277449131, "step": 2560 }, { "epoch": 0.4427980702963473, "grad_norm": 2.3404576778411865, "learning_rate": 1.9115696450898193e-08, "logits/chosen": -3.073380947113037, "logits/rejected": -3.0549139976501465, "logps/chosen": -58.83415985107422, "logps/rejected": -57.058067321777344, "loss": 0.6916, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.002324377652257681, "rewards/margins": 0.0032200622372329235, "rewards/rejected": -0.005544439889490604, "step": 2570 }, { "epoch": 0.4445210199862164, "grad_norm": 2.5390334129333496, "learning_rate": 1.9103291394097894e-08, "logits/chosen": -3.1005444526672363, "logits/rejected": -3.0815181732177734, "logps/chosen": -53.89350128173828, "logps/rejected": -53.769981384277344, "loss": 0.6928, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.005077004432678223, "rewards/margins": 0.0007953291060402989, "rewards/rejected": -0.0058723329566419125, "step": 2580 }, { "epoch": 0.4462439696760855, "grad_norm": 2.608225107192993, "learning_rate": 1.9090804015532585e-08, "logits/chosen": -3.0780763626098633, "logits/rejected": -3.040466785430908, "logps/chosen": -57.0039176940918, "logps/rejected": -52.00291061401367, "loss": 0.6909, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003094766056165099, "rewards/margins": 0.004467259161174297, "rewards/rejected": -0.007562024984508753, "step": 2590 }, { "epoch": 0.4479669193659545, "grad_norm": 2.4454474449157715, "learning_rate": 1.9078234428126585e-08, "logits/chosen": -3.0497794151306152, "logits/rejected": -3.0013108253479004, "logps/chosen": -57.414405822753906, "logps/rejected": -49.24188995361328, "loss": 0.6909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.003182294312864542, "rewards/margins": 0.004597029648721218, "rewards/rejected": -0.007779325358569622, "step": 2600 }, { "epoch": 0.4479669193659545, "eval_logits/chosen": -3.1568431854248047, "eval_logits/rejected": -3.1512274742126465, "eval_logps/chosen": -58.55170822143555, "eval_logps/rejected": -63.10794448852539, "eval_loss": 0.6927151679992676, "eval_rewards/accuracies": 0.5580855011940002, "eval_rewards/chosen": 0.0016019355971366167, "eval_rewards/margins": 0.0008801804506219923, "eval_rewards/rejected": 0.0007217551465146244, "eval_runtime": 383.6818, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 2600 }, { "epoch": 0.4496898690558236, "grad_norm": 2.163145065307617, "learning_rate": 1.9065582745547646e-08, "logits/chosen": -3.0288872718811035, "logits/rejected": -2.989107131958008, "logps/chosen": -58.72133255004883, "logps/rejected": -52.37580108642578, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": -0.0034227855503559113, "rewards/margins": 0.0036105778999626637, "rewards/rejected": -0.007033363915979862, "step": 2610 }, { "epoch": 0.4514128187456926, "grad_norm": 2.1911659240722656, "learning_rate": 1.9052849082205908e-08, "logits/chosen": -3.1205995082855225, "logits/rejected": -3.0899150371551514, "logps/chosen": -51.6873664855957, "logps/rejected": -51.2896614074707, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004214108921587467, "rewards/margins": 0.004206494893878698, "rewards/rejected": -0.008420604281127453, "step": 2620 }, { "epoch": 0.4531357684355617, "grad_norm": 2.6254820823669434, "learning_rate": 1.9040033553252865e-08, "logits/chosen": -3.0019752979278564, "logits/rejected": -2.9693052768707275, "logps/chosen": -55.170799255371094, "logps/rejected": -53.92267990112305, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0035726067144423723, "rewards/margins": 0.004099712241441011, "rewards/rejected": -0.00767231872305274, "step": 2630 }, { "epoch": 0.4548587181254307, "grad_norm": 2.2283997535705566, "learning_rate": 1.9027136274580334e-08, "logits/chosen": -3.0246224403381348, "logits/rejected": -3.0031864643096924, "logps/chosen": -50.46599578857422, "logps/rejected": -49.7197151184082, "loss": 0.6914, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0037057590670883656, "rewards/margins": 0.0036159877199679613, "rewards/rejected": -0.007321746554225683, "step": 2640 }, { "epoch": 0.4565816678152998, "grad_norm": 2.218834638595581, "learning_rate": 1.90141573628194e-08, "logits/chosen": -2.9965646266937256, "logits/rejected": -2.97887921333313, "logps/chosen": -52.707855224609375, "logps/rejected": -54.880126953125, "loss": 0.6916, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0027702811639755964, "rewards/margins": 0.0031878617592155933, "rewards/rejected": -0.005958142690360546, "step": 2650 }, { "epoch": 0.4583046175051689, "grad_norm": 2.5716030597686768, "learning_rate": 1.9001096935339365e-08, "logits/chosen": -3.025402784347534, "logits/rejected": -2.982842445373535, "logps/chosen": -58.6676139831543, "logps/rejected": -53.14191436767578, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003777928650379181, "rewards/margins": 0.0028960234485566616, "rewards/rejected": -0.006673953030258417, "step": 2660 }, { "epoch": 0.4600275671950379, "grad_norm": 2.253568172454834, "learning_rate": 1.898795511024667e-08, "logits/chosen": -2.997163772583008, "logits/rejected": -2.968554973602295, "logps/chosen": -54.23125076293945, "logps/rejected": -52.85374069213867, "loss": 0.6906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0028527709655463696, "rewards/margins": 0.005068852566182613, "rewards/rejected": -0.007921623066067696, "step": 2670 }, { "epoch": 0.461750516884907, "grad_norm": 2.7172999382019043, "learning_rate": 1.8974732006383862e-08, "logits/chosen": -3.0443577766418457, "logits/rejected": -3.016765594482422, "logps/chosen": -58.591461181640625, "logps/rejected": -54.34435272216797, "loss": 0.6915, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0034392178058624268, "rewards/margins": 0.0033766250126063824, "rewards/rejected": -0.0068158432841300964, "step": 2680 }, { "epoch": 0.463473466574776, "grad_norm": 2.3720362186431885, "learning_rate": 1.8961427743328484e-08, "logits/chosen": -3.0178565979003906, "logits/rejected": -2.9950802326202393, "logps/chosen": -51.14887237548828, "logps/rejected": -50.86174774169922, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005077795125544071, "rewards/margins": 0.0046742986887693405, "rewards/rejected": -0.009752093814313412, "step": 2690 }, { "epoch": 0.4651964162646451, "grad_norm": 2.2392497062683105, "learning_rate": 1.8948042441392008e-08, "logits/chosen": -3.0468106269836426, "logits/rejected": -3.0200257301330566, "logps/chosen": -53.548301696777344, "logps/rejected": -54.21494674682617, "loss": 0.6911, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.003175341058522463, "rewards/margins": 0.004210122860968113, "rewards/rejected": -0.0073854634538292885, "step": 2700 }, { "epoch": 0.4651964162646451, "eval_logits/chosen": -3.156172752380371, "eval_logits/rejected": -3.15053391456604, "eval_logps/chosen": -58.5521354675293, "eval_logps/rejected": -63.113643646240234, "eval_loss": 0.6926901340484619, "eval_rewards/accuracies": 0.5627323389053345, "eval_rewards/chosen": 0.001597628928720951, "eval_rewards/margins": 0.0009329087333753705, "eval_rewards/rejected": 0.0006647202535532415, "eval_runtime": 383.58, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 2700 }, { "epoch": 0.4669193659545141, "grad_norm": 2.4848990440368652, "learning_rate": 1.893457622161875e-08, "logits/chosen": -3.0706381797790527, "logits/rejected": -3.051726818084717, "logps/chosen": -59.67988967895508, "logps/rejected": -55.850067138671875, "loss": 0.6918, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0023324606008827686, "rewards/margins": 0.0026608449406921864, "rewards/rejected": -0.004993305075913668, "step": 2710 }, { "epoch": 0.4686423156443832, "grad_norm": 2.4114415645599365, "learning_rate": 1.8921029205784776e-08, "logits/chosen": -3.0598363876342773, "logits/rejected": -3.0630555152893066, "logps/chosen": -52.59183883666992, "logps/rejected": -54.42897415161133, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005445893853902817, "rewards/margins": 0.0012161575723439455, "rewards/rejected": -0.006662050727754831, "step": 2720 }, { "epoch": 0.4703652653342522, "grad_norm": 2.2148189544677734, "learning_rate": 1.890740151639679e-08, "logits/chosen": -3.0595743656158447, "logits/rejected": -3.0394186973571777, "logps/chosen": -58.775352478027344, "logps/rejected": -55.89299392700195, "loss": 0.6909, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0020863967947661877, "rewards/margins": 0.004591117147356272, "rewards/rejected": -0.006677514407783747, "step": 2730 }, { "epoch": 0.4720882150241213, "grad_norm": 2.4539008140563965, "learning_rate": 1.8893693276691043e-08, "logits/chosen": -3.049955368041992, "logits/rejected": -3.028224468231201, "logps/chosen": -53.324928283691406, "logps/rejected": -50.06505584716797, "loss": 0.6921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004656701814383268, "rewards/margins": 0.002166991587728262, "rewards/rejected": -0.00682369340211153, "step": 2740 }, { "epoch": 0.4738111647139904, "grad_norm": 2.2547569274902344, "learning_rate": 1.8879904610632196e-08, "logits/chosen": -2.9805612564086914, "logits/rejected": -2.975369930267334, "logps/chosen": -49.73875427246094, "logps/rejected": -54.2640495300293, "loss": 0.6914, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0047501143999397755, "rewards/margins": 0.003620445728302002, "rewards/rejected": -0.008370560593903065, "step": 2750 }, { "epoch": 0.4755341144038594, "grad_norm": 2.424126625061035, "learning_rate": 1.8866035642912217e-08, "logits/chosen": -3.030247211456299, "logits/rejected": -3.01254940032959, "logps/chosen": -54.67096710205078, "logps/rejected": -55.503089904785156, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003959754481911659, "rewards/margins": 0.0030137132853269577, "rewards/rejected": -0.006973467767238617, "step": 2760 }, { "epoch": 0.4772570640937285, "grad_norm": 2.4591829776763916, "learning_rate": 1.885208649894925e-08, "logits/chosen": -3.1635355949401855, "logits/rejected": -3.133530616760254, "logps/chosen": -55.5648078918457, "logps/rejected": -53.63425827026367, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0035004555247724056, "rewards/margins": 0.002456140238791704, "rewards/rejected": -0.00595659576356411, "step": 2770 }, { "epoch": 0.4789800137835975, "grad_norm": 2.2963922023773193, "learning_rate": 1.8838057304886483e-08, "logits/chosen": -2.9820423126220703, "logits/rejected": -2.951641082763672, "logps/chosen": -53.36084747314453, "logps/rejected": -51.40031814575195, "loss": 0.6919, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.005320178810507059, "rewards/margins": 0.002531546400859952, "rewards/rejected": -0.007851725444197655, "step": 2780 }, { "epoch": 0.4807029634734666, "grad_norm": 2.3971450328826904, "learning_rate": 1.8823948187590994e-08, "logits/chosen": -3.1007752418518066, "logits/rejected": -3.066845655441284, "logps/chosen": -51.98784255981445, "logps/rejected": -50.7843132019043, "loss": 0.6909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.002275830367580056, "rewards/margins": 0.004438784904778004, "rewards/rejected": -0.006714615970849991, "step": 2790 }, { "epoch": 0.4824259131633356, "grad_norm": 2.691331148147583, "learning_rate": 1.8809759274652614e-08, "logits/chosen": -3.080821990966797, "logits/rejected": -3.055692195892334, "logps/chosen": -60.45173263549805, "logps/rejected": -57.539276123046875, "loss": 0.6917, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.004160616081207991, "rewards/margins": 0.002856313716620207, "rewards/rejected": -0.0070169297978281975, "step": 2800 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.1558778285980225, "eval_logits/rejected": -3.150256395339966, "eval_logps/chosen": -58.53827667236328, "eval_logps/rejected": -63.10443878173828, "eval_loss": 0.6926683783531189, "eval_rewards/accuracies": 0.5506505370140076, "eval_rewards/chosen": 0.0017361408099532127, "eval_rewards/margins": 0.0009792475029826164, "eval_rewards/rejected": 0.0007568933651782572, "eval_runtime": 383.2845, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 2800 }, { "epoch": 0.4841488628532047, "grad_norm": 2.172553300857544, "learning_rate": 1.8795490694382782e-08, "logits/chosen": -2.971768617630005, "logits/rejected": -2.9470152854919434, "logps/chosen": -55.66022872924805, "logps/rejected": -56.47652053833008, "loss": 0.6912, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0032442144583910704, "rewards/margins": 0.0038853243459016085, "rewards/rejected": -0.007129538804292679, "step": 2810 }, { "epoch": 0.48587181254307377, "grad_norm": 2.5765879154205322, "learning_rate": 1.8781142575813362e-08, "logits/chosen": -3.121720314025879, "logits/rejected": -3.109372615814209, "logps/chosen": -54.74897384643555, "logps/rejected": -53.752784729003906, "loss": 0.6916, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00421771639958024, "rewards/margins": 0.003111992496997118, "rewards/rejected": -0.007329708896577358, "step": 2820 }, { "epoch": 0.4875947622329428, "grad_norm": 2.184145212173462, "learning_rate": 1.8766715048695498e-08, "logits/chosen": -2.9223580360412598, "logits/rejected": -2.9071013927459717, "logps/chosen": -55.8566780090332, "logps/rejected": -55.659027099609375, "loss": 0.6915, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.003561650635674596, "rewards/margins": 0.0033836769871413708, "rewards/rejected": -0.00694532785564661, "step": 2830 }, { "epoch": 0.48931771192281187, "grad_norm": 2.2509357929229736, "learning_rate": 1.875220824349843e-08, "logits/chosen": -3.0975735187530518, "logits/rejected": -3.086153507232666, "logps/chosen": -53.18671417236328, "logps/rejected": -52.27134323120117, "loss": 0.6915, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005504312459379435, "rewards/margins": 0.0034088040702044964, "rewards/rejected": -0.008913116529583931, "step": 2840 }, { "epoch": 0.4910406616126809, "grad_norm": 2.483192205429077, "learning_rate": 1.873762229140831e-08, "logits/chosen": -3.0591354370117188, "logits/rejected": -3.039513111114502, "logps/chosen": -52.63386917114258, "logps/rejected": -55.8605842590332, "loss": 0.6901, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0038743845652788877, "rewards/margins": 0.006244526244699955, "rewards/rejected": -0.010118911042809486, "step": 2850 }, { "epoch": 0.49276361130255, "grad_norm": 2.017402172088623, "learning_rate": 1.872295732432703e-08, "logits/chosen": -3.0378129482269287, "logits/rejected": -3.0134940147399902, "logps/chosen": -55.37115478515625, "logps/rejected": -52.80224609375, "loss": 0.6911, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004066367633640766, "rewards/margins": 0.004180265124887228, "rewards/rejected": -0.008246633224189281, "step": 2860 }, { "epoch": 0.494486560992419, "grad_norm": 2.3846540451049805, "learning_rate": 1.8708213474871015e-08, "logits/chosen": -3.0799076557159424, "logits/rejected": -3.055649757385254, "logps/chosen": -56.450767517089844, "logps/rejected": -53.93091583251953, "loss": 0.6904, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0033499475102871656, "rewards/margins": 0.005589387379586697, "rewards/rejected": -0.008939335122704506, "step": 2870 }, { "epoch": 0.4962095106822881, "grad_norm": 2.6427111625671387, "learning_rate": 1.8693390876370032e-08, "logits/chosen": -3.156221389770508, "logits/rejected": -3.129972219467163, "logps/chosen": -55.811683654785156, "logps/rejected": -51.90478515625, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0046365829184651375, "rewards/margins": 0.005148191004991531, "rewards/rejected": -0.009784774854779243, "step": 2880 }, { "epoch": 0.49793246037215716, "grad_norm": 2.230509042739868, "learning_rate": 1.867848966286598e-08, "logits/chosen": -3.1655468940734863, "logits/rejected": -3.160212993621826, "logps/chosen": -53.45560836791992, "logps/rejected": -53.241310119628906, "loss": 0.6921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.004979230929166079, "rewards/margins": 0.0022403167095035315, "rewards/rejected": -0.007219547871500254, "step": 2890 }, { "epoch": 0.4996554100620262, "grad_norm": 2.3694610595703125, "learning_rate": 1.8663509969111677e-08, "logits/chosen": -3.084071636199951, "logits/rejected": -3.0744006633758545, "logps/chosen": -52.7962532043457, "logps/rejected": -52.0960578918457, "loss": 0.6919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0030820216052234173, "rewards/margins": 0.002532669808715582, "rewards/rejected": -0.005614691413938999, "step": 2900 }, { "epoch": 0.4996554100620262, "eval_logits/chosen": -3.155229330062866, "eval_logits/rejected": -3.1495659351348877, "eval_logps/chosen": -58.53911209106445, "eval_logps/rejected": -63.11810302734375, "eval_loss": 0.6926056742668152, "eval_rewards/accuracies": 0.5608736276626587, "eval_rewards/chosen": 0.001727823168039322, "eval_rewards/margins": 0.0011076563969254494, "eval_rewards/rejected": 0.0006201668875291944, "eval_runtime": 383.253, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 2900 }, { "epoch": 0.5013783597518953, "grad_norm": 2.7788450717926025, "learning_rate": 1.8648451930569647e-08, "logits/chosen": -3.1230292320251465, "logits/rejected": -3.1110596656799316, "logps/chosen": -56.4962158203125, "logps/rejected": -56.69160842895508, "loss": 0.6912, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0038903437089174986, "rewards/margins": 0.004020698834210634, "rewards/rejected": -0.007911041378974915, "step": 2910 }, { "epoch": 0.5031013094417643, "grad_norm": 2.253162145614624, "learning_rate": 1.8633315683410898e-08, "logits/chosen": -3.0678889751434326, "logits/rejected": -3.0639030933380127, "logps/chosen": -54.0466194152832, "logps/rejected": -56.43535614013672, "loss": 0.6909, "rewards/accuracies": 0.65625, "rewards/chosen": -0.004764976445585489, "rewards/margins": 0.004528032150119543, "rewards/rejected": -0.009293009527027607, "step": 2920 }, { "epoch": 0.5048242591316333, "grad_norm": 2.342252254486084, "learning_rate": 1.8618101364513675e-08, "logits/chosen": -3.0404350757598877, "logits/rejected": -3.0122365951538086, "logps/chosen": -53.86481857299805, "logps/rejected": -52.56145477294922, "loss": 0.6908, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.004135873168706894, "rewards/margins": 0.00479888916015625, "rewards/rejected": -0.008934763260185719, "step": 2930 }, { "epoch": 0.5065472088215024, "grad_norm": 2.1050474643707275, "learning_rate": 1.8602809111462233e-08, "logits/chosen": -3.071373462677002, "logits/rejected": -3.0339467525482178, "logps/chosen": -51.91243362426758, "logps/rejected": -51.6020622253418, "loss": 0.6914, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004520986694842577, "rewards/margins": 0.00364798866212368, "rewards/rejected": -0.00816897489130497, "step": 2940 }, { "epoch": 0.5082701585113715, "grad_norm": 2.170177936553955, "learning_rate": 1.8587439062545598e-08, "logits/chosen": -3.1068174839019775, "logits/rejected": -3.0881857872009277, "logps/chosen": -54.690635681152344, "logps/rejected": -55.344505310058594, "loss": 0.691, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0036951780784875154, "rewards/margins": 0.004416085779666901, "rewards/rejected": -0.008111263625323772, "step": 2950 }, { "epoch": 0.5099931082012406, "grad_norm": 2.382573366165161, "learning_rate": 1.8571991356756304e-08, "logits/chosen": -3.0656509399414062, "logits/rejected": -3.038205623626709, "logps/chosen": -54.41986083984375, "logps/rejected": -52.79435348510742, "loss": 0.6911, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0047190822660923, "rewards/margins": 0.004267274402081966, "rewards/rejected": -0.008986357599496841, "step": 2960 }, { "epoch": 0.5117160578911096, "grad_norm": 2.9030227661132812, "learning_rate": 1.8556466133789146e-08, "logits/chosen": -2.9982943534851074, "logits/rejected": -2.9701733589172363, "logps/chosen": -55.818634033203125, "logps/rejected": -53.390159606933594, "loss": 0.6912, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.005811252165585756, "rewards/margins": 0.00395150575786829, "rewards/rejected": -0.009762757457792759, "step": 2970 }, { "epoch": 0.5134390075809786, "grad_norm": 2.39404034614563, "learning_rate": 1.8540863534039903e-08, "logits/chosen": -2.9953501224517822, "logits/rejected": -2.9691390991210938, "logps/chosen": -54.135398864746094, "logps/rejected": -53.153411865234375, "loss": 0.6898, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.001961924834176898, "rewards/margins": 0.0067636966705322266, "rewards/rejected": -0.008725621737539768, "step": 2980 }, { "epoch": 0.5151619572708477, "grad_norm": 2.3001177310943604, "learning_rate": 1.8525183698604096e-08, "logits/chosen": -3.04237699508667, "logits/rejected": -3.014385938644409, "logps/chosen": -56.99365234375, "logps/rejected": -55.51732635498047, "loss": 0.6906, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0038381018675863743, "rewards/margins": 0.005188227631151676, "rewards/rejected": -0.009026329033076763, "step": 2990 }, { "epoch": 0.5168849069607168, "grad_norm": 1.937101125717163, "learning_rate": 1.8509426769275677e-08, "logits/chosen": -3.0544161796569824, "logits/rejected": -3.048501968383789, "logps/chosen": -52.471229553222656, "logps/rejected": -54.636680603027344, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004588799085468054, "rewards/margins": 0.002834170823916793, "rewards/rejected": -0.00742297014221549, "step": 3000 }, { "epoch": 0.5168849069607168, "eval_logits/chosen": -3.154435634613037, "eval_logits/rejected": -3.148829460144043, "eval_logps/chosen": -58.526187896728516, "eval_logps/rejected": -63.12165832519531, "eval_loss": 0.6925256848335266, "eval_rewards/accuracies": 0.5606412887573242, "eval_rewards/chosen": 0.0018571042455732822, "eval_rewards/margins": 0.0012724484549835324, "eval_rewards/rejected": 0.0005846557905897498, "eval_runtime": 383.6244, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 3000 }, { "epoch": 0.5186078566505858, "grad_norm": 2.4951703548431396, "learning_rate": 1.8493592888545773e-08, "logits/chosen": -3.080109119415283, "logits/rejected": -3.0565946102142334, "logps/chosen": -55.65546798706055, "logps/rejected": -53.9165153503418, "loss": 0.6914, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004678068216890097, "rewards/margins": 0.0035416565369814634, "rewards/rejected": -0.008219725452363491, "step": 3010 }, { "epoch": 0.5203308063404548, "grad_norm": 2.145470142364502, "learning_rate": 1.8477682199601388e-08, "logits/chosen": -3.166893720626831, "logits/rejected": -3.134974956512451, "logps/chosen": -54.71784591674805, "logps/rejected": -51.342918395996094, "loss": 0.6905, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004793129861354828, "rewards/margins": 0.0054800366051495075, "rewards/rejected": -0.010273166000843048, "step": 3020 }, { "epoch": 0.5220537560303239, "grad_norm": 2.2682783603668213, "learning_rate": 1.8461694846324108e-08, "logits/chosen": -3.0667412281036377, "logits/rejected": -3.044809341430664, "logps/chosen": -56.32188034057617, "logps/rejected": -55.66552734375, "loss": 0.6925, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.006469712592661381, "rewards/margins": 0.00129136280156672, "rewards/rejected": -0.00776107469573617, "step": 3030 }, { "epoch": 0.523776705720193, "grad_norm": 2.118988275527954, "learning_rate": 1.84456309732888e-08, "logits/chosen": -3.171809196472168, "logits/rejected": -3.1654465198516846, "logps/chosen": -51.78215789794922, "logps/rejected": -55.33644485473633, "loss": 0.6911, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.005943716503679752, "rewards/margins": 0.004264301620423794, "rewards/rejected": -0.010208018124103546, "step": 3040 }, { "epoch": 0.525499655410062, "grad_norm": 1.9871397018432617, "learning_rate": 1.84294907257623e-08, "logits/chosen": -3.030111789703369, "logits/rejected": -3.016605854034424, "logps/chosen": -54.00185012817383, "logps/rejected": -56.15374755859375, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": -0.004437591414898634, "rewards/margins": 0.0035246661864221096, "rewards/rejected": -0.007962257601320744, "step": 3050 }, { "epoch": 0.5272226050999311, "grad_norm": 2.395477533340454, "learning_rate": 1.8413274249702112e-08, "logits/chosen": -3.0659689903259277, "logits/rejected": -3.0472943782806396, "logps/chosen": -56.21149444580078, "logps/rejected": -55.056556701660156, "loss": 0.6924, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0053570386953651905, "rewards/margins": 0.0015337929362431169, "rewards/rejected": -0.006890831049531698, "step": 3060 }, { "epoch": 0.5289455547898001, "grad_norm": 2.4162089824676514, "learning_rate": 1.839698169175508e-08, "logits/chosen": -3.0253570079803467, "logits/rejected": -2.9994709491729736, "logps/chosen": -58.442176818847656, "logps/rejected": -54.90386199951172, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.004995794501155615, "rewards/margins": 0.004661207552999258, "rewards/rejected": -0.009657002054154873, "step": 3070 }, { "epoch": 0.5306685044796692, "grad_norm": 2.121013641357422, "learning_rate": 1.8380613199256057e-08, "logits/chosen": -2.9383175373077393, "logits/rejected": -2.921355724334717, "logps/chosen": -51.76934051513672, "logps/rejected": -55.88709259033203, "loss": 0.6923, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.009108955040574074, "rewards/margins": 0.0017499884124845266, "rewards/rejected": -0.01085894275456667, "step": 3080 }, { "epoch": 0.5323914541695383, "grad_norm": 2.1115143299102783, "learning_rate": 1.836416892022658e-08, "logits/chosen": -3.023068428039551, "logits/rejected": -3.0035014152526855, "logps/chosen": -54.106475830078125, "logps/rejected": -54.712646484375, "loss": 0.6906, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.004943912848830223, "rewards/margins": 0.005139966495335102, "rewards/rejected": -0.010083879344165325, "step": 3090 }, { "epoch": 0.5341144038594073, "grad_norm": 2.477569341659546, "learning_rate": 1.8347649003373534e-08, "logits/chosen": -3.061750888824463, "logits/rejected": -3.0320868492126465, "logps/chosen": -54.39912033081055, "logps/rejected": -52.48120880126953, "loss": 0.691, "rewards/accuracies": 0.625, "rewards/chosen": -0.00478705670684576, "rewards/margins": 0.004317191429436207, "rewards/rejected": -0.009104247204959393, "step": 3100 }, { "epoch": 0.5341144038594073, "eval_logits/chosen": -3.154158592224121, "eval_logits/rejected": -3.148547887802124, "eval_logps/chosen": -58.521949768066406, "eval_logps/rejected": -63.126922607421875, "eval_loss": 0.6924790740013123, "eval_rewards/accuracies": 0.5669144988059998, "eval_rewards/chosen": 0.0018994332058355212, "eval_rewards/margins": 0.0013674128567799926, "eval_rewards/rejected": 0.0005320201744325459, "eval_runtime": 383.3353, "eval_samples_per_second": 11.228, "eval_steps_per_second": 1.403, "step": 3100 }, { "epoch": 0.5358373535492763, "grad_norm": 2.5770890712738037, "learning_rate": 1.8331053598087794e-08, "logits/chosen": -3.0007739067077637, "logits/rejected": -3.010702133178711, "logps/chosen": -51.33858108520508, "logps/rejected": -55.857177734375, "loss": 0.6923, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.007260960526764393, "rewards/margins": 0.001832063077017665, "rewards/rejected": -0.009093021973967552, "step": 3110 }, { "epoch": 0.5375603032391454, "grad_norm": 2.4113028049468994, "learning_rate": 1.8314382854442894e-08, "logits/chosen": -3.049830198287964, "logits/rejected": -3.025566339492798, "logps/chosen": -56.523887634277344, "logps/rejected": -56.6429328918457, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": -0.005131120793521404, "rewards/margins": 0.003246209817007184, "rewards/rejected": -0.008377330377697945, "step": 3120 }, { "epoch": 0.5392832529290145, "grad_norm": 2.1546311378479004, "learning_rate": 1.8297636923193653e-08, "logits/chosen": -2.966508626937866, "logits/rejected": -2.9583373069763184, "logps/chosen": -52.656578063964844, "logps/rejected": -54.178009033203125, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": -0.00575623381882906, "rewards/margins": 0.002625588094815612, "rewards/rejected": -0.008381822146475315, "step": 3130 }, { "epoch": 0.5410062026188835, "grad_norm": 2.1864230632781982, "learning_rate": 1.828081595577481e-08, "logits/chosen": -3.084721088409424, "logits/rejected": -3.0690340995788574, "logps/chosen": -52.35600662231445, "logps/rejected": -54.71647262573242, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006588843651115894, "rewards/margins": 0.003160575870424509, "rewards/rejected": -0.00974941998720169, "step": 3140 }, { "epoch": 0.5427291523087526, "grad_norm": 2.3320834636688232, "learning_rate": 1.8263920104299668e-08, "logits/chosen": -3.0859665870666504, "logits/rejected": -3.0568442344665527, "logps/chosen": -54.78154373168945, "logps/rejected": -54.6657829284668, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.004125826992094517, "rewards/margins": 0.006509957369416952, "rewards/rejected": -0.010635784827172756, "step": 3150 }, { "epoch": 0.5444521019986216, "grad_norm": 2.330270767211914, "learning_rate": 1.824694952155872e-08, "logits/chosen": -3.0266149044036865, "logits/rejected": -2.997025728225708, "logps/chosen": -54.46602249145508, "logps/rejected": -53.62001419067383, "loss": 0.6899, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0031971274875104427, "rewards/margins": 0.006669213529676199, "rewards/rejected": -0.009866341017186642, "step": 3160 }, { "epoch": 0.5461750516884907, "grad_norm": 2.5265400409698486, "learning_rate": 1.822990436101825e-08, "logits/chosen": -3.052417516708374, "logits/rejected": -3.019367218017578, "logps/chosen": -56.2938346862793, "logps/rejected": -50.660343170166016, "loss": 0.6894, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004044114612042904, "rewards/margins": 0.007699139416217804, "rewards/rejected": -0.011743253096938133, "step": 3170 }, { "epoch": 0.5478980013783598, "grad_norm": 2.0642013549804688, "learning_rate": 1.8212784776818955e-08, "logits/chosen": -3.1040587425231934, "logits/rejected": -3.0662312507629395, "logps/chosen": -53.784767150878906, "logps/rejected": -52.4791145324707, "loss": 0.6906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005283757578581572, "rewards/margins": 0.005227072164416313, "rewards/rejected": -0.010510829277336597, "step": 3180 }, { "epoch": 0.5496209510682288, "grad_norm": 2.294727325439453, "learning_rate": 1.8195590923774554e-08, "logits/chosen": -3.1036376953125, "logits/rejected": -3.0982773303985596, "logps/chosen": -52.858741760253906, "logps/rejected": -57.697471618652344, "loss": 0.6908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005610528867691755, "rewards/margins": 0.004711043555289507, "rewards/rejected": -0.010321573354303837, "step": 3190 }, { "epoch": 0.5513439007580979, "grad_norm": 2.7669525146484375, "learning_rate": 1.8178322957370386e-08, "logits/chosen": -3.0280566215515137, "logits/rejected": -3.013529062271118, "logps/chosen": -55.881103515625, "logps/rejected": -52.9276008605957, "loss": 0.692, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00552431819960475, "rewards/margins": 0.0024294995237141848, "rewards/rejected": -0.007953817956149578, "step": 3200 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.1533045768737793, "eval_logits/rejected": -3.1476891040802, "eval_logps/chosen": -58.526756286621094, "eval_logps/rejected": -63.130882263183594, "eval_loss": 0.6924848556518555, "eval_rewards/accuracies": 0.5606412887573242, "eval_rewards/chosen": 0.00185136660002172, "eval_rewards/margins": 0.0013589225709438324, "eval_rewards/rejected": 0.0004924440290778875, "eval_runtime": 383.5261, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 3200 }, { "epoch": 0.5530668504479669, "grad_norm": 2.6868748664855957, "learning_rate": 1.8160981033762e-08, "logits/chosen": -2.945176362991333, "logits/rejected": -2.9206013679504395, "logps/chosen": -54.455101013183594, "logps/rejected": -53.40840530395508, "loss": 0.6912, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005037089344114065, "rewards/margins": 0.004074054770171642, "rewards/rejected": -0.00911114551126957, "step": 3210 }, { "epoch": 0.554789800137836, "grad_norm": 2.1434881687164307, "learning_rate": 1.8143565309773743e-08, "logits/chosen": -2.9813790321350098, "logits/rejected": -2.9762043952941895, "logps/chosen": -51.000160217285156, "logps/rejected": -53.827186584472656, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006563312374055386, "rewards/margins": 0.002880085725337267, "rewards/rejected": -0.009443397633731365, "step": 3220 }, { "epoch": 0.556512749827705, "grad_norm": 2.1510605812072754, "learning_rate": 1.812607594289735e-08, "logits/chosen": -3.036472797393799, "logits/rejected": -3.0197601318359375, "logps/chosen": -54.75006866455078, "logps/rejected": -57.2293815612793, "loss": 0.6919, "rewards/accuracies": 0.5625, "rewards/chosen": -0.005926917772740126, "rewards/margins": 0.0025798422284424305, "rewards/rejected": -0.008506760001182556, "step": 3230 }, { "epoch": 0.5582356995175741, "grad_norm": 2.268118143081665, "learning_rate": 1.8108513091290518e-08, "logits/chosen": -3.157724380493164, "logits/rejected": -3.1337904930114746, "logps/chosen": -56.884422302246094, "logps/rejected": -53.697410583496094, "loss": 0.6907, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.00647754967212677, "rewards/margins": 0.005060004070401192, "rewards/rejected": -0.011537553742527962, "step": 3240 }, { "epoch": 0.5599586492074431, "grad_norm": 2.060023546218872, "learning_rate": 1.8090876913775457e-08, "logits/chosen": -3.052263021469116, "logits/rejected": -3.0307984352111816, "logps/chosen": -53.035064697265625, "logps/rejected": -54.42890548706055, "loss": 0.6899, "rewards/accuracies": 0.65625, "rewards/chosen": -0.004625025205314159, "rewards/margins": 0.006578563246876001, "rewards/rejected": -0.011203588917851448, "step": 3250 }, { "epoch": 0.5616815988973122, "grad_norm": 2.4339516162872314, "learning_rate": 1.8073167569837484e-08, "logits/chosen": -3.063772678375244, "logits/rejected": -3.046740770339966, "logps/chosen": -52.22735595703125, "logps/rejected": -53.87421417236328, "loss": 0.691, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006919285748153925, "rewards/margins": 0.004433135036379099, "rewards/rejected": -0.011352420784533024, "step": 3260 }, { "epoch": 0.5634045485871813, "grad_norm": 2.4903886318206787, "learning_rate": 1.8055385219623555e-08, "logits/chosen": -3.120856523513794, "logits/rejected": -3.116936445236206, "logps/chosen": -55.896339416503906, "logps/rejected": -59.06333541870117, "loss": 0.6912, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007667438592761755, "rewards/margins": 0.004086242523044348, "rewards/rejected": -0.011753681115806103, "step": 3270 }, { "epoch": 0.5651274982770503, "grad_norm": 2.4499690532684326, "learning_rate": 1.8037530023940842e-08, "logits/chosen": -3.127962112426758, "logits/rejected": -3.117267370223999, "logps/chosen": -50.12376022338867, "logps/rejected": -53.41807174682617, "loss": 0.6916, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0070305257104337215, "rewards/margins": 0.003172159194946289, "rewards/rejected": -0.010202684439718723, "step": 3280 }, { "epoch": 0.5668504479669194, "grad_norm": 2.1593992710113525, "learning_rate": 1.8019602144255244e-08, "logits/chosen": -3.1752045154571533, "logits/rejected": -3.143965005874634, "logps/chosen": -53.74822235107422, "logps/rejected": -54.3244743347168, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005819928366690874, "rewards/margins": 0.00419646967202425, "rewards/rejected": -0.010016398504376411, "step": 3290 }, { "epoch": 0.5685733976567884, "grad_norm": 2.6347358226776123, "learning_rate": 1.800160174268996e-08, "logits/chosen": -3.050291061401367, "logits/rejected": -3.027971029281616, "logps/chosen": -55.40729904174805, "logps/rejected": -54.66904830932617, "loss": 0.6902, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004908127710223198, "rewards/margins": 0.006011998746544123, "rewards/rejected": -0.010920126922428608, "step": 3300 }, { "epoch": 0.5685733976567884, "eval_logits/chosen": -3.152574300765991, "eval_logits/rejected": -3.146979331970215, "eval_logps/chosen": -58.527687072753906, "eval_logps/rejected": -63.1528434753418, "eval_loss": 0.6923813223838806, "eval_rewards/accuracies": 0.5604089498519897, "eval_rewards/chosen": 0.0018420711858198047, "eval_rewards/margins": 0.001569233019836247, "eval_rewards/rejected": 0.0002728381659835577, "eval_runtime": 383.5849, "eval_samples_per_second": 11.22, "eval_steps_per_second": 1.403, "step": 3300 }, { "epoch": 0.5702963473466575, "grad_norm": 2.300382137298584, "learning_rate": 1.7983528982024008e-08, "logits/chosen": -2.99222731590271, "logits/rejected": -2.96380615234375, "logps/chosen": -53.65486526489258, "logps/rejected": -51.8136100769043, "loss": 0.6904, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005560653749853373, "rewards/margins": 0.00558196846395731, "rewards/rejected": -0.01114262267947197, "step": 3310 }, { "epoch": 0.5720192970365265, "grad_norm": 2.6217761039733887, "learning_rate": 1.796538402569076e-08, "logits/chosen": -3.049356698989868, "logits/rejected": -3.013167142868042, "logps/chosen": -54.18232345581055, "logps/rejected": -50.1631965637207, "loss": 0.6895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004746898077428341, "rewards/margins": 0.007434196770191193, "rewards/rejected": -0.012181093916296959, "step": 3320 }, { "epoch": 0.5737422467263956, "grad_norm": 2.4825403690338135, "learning_rate": 1.7947167037776444e-08, "logits/chosen": -3.1506247520446777, "logits/rejected": -3.1185247898101807, "logps/chosen": -54.65381622314453, "logps/rejected": -54.71440887451172, "loss": 0.6902, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004404108040034771, "rewards/margins": 0.005980129353702068, "rewards/rejected": -0.01038423739373684, "step": 3330 }, { "epoch": 0.5754651964162646, "grad_norm": 2.2614500522613525, "learning_rate": 1.792887818301869e-08, "logits/chosen": -3.0857772827148438, "logits/rejected": -3.0680103302001953, "logps/chosen": -56.436622619628906, "logps/rejected": -55.13303756713867, "loss": 0.6901, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.005234843119978905, "rewards/margins": 0.006301610264927149, "rewards/rejected": -0.01153645385056734, "step": 3340 }, { "epoch": 0.5771881461061337, "grad_norm": 2.3960185050964355, "learning_rate": 1.791051762680502e-08, "logits/chosen": -2.980429172515869, "logits/rejected": -2.9654033184051514, "logps/chosen": -56.40067672729492, "logps/rejected": -58.61570358276367, "loss": 0.6899, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.006220079492777586, "rewards/margins": 0.006694138050079346, "rewards/rejected": -0.01291421614587307, "step": 3350 }, { "epoch": 0.5789110957960028, "grad_norm": 2.283350944519043, "learning_rate": 1.789208553517135e-08, "logits/chosen": -3.103543758392334, "logits/rejected": -3.068631649017334, "logps/chosen": -55.334007263183594, "logps/rejected": -53.504661560058594, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.005187752656638622, "rewards/margins": 0.0075442553497850895, "rewards/rejected": -0.012732008472084999, "step": 3360 }, { "epoch": 0.5806340454858718, "grad_norm": 2.334491491317749, "learning_rate": 1.7873582074800518e-08, "logits/chosen": -3.034996509552002, "logits/rejected": -3.0251402854919434, "logps/chosen": -51.59055709838867, "logps/rejected": -55.11620330810547, "loss": 0.6909, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00635263929143548, "rewards/margins": 0.004672903101891279, "rewards/rejected": -0.01102554239332676, "step": 3370 }, { "epoch": 0.5823569951757409, "grad_norm": 2.041323184967041, "learning_rate": 1.785500741302073e-08, "logits/chosen": -3.1335296630859375, "logits/rejected": -3.1007940769195557, "logps/chosen": -55.99006271362305, "logps/rejected": -51.322906494140625, "loss": 0.69, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005291844252496958, "rewards/margins": 0.006499829702079296, "rewards/rejected": -0.011791672557592392, "step": 3380 }, { "epoch": 0.5840799448656099, "grad_norm": 1.9655297994613647, "learning_rate": 1.7836361717804083e-08, "logits/chosen": -3.056028366088867, "logits/rejected": -3.0294790267944336, "logps/chosen": -54.32973098754883, "logps/rejected": -52.4959716796875, "loss": 0.6909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00602052453905344, "rewards/margins": 0.0046034911647439, "rewards/rejected": -0.010624016635119915, "step": 3390 }, { "epoch": 0.585802894555479, "grad_norm": 2.251234531402588, "learning_rate": 1.7817645157765035e-08, "logits/chosen": -3.109879970550537, "logits/rejected": -3.0702693462371826, "logps/chosen": -53.470672607421875, "logps/rejected": -52.68741989135742, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": -0.006139514502137899, "rewards/margins": 0.006888681091368198, "rewards/rejected": -0.01302819512784481, "step": 3400 }, { "epoch": 0.585802894555479, "eval_logits/chosen": -3.151777982711792, "eval_logits/rejected": -3.1461565494537354, "eval_logps/chosen": -58.51345443725586, "eval_logps/rejected": -63.1519660949707, "eval_loss": 0.69231778383255, "eval_rewards/accuracies": 0.5601765513420105, "eval_rewards/chosen": 0.001984409289434552, "eval_rewards/margins": 0.0017028645379468799, "eval_rewards/rejected": 0.0002815446350723505, "eval_runtime": 383.5062, "eval_samples_per_second": 11.223, "eval_steps_per_second": 1.403, "step": 3400 }, { "epoch": 0.587525844245348, "grad_norm": 2.122176170349121, "learning_rate": 1.7798857902158887e-08, "logits/chosen": -2.9969916343688965, "logits/rejected": -2.967726230621338, "logps/chosen": -50.218971252441406, "logps/rejected": -48.38374710083008, "loss": 0.6899, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006457244511693716, "rewards/margins": 0.006697861943393946, "rewards/rejected": -0.013155105523765087, "step": 3410 }, { "epoch": 0.5892487939352171, "grad_norm": 2.6450655460357666, "learning_rate": 1.7780000120880232e-08, "logits/chosen": -3.017392873764038, "logits/rejected": -2.9929404258728027, "logps/chosen": -53.33781051635742, "logps/rejected": -54.46907424926758, "loss": 0.6902, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005704882554709911, "rewards/margins": 0.005997374653816223, "rewards/rejected": -0.01170225627720356, "step": 3420 }, { "epoch": 0.5909717436250862, "grad_norm": 1.989274024963379, "learning_rate": 1.7761071984461438e-08, "logits/chosen": -3.1018154621124268, "logits/rejected": -3.081432342529297, "logps/chosen": -51.916282653808594, "logps/rejected": -55.77009963989258, "loss": 0.6904, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005753933917731047, "rewards/margins": 0.005630741361528635, "rewards/rejected": -0.011384674347937107, "step": 3430 }, { "epoch": 0.5926946933149552, "grad_norm": 2.477759838104248, "learning_rate": 1.7742073664071095e-08, "logits/chosen": -3.0237717628479004, "logits/rejected": -3.0075230598449707, "logps/chosen": -53.5707893371582, "logps/rejected": -54.196929931640625, "loss": 0.6912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0099791893735528, "rewards/margins": 0.003979781176894903, "rewards/rejected": -0.01395897101610899, "step": 3440 }, { "epoch": 0.5944176430048242, "grad_norm": 2.5092668533325195, "learning_rate": 1.772300533151249e-08, "logits/chosen": -3.272890090942383, "logits/rejected": -3.2318217754364014, "logps/chosen": -59.927818298339844, "logps/rejected": -56.14251708984375, "loss": 0.69, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00548345223069191, "rewards/margins": 0.006471781525760889, "rewards/rejected": -0.01195523515343666, "step": 3450 }, { "epoch": 0.5961405926946933, "grad_norm": 2.2644803524017334, "learning_rate": 1.7703867159222012e-08, "logits/chosen": -3.0503110885620117, "logits/rejected": -3.037585496902466, "logps/chosen": -52.72737503051758, "logps/rejected": -54.55059051513672, "loss": 0.6912, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005479804240167141, "rewards/margins": 0.004024847410619259, "rewards/rejected": -0.009504652582108974, "step": 3460 }, { "epoch": 0.5978635423845624, "grad_norm": 1.9865294694900513, "learning_rate": 1.768465932026763e-08, "logits/chosen": -3.1293208599090576, "logits/rejected": -3.1130614280700684, "logps/chosen": -56.37324905395508, "logps/rejected": -55.44392013549805, "loss": 0.6905, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005630441941320896, "rewards/margins": 0.005450661294162273, "rewards/rejected": -0.011081104166805744, "step": 3470 }, { "epoch": 0.5995864920744314, "grad_norm": 2.379979133605957, "learning_rate": 1.766538198834731e-08, "logits/chosen": -3.0560507774353027, "logits/rejected": -3.0245845317840576, "logps/chosen": -55.585777282714844, "logps/rejected": -53.092987060546875, "loss": 0.6895, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004711954854428768, "rewards/margins": 0.007386787328869104, "rewards/rejected": -0.01209874078631401, "step": 3480 }, { "epoch": 0.6013094417643005, "grad_norm": 2.4309921264648438, "learning_rate": 1.7646035337787454e-08, "logits/chosen": -3.0650832653045654, "logits/rejected": -3.0358080863952637, "logps/chosen": -54.878990173339844, "logps/rejected": -55.72333908081055, "loss": 0.6903, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.006096069701015949, "rewards/margins": 0.005908667109906673, "rewards/rejected": -0.012004735879600048, "step": 3490 }, { "epoch": 0.6030323914541695, "grad_norm": 2.2524032592773438, "learning_rate": 1.7626619543541304e-08, "logits/chosen": -3.01064395904541, "logits/rejected": -3.0026509761810303, "logps/chosen": -52.7326774597168, "logps/rejected": -56.00465774536133, "loss": 0.6902, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0072999633848667145, "rewards/margins": 0.0061295004561543465, "rewards/rejected": -0.01342946570366621, "step": 3500 }, { "epoch": 0.6030323914541695, "eval_logits/chosen": -3.1511001586914062, "eval_logits/rejected": -3.145463228225708, "eval_logps/chosen": -58.5220947265625, "eval_logps/rejected": -63.16736602783203, "eval_loss": 0.6922861337661743, "eval_rewards/accuracies": 0.5532063245773315, "eval_rewards/chosen": 0.001898013986647129, "eval_rewards/margins": 0.0017704860074445605, "eval_rewards/rejected": 0.00012752779002767056, "eval_runtime": 383.5783, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 3500 }, { "epoch": 0.6047553411440386, "grad_norm": 2.344438076019287, "learning_rate": 1.760713478118739e-08, "logits/chosen": -2.9544014930725098, "logits/rejected": -2.928922414779663, "logps/chosen": -54.55061721801758, "logps/rejected": -52.584266662597656, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.00983361341059208, "rewards/margins": 0.004564939998090267, "rewards/rejected": -0.014398554340004921, "step": 3510 }, { "epoch": 0.6064782908339077, "grad_norm": 2.2484140396118164, "learning_rate": 1.758758122692791e-08, "logits/chosen": -2.988053321838379, "logits/rejected": -2.9679315090179443, "logps/chosen": -56.056297302246094, "logps/rejected": -53.2197265625, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": -0.007219976745545864, "rewards/margins": 0.004322125110775232, "rewards/rejected": -0.011542101390659809, "step": 3520 }, { "epoch": 0.6082012405237767, "grad_norm": 2.396355152130127, "learning_rate": 1.756795905758717e-08, "logits/chosen": -3.1577303409576416, "logits/rejected": -3.1141624450683594, "logps/chosen": -56.49724578857422, "logps/rejected": -52.197303771972656, "loss": 0.6895, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005397644359618425, "rewards/margins": 0.007435487117618322, "rewards/rejected": -0.012833130545914173, "step": 3530 }, { "epoch": 0.6099241902136457, "grad_norm": 2.5322744846343994, "learning_rate": 1.754826845060995e-08, "logits/chosen": -2.97660493850708, "logits/rejected": -2.951645851135254, "logps/chosen": -54.54118728637695, "logps/rejected": -54.915924072265625, "loss": 0.6896, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006284528411924839, "rewards/margins": 0.007204174064099789, "rewards/rejected": -0.013488702476024628, "step": 3540 }, { "epoch": 0.6116471399035148, "grad_norm": 2.43735671043396, "learning_rate": 1.752850958405993e-08, "logits/chosen": -3.067127227783203, "logits/rejected": -3.0576090812683105, "logps/chosen": -55.544654846191406, "logps/rejected": -53.7586784362793, "loss": 0.6921, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.009815131314098835, "rewards/margins": 0.0022593550384044647, "rewards/rejected": -0.0120744863525033, "step": 3550 }, { "epoch": 0.6133700895933839, "grad_norm": 2.1326327323913574, "learning_rate": 1.7508682636618058e-08, "logits/chosen": -3.0019993782043457, "logits/rejected": -2.9740567207336426, "logps/chosen": -53.49755096435547, "logps/rejected": -53.9032096862793, "loss": 0.6894, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005975979380309582, "rewards/margins": 0.007574207149446011, "rewards/rejected": -0.013550187461078167, "step": 3560 }, { "epoch": 0.6150930392832529, "grad_norm": 2.697509288787842, "learning_rate": 1.7488787787580952e-08, "logits/chosen": -3.1488282680511475, "logits/rejected": -3.113802194595337, "logps/chosen": -57.558563232421875, "logps/rejected": -54.47620391845703, "loss": 0.6904, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.006858462933450937, "rewards/margins": 0.005681387148797512, "rewards/rejected": -0.012539848685264587, "step": 3570 }, { "epoch": 0.616815988973122, "grad_norm": 2.5877275466918945, "learning_rate": 1.746882521685926e-08, "logits/chosen": -3.0800981521606445, "logits/rejected": -3.0567736625671387, "logps/chosen": -58.673004150390625, "logps/rejected": -58.826385498046875, "loss": 0.6888, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004782065749168396, "rewards/margins": 0.008766787126660347, "rewards/rejected": -0.013548852875828743, "step": 3580 }, { "epoch": 0.618538938662991, "grad_norm": 2.1750683784484863, "learning_rate": 1.7448795104976046e-08, "logits/chosen": -3.068450450897217, "logits/rejected": -3.0522539615631104, "logps/chosen": -55.2479133605957, "logps/rejected": -55.32890701293945, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": -0.008229101076722145, "rewards/margins": 0.005200072657316923, "rewards/rejected": -0.013429174199700356, "step": 3590 }, { "epoch": 0.6202618883528601, "grad_norm": 2.4608330726623535, "learning_rate": 1.7428697633065155e-08, "logits/chosen": -3.0483107566833496, "logits/rejected": -3.0238070487976074, "logps/chosen": -55.4265251159668, "logps/rejected": -52.878150939941406, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0077946423552930355, "rewards/margins": 0.005378765985369682, "rewards/rejected": -0.013173406943678856, "step": 3600 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -3.15022611618042, "eval_logits/rejected": -3.144578218460083, "eval_logps/chosen": -58.52936935424805, "eval_logps/rejected": -63.18168258666992, "eval_loss": 0.6922528743743896, "eval_rewards/accuracies": 0.5697026252746582, "eval_rewards/chosen": 0.0018253130838274956, "eval_rewards/margins": 0.0018409350886940956, "eval_rewards/rejected": -1.5622024875483476e-05, "eval_runtime": 383.6418, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 3600 }, { "epoch": 0.6219848380427292, "grad_norm": 1.8933156728744507, "learning_rate": 1.7408532982869573e-08, "logits/chosen": -3.015439510345459, "logits/rejected": -2.9991238117218018, "logps/chosen": -52.52251052856445, "logps/rejected": -50.87234115600586, "loss": 0.6912, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.01029304601252079, "rewards/margins": 0.004010824952274561, "rewards/rejected": -0.014303868636488914, "step": 3610 }, { "epoch": 0.6237077877325982, "grad_norm": 2.2738919258117676, "learning_rate": 1.7388301336739784e-08, "logits/chosen": -2.9741413593292236, "logits/rejected": -2.9491591453552246, "logps/chosen": -55.929954528808594, "logps/rejected": -54.73632049560547, "loss": 0.6897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.004220867063850164, "rewards/margins": 0.0071715391241014, "rewards/rejected": -0.011392408050596714, "step": 3620 }, { "epoch": 0.6254307374224672, "grad_norm": 2.115520715713501, "learning_rate": 1.736800287763212e-08, "logits/chosen": -3.1313865184783936, "logits/rejected": -3.1247239112854004, "logps/chosen": -51.954750061035156, "logps/rejected": -57.22846221923828, "loss": 0.6896, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.008140425197780132, "rewards/margins": 0.007292865309864283, "rewards/rejected": -0.015433289110660553, "step": 3630 }, { "epoch": 0.6271536871123363, "grad_norm": 2.4922537803649902, "learning_rate": 1.7347637789107115e-08, "logits/chosen": -3.081291437149048, "logits/rejected": -3.0664103031158447, "logps/chosen": -55.62919998168945, "logps/rejected": -57.412513732910156, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.005860840901732445, "rewards/margins": 0.0064689526334404945, "rewards/rejected": -0.01232979353517294, "step": 3640 }, { "epoch": 0.6288766368022054, "grad_norm": 1.9521703720092773, "learning_rate": 1.7327206255327825e-08, "logits/chosen": -3.031919002532959, "logits/rejected": -3.0375239849090576, "logps/chosen": -51.86109161376953, "logps/rejected": -55.51893997192383, "loss": 0.6922, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0077956244349479675, "rewards/margins": 0.0020641677547246218, "rewards/rejected": -0.00985979288816452, "step": 3650 }, { "epoch": 0.6305995864920745, "grad_norm": 2.2140133380889893, "learning_rate": 1.730670846105819e-08, "logits/chosen": -3.091434955596924, "logits/rejected": -3.0613198280334473, "logps/chosen": -54.4542236328125, "logps/rejected": -54.953651428222656, "loss": 0.6891, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005954755935817957, "rewards/margins": 0.008188659325242043, "rewards/rejected": -0.014143416658043861, "step": 3660 }, { "epoch": 0.6323225361819435, "grad_norm": 2.279637336730957, "learning_rate": 1.7286144591661338e-08, "logits/chosen": -3.048067569732666, "logits/rejected": -3.019761562347412, "logps/chosen": -53.457313537597656, "logps/rejected": -53.82349395751953, "loss": 0.6899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.006061294116079807, "rewards/margins": 0.006697321776300669, "rewards/rejected": -0.012758615426719189, "step": 3670 }, { "epoch": 0.6340454858718125, "grad_norm": 2.239985704421997, "learning_rate": 1.7265514833097923e-08, "logits/chosen": -3.0665507316589355, "logits/rejected": -3.0252902507781982, "logps/chosen": -55.358665466308594, "logps/rejected": -51.566871643066406, "loss": 0.6893, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0068284133449196815, "rewards/margins": 0.007831481285393238, "rewards/rejected": -0.01465989463031292, "step": 3680 }, { "epoch": 0.6357684355616816, "grad_norm": 2.6258862018585205, "learning_rate": 1.724481937192444e-08, "logits/chosen": -3.000847339630127, "logits/rejected": -3.009695529937744, "logps/chosen": -52.84056854248047, "logps/rejected": -60.19794464111328, "loss": 0.6915, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.007468428462743759, "rewards/margins": 0.0035681980662047863, "rewards/rejected": -0.011036626063287258, "step": 3690 }, { "epoch": 0.6374913852515507, "grad_norm": 2.4174911975860596, "learning_rate": 1.7224058395291544e-08, "logits/chosen": -3.04533052444458, "logits/rejected": -3.0184431076049805, "logps/chosen": -58.0278205871582, "logps/rejected": -60.93510055541992, "loss": 0.6877, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0043028187938034534, "rewards/margins": 0.011179441586136818, "rewards/rejected": -0.015482261776924133, "step": 3700 }, { "epoch": 0.6374913852515507, "eval_logits/chosen": -3.149414539337158, "eval_logits/rejected": -3.1437571048736572, "eval_logps/chosen": -58.51812744140625, "eval_logps/rejected": -63.18494415283203, "eval_loss": 0.6921834945678711, "eval_rewards/accuracies": 0.574117124080658, "eval_rewards/chosen": 0.0019376871641725302, "eval_rewards/margins": 0.0019858963787555695, "eval_rewards/rejected": -4.820928006665781e-05, "eval_runtime": 383.5204, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 3700 }, { "epoch": 0.6392143349414197, "grad_norm": 2.4837942123413086, "learning_rate": 1.7203232090942337e-08, "logits/chosen": -3.080009937286377, "logits/rejected": -3.046736240386963, "logps/chosen": -54.5723762512207, "logps/rejected": -52.51799392700195, "loss": 0.6893, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007036884780973196, "rewards/margins": 0.007810269482433796, "rewards/rejected": -0.01484715472906828, "step": 3710 }, { "epoch": 0.6409372846312887, "grad_norm": 2.339407205581665, "learning_rate": 1.7182340647210696e-08, "logits/chosen": -3.0932888984680176, "logits/rejected": -3.0756850242614746, "logps/chosen": -52.47334671020508, "logps/rejected": -54.8570556640625, "loss": 0.6888, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.005765864159911871, "rewards/margins": 0.008951535448431969, "rewards/rejected": -0.014717401936650276, "step": 3720 }, { "epoch": 0.6426602343211578, "grad_norm": 2.783074378967285, "learning_rate": 1.7161384253019558e-08, "logits/chosen": -3.0552356243133545, "logits/rejected": -3.0320777893066406, "logps/chosen": -55.36650848388672, "logps/rejected": -52.16785430908203, "loss": 0.6902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.008488858118653297, "rewards/margins": 0.0061333938501775265, "rewards/rejected": -0.014622251503169537, "step": 3730 }, { "epoch": 0.6443831840110269, "grad_norm": 2.47733473777771, "learning_rate": 1.7140363097879206e-08, "logits/chosen": -3.0521280765533447, "logits/rejected": -3.0277328491210938, "logps/chosen": -54.155914306640625, "logps/rejected": -57.747825622558594, "loss": 0.6902, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.007208681665360928, "rewards/margins": 0.006174913141876459, "rewards/rejected": -0.0133835943415761, "step": 3740 }, { "epoch": 0.646106133700896, "grad_norm": 2.5525078773498535, "learning_rate": 1.7119277371885565e-08, "logits/chosen": -3.1250386238098145, "logits/rejected": -3.096428394317627, "logps/chosen": -53.9883918762207, "logps/rejected": -54.46564865112305, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": -0.006697263568639755, "rewards/margins": 0.007167118135839701, "rewards/rejected": -0.013864380307495594, "step": 3750 }, { "epoch": 0.647829083390765, "grad_norm": 2.3697919845581055, "learning_rate": 1.709812726571848e-08, "logits/chosen": -3.0818190574645996, "logits/rejected": -3.078629732131958, "logps/chosen": -56.815521240234375, "logps/rejected": -57.3564338684082, "loss": 0.6915, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.009418672882020473, "rewards/margins": 0.0035493075847625732, "rewards/rejected": -0.012967979535460472, "step": 3760 }, { "epoch": 0.649552033080634, "grad_norm": 2.4745736122131348, "learning_rate": 1.707691297063999e-08, "logits/chosen": -3.04585337638855, "logits/rejected": -3.025336742401123, "logps/chosen": -54.28108596801758, "logps/rejected": -55.88178253173828, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": -0.007393890526145697, "rewards/margins": 0.00592414103448391, "rewards/rejected": -0.013318032026290894, "step": 3770 }, { "epoch": 0.6512749827705031, "grad_norm": 2.3536436557769775, "learning_rate": 1.7055634678492594e-08, "logits/chosen": -3.0982823371887207, "logits/rejected": -3.058579206466675, "logps/chosen": -54.78825759887695, "logps/rejected": -50.85487365722656, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.007096471730619669, "rewards/margins": 0.007441540714353323, "rewards/rejected": -0.014538010582327843, "step": 3780 }, { "epoch": 0.6529979324603722, "grad_norm": 2.3600430488586426, "learning_rate": 1.7034292581697533e-08, "logits/chosen": -3.0493836402893066, "logits/rejected": -3.019918203353882, "logps/chosen": -54.718711853027344, "logps/rejected": -52.50199508666992, "loss": 0.6893, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007305916398763657, "rewards/margins": 0.007807619869709015, "rewards/rejected": -0.015113537199795246, "step": 3790 }, { "epoch": 0.6547208821502413, "grad_norm": 2.946199893951416, "learning_rate": 1.701288687325303e-08, "logits/chosen": -3.066213369369507, "logits/rejected": -3.033952236175537, "logps/chosen": -58.41338348388672, "logps/rejected": -54.03316116333008, "loss": 0.691, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007201158907264471, "rewards/margins": 0.0044540828093886375, "rewards/rejected": -0.01165524311363697, "step": 3800 }, { "epoch": 0.6547208821502413, "eval_logits/chosen": -3.1486449241638184, "eval_logits/rejected": -3.143003225326538, "eval_logps/chosen": -58.519344329833984, "eval_logps/rejected": -63.19419479370117, "eval_loss": 0.6921459436416626, "eval_rewards/accuracies": 0.5676115155220032, "eval_rewards/chosen": 0.0019254968501627445, "eval_rewards/margins": 0.0020662089809775352, "eval_rewards/rejected": -0.0001407122181262821, "eval_runtime": 383.5365, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 3800 }, { "epoch": 0.6564438318401102, "grad_norm": 2.0473248958587646, "learning_rate": 1.699141774673255e-08, "logits/chosen": -3.0938503742218018, "logits/rejected": -3.0491273403167725, "logps/chosen": -58.0572509765625, "logps/rejected": -53.581703186035156, "loss": 0.6886, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.005891457665711641, "rewards/margins": 0.009286518208682537, "rewards/rejected": -0.015177974477410316, "step": 3810 }, { "epoch": 0.6581667815299793, "grad_norm": 2.378174304962158, "learning_rate": 1.696988539628306e-08, "logits/chosen": -3.0598483085632324, "logits/rejected": -3.0423831939697266, "logps/chosen": -54.69315719604492, "logps/rejected": -54.343544006347656, "loss": 0.6899, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.00594948697835207, "rewards/margins": 0.006609291769564152, "rewards/rejected": -0.012558777816593647, "step": 3820 }, { "epoch": 0.6598897312198484, "grad_norm": 2.2672643661499023, "learning_rate": 1.6948290016623267e-08, "logits/chosen": -3.106572151184082, "logits/rejected": -3.0605735778808594, "logps/chosen": -59.068077087402344, "logps/rejected": -52.881927490234375, "loss": 0.6876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.003909780643880367, "rewards/margins": 0.011306321248412132, "rewards/rejected": -0.0152161018922925, "step": 3830 }, { "epoch": 0.6616126809097175, "grad_norm": 2.378580331802368, "learning_rate": 1.6926631803041846e-08, "logits/chosen": -3.029109477996826, "logits/rejected": -3.015733242034912, "logps/chosen": -54.55817413330078, "logps/rejected": -60.2359619140625, "loss": 0.6896, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.004717822652310133, "rewards/margins": 0.0073250653222203255, "rewards/rejected": -0.01204288937151432, "step": 3840 }, { "epoch": 0.6633356305995864, "grad_norm": 2.3673667907714844, "learning_rate": 1.690491095139569e-08, "logits/chosen": -3.157423496246338, "logits/rejected": -3.1420907974243164, "logps/chosen": -58.070655822753906, "logps/rejected": -54.941322326660156, "loss": 0.6912, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007695481181144714, "rewards/margins": 0.00405865628272295, "rewards/rejected": -0.011754137463867664, "step": 3850 }, { "epoch": 0.6650585802894555, "grad_norm": 2.7017436027526855, "learning_rate": 1.688312765810814e-08, "logits/chosen": -3.1684272289276123, "logits/rejected": -3.1415889263153076, "logps/chosen": -55.018394470214844, "logps/rejected": -54.544158935546875, "loss": 0.6894, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008379653096199036, "rewards/margins": 0.0076343291439116, "rewards/rejected": -0.01601398177444935, "step": 3860 }, { "epoch": 0.6667815299793246, "grad_norm": 3.0593020915985107, "learning_rate": 1.6861282120167186e-08, "logits/chosen": -3.058176040649414, "logits/rejected": -3.0332953929901123, "logps/chosen": -59.28391647338867, "logps/rejected": -56.78931427001953, "loss": 0.6898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004310892894864082, "rewards/margins": 0.006942380219697952, "rewards/rejected": -0.011253273114562035, "step": 3870 }, { "epoch": 0.6685044796691937, "grad_norm": 2.492913246154785, "learning_rate": 1.6839374535123718e-08, "logits/chosen": -3.087202548980713, "logits/rejected": -3.0917575359344482, "logps/chosen": -52.11659622192383, "logps/rejected": -54.79553985595703, "loss": 0.6929, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.012375886552035809, "rewards/margins": 0.0006632144213654101, "rewards/rejected": -0.013039101846516132, "step": 3880 }, { "epoch": 0.6702274293590628, "grad_norm": 2.2675533294677734, "learning_rate": 1.6817405101089707e-08, "logits/chosen": -3.136599063873291, "logits/rejected": -3.108509063720703, "logps/chosen": -54.788795471191406, "logps/rejected": -53.5931510925293, "loss": 0.689, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.006023922935128212, "rewards/margins": 0.008436523377895355, "rewards/rejected": -0.014460447244346142, "step": 3890 }, { "epoch": 0.6719503790489317, "grad_norm": 2.471661329269409, "learning_rate": 1.679537401673644e-08, "logits/chosen": -3.106663227081299, "logits/rejected": -3.050253391265869, "logps/chosen": -57.400482177734375, "logps/rejected": -51.8661003112793, "loss": 0.6881, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005719014443457127, "rewards/margins": 0.010319101624190807, "rewards/rejected": -0.016038116067647934, "step": 3900 }, { "epoch": 0.6719503790489317, "eval_logits/chosen": -3.1476454734802246, "eval_logits/rejected": -3.1420326232910156, "eval_logps/chosen": -58.53263473510742, "eval_logps/rejected": -63.2182502746582, "eval_loss": 0.6920942068099976, "eval_rewards/accuracies": 0.5638940334320068, "eval_rewards/chosen": 0.0017926108557730913, "eval_rewards/margins": 0.002173854038119316, "eval_rewards/rejected": -0.000381243386073038, "eval_runtime": 383.5692, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 3900 }, { "epoch": 0.6736733287388008, "grad_norm": 2.3394501209259033, "learning_rate": 1.6773281481292708e-08, "logits/chosen": -3.083597421646118, "logits/rejected": -3.067528009414673, "logps/chosen": -56.4756965637207, "logps/rejected": -56.2137336730957, "loss": 0.6894, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.00834328681230545, "rewards/margins": 0.007735086139291525, "rewards/rejected": -0.016078371554613113, "step": 3910 }, { "epoch": 0.6753962784286699, "grad_norm": 2.520785331726074, "learning_rate": 1.6751127694543012e-08, "logits/chosen": -3.1565117835998535, "logits/rejected": -3.1098198890686035, "logps/chosen": -59.21630096435547, "logps/rejected": -51.98161697387695, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": -0.005468897521495819, "rewards/margins": 0.009867525659501553, "rewards/rejected": -0.015336424112319946, "step": 3920 }, { "epoch": 0.677119228118539, "grad_norm": 2.314502716064453, "learning_rate": 1.6728912856825752e-08, "logits/chosen": -3.1580872535705566, "logits/rejected": -3.1532976627349854, "logps/chosen": -52.03582000732422, "logps/rejected": -54.057891845703125, "loss": 0.6907, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007780077867209911, "rewards/margins": 0.004977663513273001, "rewards/rejected": -0.012757742777466774, "step": 3930 }, { "epoch": 0.6788421778084079, "grad_norm": 2.5148684978485107, "learning_rate": 1.6706637169031412e-08, "logits/chosen": -2.9767425060272217, "logits/rejected": -2.9571597576141357, "logps/chosen": -54.011024475097656, "logps/rejected": -55.11913299560547, "loss": 0.6887, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.005110305733978748, "rewards/margins": 0.00910019502043724, "rewards/rejected": -0.014210501685738564, "step": 3940 }, { "epoch": 0.680565127498277, "grad_norm": 2.334064483642578, "learning_rate": 1.6684300832600752e-08, "logits/chosen": -2.980774164199829, "logits/rejected": -2.9327759742736816, "logps/chosen": -56.086326599121094, "logps/rejected": -52.7585563659668, "loss": 0.6872, "rewards/accuracies": 0.71875, "rewards/chosen": -0.006129746790975332, "rewards/margins": 0.011988668702542782, "rewards/rejected": -0.018118415027856827, "step": 3950 }, { "epoch": 0.6822880771881461, "grad_norm": 2.4945201873779297, "learning_rate": 1.6661904049522985e-08, "logits/chosen": -3.0870680809020996, "logits/rejected": -3.089069128036499, "logps/chosen": -53.5789794921875, "logps/rejected": -60.57684326171875, "loss": 0.6917, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.008758168667554855, "rewards/margins": 0.0030608586966991425, "rewards/rejected": -0.011819026432931423, "step": 3960 }, { "epoch": 0.6840110268780152, "grad_norm": 2.3911170959472656, "learning_rate": 1.663944702233395e-08, "logits/chosen": -3.0987579822540283, "logits/rejected": -3.0742454528808594, "logps/chosen": -54.832969665527344, "logps/rejected": -54.51105880737305, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": -0.007600725628435612, "rewards/margins": 0.007472677621990442, "rewards/rejected": -0.015073401853442192, "step": 3970 }, { "epoch": 0.6857339765678843, "grad_norm": 2.2145421504974365, "learning_rate": 1.6616929954114263e-08, "logits/chosen": -2.976238965988159, "logits/rejected": -2.9565927982330322, "logps/chosen": -54.43210983276367, "logps/rejected": -54.672218322753906, "loss": 0.6913, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.010925976559519768, "rewards/margins": 0.003819962264969945, "rewards/rejected": -0.014745938591659069, "step": 3980 }, { "epoch": 0.6874569262577532, "grad_norm": 2.3381612300872803, "learning_rate": 1.659435304848751e-08, "logits/chosen": -3.0916008949279785, "logits/rejected": -3.0771727561950684, "logps/chosen": -53.97601318359375, "logps/rejected": -56.700538635253906, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.007593357469886541, "rewards/margins": 0.0067450194619596004, "rewards/rejected": -0.014338378794491291, "step": 3990 }, { "epoch": 0.6891798759476223, "grad_norm": 2.3654754161834717, "learning_rate": 1.6571716509618385e-08, "logits/chosen": -3.156153440475464, "logits/rejected": -3.1207895278930664, "logps/chosen": -57.9157600402832, "logps/rejected": -54.3126335144043, "loss": 0.6891, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0071152858436107635, "rewards/margins": 0.008385889232158661, "rewards/rejected": -0.015501176007091999, "step": 4000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -3.1464505195617676, "eval_logits/rejected": -3.140815258026123, "eval_logps/chosen": -58.53483200073242, "eval_logps/rejected": -63.2358283996582, "eval_loss": 0.6920202374458313, "eval_rewards/accuracies": 0.5727230310440063, "eval_rewards/chosen": 0.0017705905484035611, "eval_rewards/margins": 0.0023276114370673895, "eval_rewards/rejected": -0.000557021121494472, "eval_runtime": 383.5374, "eval_samples_per_second": 11.222, "eval_steps_per_second": 1.403, "step": 4000 }, { "epoch": 0.6909028256374914, "grad_norm": 2.257662773132324, "learning_rate": 1.6549020542210858e-08, "logits/chosen": -3.0420236587524414, "logits/rejected": -3.0328893661499023, "logps/chosen": -48.96164321899414, "logps/rejected": -54.50152587890625, "loss": 0.6905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009422359988093376, "rewards/margins": 0.005418227985501289, "rewards/rejected": -0.014840586110949516, "step": 4010 }, { "epoch": 0.6926257753273605, "grad_norm": 2.3167202472686768, "learning_rate": 1.6526265351506302e-08, "logits/chosen": -3.0502841472625732, "logits/rejected": -3.0362343788146973, "logps/chosen": -53.02349853515625, "logps/rejected": -54.821495056152344, "loss": 0.689, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.008379830047488213, "rewards/margins": 0.008577833883464336, "rewards/rejected": -0.016957664862275124, "step": 4020 }, { "epoch": 0.6943487250172296, "grad_norm": 2.379713535308838, "learning_rate": 1.6503451143281665e-08, "logits/chosen": -3.0468204021453857, "logits/rejected": -3.0248570442199707, "logps/chosen": -56.70612335205078, "logps/rejected": -57.12060546875, "loss": 0.6899, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.006521374918520451, "rewards/margins": 0.006764276418834925, "rewards/rejected": -0.013285649940371513, "step": 4030 }, { "epoch": 0.6960716747070985, "grad_norm": 2.7433881759643555, "learning_rate": 1.6480578123847584e-08, "logits/chosen": -3.1389083862304688, "logits/rejected": -3.126983642578125, "logps/chosen": -57.09613800048828, "logps/rejected": -55.38570022583008, "loss": 0.6911, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009039239026606083, "rewards/margins": 0.00432856660336256, "rewards/rejected": -0.013367804698646069, "step": 4040 }, { "epoch": 0.6977946243969676, "grad_norm": 2.4082491397857666, "learning_rate": 1.6457646500046536e-08, "logits/chosen": -3.0439090728759766, "logits/rejected": -3.026900053024292, "logps/chosen": -51.30855178833008, "logps/rejected": -54.536094665527344, "loss": 0.6884, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007431273814290762, "rewards/margins": 0.009696152061223984, "rewards/rejected": -0.017127424478530884, "step": 4050 }, { "epoch": 0.6995175740868367, "grad_norm": 2.4375593662261963, "learning_rate": 1.643465647925096e-08, "logits/chosen": -3.0453343391418457, "logits/rejected": -3.0121757984161377, "logps/chosen": -57.543304443359375, "logps/rejected": -54.09287643432617, "loss": 0.6895, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.011836276389658451, "rewards/margins": 0.007690160069614649, "rewards/rejected": -0.01952643319964409, "step": 4060 }, { "epoch": 0.7012405237767058, "grad_norm": 2.122170925140381, "learning_rate": 1.6411608269361393e-08, "logits/chosen": -3.05137300491333, "logits/rejected": -3.028947353363037, "logps/chosen": -55.06595993041992, "logps/rejected": -54.700538635253906, "loss": 0.6903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012340652756392956, "rewards/margins": 0.005884417332708836, "rewards/rejected": -0.01822507008910179, "step": 4070 }, { "epoch": 0.7029634734665747, "grad_norm": 2.809166669845581, "learning_rate": 1.638850207880456e-08, "logits/chosen": -3.0260517597198486, "logits/rejected": -3.018556594848633, "logps/chosen": -52.71803665161133, "logps/rejected": -55.4732666015625, "loss": 0.6901, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.008182106539607048, "rewards/margins": 0.006329337600618601, "rewards/rejected": -0.014511443674564362, "step": 4080 }, { "epoch": 0.7046864231564438, "grad_norm": 2.3355367183685303, "learning_rate": 1.6365338116531524e-08, "logits/chosen": -3.100362777709961, "logits/rejected": -3.0696163177490234, "logps/chosen": -56.38579177856445, "logps/rejected": -56.2817497253418, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": -0.009085236117243767, "rewards/margins": 0.008473975583910942, "rewards/rejected": -0.01755921170115471, "step": 4090 }, { "epoch": 0.7064093728463129, "grad_norm": 2.1862552165985107, "learning_rate": 1.6342116592015784e-08, "logits/chosen": -3.0656888484954834, "logits/rejected": -3.036451816558838, "logps/chosen": -53.445716857910156, "logps/rejected": -50.91322708129883, "loss": 0.688, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0060226572677493095, "rewards/margins": 0.010662797838449478, "rewards/rejected": -0.016685456037521362, "step": 4100 }, { "epoch": 0.7064093728463129, "eval_logits/chosen": -3.1452784538269043, "eval_logits/rejected": -3.139664888381958, "eval_logps/chosen": -58.53339767456055, "eval_logps/rejected": -63.248939514160156, "eval_loss": 0.6919506788253784, "eval_rewards/accuracies": 0.5694702863693237, "eval_rewards/chosen": 0.0017849754076451063, "eval_rewards/margins": 0.0024732158053666353, "eval_rewards/rejected": -0.0006882402813062072, "eval_runtime": 383.3008, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 4100 }, { "epoch": 0.708132322536182, "grad_norm": 2.4810664653778076, "learning_rate": 1.631883771525137e-08, "logits/chosen": -3.0311903953552246, "logits/rejected": -3.019279956817627, "logps/chosen": -57.1802864074707, "logps/rejected": -53.1908073425293, "loss": 0.6904, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.011178601533174515, "rewards/margins": 0.005697404034435749, "rewards/rejected": -0.01687600649893284, "step": 4110 }, { "epoch": 0.709855272226051, "grad_norm": 2.73134183883667, "learning_rate": 1.6295501696750958e-08, "logits/chosen": -2.9880242347717285, "logits/rejected": -2.9914722442626953, "logps/chosen": -51.12857437133789, "logps/rejected": -56.45573806762695, "loss": 0.6908, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.01163684856146574, "rewards/margins": 0.005046077072620392, "rewards/rejected": -0.016682926565408707, "step": 4120 }, { "epoch": 0.71157822191592, "grad_norm": 2.448148727416992, "learning_rate": 1.6272108747543964e-08, "logits/chosen": -3.089820384979248, "logits/rejected": -3.066537618637085, "logps/chosen": -54.83890914916992, "logps/rejected": -54.20136260986328, "loss": 0.6893, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.008709724061191082, "rewards/margins": 0.0079735042527318, "rewards/rejected": -0.016683228313922882, "step": 4130 }, { "epoch": 0.7133011716057891, "grad_norm": 2.247288942337036, "learning_rate": 1.6248659079174624e-08, "logits/chosen": -3.0946431159973145, "logits/rejected": -3.066021680831909, "logps/chosen": -56.27561569213867, "logps/rejected": -52.206207275390625, "loss": 0.6895, "rewards/accuracies": 0.65625, "rewards/chosen": -0.007499805651605129, "rewards/margins": 0.0075314841233193874, "rewards/rejected": -0.015031290240585804, "step": 4140 }, { "epoch": 0.7150241212956582, "grad_norm": 2.0884854793548584, "learning_rate": 1.6225152903700093e-08, "logits/chosen": -3.1154227256774902, "logits/rejected": -3.088381052017212, "logps/chosen": -58.83393478393555, "logps/rejected": -53.803932189941406, "loss": 0.6884, "rewards/accuracies": 0.65625, "rewards/chosen": -0.009922737255692482, "rewards/margins": 0.009877922013401985, "rewards/rejected": -0.019800657406449318, "step": 4150 }, { "epoch": 0.7167470709855273, "grad_norm": 2.234891653060913, "learning_rate": 1.6201590433688532e-08, "logits/chosen": -3.0078649520874023, "logits/rejected": -2.9760546684265137, "logps/chosen": -52.66447830200195, "logps/rejected": -51.263343811035156, "loss": 0.6875, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.009862758219242096, "rewards/margins": 0.011652237735688686, "rewards/rejected": -0.021514996886253357, "step": 4160 }, { "epoch": 0.7184700206753962, "grad_norm": 2.4819843769073486, "learning_rate": 1.617797188221717e-08, "logits/chosen": -3.06020188331604, "logits/rejected": -3.0522642135620117, "logps/chosen": -52.52134323120117, "logps/rejected": -54.958274841308594, "loss": 0.692, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.011253321543335915, "rewards/margins": 0.0024759075604379177, "rewards/rejected": -0.01372922956943512, "step": 4170 }, { "epoch": 0.7201929703652653, "grad_norm": 2.3897159099578857, "learning_rate": 1.6154297462870378e-08, "logits/chosen": -3.0610311031341553, "logits/rejected": -3.0461132526397705, "logps/chosen": -56.082275390625, "logps/rejected": -57.79096221923828, "loss": 0.6895, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.007366887293756008, "rewards/margins": 0.007653279695659876, "rewards/rejected": -0.015020167455077171, "step": 4180 }, { "epoch": 0.7219159200551344, "grad_norm": 2.1994800567626953, "learning_rate": 1.6130567389737767e-08, "logits/chosen": -3.041149616241455, "logits/rejected": -3.0273966789245605, "logps/chosen": -51.95328903198242, "logps/rejected": -54.303009033203125, "loss": 0.6888, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.008851468563079834, "rewards/margins": 0.008974619209766388, "rewards/rejected": -0.017826087772846222, "step": 4190 }, { "epoch": 0.7236388697450035, "grad_norm": 2.5199713706970215, "learning_rate": 1.6106781877412207e-08, "logits/chosen": -3.092040538787842, "logits/rejected": -3.0750718116760254, "logps/chosen": -56.37440872192383, "logps/rejected": -54.04875564575195, "loss": 0.6893, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006679283920675516, "rewards/margins": 0.007919451221823692, "rewards/rejected": -0.014598734676837921, "step": 4200 }, { "epoch": 0.7236388697450035, "eval_logits/chosen": -3.1445956230163574, "eval_logits/rejected": -3.1389715671539307, "eval_logps/chosen": -58.557437896728516, "eval_logps/rejected": -63.27348709106445, "eval_loss": 0.6919510364532471, "eval_rewards/accuracies": 0.5685408711433411, "eval_rewards/chosen": 0.0015445526223629713, "eval_rewards/margins": 0.0024782144464552402, "eval_rewards/rejected": -0.0009336618822999299, "eval_runtime": 382.6488, "eval_samples_per_second": 11.248, "eval_steps_per_second": 1.406, "step": 4200 }, { "epoch": 0.7253618194348725, "grad_norm": 2.560533046722412, "learning_rate": 1.6082941140987916e-08, "logits/chosen": -3.0729806423187256, "logits/rejected": -3.049401044845581, "logps/chosen": -56.40735626220703, "logps/rejected": -54.78361892700195, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": -0.011106094345450401, "rewards/margins": 0.00898011215031147, "rewards/rejected": -0.020086204633116722, "step": 4210 }, { "epoch": 0.7270847691247415, "grad_norm": 2.338407039642334, "learning_rate": 1.6059045396058517e-08, "logits/chosen": -2.9267725944519043, "logits/rejected": -2.907881498336792, "logps/chosen": -53.28325271606445, "logps/rejected": -54.56683349609375, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": -0.010645734146237373, "rewards/margins": 0.00797154288738966, "rewards/rejected": -0.018617277964949608, "step": 4220 }, { "epoch": 0.7288077188146106, "grad_norm": 2.2222158908843994, "learning_rate": 1.603509485871506e-08, "logits/chosen": -3.1128008365631104, "logits/rejected": -3.0919594764709473, "logps/chosen": -56.30914306640625, "logps/rejected": -55.95566940307617, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": -0.010030779056251049, "rewards/margins": 0.008529409766197205, "rewards/rejected": -0.018560189753770828, "step": 4230 }, { "epoch": 0.7305306685044797, "grad_norm": 2.292361259460449, "learning_rate": 1.601108974554411e-08, "logits/chosen": -3.0141403675079346, "logits/rejected": -2.9893059730529785, "logps/chosen": -55.5947151184082, "logps/rejected": -58.04345703125, "loss": 0.6871, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.00850595161318779, "rewards/margins": 0.012388696894049644, "rewards/rejected": -0.020894650369882584, "step": 4240 }, { "epoch": 0.7322536181943488, "grad_norm": 2.362673044204712, "learning_rate": 1.5987030273625747e-08, "logits/chosen": -3.03678297996521, "logits/rejected": -3.020172595977783, "logps/chosen": -56.116294860839844, "logps/rejected": -54.58025360107422, "loss": 0.6891, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01067496370524168, "rewards/margins": 0.008443078026175499, "rewards/rejected": -0.019118044525384903, "step": 4250 }, { "epoch": 0.7339765678842178, "grad_norm": 2.441284656524658, "learning_rate": 1.596291666053163e-08, "logits/chosen": -2.9980833530426025, "logits/rejected": -2.9824585914611816, "logps/chosen": -55.41083908081055, "logps/rejected": -56.44206619262695, "loss": 0.6898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008826795034110546, "rewards/margins": 0.0070566474460065365, "rewards/rejected": -0.015883442014455795, "step": 4260 }, { "epoch": 0.7356995175740868, "grad_norm": 2.271895170211792, "learning_rate": 1.5938749124323017e-08, "logits/chosen": -3.0179824829101562, "logits/rejected": -2.9922022819519043, "logps/chosen": -52.23370361328125, "logps/rejected": -52.155426025390625, "loss": 0.6884, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00625257333740592, "rewards/margins": 0.009830532595515251, "rewards/rejected": -0.01608310639858246, "step": 4270 }, { "epoch": 0.7374224672639559, "grad_norm": 2.187901496887207, "learning_rate": 1.5914527883548804e-08, "logits/chosen": -2.9919097423553467, "logits/rejected": -2.9648940563201904, "logps/chosen": -52.847511291503906, "logps/rejected": -54.09342575073242, "loss": 0.6887, "rewards/accuracies": 0.625, "rewards/chosen": -0.011299841105937958, "rewards/margins": 0.009080270305275917, "rewards/rejected": -0.020380113273859024, "step": 4280 }, { "epoch": 0.739145416953825, "grad_norm": 2.2882347106933594, "learning_rate": 1.5890253157243527e-08, "logits/chosen": -3.045542001724243, "logits/rejected": -3.0208449363708496, "logps/chosen": -56.22462844848633, "logps/rejected": -53.41437911987305, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.009847553446888924, "rewards/margins": 0.008151333779096603, "rewards/rejected": -0.017998887225985527, "step": 4290 }, { "epoch": 0.740868366643694, "grad_norm": 2.245983839035034, "learning_rate": 1.5865925164925415e-08, "logits/chosen": -3.059497117996216, "logits/rejected": -3.0487923622131348, "logps/chosen": -53.6651496887207, "logps/rejected": -54.894622802734375, "loss": 0.6897, "rewards/accuracies": 0.625, "rewards/chosen": -0.011088307946920395, "rewards/margins": 0.0072076646611094475, "rewards/rejected": -0.018295975401997566, "step": 4300 }, { "epoch": 0.740868366643694, "eval_logits/chosen": -3.143899440765381, "eval_logits/rejected": -3.1382787227630615, "eval_logps/chosen": -58.56075668334961, "eval_logps/rejected": -63.2966423034668, "eval_loss": 0.691855251789093, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": 0.001511368784122169, "eval_rewards/margins": 0.0026765998918563128, "eval_rewards/rejected": -0.0011652313405647874, "eval_runtime": 383.1116, "eval_samples_per_second": 11.234, "eval_steps_per_second": 1.404, "step": 4300 }, { "epoch": 0.742591316333563, "grad_norm": 2.5445520877838135, "learning_rate": 1.5841544126594372e-08, "logits/chosen": -3.0100371837615967, "logits/rejected": -2.9904136657714844, "logps/chosen": -54.983856201171875, "logps/rejected": -53.272987365722656, "loss": 0.6895, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.009319139644503593, "rewards/margins": 0.007537718862295151, "rewards/rejected": -0.016856860369443893, "step": 4310 }, { "epoch": 0.7443142660234321, "grad_norm": 2.088826894760132, "learning_rate": 1.581711026273e-08, "logits/chosen": -3.0851564407348633, "logits/rejected": -3.05715012550354, "logps/chosen": -59.47802734375, "logps/rejected": -56.493492126464844, "loss": 0.6884, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.008360291831195354, "rewards/margins": 0.009802700951695442, "rewards/rejected": -0.018162991851568222, "step": 4320 }, { "epoch": 0.7460372157133012, "grad_norm": 2.4881863594055176, "learning_rate": 1.579262379428962e-08, "logits/chosen": -2.991785764694214, "logits/rejected": -2.9598193168640137, "logps/chosen": -56.84458541870117, "logps/rejected": -51.7958869934082, "loss": 0.69, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.013537961058318615, "rewards/margins": 0.006494508590549231, "rewards/rejected": -0.020032469183206558, "step": 4330 }, { "epoch": 0.7477601654031703, "grad_norm": 2.1231768131256104, "learning_rate": 1.5768084942706245e-08, "logits/chosen": -2.9963862895965576, "logits/rejected": -2.9773504734039307, "logps/chosen": -50.41028594970703, "logps/rejected": -52.8140754699707, "loss": 0.6888, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.014139607548713684, "rewards/margins": 0.00901852734386921, "rewards/rejected": -0.023158136755228043, "step": 4340 }, { "epoch": 0.7494831150930393, "grad_norm": 2.1576578617095947, "learning_rate": 1.5743493929886602e-08, "logits/chosen": -3.0088493824005127, "logits/rejected": -2.994492530822754, "logps/chosen": -51.812225341796875, "logps/rejected": -54.839447021484375, "loss": 0.6891, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.008023882284760475, "rewards/margins": 0.008411007933318615, "rewards/rejected": -0.016434891149401665, "step": 4350 }, { "epoch": 0.7512060647829083, "grad_norm": 2.4116005897521973, "learning_rate": 1.5718850978209113e-08, "logits/chosen": -3.0605316162109375, "logits/rejected": -3.038637638092041, "logps/chosen": -55.0179328918457, "logps/rejected": -53.5815544128418, "loss": 0.6904, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.01346288900822401, "rewards/margins": 0.005852080415934324, "rewards/rejected": -0.019314970821142197, "step": 4360 }, { "epoch": 0.7529290144727774, "grad_norm": 2.157750368118286, "learning_rate": 1.5694156310521886e-08, "logits/chosen": -3.0732007026672363, "logits/rejected": -3.0582518577575684, "logps/chosen": -55.9976921081543, "logps/rejected": -56.61333084106445, "loss": 0.6894, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.010390983894467354, "rewards/margins": 0.007754699792712927, "rewards/rejected": -0.018145684152841568, "step": 4370 }, { "epoch": 0.7546519641626465, "grad_norm": 2.26912260055542, "learning_rate": 1.5669410150140707e-08, "logits/chosen": -3.0933172702789307, "logits/rejected": -3.069687604904175, "logps/chosen": -55.12248611450195, "logps/rejected": -53.76482391357422, "loss": 0.6894, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008189195767045021, "rewards/margins": 0.007664737291634083, "rewards/rejected": -0.01585393212735653, "step": 4380 }, { "epoch": 0.7563749138525155, "grad_norm": 2.0165369510650635, "learning_rate": 1.5644612720847002e-08, "logits/chosen": -3.018369197845459, "logits/rejected": -3.0045838356018066, "logps/chosen": -54.61598587036133, "logps/rejected": -55.37036895751953, "loss": 0.6908, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013793488033115864, "rewards/margins": 0.0049360147677361965, "rewards/rejected": -0.018729500472545624, "step": 4390 }, { "epoch": 0.7580978635423845, "grad_norm": 2.3872110843658447, "learning_rate": 1.5619764246885842e-08, "logits/chosen": -3.0393893718719482, "logits/rejected": -3.0217764377593994, "logps/chosen": -57.86515426635742, "logps/rejected": -56.952545166015625, "loss": 0.6904, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.009598755277693272, "rewards/margins": 0.005718299653381109, "rewards/rejected": -0.015317055396735668, "step": 4400 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -3.1430068016052246, "eval_logits/rejected": -3.137352466583252, "eval_logps/chosen": -58.58723449707031, "eval_logps/rejected": -63.3355598449707, "eval_loss": 0.691795825958252, "eval_rewards/accuracies": 0.571096658706665, "eval_rewards/chosen": 0.001246647210791707, "eval_rewards/margins": 0.0028010986279696226, "eval_rewards/rejected": -0.00155445106793195, "eval_runtime": 383.1918, "eval_samples_per_second": 11.232, "eval_steps_per_second": 1.404, "step": 4400 }, { "epoch": 0.7598208132322536, "grad_norm": 2.294480800628662, "learning_rate": 1.5594864952963885e-08, "logits/chosen": -3.0046401023864746, "logits/rejected": -2.98811411857605, "logps/chosen": -56.121124267578125, "logps/rejected": -57.08354568481445, "loss": 0.6892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.012294664047658443, "rewards/margins": 0.008234361186623573, "rewards/rejected": -0.02052902616560459, "step": 4410 }, { "epoch": 0.7615437629221227, "grad_norm": 2.1563189029693604, "learning_rate": 1.5569915064247365e-08, "logits/chosen": -3.1245226860046387, "logits/rejected": -3.0795650482177734, "logps/chosen": -55.503578186035156, "logps/rejected": -52.61016845703125, "loss": 0.6861, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0068757846020162106, "rewards/margins": 0.01452686171978712, "rewards/rejected": -0.021402645856142044, "step": 4420 }, { "epoch": 0.7632667126119917, "grad_norm": 1.9755196571350098, "learning_rate": 1.5544914806360043e-08, "logits/chosen": -3.0168216228485107, "logits/rejected": -2.985079288482666, "logps/chosen": -55.10762405395508, "logps/rejected": -53.182945251464844, "loss": 0.6906, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013249601237475872, "rewards/margins": 0.005370932165533304, "rewards/rejected": -0.018620532006025314, "step": 4430 }, { "epoch": 0.7649896623018608, "grad_norm": 2.288050889968872, "learning_rate": 1.5519864405381183e-08, "logits/chosen": -3.083771228790283, "logits/rejected": -3.0520148277282715, "logps/chosen": -60.474449157714844, "logps/rejected": -56.98888397216797, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": -0.010724170133471489, "rewards/margins": 0.008934767916798592, "rewards/rejected": -0.01965893805027008, "step": 4440 }, { "epoch": 0.7667126119917298, "grad_norm": 2.294389486312866, "learning_rate": 1.5494764087843482e-08, "logits/chosen": -2.9569332599639893, "logits/rejected": -2.944169521331787, "logps/chosen": -56.9466552734375, "logps/rejected": -55.96522903442383, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012313870713114738, "rewards/margins": 0.005467808805406094, "rewards/rejected": -0.017781678587198257, "step": 4450 }, { "epoch": 0.7684355616815989, "grad_norm": 2.432860851287842, "learning_rate": 1.5469614080731053e-08, "logits/chosen": -3.069728136062622, "logits/rejected": -3.061816692352295, "logps/chosen": -54.156227111816406, "logps/rejected": -54.633888244628906, "loss": 0.6906, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0142423827201128, "rewards/margins": 0.005499526392668486, "rewards/rejected": -0.019741909578442574, "step": 4460 }, { "epoch": 0.770158511371468, "grad_norm": 2.1848111152648926, "learning_rate": 1.544441461147734e-08, "logits/chosen": -3.105034351348877, "logits/rejected": -3.074484348297119, "logps/chosen": -57.09598922729492, "logps/rejected": -52.336395263671875, "loss": 0.6887, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.011739978566765785, "rewards/margins": 0.009237302467226982, "rewards/rejected": -0.020977279171347618, "step": 4470 }, { "epoch": 0.771881461061337, "grad_norm": 2.3228683471679688, "learning_rate": 1.5419165907963085e-08, "logits/chosen": -3.133382797241211, "logits/rejected": -3.126979351043701, "logps/chosen": -52.30253219604492, "logps/rejected": -54.35466766357422, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01291743665933609, "rewards/margins": 0.00816321186721325, "rewards/rejected": -0.02108065038919449, "step": 4480 }, { "epoch": 0.7736044107512061, "grad_norm": 2.3531270027160645, "learning_rate": 1.5393868198514258e-08, "logits/chosen": -3.023160457611084, "logits/rejected": -2.9955711364746094, "logps/chosen": -52.2995719909668, "logps/rejected": -51.951698303222656, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": -0.011587420478463173, "rewards/margins": 0.009556153789162636, "rewards/rejected": -0.02114357426762581, "step": 4490 }, { "epoch": 0.7753273604410751, "grad_norm": 2.6675174236297607, "learning_rate": 1.5368521711899994e-08, "logits/chosen": -3.038281202316284, "logits/rejected": -3.0235447883605957, "logps/chosen": -57.523460388183594, "logps/rejected": -54.69983673095703, "loss": 0.6905, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.011881532147526741, "rewards/margins": 0.005483219865709543, "rewards/rejected": -0.017364751547574997, "step": 4500 }, { "epoch": 0.7753273604410751, "eval_logits/chosen": -3.142470598220825, "eval_logits/rejected": -3.136869430541992, "eval_logps/chosen": -58.58580780029297, "eval_logps/rejected": -63.34257507324219, "eval_loss": 0.6917560696601868, "eval_rewards/accuracies": 0.5850371718406677, "eval_rewards/chosen": 0.0012609114637598395, "eval_rewards/margins": 0.0028854578267782927, "eval_rewards/rejected": -0.0016245462466031313, "eval_runtime": 383.2687, "eval_samples_per_second": 11.23, "eval_steps_per_second": 1.404, "step": 4500 }, { "epoch": 0.7770503101309442, "grad_norm": 2.6927618980407715, "learning_rate": 1.5343126677330526e-08, "logits/chosen": -3.154592990875244, "logits/rejected": -3.126269817352295, "logps/chosen": -54.230247497558594, "logps/rejected": -57.4571418762207, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -0.00797832477837801, "rewards/margins": 0.010417604818940163, "rewards/rejected": -0.018395930528640747, "step": 4510 }, { "epoch": 0.7787732598208132, "grad_norm": 2.582130193710327, "learning_rate": 1.5317683324455104e-08, "logits/chosen": -3.07936429977417, "logits/rejected": -3.0577187538146973, "logps/chosen": -54.154029846191406, "logps/rejected": -52.17194747924805, "loss": 0.6906, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.015559755265712738, "rewards/margins": 0.005351835396140814, "rewards/rejected": -0.02091159299015999, "step": 4520 }, { "epoch": 0.7804962095106823, "grad_norm": 2.4246294498443604, "learning_rate": 1.5292191883359924e-08, "logits/chosen": -3.0756242275238037, "logits/rejected": -3.066664695739746, "logps/chosen": -55.20574188232422, "logps/rejected": -55.2439079284668, "loss": 0.6916, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01431342214345932, "rewards/margins": 0.0033062086440622807, "rewards/rejected": -0.017619632184505463, "step": 4530 }, { "epoch": 0.7822191592005513, "grad_norm": 2.6444544792175293, "learning_rate": 1.5266652584566056e-08, "logits/chosen": -2.9866771697998047, "logits/rejected": -2.9747414588928223, "logps/chosen": -53.885520935058594, "logps/rejected": -53.13618850708008, "loss": 0.6881, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.011537250131368637, "rewards/margins": 0.010475369170308113, "rewards/rejected": -0.0220126211643219, "step": 4540 }, { "epoch": 0.7839421088904204, "grad_norm": 2.567601442337036, "learning_rate": 1.5241065659027345e-08, "logits/chosen": -3.092729091644287, "logits/rejected": -3.0693888664245605, "logps/chosen": -57.39257049560547, "logps/rejected": -55.48276901245117, "loss": 0.688, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.010340984910726547, "rewards/margins": 0.010571248829364777, "rewards/rejected": -0.020912233740091324, "step": 4550 }, { "epoch": 0.7856650585802895, "grad_norm": 2.5469632148742676, "learning_rate": 1.5215431338128326e-08, "logits/chosen": -2.986642360687256, "logits/rejected": -2.949063777923584, "logps/chosen": -56.87488555908203, "logps/rejected": -54.90007781982422, "loss": 0.6875, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.006578305270522833, "rewards/margins": 0.011532245203852654, "rewards/rejected": -0.018110549077391624, "step": 4560 }, { "epoch": 0.7873880082701585, "grad_norm": 2.250145435333252, "learning_rate": 1.5189749853682138e-08, "logits/chosen": -3.014840602874756, "logits/rejected": -2.9897007942199707, "logps/chosen": -52.40979766845703, "logps/rejected": -53.9889030456543, "loss": 0.6887, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0131943728774786, "rewards/margins": 0.009161809459328651, "rewards/rejected": -0.02235618606209755, "step": 4570 }, { "epoch": 0.7891109579600276, "grad_norm": 2.483966827392578, "learning_rate": 1.5164021437928424e-08, "logits/chosen": -2.990060567855835, "logits/rejected": -2.9630320072174072, "logps/chosen": -55.05839920043945, "logps/rejected": -53.426841735839844, "loss": 0.6889, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.007987894117832184, "rewards/margins": 0.008664386346936226, "rewards/rejected": -0.01665228046476841, "step": 4580 }, { "epoch": 0.7908339076498966, "grad_norm": 2.7395739555358887, "learning_rate": 1.5138246323531224e-08, "logits/chosen": -3.0871968269348145, "logits/rejected": -3.0587024688720703, "logps/chosen": -56.29444122314453, "logps/rejected": -54.183837890625, "loss": 0.6905, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014101634733378887, "rewards/margins": 0.005672593601047993, "rewards/rejected": -0.01977423205971718, "step": 4590 }, { "epoch": 0.7925568573397657, "grad_norm": 2.2159547805786133, "learning_rate": 1.5112424743576885e-08, "logits/chosen": -2.964893102645874, "logits/rejected": -2.945138931274414, "logps/chosen": -54.72895431518555, "logps/rejected": -56.71080780029297, "loss": 0.6883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.013638514094054699, "rewards/margins": 0.009959360584616661, "rewards/rejected": -0.023597871884703636, "step": 4600 }, { "epoch": 0.7925568573397657, "eval_logits/chosen": -3.141345262527466, "eval_logits/rejected": -3.1357266902923584, "eval_logps/chosen": -58.60505294799805, "eval_logps/rejected": -63.36589431762695, "eval_loss": 0.6917376518249512, "eval_rewards/accuracies": 0.5787639617919922, "eval_rewards/chosen": 0.0010684671578928828, "eval_rewards/margins": 0.0029261959716677666, "eval_rewards/rejected": -0.0018577290466055274, "eval_runtime": 383.4911, "eval_samples_per_second": 11.223, "eval_steps_per_second": 1.403, "step": 4600 }, { "epoch": 0.7942798070296347, "grad_norm": 2.280055046081543, "learning_rate": 1.5086556931571946e-08, "logits/chosen": -2.999232292175293, "logits/rejected": -2.988621711730957, "logps/chosen": -52.64866256713867, "logps/rejected": -53.27161407470703, "loss": 0.6892, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.015107172541320324, "rewards/margins": 0.008137322030961514, "rewards/rejected": -0.023244492709636688, "step": 4610 }, { "epoch": 0.7960027567195038, "grad_norm": 2.420598030090332, "learning_rate": 1.5060643121441017e-08, "logits/chosen": -3.0278878211975098, "logits/rejected": -3.004408836364746, "logps/chosen": -56.786781311035156, "logps/rejected": -57.46771240234375, "loss": 0.6889, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01068052463233471, "rewards/margins": 0.0088728042319417, "rewards/rejected": -0.019553329795598984, "step": 4620 }, { "epoch": 0.7977257064093728, "grad_norm": 2.28881573677063, "learning_rate": 1.503468354752468e-08, "logits/chosen": -3.0719847679138184, "logits/rejected": -3.0378081798553467, "logps/chosen": -58.28291702270508, "logps/rejected": -58.635284423828125, "loss": 0.6881, "rewards/accuracies": 0.625, "rewards/chosen": -0.009992343373596668, "rewards/margins": 0.010409261099994183, "rewards/rejected": -0.0204016026109457, "step": 4630 }, { "epoch": 0.7994486560992419, "grad_norm": 2.432915449142456, "learning_rate": 1.5008678444577368e-08, "logits/chosen": -3.0757288932800293, "logits/rejected": -3.060554265975952, "logps/chosen": -56.30283737182617, "logps/rejected": -57.6944694519043, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01548223476856947, "rewards/margins": 0.005951850675046444, "rewards/rejected": -0.021434085443615913, "step": 4640 }, { "epoch": 0.801171605789111, "grad_norm": 2.3404457569122314, "learning_rate": 1.4982628047765213e-08, "logits/chosen": -3.073173999786377, "logits/rejected": -3.0398802757263184, "logps/chosen": -55.227325439453125, "logps/rejected": -52.969627380371094, "loss": 0.6877, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010765276849269867, "rewards/margins": 0.011142291128635406, "rewards/rejected": -0.021907567977905273, "step": 4650 }, { "epoch": 0.80289455547898, "grad_norm": 2.2700555324554443, "learning_rate": 1.495653259266398e-08, "logits/chosen": -3.0625596046447754, "logits/rejected": -3.0436787605285645, "logps/chosen": -55.32490158081055, "logps/rejected": -56.171669006347656, "loss": 0.6889, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.010176388546824455, "rewards/margins": 0.008753242902457714, "rewards/rejected": -0.018929632380604744, "step": 4660 }, { "epoch": 0.8046175051688491, "grad_norm": 2.1048812866210938, "learning_rate": 1.493039231525686e-08, "logits/chosen": -3.0612926483154297, "logits/rejected": -3.036155939102173, "logps/chosen": -53.7221565246582, "logps/rejected": -51.17532730102539, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012948019430041313, "rewards/margins": 0.00783008337020874, "rewards/rejected": -0.020778100937604904, "step": 4670 }, { "epoch": 0.8063404548587181, "grad_norm": 2.1982691287994385, "learning_rate": 1.4904207451932403e-08, "logits/chosen": -3.0380797386169434, "logits/rejected": -3.0065929889678955, "logps/chosen": -53.59992218017578, "logps/rejected": -53.799102783203125, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.011093436740338802, "rewards/margins": 0.009997878223657608, "rewards/rejected": -0.021091314032673836, "step": 4680 }, { "epoch": 0.8080634045485872, "grad_norm": 2.743126630783081, "learning_rate": 1.4877978239482345e-08, "logits/chosen": -3.0848495960235596, "logits/rejected": -3.062772750854492, "logps/chosen": -56.99140167236328, "logps/rejected": -56.22808837890625, "loss": 0.6898, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012282797135412693, "rewards/margins": 0.006996271200478077, "rewards/rejected": -0.01927906833589077, "step": 4690 }, { "epoch": 0.8097863542384562, "grad_norm": 2.436103105545044, "learning_rate": 1.4851704915099474e-08, "logits/chosen": -3.060211420059204, "logits/rejected": -3.0525429248809814, "logps/chosen": -53.092872619628906, "logps/rejected": -57.76226806640625, "loss": 0.6897, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.013585333712399006, "rewards/margins": 0.0071394420228898525, "rewards/rejected": -0.02072477713227272, "step": 4700 }, { "epoch": 0.8097863542384562, "eval_logits/chosen": -3.1409170627593994, "eval_logits/rejected": -3.135312557220459, "eval_logps/chosen": -58.612998962402344, "eval_logps/rejected": -63.39475631713867, "eval_loss": 0.6916364431381226, "eval_rewards/accuracies": 0.574117124080658, "eval_rewards/chosen": 0.0009889440843835473, "eval_rewards/margins": 0.0031352676451206207, "eval_rewards/rejected": -0.0021463236771523952, "eval_runtime": 383.168, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 4700 }, { "epoch": 0.8115093039283253, "grad_norm": 2.3831241130828857, "learning_rate": 1.482538771637548e-08, "logits/chosen": -3.1120047569274902, "logits/rejected": -3.073157787322998, "logps/chosen": -59.07976150512695, "logps/rejected": -54.81146240234375, "loss": 0.6869, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.011325079016387463, "rewards/margins": 0.012960456311702728, "rewards/rejected": -0.024285534396767616, "step": 4710 }, { "epoch": 0.8132322536181944, "grad_norm": 2.280550241470337, "learning_rate": 1.4799026881298825e-08, "logits/chosen": -2.985053539276123, "logits/rejected": -2.953970432281494, "logps/chosen": -57.7352294921875, "logps/rejected": -54.659141540527344, "loss": 0.6869, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.009329278953373432, "rewards/margins": 0.012923975475132465, "rewards/rejected": -0.02225325256586075, "step": 4720 }, { "epoch": 0.8149552033080634, "grad_norm": 2.3526196479797363, "learning_rate": 1.4772622648252565e-08, "logits/chosen": -2.976370096206665, "logits/rejected": -2.9506354331970215, "logps/chosen": -55.18701934814453, "logps/rejected": -53.34539031982422, "loss": 0.6899, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.014407898299396038, "rewards/margins": 0.006814153399318457, "rewards/rejected": -0.021222051233053207, "step": 4730 }, { "epoch": 0.8166781529979324, "grad_norm": 2.533383846282959, "learning_rate": 1.4746175256012212e-08, "logits/chosen": -3.0675313472747803, "logits/rejected": -3.056377410888672, "logps/chosen": -55.36817169189453, "logps/rejected": -55.598167419433594, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": -0.009266135282814503, "rewards/margins": 0.008008824661374092, "rewards/rejected": -0.01727495715022087, "step": 4740 }, { "epoch": 0.8184011026878015, "grad_norm": 2.477605104446411, "learning_rate": 1.4719684943743575e-08, "logits/chosen": -3.036411762237549, "logits/rejected": -3.016793966293335, "logps/chosen": -56.173614501953125, "logps/rejected": -56.49778366088867, "loss": 0.6877, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007815537974238396, "rewards/margins": 0.011227364651858807, "rewards/rejected": -0.019042903557419777, "step": 4750 }, { "epoch": 0.8201240523776706, "grad_norm": 2.4582791328430176, "learning_rate": 1.4693151951000583e-08, "logits/chosen": -3.0335280895233154, "logits/rejected": -3.011873245239258, "logps/chosen": -53.954620361328125, "logps/rejected": -53.71638870239258, "loss": 0.6899, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.014825527556240559, "rewards/margins": 0.006758248899132013, "rewards/rejected": -0.02158377692103386, "step": 4760 }, { "epoch": 0.8218470020675396, "grad_norm": 2.687391519546509, "learning_rate": 1.4666576517723136e-08, "logits/chosen": -3.0979321002960205, "logits/rejected": -3.0764148235321045, "logps/chosen": -60.7008056640625, "logps/rejected": -56.8785400390625, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": -0.011106926016509533, "rewards/margins": 0.005686748772859573, "rewards/rejected": -0.01679367572069168, "step": 4770 }, { "epoch": 0.8235699517574087, "grad_norm": 2.817535400390625, "learning_rate": 1.4639958884234921e-08, "logits/chosen": -2.9717934131622314, "logits/rejected": -2.953327178955078, "logps/chosen": -52.80674362182617, "logps/rejected": -56.35276412963867, "loss": 0.688, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01336643099784851, "rewards/margins": 0.01072400901466608, "rewards/rejected": -0.024090442806482315, "step": 4780 }, { "epoch": 0.8252929014472777, "grad_norm": 2.144557476043701, "learning_rate": 1.4613299291241247e-08, "logits/chosen": -3.074733257293701, "logits/rejected": -3.048335552215576, "logps/chosen": -59.228851318359375, "logps/rejected": -54.978431701660156, "loss": 0.6896, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.014375475235283375, "rewards/margins": 0.007547792978584766, "rewards/rejected": -0.021923266351222992, "step": 4790 }, { "epoch": 0.8270158511371468, "grad_norm": 2.7041478157043457, "learning_rate": 1.458659797982687e-08, "logits/chosen": -2.9979748725891113, "logits/rejected": -2.9825730323791504, "logps/chosen": -53.27587127685547, "logps/rejected": -52.17603302001953, "loss": 0.6905, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012802990153431892, "rewards/margins": 0.00566075649112463, "rewards/rejected": -0.018463749438524246, "step": 4800 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -3.14040207862854, "eval_logits/rejected": -3.1347618103027344, "eval_logps/chosen": -58.63165283203125, "eval_logps/rejected": -63.4159049987793, "eval_loss": 0.6916272640228271, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": 0.0008023965056054294, "eval_rewards/margins": 0.003160183085128665, "eval_rewards/rejected": -0.0023577865213155746, "eval_runtime": 383.65, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 4800 }, { "epoch": 0.8287388008270159, "grad_norm": 2.4247682094573975, "learning_rate": 1.45598551914538e-08, "logits/chosen": -3.009613513946533, "logits/rejected": -2.974290370941162, "logps/chosen": -58.334434509277344, "logps/rejected": -54.773475646972656, "loss": 0.687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.011181493289768696, "rewards/margins": 0.012812025845050812, "rewards/rejected": -0.023993518203496933, "step": 4810 }, { "epoch": 0.8304617505168849, "grad_norm": 2.3951213359832764, "learning_rate": 1.453307116795913e-08, "logits/chosen": -3.044926404953003, "logits/rejected": -3.035269260406494, "logps/chosen": -52.56196212768555, "logps/rejected": -54.002899169921875, "loss": 0.6894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.014045966789126396, "rewards/margins": 0.007807609625160694, "rewards/rejected": -0.021853577345609665, "step": 4820 }, { "epoch": 0.832184700206754, "grad_norm": 2.170640230178833, "learning_rate": 1.4506246151552857e-08, "logits/chosen": -3.054115056991577, "logits/rejected": -3.0311291217803955, "logps/chosen": -54.9075813293457, "logps/rejected": -54.56113815307617, "loss": 0.6903, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.013306808657944202, "rewards/margins": 0.006027604453265667, "rewards/rejected": -0.01933441497385502, "step": 4830 }, { "epoch": 0.833907649896623, "grad_norm": 2.660403251647949, "learning_rate": 1.447938038481566e-08, "logits/chosen": -3.012082576751709, "logits/rejected": -2.9815993309020996, "logps/chosen": -57.541893005371094, "logps/rejected": -54.8646240234375, "loss": 0.6879, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.013715410605072975, "rewards/margins": 0.01079997606575489, "rewards/rejected": -0.024515386670827866, "step": 4840 }, { "epoch": 0.8356305995864921, "grad_norm": 2.3371341228485107, "learning_rate": 1.4452474110696739e-08, "logits/chosen": -3.090893268585205, "logits/rejected": -3.0729451179504395, "logps/chosen": -55.47370147705078, "logps/rejected": -55.9431266784668, "loss": 0.6884, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.013396288268268108, "rewards/margins": 0.009913275018334389, "rewards/rejected": -0.023309562355279922, "step": 4850 }, { "epoch": 0.8373535492763611, "grad_norm": 2.4334421157836914, "learning_rate": 1.4425527572511602e-08, "logits/chosen": -3.095104932785034, "logits/rejected": -3.0613903999328613, "logps/chosen": -60.0928840637207, "logps/rejected": -52.79871368408203, "loss": 0.6904, "rewards/accuracies": 0.625, "rewards/chosen": -0.01635749265551567, "rewards/margins": 0.0058675603941082954, "rewards/rejected": -0.02222505584359169, "step": 4860 }, { "epoch": 0.8390764989662302, "grad_norm": 2.731940507888794, "learning_rate": 1.4398541013939869e-08, "logits/chosen": -3.11464524269104, "logits/rejected": -3.0886950492858887, "logps/chosen": -58.54974365234375, "logps/rejected": -53.7392463684082, "loss": 0.6875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.010801544412970543, "rewards/margins": 0.011670037172734737, "rewards/rejected": -0.022471582517027855, "step": 4870 }, { "epoch": 0.8407994486560992, "grad_norm": 2.099777936935425, "learning_rate": 1.4371514679023067e-08, "logits/chosen": -2.9958763122558594, "logits/rejected": -2.959946870803833, "logps/chosen": -54.3371696472168, "logps/rejected": -52.98571014404297, "loss": 0.6873, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.011563587002456188, "rewards/margins": 0.01208436954766512, "rewards/rejected": -0.023647956550121307, "step": 4880 }, { "epoch": 0.8425223983459683, "grad_norm": 2.4163782596588135, "learning_rate": 1.4344448812162429e-08, "logits/chosen": -3.053267002105713, "logits/rejected": -3.0314173698425293, "logps/chosen": -56.69382858276367, "logps/rejected": -55.77009201049805, "loss": 0.6886, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.012732396833598614, "rewards/margins": 0.00963148195296526, "rewards/rejected": -0.022363876923918724, "step": 4890 }, { "epoch": 0.8442453480358374, "grad_norm": 2.075690507888794, "learning_rate": 1.4317343658116666e-08, "logits/chosen": -3.020969867706299, "logits/rejected": -2.987406015396118, "logps/chosen": -54.62163162231445, "logps/rejected": -52.847877502441406, "loss": 0.6875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014192362315952778, "rewards/margins": 0.01184868160635233, "rewards/rejected": -0.026041042059659958, "step": 4900 }, { "epoch": 0.8442453480358374, "eval_logits/chosen": -3.139631509780884, "eval_logits/rejected": -3.1339633464813232, "eval_logps/chosen": -58.65803527832031, "eval_logps/rejected": -63.45630645751953, "eval_loss": 0.6915606260299683, "eval_rewards/accuracies": 0.5773698687553406, "eval_rewards/chosen": 0.0005385760450735688, "eval_rewards/margins": 0.003300320589914918, "eval_rewards/rejected": -0.0027617441955953836, "eval_runtime": 384.2265, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 4900 }, { "epoch": 0.8459682977257064, "grad_norm": 2.4822423458099365, "learning_rate": 1.4290199461999776e-08, "logits/chosen": -3.165937900543213, "logits/rejected": -3.1338515281677246, "logps/chosen": -57.39813232421875, "logps/rejected": -54.81819534301758, "loss": 0.6867, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.010589056648314, "rewards/margins": 0.013249260373413563, "rewards/rejected": -0.023838315159082413, "step": 4910 }, { "epoch": 0.8476912474155754, "grad_norm": 2.523733615875244, "learning_rate": 1.4263016469278812e-08, "logits/chosen": -2.9909253120422363, "logits/rejected": -2.9729251861572266, "logps/chosen": -55.359779357910156, "logps/rejected": -56.16680145263672, "loss": 0.6874, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.015199231915175915, "rewards/margins": 0.011813652701675892, "rewards/rejected": -0.027012884616851807, "step": 4920 }, { "epoch": 0.8494141971054445, "grad_norm": 2.3042807579040527, "learning_rate": 1.4235794925771672e-08, "logits/chosen": -3.1721012592315674, "logits/rejected": -3.160557746887207, "logps/chosen": -54.805763244628906, "logps/rejected": -60.66535186767578, "loss": 0.6884, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.016626281663775444, "rewards/margins": 0.009994568303227425, "rewards/rejected": -0.02662084996700287, "step": 4930 }, { "epoch": 0.8511371467953136, "grad_norm": 2.6536920070648193, "learning_rate": 1.420853507764487e-08, "logits/chosen": -3.094923496246338, "logits/rejected": -3.064950704574585, "logps/chosen": -57.57648849487305, "logps/rejected": -55.926292419433594, "loss": 0.6881, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.016546418890357018, "rewards/margins": 0.010448496788740158, "rewards/rejected": -0.026994913816452026, "step": 4940 }, { "epoch": 0.8528600964851827, "grad_norm": 2.630887508392334, "learning_rate": 1.4181237171411314e-08, "logits/chosen": -2.8983561992645264, "logits/rejected": -2.88315749168396, "logps/chosen": -55.97765350341797, "logps/rejected": -56.04487228393555, "loss": 0.6895, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01150443498045206, "rewards/margins": 0.0077610015869140625, "rewards/rejected": -0.019265437498688698, "step": 4950 }, { "epoch": 0.8545830461750517, "grad_norm": 2.179489850997925, "learning_rate": 1.4153901453928069e-08, "logits/chosen": -3.0133819580078125, "logits/rejected": -3.0152859687805176, "logps/chosen": -54.046836853027344, "logps/rejected": -58.50445556640625, "loss": 0.6918, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.017930559813976288, "rewards/margins": 0.002923080464825034, "rewards/rejected": -0.020853638648986816, "step": 4960 }, { "epoch": 0.8563059958649207, "grad_norm": 2.817776679992676, "learning_rate": 1.4126528172394132e-08, "logits/chosen": -3.0035629272460938, "logits/rejected": -2.993034839630127, "logps/chosen": -53.23530960083008, "logps/rejected": -54.26251983642578, "loss": 0.6906, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.018618527799844742, "rewards/margins": 0.005449257791042328, "rewards/rejected": -0.02406778372824192, "step": 4970 }, { "epoch": 0.8580289455547898, "grad_norm": 2.0556752681732178, "learning_rate": 1.40991175743482e-08, "logits/chosen": -3.0782289505004883, "logits/rejected": -3.0726828575134277, "logps/chosen": -54.139183044433594, "logps/rejected": -56.74431228637695, "loss": 0.6904, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012816699221730232, "rewards/margins": 0.005773487966507673, "rewards/rejected": -0.018590185791254044, "step": 4980 }, { "epoch": 0.8597518952446589, "grad_norm": 2.202556848526001, "learning_rate": 1.4071669907666415e-08, "logits/chosen": -2.9854166507720947, "logits/rejected": -2.9775357246398926, "logps/chosen": -53.2756462097168, "logps/rejected": -57.1760139465332, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014912595972418785, "rewards/margins": 0.007976751774549484, "rewards/rejected": -0.02288934774696827, "step": 4990 }, { "epoch": 0.8614748449345279, "grad_norm": 2.384925603866577, "learning_rate": 1.4044185420560144e-08, "logits/chosen": -3.0189459323883057, "logits/rejected": -2.9987950325012207, "logps/chosen": -57.17552947998047, "logps/rejected": -54.762413024902344, "loss": 0.6899, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.013715590350329876, "rewards/margins": 0.006854689680039883, "rewards/rejected": -0.02057028003036976, "step": 5000 }, { "epoch": 0.8614748449345279, "eval_logits/chosen": -3.138385534286499, "eval_logits/rejected": -3.1327359676361084, "eval_logps/chosen": -58.66404724121094, "eval_logps/rejected": -63.465179443359375, "eval_loss": 0.6915493011474609, "eval_rewards/accuracies": 0.5769051909446716, "eval_rewards/chosen": 0.0004785024793818593, "eval_rewards/margins": 0.0033290009014308453, "eval_rewards/rejected": -0.002850498305633664, "eval_runtime": 384.5021, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 5000 }, { "epoch": 0.8631977946243969, "grad_norm": 2.560671091079712, "learning_rate": 1.4016664361573723e-08, "logits/chosen": -3.0151009559631348, "logits/rejected": -2.9882473945617676, "logps/chosen": -54.525062561035156, "logps/rejected": -54.10304641723633, "loss": 0.6858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.014742674306035042, "rewards/margins": 0.015148031525313854, "rewards/rejected": -0.02989070676267147, "step": 5010 }, { "epoch": 0.864920744314266, "grad_norm": 2.4198813438415527, "learning_rate": 1.3989106979582206e-08, "logits/chosen": -3.0126874446868896, "logits/rejected": -2.9866137504577637, "logps/chosen": -54.9820671081543, "logps/rejected": -54.1724967956543, "loss": 0.6899, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.014826476573944092, "rewards/margins": 0.00693404208868742, "rewards/rejected": -0.021760519593954086, "step": 5020 }, { "epoch": 0.8666436940041351, "grad_norm": 2.369276523590088, "learning_rate": 1.3961513523789117e-08, "logits/chosen": -2.9598562717437744, "logits/rejected": -2.9502501487731934, "logps/chosen": -54.18781661987305, "logps/rejected": -56.25947952270508, "loss": 0.6897, "rewards/accuracies": 0.59375, "rewards/chosen": -0.015706423670053482, "rewards/margins": 0.007301941514015198, "rewards/rejected": -0.02300836518406868, "step": 5030 }, { "epoch": 0.8683666436940042, "grad_norm": 2.4504623413085938, "learning_rate": 1.3933884243724207e-08, "logits/chosen": -3.1910994052886963, "logits/rejected": -3.148806095123291, "logps/chosen": -58.43614959716797, "logps/rejected": -52.756126403808594, "loss": 0.6882, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012786917388439178, "rewards/margins": 0.010306078009307384, "rewards/rejected": -0.02309299446642399, "step": 5040 }, { "epoch": 0.8700895933838731, "grad_norm": 2.484468936920166, "learning_rate": 1.3906219389241175e-08, "logits/chosen": -3.059654712677002, "logits/rejected": -3.035466194152832, "logps/chosen": -56.96913528442383, "logps/rejected": -57.09128952026367, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.013648102059960365, "rewards/margins": 0.010198203846812248, "rewards/rejected": -0.023846307769417763, "step": 5050 }, { "epoch": 0.8718125430737422, "grad_norm": 2.1926894187927246, "learning_rate": 1.3878519210515435e-08, "logits/chosen": -2.9194982051849365, "logits/rejected": -2.9135661125183105, "logps/chosen": -52.278892517089844, "logps/rejected": -54.48607635498047, "loss": 0.6901, "rewards/accuracies": 0.5625, "rewards/chosen": -0.018070057034492493, "rewards/margins": 0.006437377072870731, "rewards/rejected": -0.024507436901330948, "step": 5060 }, { "epoch": 0.8735354927636113, "grad_norm": 2.3249335289001465, "learning_rate": 1.3850783958041834e-08, "logits/chosen": -3.032139778137207, "logits/rejected": -3.0054965019226074, "logps/chosen": -54.1693229675293, "logps/rejected": -50.52428436279297, "loss": 0.6884, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.014210036024451256, "rewards/margins": 0.009761094115674496, "rewards/rejected": -0.023971129208803177, "step": 5070 }, { "epoch": 0.8752584424534804, "grad_norm": 2.3921754360198975, "learning_rate": 1.38230138826324e-08, "logits/chosen": -3.223688840866089, "logits/rejected": -3.1855413913726807, "logps/chosen": -58.59296798706055, "logps/rejected": -53.28046417236328, "loss": 0.6869, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010885847732424736, "rewards/margins": 0.012997889891266823, "rewards/rejected": -0.023883739486336708, "step": 5080 }, { "epoch": 0.8769813921433495, "grad_norm": 2.5459678173065186, "learning_rate": 1.379520923541406e-08, "logits/chosen": -3.0902998447418213, "logits/rejected": -3.05825138092041, "logps/chosen": -55.68738555908203, "logps/rejected": -54.36516189575195, "loss": 0.686, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01396668516099453, "rewards/margins": 0.014680743217468262, "rewards/rejected": -0.02864742837846279, "step": 5090 }, { "epoch": 0.8787043418332184, "grad_norm": 2.2755610942840576, "learning_rate": 1.376737026782638e-08, "logits/chosen": -2.925417900085449, "logits/rejected": -2.889483690261841, "logps/chosen": -56.4536247253418, "logps/rejected": -55.12589645385742, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.013867422938346863, "rewards/margins": 0.014166000299155712, "rewards/rejected": -0.02803342044353485, "step": 5100 }, { "epoch": 0.8787043418332184, "eval_logits/chosen": -3.1375465393066406, "eval_logits/rejected": -3.131910562515259, "eval_logps/chosen": -58.683929443359375, "eval_logps/rejected": -63.488773345947266, "eval_loss": 0.6915342807769775, "eval_rewards/accuracies": 0.5683085322380066, "eval_rewards/chosen": 0.0002796630433294922, "eval_rewards/margins": 0.0033661844208836555, "eval_rewards/rejected": -0.0030865215230733156, "eval_runtime": 384.6471, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 5100 }, { "epoch": 0.8804272915230875, "grad_norm": 2.5277225971221924, "learning_rate": 1.373949723161929e-08, "logits/chosen": -3.0719993114471436, "logits/rejected": -3.0354537963867188, "logps/chosen": -59.154884338378906, "logps/rejected": -56.359588623046875, "loss": 0.6871, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.012820270843803883, "rewards/margins": 0.012380128726363182, "rewards/rejected": -0.02520040050148964, "step": 5110 }, { "epoch": 0.8821502412129566, "grad_norm": 2.3103935718536377, "learning_rate": 1.3711590378850797e-08, "logits/chosen": -3.1260557174682617, "logits/rejected": -3.0945322513580322, "logps/chosen": -55.463409423828125, "logps/rejected": -52.56806564331055, "loss": 0.6899, "rewards/accuracies": 0.65625, "rewards/chosen": -0.015900244936347008, "rewards/margins": 0.006916800979524851, "rewards/rejected": -0.02281704545021057, "step": 5120 }, { "epoch": 0.8838731909028257, "grad_norm": 2.3789403438568115, "learning_rate": 1.3683649961884723e-08, "logits/chosen": -3.146933078765869, "logits/rejected": -3.112955093383789, "logps/chosen": -54.97711944580078, "logps/rejected": -52.694297790527344, "loss": 0.6868, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.014110149815678596, "rewards/margins": 0.013062229380011559, "rewards/rejected": -0.027172381058335304, "step": 5130 }, { "epoch": 0.8855961405926946, "grad_norm": 2.3451788425445557, "learning_rate": 1.365567623338841e-08, "logits/chosen": -3.054654359817505, "logits/rejected": -3.0173521041870117, "logps/chosen": -58.7443962097168, "logps/rejected": -55.3597297668457, "loss": 0.686, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.009260999038815498, "rewards/margins": 0.014625395648181438, "rewards/rejected": -0.023886393755674362, "step": 5140 }, { "epoch": 0.8873190902825637, "grad_norm": 2.1813039779663086, "learning_rate": 1.362766944633044e-08, "logits/chosen": -3.0976474285125732, "logits/rejected": -3.088894844055176, "logps/chosen": -54.575950622558594, "logps/rejected": -57.33098220825195, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -0.014461587183177471, "rewards/margins": 0.010113890282809734, "rewards/rejected": -0.024575477465987206, "step": 5150 }, { "epoch": 0.8890420399724328, "grad_norm": 2.3312623500823975, "learning_rate": 1.3599629853978341e-08, "logits/chosen": -3.06577205657959, "logits/rejected": -3.029139757156372, "logps/chosen": -53.272178649902344, "logps/rejected": -50.97618865966797, "loss": 0.6878, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01855347864329815, "rewards/margins": 0.011086962185800076, "rewards/rejected": -0.02964043989777565, "step": 5160 }, { "epoch": 0.8907649896623019, "grad_norm": 2.2718193531036377, "learning_rate": 1.357155770989631e-08, "logits/chosen": -3.05924129486084, "logits/rejected": -3.0357818603515625, "logps/chosen": -56.48089599609375, "logps/rejected": -53.92249298095703, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": -0.011843241751194, "rewards/margins": 0.008908586576581001, "rewards/rejected": -0.020751826465129852, "step": 5170 }, { "epoch": 0.892487939352171, "grad_norm": 2.8206381797790527, "learning_rate": 1.3543453267942905e-08, "logits/chosen": -3.149730682373047, "logits/rejected": -3.1397929191589355, "logps/chosen": -56.902320861816406, "logps/rejected": -56.74182891845703, "loss": 0.6897, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.014993062242865562, "rewards/margins": 0.0073389047756791115, "rewards/rejected": -0.0223319660872221, "step": 5180 }, { "epoch": 0.8942108890420399, "grad_norm": 2.555413007736206, "learning_rate": 1.3515316782268756e-08, "logits/chosen": -3.028125047683716, "logits/rejected": -3.0172619819641113, "logps/chosen": -54.56239700317383, "logps/rejected": -56.15587615966797, "loss": 0.6906, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.017221994698047638, "rewards/margins": 0.005434014368802309, "rewards/rejected": -0.02265600860118866, "step": 5190 }, { "epoch": 0.895933838731909, "grad_norm": 2.364535093307495, "learning_rate": 1.3487148507314273e-08, "logits/chosen": -3.088724136352539, "logits/rejected": -3.0658886432647705, "logps/chosen": -53.71836471557617, "logps/rejected": -57.087257385253906, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01639769785106182, "rewards/margins": 0.01384388841688633, "rewards/rejected": -0.030241584405303, "step": 5200 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -3.1371073722839355, "eval_logits/rejected": -3.131441116333008, "eval_logps/chosen": -58.70653533935547, "eval_logps/rejected": -63.53398895263672, "eval_loss": 0.6914243698120117, "eval_rewards/accuracies": 0.5734200477600098, "eval_rewards/chosen": 5.36102379555814e-05, "eval_rewards/margins": 0.00359228253364563, "eval_rewards/rejected": -0.003538672346621752, "eval_runtime": 384.4039, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 5200 }, { "epoch": 0.8976567884217781, "grad_norm": 2.1429319381713867, "learning_rate": 1.3458948697807336e-08, "logits/chosen": -3.0572848320007324, "logits/rejected": -3.042999744415283, "logps/chosen": -53.3560791015625, "logps/rejected": -52.3207893371582, "loss": 0.6903, "rewards/accuracies": 0.59375, "rewards/chosen": -0.017168376594781876, "rewards/margins": 0.006034437101334333, "rewards/rejected": -0.023202812299132347, "step": 5210 }, { "epoch": 0.8993797381116472, "grad_norm": 2.7378227710723877, "learning_rate": 1.3430717608760991e-08, "logits/chosen": -3.082184314727783, "logits/rejected": -3.0691020488739014, "logps/chosen": -54.03383255004883, "logps/rejected": -58.534889221191406, "loss": 0.6877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.014227539300918579, "rewards/margins": 0.011310763657093048, "rewards/rejected": -0.025538304820656776, "step": 5220 }, { "epoch": 0.9011026878015161, "grad_norm": 2.590153217315674, "learning_rate": 1.3402455495471153e-08, "logits/chosen": -3.0239996910095215, "logits/rejected": -3.012888193130493, "logps/chosen": -56.472503662109375, "logps/rejected": -58.494102478027344, "loss": 0.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.012096828781068325, "rewards/margins": 0.011721945367753506, "rewards/rejected": -0.02381877228617668, "step": 5230 }, { "epoch": 0.9028256374913852, "grad_norm": 2.373455047607422, "learning_rate": 1.3374162613514285e-08, "logits/chosen": -3.0243659019470215, "logits/rejected": -3.001631498336792, "logps/chosen": -54.14702606201172, "logps/rejected": -55.911949157714844, "loss": 0.6874, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.013740400783717632, "rewards/margins": 0.011903162114322186, "rewards/rejected": -0.025643562898039818, "step": 5240 }, { "epoch": 0.9045485871812543, "grad_norm": 2.0493698120117188, "learning_rate": 1.3345839218745101e-08, "logits/chosen": -2.9678304195404053, "logits/rejected": -2.953367233276367, "logps/chosen": -52.9145393371582, "logps/rejected": -53.31523895263672, "loss": 0.6909, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.019368145614862442, "rewards/margins": 0.004988770931959152, "rewards/rejected": -0.024356918409466743, "step": 5250 }, { "epoch": 0.9062715368711234, "grad_norm": 2.544506311416626, "learning_rate": 1.3317485567294238e-08, "logits/chosen": -3.062175750732422, "logits/rejected": -3.0207974910736084, "logps/chosen": -58.17012405395508, "logps/rejected": -54.205718994140625, "loss": 0.6868, "rewards/accuracies": 0.65625, "rewards/chosen": -0.013847528025507927, "rewards/margins": 0.013102750293910503, "rewards/rejected": -0.026950281113386154, "step": 5260 }, { "epoch": 0.9079944865609925, "grad_norm": 2.3220739364624023, "learning_rate": 1.3289101915565951e-08, "logits/chosen": -3.1599390506744385, "logits/rejected": -3.14455246925354, "logps/chosen": -53.253761291503906, "logps/rejected": -53.868431091308594, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.016995316371321678, "rewards/margins": 0.010229960083961487, "rewards/rejected": -0.027225274592638016, "step": 5270 }, { "epoch": 0.9097174362508614, "grad_norm": 2.4071993827819824, "learning_rate": 1.3260688520235785e-08, "logits/chosen": -3.0632424354553223, "logits/rejected": -3.032189130783081, "logps/chosen": -56.436180114746094, "logps/rejected": -53.795684814453125, "loss": 0.6882, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.020667383447289467, "rewards/margins": 0.010586929507553577, "rewards/rejected": -0.03125431388616562, "step": 5280 }, { "epoch": 0.9114403859407305, "grad_norm": 2.3140807151794434, "learning_rate": 1.3232245638248262e-08, "logits/chosen": -3.0068345069885254, "logits/rejected": -2.96124529838562, "logps/chosen": -56.988670349121094, "logps/rejected": -54.59224319458008, "loss": 0.6863, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.011329452507197857, "rewards/margins": 0.01419984083622694, "rewards/rejected": -0.025529295206069946, "step": 5290 }, { "epoch": 0.9131633356305996, "grad_norm": 2.47336745262146, "learning_rate": 1.3203773526814558e-08, "logits/chosen": -3.0152244567871094, "logits/rejected": -3.0013296604156494, "logps/chosen": -56.43061065673828, "logps/rejected": -57.6225471496582, "loss": 0.6877, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.013324270024895668, "rewards/margins": 0.011425389908254147, "rewards/rejected": -0.02474966086447239, "step": 5300 }, { "epoch": 0.9131633356305996, "eval_logits/chosen": -3.136502265930176, "eval_logits/rejected": -3.1308670043945312, "eval_logps/chosen": -58.71966552734375, "eval_logps/rejected": -63.566688537597656, "eval_loss": 0.691329836845398, "eval_rewards/accuracies": 0.5736523866653442, "eval_rewards/chosen": -7.772997923893854e-05, "eval_rewards/margins": 0.003787950612604618, "eval_rewards/rejected": -0.003865680657327175, "eval_runtime": 384.1169, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 5300 }, { "epoch": 0.9148862853204687, "grad_norm": 2.492147922515869, "learning_rate": 1.3175272443410165e-08, "logits/chosen": -3.148709774017334, "logits/rejected": -3.1363625526428223, "logps/chosen": -59.37726974487305, "logps/rejected": -54.50934600830078, "loss": 0.689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.012435202486813068, "rewards/margins": 0.008691241964697838, "rewards/rejected": -0.02112644538283348, "step": 5310 }, { "epoch": 0.9166092350103378, "grad_norm": 2.2255496978759766, "learning_rate": 1.3146742645772576e-08, "logits/chosen": -3.0580496788024902, "logits/rejected": -3.0039610862731934, "logps/chosen": -58.974510192871094, "logps/rejected": -53.189048767089844, "loss": 0.6858, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.014206658117473125, "rewards/margins": 0.015097076073288918, "rewards/rejected": -0.02930373325943947, "step": 5320 }, { "epoch": 0.9183321847002067, "grad_norm": 2.464592695236206, "learning_rate": 1.311818439189895e-08, "logits/chosen": -2.996553421020508, "logits/rejected": -3.0002400875091553, "logps/chosen": -52.74189376831055, "logps/rejected": -55.937255859375, "loss": 0.6903, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.018881823867559433, "rewards/margins": 0.006236328277736902, "rewards/rejected": -0.025118151679635048, "step": 5330 }, { "epoch": 0.9200551343900758, "grad_norm": 2.629176139831543, "learning_rate": 1.3089597940043773e-08, "logits/chosen": -3.035466432571411, "logits/rejected": -2.9821648597717285, "logps/chosen": -57.51744842529297, "logps/rejected": -54.27968215942383, "loss": 0.6847, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011555047705769539, "rewards/margins": 0.017491517588496208, "rewards/rejected": -0.02904656156897545, "step": 5340 }, { "epoch": 0.9217780840799449, "grad_norm": 2.4826347827911377, "learning_rate": 1.3060983548716533e-08, "logits/chosen": -2.9875071048736572, "logits/rejected": -2.9504916667938232, "logps/chosen": -54.7081184387207, "logps/rejected": -55.256614685058594, "loss": 0.6873, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.020656753331422806, "rewards/margins": 0.012195435352623463, "rewards/rejected": -0.032852184027433395, "step": 5350 }, { "epoch": 0.923501033769814, "grad_norm": 2.558962345123291, "learning_rate": 1.3032341476679368e-08, "logits/chosen": -3.1059539318084717, "logits/rejected": -3.0991828441619873, "logps/chosen": -54.63978958129883, "logps/rejected": -58.72779083251953, "loss": 0.6893, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.014256837777793407, "rewards/margins": 0.008205600082874298, "rewards/rejected": -0.02246243879199028, "step": 5360 }, { "epoch": 0.9252239834596829, "grad_norm": 2.460012674331665, "learning_rate": 1.3003671982944747e-08, "logits/chosen": -3.004504680633545, "logits/rejected": -2.9867444038391113, "logps/chosen": -53.31267166137695, "logps/rejected": -55.93389129638672, "loss": 0.6877, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.013879385776817799, "rewards/margins": 0.011418366804718971, "rewards/rejected": -0.025297755375504494, "step": 5370 }, { "epoch": 0.926946933149552, "grad_norm": 2.2511675357818604, "learning_rate": 1.2974975326773106e-08, "logits/chosen": -3.1150755882263184, "logits/rejected": -3.075209617614746, "logps/chosen": -58.38878631591797, "logps/rejected": -54.537193298339844, "loss": 0.6838, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009906591847538948, "rewards/margins": 0.019162429496645927, "rewards/rejected": -0.029069025069475174, "step": 5380 }, { "epoch": 0.9286698828394211, "grad_norm": 2.384746551513672, "learning_rate": 1.2946251767670519e-08, "logits/chosen": -3.032280445098877, "logits/rejected": -2.9894490242004395, "logps/chosen": -60.061767578125, "logps/rejected": -54.47789764404297, "loss": 0.6865, "rewards/accuracies": 0.65625, "rewards/chosen": -0.014944732189178467, "rewards/margins": 0.013701597228646278, "rewards/rejected": -0.028646331280469894, "step": 5390 }, { "epoch": 0.9303928325292902, "grad_norm": 2.1436920166015625, "learning_rate": 1.2917501565386343e-08, "logits/chosen": -3.0589873790740967, "logits/rejected": -3.041414976119995, "logps/chosen": -52.97267532348633, "logps/rejected": -56.66045379638672, "loss": 0.6889, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.018663279712200165, "rewards/margins": 0.009023270569741726, "rewards/rejected": -0.027686551213264465, "step": 5400 }, { "epoch": 0.9303928325292902, "eval_logits/chosen": -3.1357803344726562, "eval_logits/rejected": -3.130146026611328, "eval_logps/chosen": -58.737403869628906, "eval_logps/rejected": -63.59604263305664, "eval_loss": 0.6912763714790344, "eval_rewards/accuracies": 0.5759758353233337, "eval_rewards/chosen": -0.00025504513178020716, "eval_rewards/margins": 0.0039042264688760042, "eval_rewards/rejected": -0.004159271717071533, "eval_runtime": 384.3865, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 5400 }, { "epoch": 0.9321157822191593, "grad_norm": 2.2527265548706055, "learning_rate": 1.2888724979910867e-08, "logits/chosen": -3.071611166000366, "logits/rejected": -3.059826374053955, "logps/chosen": -58.34067916870117, "logps/rejected": -56.316001892089844, "loss": 0.6908, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.017013002187013626, "rewards/margins": 0.005270760972052813, "rewards/rejected": -0.022283760830760002, "step": 5410 }, { "epoch": 0.9338387319090282, "grad_norm": 2.507885456085205, "learning_rate": 1.2859922271472968e-08, "logits/chosen": -3.124504566192627, "logits/rejected": -3.101989984512329, "logps/chosen": -55.222572326660156, "logps/rejected": -56.50590133666992, "loss": 0.6871, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.015043241903185844, "rewards/margins": 0.01249424833804369, "rewards/rejected": -0.02753749117255211, "step": 5420 }, { "epoch": 0.9355616815988973, "grad_norm": 2.130812644958496, "learning_rate": 1.2831093700537764e-08, "logits/chosen": -3.0423922538757324, "logits/rejected": -3.0314245223999023, "logps/chosen": -55.79705810546875, "logps/rejected": -54.9379768371582, "loss": 0.6884, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.021989721804857254, "rewards/margins": 0.009882936254143715, "rewards/rejected": -0.03187265619635582, "step": 5430 }, { "epoch": 0.9372846312887664, "grad_norm": 2.3937151432037354, "learning_rate": 1.2802239527804237e-08, "logits/chosen": -3.056756019592285, "logits/rejected": -3.0396242141723633, "logps/chosen": -57.8969841003418, "logps/rejected": -58.131507873535156, "loss": 0.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.016955677419900894, "rewards/margins": 0.011915793642401695, "rewards/rejected": -0.028871476650238037, "step": 5440 }, { "epoch": 0.9390075809786355, "grad_norm": 2.4363255500793457, "learning_rate": 1.2773360014202888e-08, "logits/chosen": -2.983828067779541, "logits/rejected": -2.9721152782440186, "logps/chosen": -57.15266799926758, "logps/rejected": -56.26232147216797, "loss": 0.6874, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0157354436814785, "rewards/margins": 0.012112580239772797, "rewards/rejected": -0.027848023921251297, "step": 5450 }, { "epoch": 0.9407305306685044, "grad_norm": 2.7043750286102295, "learning_rate": 1.2744455420893392e-08, "logits/chosen": -3.0711090564727783, "logits/rejected": -3.0553476810455322, "logps/chosen": -55.708465576171875, "logps/rejected": -54.60699462890625, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.018650764599442482, "rewards/margins": 0.007309816777706146, "rewards/rejected": -0.025960583239793777, "step": 5460 }, { "epoch": 0.9424534803583735, "grad_norm": 2.703200578689575, "learning_rate": 1.2715526009262208e-08, "logits/chosen": -3.0238287448883057, "logits/rejected": -3.003737211227417, "logps/chosen": -53.43670654296875, "logps/rejected": -53.303321838378906, "loss": 0.6897, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.020780716091394424, "rewards/margins": 0.007408404257148504, "rewards/rejected": -0.028189118951559067, "step": 5470 }, { "epoch": 0.9441764300482426, "grad_norm": 2.4181151390075684, "learning_rate": 1.268657204092023e-08, "logits/chosen": -3.0498404502868652, "logits/rejected": -3.0245485305786133, "logps/chosen": -54.807106018066406, "logps/rejected": -54.9182014465332, "loss": 0.6864, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.018639078363776207, "rewards/margins": 0.014108446426689625, "rewards/rejected": -0.032747525721788406, "step": 5480 }, { "epoch": 0.9458993797381117, "grad_norm": 2.315063238143921, "learning_rate": 1.2657593777700424e-08, "logits/chosen": -3.0332398414611816, "logits/rejected": -3.0053722858428955, "logps/chosen": -57.3248405456543, "logps/rejected": -54.80067825317383, "loss": 0.6892, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.017597036436200142, "rewards/margins": 0.008288295939564705, "rewards/rejected": -0.025885334238409996, "step": 5490 }, { "epoch": 0.9476223294279807, "grad_norm": 2.1240878105163574, "learning_rate": 1.2628591481655457e-08, "logits/chosen": -3.1073358058929443, "logits/rejected": -3.091235637664795, "logps/chosen": -55.46843719482422, "logps/rejected": -57.39680862426758, "loss": 0.688, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01657593622803688, "rewards/margins": 0.010735424235463142, "rewards/rejected": -0.027311360463500023, "step": 5500 }, { "epoch": 0.9476223294279807, "eval_logits/chosen": -3.1351189613342285, "eval_logits/rejected": -3.1294331550598145, "eval_logps/chosen": -58.75161361694336, "eval_logps/rejected": -63.61307907104492, "eval_loss": 0.6912639737129211, "eval_rewards/accuracies": 0.5659851431846619, "eval_rewards/chosen": -0.0003971691185142845, "eval_rewards/margins": 0.003932336810976267, "eval_rewards/rejected": -0.004329506773501635, "eval_runtime": 383.8331, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 5500 }, { "epoch": 0.9493452791178497, "grad_norm": 2.3148903846740723, "learning_rate": 1.2599565415055328e-08, "logits/chosen": -3.0147597789764404, "logits/rejected": -2.9981188774108887, "logps/chosen": -54.94285202026367, "logps/rejected": -55.143272399902344, "loss": 0.6899, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01715438812971115, "rewards/margins": 0.0069595216773450375, "rewards/rejected": -0.024113908410072327, "step": 5510 }, { "epoch": 0.9510682288077188, "grad_norm": 2.79364013671875, "learning_rate": 1.2570515840384984e-08, "logits/chosen": -3.0089077949523926, "logits/rejected": -2.9828317165374756, "logps/chosen": -57.03174591064453, "logps/rejected": -54.21330642700195, "loss": 0.6861, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01207288634032011, "rewards/margins": 0.014618280343711376, "rewards/rejected": -0.026691168546676636, "step": 5520 }, { "epoch": 0.9527911784975879, "grad_norm": 2.2197608947753906, "learning_rate": 1.2541443020341975e-08, "logits/chosen": -3.067873477935791, "logits/rejected": -3.032580852508545, "logps/chosen": -60.96553421020508, "logps/rejected": -55.5814094543457, "loss": 0.6861, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01058810856193304, "rewards/margins": 0.014492152258753777, "rewards/rejected": -0.025080259889364243, "step": 5530 }, { "epoch": 0.954514128187457, "grad_norm": 2.2949342727661133, "learning_rate": 1.2512347217834042e-08, "logits/chosen": -3.0294876098632812, "logits/rejected": -3.0352654457092285, "logps/chosen": -53.901390075683594, "logps/rejected": -59.26971435546875, "loss": 0.6883, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.015541985630989075, "rewards/margins": 0.010141951031982899, "rewards/rejected": -0.025683939456939697, "step": 5540 }, { "epoch": 0.956237077877326, "grad_norm": 2.465346336364746, "learning_rate": 1.2483228695976776e-08, "logits/chosen": -2.912057876586914, "logits/rejected": -2.89572811126709, "logps/chosen": -54.469642639160156, "logps/rejected": -56.864906311035156, "loss": 0.6888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01851799711585045, "rewards/margins": 0.009365186095237732, "rewards/rejected": -0.02788318134844303, "step": 5550 }, { "epoch": 0.957960027567195, "grad_norm": 2.667961835861206, "learning_rate": 1.2454087718091208e-08, "logits/chosen": -3.0371270179748535, "logits/rejected": -3.0104763507843018, "logps/chosen": -53.86272048950195, "logps/rejected": -52.576141357421875, "loss": 0.6879, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.019759749993681908, "rewards/margins": 0.011099927127361298, "rewards/rejected": -0.030859675258398056, "step": 5560 }, { "epoch": 0.9596829772570641, "grad_norm": 2.3370180130004883, "learning_rate": 1.2424924547701442e-08, "logits/chosen": -3.081906795501709, "logits/rejected": -3.0722689628601074, "logps/chosen": -52.50896453857422, "logps/rejected": -59.51946258544922, "loss": 0.6898, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.018424561247229576, "rewards/margins": 0.0074028694070875645, "rewards/rejected": -0.025827426463365555, "step": 5570 }, { "epoch": 0.9614059269469332, "grad_norm": 2.0538763999938965, "learning_rate": 1.239573944853228e-08, "logits/chosen": -2.9546151161193848, "logits/rejected": -2.94108247756958, "logps/chosen": -55.21215057373047, "logps/rejected": -56.68113327026367, "loss": 0.6891, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02034585550427437, "rewards/margins": 0.00860525481402874, "rewards/rejected": -0.02895110473036766, "step": 5580 }, { "epoch": 0.9631288766368022, "grad_norm": 2.415463447570801, "learning_rate": 1.2366532684506815e-08, "logits/chosen": -3.1263980865478516, "logits/rejected": -3.0816597938537598, "logps/chosen": -57.99749755859375, "logps/rejected": -58.05116653442383, "loss": 0.6848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011663533747196198, "rewards/margins": 0.017326349392533302, "rewards/rejected": -0.02898988500237465, "step": 5590 }, { "epoch": 0.9648518263266712, "grad_norm": 2.2387213706970215, "learning_rate": 1.2337304519744066e-08, "logits/chosen": -3.0889410972595215, "logits/rejected": -3.086632013320923, "logps/chosen": -54.502777099609375, "logps/rejected": -60.0045051574707, "loss": 0.6899, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.019988704472780228, "rewards/margins": 0.0069748698733747005, "rewards/rejected": -0.026963572949171066, "step": 5600 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -3.134329080581665, "eval_logits/rejected": -3.128695011138916, "eval_logps/chosen": -58.77082443237305, "eval_logps/rejected": -63.63037109375, "eval_loss": 0.6912762522697449, "eval_rewards/accuracies": 0.5745818018913269, "eval_rewards/chosen": -0.0005893177003599703, "eval_rewards/margins": 0.0039131660014390945, "eval_rewards/rejected": -0.004502483177930117, "eval_runtime": 384.5923, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 5600 }, { "epoch": 0.9665747760165403, "grad_norm": 2.2621214389801025, "learning_rate": 1.2308055218556577e-08, "logits/chosen": -3.098680019378662, "logits/rejected": -3.080854892730713, "logps/chosen": -53.97863006591797, "logps/rejected": -59.511802673339844, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.015781071037054062, "rewards/margins": 0.013751788064837456, "rewards/rejected": -0.02953285537660122, "step": 5610 }, { "epoch": 0.9682977257064094, "grad_norm": 2.7723278999328613, "learning_rate": 1.2278785045448034e-08, "logits/chosen": -3.0287671089172363, "logits/rejected": -3.006072998046875, "logps/chosen": -56.87371826171875, "logps/rejected": -53.5859260559082, "loss": 0.6874, "rewards/accuracies": 0.59375, "rewards/chosen": -0.018540427088737488, "rewards/margins": 0.012070056982338428, "rewards/rejected": -0.03061048686504364, "step": 5620 }, { "epoch": 0.9700206753962785, "grad_norm": 2.5379652976989746, "learning_rate": 1.2249494265110862e-08, "logits/chosen": -3.07861065864563, "logits/rejected": -3.0483670234680176, "logps/chosen": -57.4000129699707, "logps/rejected": -53.848663330078125, "loss": 0.6874, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.014923269860446453, "rewards/margins": 0.012017721310257912, "rewards/rejected": -0.02694099023938179, "step": 5630 }, { "epoch": 0.9717436250861475, "grad_norm": 2.558870792388916, "learning_rate": 1.222018314242384e-08, "logits/chosen": -3.0515987873077393, "logits/rejected": -3.038257122039795, "logps/chosen": -55.17596435546875, "logps/rejected": -58.388694763183594, "loss": 0.6881, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012300265952944756, "rewards/margins": 0.010648714378476143, "rewards/rejected": -0.02294897846877575, "step": 5640 }, { "epoch": 0.9734665747760165, "grad_norm": 2.609940528869629, "learning_rate": 1.2190851942449712e-08, "logits/chosen": -2.959313154220581, "logits/rejected": -2.9530630111694336, "logps/chosen": -54.18489456176758, "logps/rejected": -55.13713455200195, "loss": 0.6896, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0217437781393528, "rewards/margins": 0.00759219890460372, "rewards/rejected": -0.029335975646972656, "step": 5650 }, { "epoch": 0.9751895244658856, "grad_norm": 2.476722002029419, "learning_rate": 1.2161500930432778e-08, "logits/chosen": -2.9682631492614746, "logits/rejected": -2.965312957763672, "logps/chosen": -54.02378463745117, "logps/rejected": -53.51348876953125, "loss": 0.6917, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.021599246188998222, "rewards/margins": 0.0035273456014692783, "rewards/rejected": -0.025126595050096512, "step": 5660 }, { "epoch": 0.9769124741557547, "grad_norm": 2.6098222732543945, "learning_rate": 1.2132130371796499e-08, "logits/chosen": -3.014533519744873, "logits/rejected": -2.9931082725524902, "logps/chosen": -55.737632751464844, "logps/rejected": -55.68427276611328, "loss": 0.6877, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.016952062025666237, "rewards/margins": 0.011545151472091675, "rewards/rejected": -0.02849721349775791, "step": 5670 }, { "epoch": 0.9786354238456237, "grad_norm": 2.264927387237549, "learning_rate": 1.2102740532141101e-08, "logits/chosen": -3.0722849369049072, "logits/rejected": -3.0306591987609863, "logps/chosen": -56.12786102294922, "logps/rejected": -54.975341796875, "loss": 0.6855, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.017493674531579018, "rewards/margins": 0.015774184837937355, "rewards/rejected": -0.033267855644226074, "step": 5680 }, { "epoch": 0.9803583735354927, "grad_norm": 2.7673423290252686, "learning_rate": 1.207333167724116e-08, "logits/chosen": -3.095992088317871, "logits/rejected": -3.0646653175354004, "logps/chosen": -59.16156005859375, "logps/rejected": -54.520599365234375, "loss": 0.6849, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.011630335822701454, "rewards/margins": 0.01691516861319542, "rewards/rejected": -0.028545504435896873, "step": 5690 }, { "epoch": 0.9820813232253618, "grad_norm": 2.370692729949951, "learning_rate": 1.2043904073043222e-08, "logits/chosen": -2.9002363681793213, "logits/rejected": -2.881931781768799, "logps/chosen": -59.330894470214844, "logps/rejected": -56.132362365722656, "loss": 0.687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01545041799545288, "rewards/margins": 0.012800877913832664, "rewards/rejected": -0.028251299634575844, "step": 5700 }, { "epoch": 0.9820813232253618, "eval_logits/chosen": -3.1337404251098633, "eval_logits/rejected": -3.128058910369873, "eval_logps/chosen": -58.77233123779297, "eval_logps/rejected": -63.66284942626953, "eval_loss": 0.691124677658081, "eval_rewards/accuracies": 0.5787639617919922, "eval_rewards/chosen": -0.000604349363129586, "eval_rewards/margins": 0.004222996532917023, "eval_rewards/rejected": -0.004827346187084913, "eval_runtime": 384.0163, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 5700 }, { "epoch": 0.9838042729152309, "grad_norm": 2.2526307106018066, "learning_rate": 1.2014457985663371e-08, "logits/chosen": -3.086958885192871, "logits/rejected": -3.057468891143799, "logps/chosen": -56.86411666870117, "logps/rejected": -55.9247932434082, "loss": 0.6855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.017509115859866142, "rewards/margins": 0.015910452231764793, "rewards/rejected": -0.033419571816921234, "step": 5710 }, { "epoch": 0.9855272226051, "grad_norm": 2.3755736351013184, "learning_rate": 1.1984993681384845e-08, "logits/chosen": -3.0592103004455566, "logits/rejected": -3.035421133041382, "logps/chosen": -53.76625442504883, "logps/rejected": -53.1986083984375, "loss": 0.6854, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01754259690642357, "rewards/margins": 0.016087189316749573, "rewards/rejected": -0.03362978622317314, "step": 5720 }, { "epoch": 0.987250172294969, "grad_norm": 2.365424394607544, "learning_rate": 1.1955511426655622e-08, "logits/chosen": -3.1162123680114746, "logits/rejected": -3.104675769805908, "logps/chosen": -53.531227111816406, "logps/rejected": -54.61872482299805, "loss": 0.6887, "rewards/accuracies": 0.625, "rewards/chosen": -0.01806630939245224, "rewards/margins": 0.009289136156439781, "rewards/rejected": -0.02735544741153717, "step": 5730 }, { "epoch": 0.988973121984838, "grad_norm": 2.421537399291992, "learning_rate": 1.1926011488085994e-08, "logits/chosen": -3.0126664638519287, "logits/rejected": -2.9804635047912598, "logps/chosen": -61.017852783203125, "logps/rejected": -57.800559997558594, "loss": 0.6857, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.018092989921569824, "rewards/margins": 0.015476164408028126, "rewards/rejected": -0.033569153398275375, "step": 5740 }, { "epoch": 0.9906960716747071, "grad_norm": 2.609236001968384, "learning_rate": 1.189649413244618e-08, "logits/chosen": -3.150195360183716, "logits/rejected": -3.1196980476379395, "logps/chosen": -58.42035675048828, "logps/rejected": -53.55390167236328, "loss": 0.6858, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01756112650036812, "rewards/margins": 0.015267576090991497, "rewards/rejected": -0.03282870352268219, "step": 5750 }, { "epoch": 0.9924190213645762, "grad_norm": 2.3347179889678955, "learning_rate": 1.1866959626663902e-08, "logits/chosen": -3.0212275981903076, "logits/rejected": -2.9948441982269287, "logps/chosen": -58.001365661621094, "logps/rejected": -58.6787109375, "loss": 0.6859, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01591596007347107, "rewards/margins": 0.015181079506874084, "rewards/rejected": -0.031097035855054855, "step": 5760 }, { "epoch": 0.9941419710544452, "grad_norm": 2.569519281387329, "learning_rate": 1.183740823782197e-08, "logits/chosen": -2.986656665802002, "logits/rejected": -2.9676034450531006, "logps/chosen": -53.551719665527344, "logps/rejected": -56.345191955566406, "loss": 0.6903, "rewards/accuracies": 0.59375, "rewards/chosen": -0.016970595344901085, "rewards/margins": 0.00601046672090888, "rewards/rejected": -0.022981060668826103, "step": 5770 }, { "epoch": 0.9958649207443143, "grad_norm": 2.448632001876831, "learning_rate": 1.1807840233155862e-08, "logits/chosen": -3.0169689655303955, "logits/rejected": -2.9971630573272705, "logps/chosen": -53.976890563964844, "logps/rejected": -56.612152099609375, "loss": 0.6878, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01513325423002243, "rewards/margins": 0.011307798326015472, "rewards/rejected": -0.026441048830747604, "step": 5780 }, { "epoch": 0.9975878704341833, "grad_norm": 2.341364860534668, "learning_rate": 1.1778255880051325e-08, "logits/chosen": -2.9556498527526855, "logits/rejected": -2.921391725540161, "logps/chosen": -53.52280807495117, "logps/rejected": -56.76100540161133, "loss": 0.6896, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.020487099885940552, "rewards/margins": 0.007790186908096075, "rewards/rejected": -0.028277289122343063, "step": 5790 }, { "epoch": 0.9993108201240524, "grad_norm": 2.5893890857696533, "learning_rate": 1.1748655446041944e-08, "logits/chosen": -3.0420236587524414, "logits/rejected": -3.0131123065948486, "logps/chosen": -52.47263717651367, "logps/rejected": -55.4683952331543, "loss": 0.6857, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.017217550426721573, "rewards/margins": 0.015593932941555977, "rewards/rejected": -0.0328114852309227, "step": 5800 }, { "epoch": 0.9993108201240524, "eval_logits/chosen": -3.1334035396575928, "eval_logits/rejected": -3.127761125564575, "eval_logps/chosen": -58.7999267578125, "eval_logps/rejected": -63.68791198730469, "eval_loss": 0.6911415457725525, "eval_rewards/accuracies": 0.5713289976119995, "eval_rewards/chosen": -0.0008803335367701948, "eval_rewards/margins": 0.004197545349597931, "eval_rewards/rejected": -0.005077878944575787, "eval_runtime": 383.943, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 5800 }, { "epoch": 1.0010337698139213, "grad_norm": 2.1232244968414307, "learning_rate": 1.171903919880672e-08, "logits/chosen": -3.1008832454681396, "logits/rejected": -3.0849082469940186, "logps/chosen": -55.886863708496094, "logps/rejected": -56.52936553955078, "loss": 0.6878, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.018158026039600372, "rewards/margins": 0.011155323125422001, "rewards/rejected": -0.029313350096344948, "step": 5810 }, { "epoch": 1.0027567195037905, "grad_norm": 2.475459098815918, "learning_rate": 1.1689407406167661e-08, "logits/chosen": -3.106663465499878, "logits/rejected": -3.072197675704956, "logps/chosen": -54.98704147338867, "logps/rejected": -52.68647384643555, "loss": 0.6864, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.019610222429037094, "rewards/margins": 0.013955632224678993, "rewards/rejected": -0.03356585651636124, "step": 5820 }, { "epoch": 1.0044796691936595, "grad_norm": 2.4394052028656006, "learning_rate": 1.1659760336087344e-08, "logits/chosen": -2.99074649810791, "logits/rejected": -2.958397150039673, "logps/chosen": -54.87736129760742, "logps/rejected": -55.85693359375, "loss": 0.6854, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.013808014802634716, "rewards/margins": 0.01611264795064926, "rewards/rejected": -0.029920663684606552, "step": 5830 }, { "epoch": 1.0062026188835287, "grad_norm": 2.4401886463165283, "learning_rate": 1.1630098256666513e-08, "logits/chosen": -3.0222973823547363, "logits/rejected": -3.0010902881622314, "logps/chosen": -54.44184494018555, "logps/rejected": -57.95893478393555, "loss": 0.6868, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02053814008831978, "rewards/margins": 0.013348013162612915, "rewards/rejected": -0.03388615697622299, "step": 5840 }, { "epoch": 1.0079255685733977, "grad_norm": 2.449007272720337, "learning_rate": 1.160042143614163e-08, "logits/chosen": -3.091636896133423, "logits/rejected": -3.070950508117676, "logps/chosen": -52.696617126464844, "logps/rejected": -57.5743408203125, "loss": 0.687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.017818491905927658, "rewards/margins": 0.012944323010742664, "rewards/rejected": -0.030762815847992897, "step": 5850 }, { "epoch": 1.0096485182632666, "grad_norm": 2.2463791370391846, "learning_rate": 1.157073014288247e-08, "logits/chosen": -2.9557888507843018, "logits/rejected": -2.9458224773406982, "logps/chosen": -53.57053756713867, "logps/rejected": -56.65851974487305, "loss": 0.6853, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.017971809953451157, "rewards/margins": 0.01625947281718254, "rewards/rejected": -0.0342312827706337, "step": 5860 }, { "epoch": 1.0113714679531358, "grad_norm": 2.7064640522003174, "learning_rate": 1.1541024645389687e-08, "logits/chosen": -2.9869303703308105, "logits/rejected": -2.9656178951263428, "logps/chosen": -58.08588790893555, "logps/rejected": -58.493927001953125, "loss": 0.6857, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.01905050501227379, "rewards/margins": 0.015620703808963299, "rewards/rejected": -0.03467120975255966, "step": 5870 }, { "epoch": 1.0130944176430048, "grad_norm": 2.561938524246216, "learning_rate": 1.1511305212292376e-08, "logits/chosen": -3.068009853363037, "logits/rejected": -3.0549399852752686, "logps/chosen": -56.194244384765625, "logps/rejected": -55.732421875, "loss": 0.6876, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.014522912912070751, "rewards/margins": 0.011498660780489445, "rewards/rejected": -0.026021573692560196, "step": 5880 }, { "epoch": 1.014817367332874, "grad_norm": 2.3319783210754395, "learning_rate": 1.1481572112345666e-08, "logits/chosen": -3.092992067337036, "logits/rejected": -3.0787858963012695, "logps/chosen": -56.552085876464844, "logps/rejected": -60.93707275390625, "loss": 0.6867, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01386711560189724, "rewards/margins": 0.01341196894645691, "rewards/rejected": -0.02727908454835415, "step": 5890 }, { "epoch": 1.016540317022743, "grad_norm": 2.4611449241638184, "learning_rate": 1.1451825614428266e-08, "logits/chosen": -2.9957923889160156, "logits/rejected": -2.975466251373291, "logps/chosen": -57.44199752807617, "logps/rejected": -57.857276916503906, "loss": 0.6864, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01775974966585636, "rewards/margins": 0.014119607396423817, "rewards/rejected": -0.03187935799360275, "step": 5900 }, { "epoch": 1.016540317022743, "eval_logits/chosen": -3.1327173709869385, "eval_logits/rejected": -3.1270527839660645, "eval_logps/chosen": -58.829933166503906, "eval_logps/rejected": -63.7348747253418, "eval_loss": 0.6910606026649475, "eval_rewards/accuracies": 0.5787639617919922, "eval_rewards/chosen": -0.001180329010821879, "eval_rewards/margins": 0.004367194604128599, "eval_rewards/rejected": -0.005547523498535156, "eval_runtime": 384.3077, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 5900 }, { "epoch": 1.018263266712612, "grad_norm": 2.4447977542877197, "learning_rate": 1.1422065987540045e-08, "logits/chosen": -2.992640972137451, "logits/rejected": -2.9968771934509277, "logps/chosen": -54.487457275390625, "logps/rejected": -60.301239013671875, "loss": 0.6904, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01974978856742382, "rewards/margins": 0.006094303913414478, "rewards/rejected": -0.025844091549515724, "step": 5910 }, { "epoch": 1.019986216402481, "grad_norm": 2.307249069213867, "learning_rate": 1.1392293500799604e-08, "logits/chosen": -3.047121047973633, "logits/rejected": -3.0284416675567627, "logps/chosen": -53.3023681640625, "logps/rejected": -56.572227478027344, "loss": 0.6872, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.018186841160058975, "rewards/margins": 0.012462997809052467, "rewards/rejected": -0.030649837106466293, "step": 5920 }, { "epoch": 1.02170916609235, "grad_norm": 2.171379566192627, "learning_rate": 1.1362508423441831e-08, "logits/chosen": -3.050178289413452, "logits/rejected": -3.0439326763153076, "logps/chosen": -52.685142517089844, "logps/rejected": -54.45989227294922, "loss": 0.6892, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.024569392204284668, "rewards/margins": 0.008357712998986244, "rewards/rejected": -0.03292710706591606, "step": 5930 }, { "epoch": 1.0234321157822193, "grad_norm": 2.4548449516296387, "learning_rate": 1.1332711024815471e-08, "logits/chosen": -3.0522618293762207, "logits/rejected": -3.0109293460845947, "logps/chosen": -55.6926155090332, "logps/rejected": -56.47785186767578, "loss": 0.6837, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.014554401859641075, "rewards/margins": 0.019560344517230988, "rewards/rejected": -0.03411474451422691, "step": 5940 }, { "epoch": 1.0251550654720882, "grad_norm": 2.706624746322632, "learning_rate": 1.1302901574380701e-08, "logits/chosen": -2.9206995964050293, "logits/rejected": -2.917583465576172, "logps/chosen": -55.23699951171875, "logps/rejected": -56.68768310546875, "loss": 0.6874, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015318076126277447, "rewards/margins": 0.011847314424812794, "rewards/rejected": -0.02716539241373539, "step": 5950 }, { "epoch": 1.0268780151619572, "grad_norm": 2.4179177284240723, "learning_rate": 1.1273080341706672e-08, "logits/chosen": -2.978672742843628, "logits/rejected": -2.9258780479431152, "logps/chosen": -59.678466796875, "logps/rejected": -54.84111404418945, "loss": 0.685, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.013551495969295502, "rewards/margins": 0.016849249601364136, "rewards/rejected": -0.030400747433304787, "step": 5960 }, { "epoch": 1.0286009648518264, "grad_norm": 3.122251033782959, "learning_rate": 1.1243247596469087e-08, "logits/chosen": -2.9761157035827637, "logits/rejected": -2.958291530609131, "logps/chosen": -54.1254997253418, "logps/rejected": -53.59160614013672, "loss": 0.6861, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.022281497716903687, "rewards/margins": 0.014488357119262218, "rewards/rejected": -0.03676985576748848, "step": 5970 }, { "epoch": 1.0303239145416954, "grad_norm": 2.5445525646209717, "learning_rate": 1.1213403608447758e-08, "logits/chosen": -2.977578639984131, "logits/rejected": -2.977252244949341, "logps/chosen": -54.674766540527344, "logps/rejected": -59.23362350463867, "loss": 0.6904, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0210475604981184, "rewards/margins": 0.0061415620148181915, "rewards/rejected": -0.02718912623822689, "step": 5980 }, { "epoch": 1.0320468642315643, "grad_norm": 2.410888910293579, "learning_rate": 1.1183548647524173e-08, "logits/chosen": -3.0514755249023438, "logits/rejected": -3.0170586109161377, "logps/chosen": -57.01215744018555, "logps/rejected": -55.18632125854492, "loss": 0.6872, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01939382217824459, "rewards/margins": 0.012516381219029427, "rewards/rejected": -0.03191020339727402, "step": 5990 }, { "epoch": 1.0337698139214335, "grad_norm": 1.9471714496612549, "learning_rate": 1.1153682983679035e-08, "logits/chosen": -3.0283169746398926, "logits/rejected": -3.0108890533447266, "logps/chosen": -54.46110916137695, "logps/rejected": -55.54545211791992, "loss": 0.6888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.023570090532302856, "rewards/margins": 0.00914863869547844, "rewards/rejected": -0.032718725502491, "step": 6000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -3.1315500736236572, "eval_logits/rejected": -3.1258907318115234, "eval_logps/chosen": -58.85401916503906, "eval_logps/rejected": -63.76580047607422, "eval_loss": 0.6910278797149658, "eval_rewards/accuracies": 0.5789963006973267, "eval_rewards/chosen": -0.0014212463283911347, "eval_rewards/margins": 0.004435404669493437, "eval_rewards/rejected": -0.005856651347130537, "eval_runtime": 383.555, "eval_samples_per_second": 11.221, "eval_steps_per_second": 1.403, "step": 6000 }, { "epoch": 1.0354927636113025, "grad_norm": 2.3859405517578125, "learning_rate": 1.1123806886989844e-08, "logits/chosen": -3.02860426902771, "logits/rejected": -3.010640859603882, "logps/chosen": -55.074371337890625, "logps/rejected": -54.42387771606445, "loss": 0.6885, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.020391542464494705, "rewards/margins": 0.009787985123693943, "rewards/rejected": -0.030179524794220924, "step": 6010 }, { "epoch": 1.0372157133011717, "grad_norm": 2.363797664642334, "learning_rate": 1.1093920627628442e-08, "logits/chosen": -3.2017006874084473, "logits/rejected": -3.1685476303100586, "logps/chosen": -56.03865432739258, "logps/rejected": -53.24171829223633, "loss": 0.6851, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.016513699665665627, "rewards/margins": 0.01690778322517872, "rewards/rejected": -0.033421482890844345, "step": 6020 }, { "epoch": 1.0389386629910407, "grad_norm": 2.2829158306121826, "learning_rate": 1.1064024475858577e-08, "logits/chosen": -2.854118824005127, "logits/rejected": -2.8442769050598145, "logps/chosen": -52.7369384765625, "logps/rejected": -53.722251892089844, "loss": 0.691, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02898944541811943, "rewards/margins": 0.005069796461611986, "rewards/rejected": -0.03405924141407013, "step": 6030 }, { "epoch": 1.0406616126809096, "grad_norm": 2.114177703857422, "learning_rate": 1.1034118702033446e-08, "logits/chosen": -3.052936553955078, "logits/rejected": -3.0325207710266113, "logps/chosen": -55.636146545410156, "logps/rejected": -55.01488494873047, "loss": 0.6864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01927899941802025, "rewards/margins": 0.014106673188507557, "rewards/rejected": -0.03338567540049553, "step": 6040 }, { "epoch": 1.0423845623707788, "grad_norm": 2.4536662101745605, "learning_rate": 1.1004203576593268e-08, "logits/chosen": -2.949414014816284, "logits/rejected": -2.9219253063201904, "logps/chosen": -61.292701721191406, "logps/rejected": -57.094200134277344, "loss": 0.6862, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019505826756358147, "rewards/margins": 0.014441567473113537, "rewards/rejected": -0.03394739329814911, "step": 6050 }, { "epoch": 1.0441075120606478, "grad_norm": 2.0977792739868164, "learning_rate": 1.0974279370062827e-08, "logits/chosen": -3.0255239009857178, "logits/rejected": -3.002349853515625, "logps/chosen": -55.601539611816406, "logps/rejected": -55.09684371948242, "loss": 0.6883, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.021067507565021515, "rewards/margins": 0.010256089270114899, "rewards/rejected": -0.031323596835136414, "step": 6060 }, { "epoch": 1.045830461750517, "grad_norm": 2.427433729171753, "learning_rate": 1.0944346353049023e-08, "logits/chosen": -3.0318515300750732, "logits/rejected": -3.010066509246826, "logps/chosen": -55.89380645751953, "logps/rejected": -55.183677673339844, "loss": 0.6859, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.019514724612236023, "rewards/margins": 0.015096470713615417, "rewards/rejected": -0.03461119160056114, "step": 6070 }, { "epoch": 1.047553411440386, "grad_norm": 2.407802104949951, "learning_rate": 1.0914404796238437e-08, "logits/chosen": -2.998624801635742, "logits/rejected": -2.977358102798462, "logps/chosen": -60.79319381713867, "logps/rejected": -56.703033447265625, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.017175931483507156, "rewards/margins": 0.01408243179321289, "rewards/rejected": -0.03125835955142975, "step": 6080 }, { "epoch": 1.049276361130255, "grad_norm": 2.5689022541046143, "learning_rate": 1.088445497039487e-08, "logits/chosen": -3.0697948932647705, "logits/rejected": -3.0521798133850098, "logps/chosen": -53.399330139160156, "logps/rejected": -54.737457275390625, "loss": 0.6865, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.019912278279662132, "rewards/margins": 0.013947946950793266, "rewards/rejected": -0.0338602289557457, "step": 6090 }, { "epoch": 1.050999310820124, "grad_norm": 2.353585720062256, "learning_rate": 1.0854497146356908e-08, "logits/chosen": -3.035552978515625, "logits/rejected": -3.032215118408203, "logps/chosen": -58.44176483154297, "logps/rejected": -57.085479736328125, "loss": 0.6857, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.016585614532232285, "rewards/margins": 0.01556225586682558, "rewards/rejected": -0.03214786946773529, "step": 6100 }, { "epoch": 1.050999310820124, "eval_logits/chosen": -3.131176471710205, "eval_logits/rejected": -3.1255154609680176, "eval_logps/chosen": -58.87300109863281, "eval_logps/rejected": -63.80307388305664, "eval_loss": 0.690942108631134, "eval_rewards/accuracies": 0.5794609785079956, "eval_rewards/chosen": -0.0016110517317429185, "eval_rewards/margins": 0.004618438426405191, "eval_rewards/rejected": -0.006229490041732788, "eval_runtime": 383.3806, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 6100 }, { "epoch": 1.052722260509993, "grad_norm": 2.3778581619262695, "learning_rate": 1.0824531595035451e-08, "logits/chosen": -3.0413858890533447, "logits/rejected": -3.0262722969055176, "logps/chosen": -53.48925018310547, "logps/rejected": -56.27558135986328, "loss": 0.6884, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01790313422679901, "rewards/margins": 0.009967513382434845, "rewards/rejected": -0.027870649471879005, "step": 6110 }, { "epoch": 1.0544452101998623, "grad_norm": 2.639493465423584, "learning_rate": 1.0794558587411295e-08, "logits/chosen": -3.1496880054473877, "logits/rejected": -3.1011807918548584, "logps/chosen": -55.5837516784668, "logps/rejected": -56.8000373840332, "loss": 0.6833, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.012620387598872185, "rewards/margins": 0.020604535937309265, "rewards/rejected": -0.0332249216735363, "step": 6120 }, { "epoch": 1.0561681598897312, "grad_norm": 2.5370519161224365, "learning_rate": 1.0764578394532654e-08, "logits/chosen": -3.0457675457000732, "logits/rejected": -3.020132541656494, "logps/chosen": -56.27012252807617, "logps/rejected": -57.2618408203125, "loss": 0.6862, "rewards/accuracies": 0.59375, "rewards/chosen": -0.018784884363412857, "rewards/margins": 0.014378098770976067, "rewards/rejected": -0.033162981271743774, "step": 6130 }, { "epoch": 1.0578911095796002, "grad_norm": 2.4076688289642334, "learning_rate": 1.0734591287512721e-08, "logits/chosen": -3.081873893737793, "logits/rejected": -3.0696253776550293, "logps/chosen": -55.290489196777344, "logps/rejected": -55.50571823120117, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.022115176543593407, "rewards/margins": 0.00814732350409031, "rewards/rejected": -0.030262500047683716, "step": 6140 }, { "epoch": 1.0596140592694694, "grad_norm": 2.379582166671753, "learning_rate": 1.0704597537527212e-08, "logits/chosen": -2.981661319732666, "logits/rejected": -2.966520071029663, "logps/chosen": -55.33698654174805, "logps/rejected": -53.2189826965332, "loss": 0.6889, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02329077199101448, "rewards/margins": 0.009010384790599346, "rewards/rejected": -0.0323011577129364, "step": 6150 }, { "epoch": 1.0613370089593384, "grad_norm": 2.3428122997283936, "learning_rate": 1.067459741581192e-08, "logits/chosen": -2.9435131549835205, "logits/rejected": -2.9327847957611084, "logps/chosen": -52.50434494018555, "logps/rejected": -55.611785888671875, "loss": 0.6858, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015285499393939972, "rewards/margins": 0.015342341735959053, "rewards/rejected": -0.030627842992544174, "step": 6160 }, { "epoch": 1.0630599586492075, "grad_norm": 2.60860538482666, "learning_rate": 1.0644591193660252e-08, "logits/chosen": -3.0668625831604004, "logits/rejected": -3.0502090454101562, "logps/chosen": -61.2022819519043, "logps/rejected": -59.66276168823242, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01741643063724041, "rewards/margins": 0.01449237484484911, "rewards/rejected": -0.031908802688121796, "step": 6170 }, { "epoch": 1.0647829083390765, "grad_norm": 2.6758952140808105, "learning_rate": 1.0614579142420786e-08, "logits/chosen": -3.082418918609619, "logits/rejected": -3.0575006008148193, "logps/chosen": -59.830848693847656, "logps/rejected": -56.3403434753418, "loss": 0.6851, "rewards/accuracies": 0.59375, "rewards/chosen": -0.016757013276219368, "rewards/margins": 0.016851117834448814, "rewards/rejected": -0.03360813111066818, "step": 6180 }, { "epoch": 1.0665058580289455, "grad_norm": 2.2758195400238037, "learning_rate": 1.0584561533494817e-08, "logits/chosen": -3.04453706741333, "logits/rejected": -3.0198044776916504, "logps/chosen": -59.79877471923828, "logps/rejected": -56.4383544921875, "loss": 0.6844, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.016228018328547478, "rewards/margins": 0.018129851669073105, "rewards/rejected": -0.03435786813497543, "step": 6190 }, { "epoch": 1.0682288077188147, "grad_norm": 2.3243348598480225, "learning_rate": 1.0554538638333888e-08, "logits/chosen": -2.933687925338745, "logits/rejected": -2.9139504432678223, "logps/chosen": -56.769676208496094, "logps/rejected": -58.42494583129883, "loss": 0.6889, "rewards/accuracies": 0.5625, "rewards/chosen": -0.022551879286766052, "rewards/margins": 0.009325524792075157, "rewards/rejected": -0.03187740594148636, "step": 6200 }, { "epoch": 1.0682288077188147, "eval_logits/chosen": -3.1305487155914307, "eval_logits/rejected": -3.124844789505005, "eval_logps/chosen": -58.903194427490234, "eval_logps/rejected": -63.83762741088867, "eval_loss": 0.6909228563308716, "eval_rewards/accuracies": 0.5764405131340027, "eval_rewards/chosen": -0.0019130135187879205, "eval_rewards/margins": 0.00466204434633255, "eval_rewards/rejected": -0.006575057283043861, "eval_runtime": 384.0941, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 6200 }, { "epoch": 1.0699517574086836, "grad_norm": 2.357759952545166, "learning_rate": 1.0524510728437354e-08, "logits/chosen": -3.0520224571228027, "logits/rejected": -3.0220112800598145, "logps/chosen": -56.74927520751953, "logps/rejected": -56.363197326660156, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": -0.019240161404013634, "rewards/margins": 0.015431523323059082, "rewards/rejected": -0.034671682864427567, "step": 6210 }, { "epoch": 1.0716747070985528, "grad_norm": 2.284916877746582, "learning_rate": 1.049447807534992e-08, "logits/chosen": -3.0356788635253906, "logits/rejected": -3.0147993564605713, "logps/chosen": -54.05214309692383, "logps/rejected": -55.98185348510742, "loss": 0.6859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019131021574139595, "rewards/margins": 0.015217870473861694, "rewards/rejected": -0.03434889391064644, "step": 6220 }, { "epoch": 1.0733976567884218, "grad_norm": 2.3051514625549316, "learning_rate": 1.0464440950659173e-08, "logits/chosen": -3.175098419189453, "logits/rejected": -3.156182289123535, "logps/chosen": -60.29032516479492, "logps/rejected": -59.4665641784668, "loss": 0.6862, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.014576122164726257, "rewards/margins": 0.014566306956112385, "rewards/rejected": -0.029142428189516068, "step": 6230 }, { "epoch": 1.0751206064782908, "grad_norm": 2.666886568069458, "learning_rate": 1.043439962599315e-08, "logits/chosen": -2.989060640335083, "logits/rejected": -2.956812620162964, "logps/chosen": -58.37445831298828, "logps/rejected": -56.85335159301758, "loss": 0.6877, "rewards/accuracies": 0.625, "rewards/chosen": -0.019876528531312943, "rewards/margins": 0.011631477624177933, "rewards/rejected": -0.031508006155490875, "step": 6240 }, { "epoch": 1.07684355616816, "grad_norm": 2.515204429626465, "learning_rate": 1.0404354373017859e-08, "logits/chosen": -3.07537841796875, "logits/rejected": -3.052424907684326, "logps/chosen": -57.4842529296875, "logps/rejected": -56.97715377807617, "loss": 0.6868, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01756465807557106, "rewards/margins": 0.01326189748942852, "rewards/rejected": -0.03082655929028988, "step": 6250 }, { "epoch": 1.078566505858029, "grad_norm": 2.4106087684631348, "learning_rate": 1.037430546343484e-08, "logits/chosen": -2.953723192214966, "logits/rejected": -2.932874917984009, "logps/chosen": -56.1456413269043, "logps/rejected": -54.07300567626953, "loss": 0.6862, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02221234142780304, "rewards/margins": 0.014708032831549644, "rewards/rejected": -0.03692037612199783, "step": 6260 }, { "epoch": 1.080289455547898, "grad_norm": 2.4935922622680664, "learning_rate": 1.0344253168978695e-08, "logits/chosen": -3.2191452980041504, "logits/rejected": -3.218228816986084, "logps/chosen": -54.688499450683594, "logps/rejected": -57.696754455566406, "loss": 0.6871, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01787818782031536, "rewards/margins": 0.012707440182566643, "rewards/rejected": -0.030585628002882004, "step": 6270 }, { "epoch": 1.082012405237767, "grad_norm": 2.4286909103393555, "learning_rate": 1.0314197761414636e-08, "logits/chosen": -2.9016292095184326, "logits/rejected": -2.876361846923828, "logps/chosen": -56.48183059692383, "logps/rejected": -56.19874954223633, "loss": 0.6861, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.020695989951491356, "rewards/margins": 0.014859400689601898, "rewards/rejected": -0.0355553925037384, "step": 6280 }, { "epoch": 1.083735354927636, "grad_norm": 2.36930775642395, "learning_rate": 1.0284139512536028e-08, "logits/chosen": -2.980339765548706, "logits/rejected": -2.9492430686950684, "logps/chosen": -53.1446418762207, "logps/rejected": -56.06641387939453, "loss": 0.6838, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.017662178725004196, "rewards/margins": 0.019239947199821472, "rewards/rejected": -0.03690212219953537, "step": 6290 }, { "epoch": 1.0854583046175053, "grad_norm": 2.4871668815612793, "learning_rate": 1.0254078694161929e-08, "logits/chosen": -3.057957887649536, "logits/rejected": -3.0294573307037354, "logps/chosen": -54.77833938598633, "logps/rejected": -55.77891159057617, "loss": 0.6865, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.023420926183462143, "rewards/margins": 0.013916957192122936, "rewards/rejected": -0.037337884306907654, "step": 6300 }, { "epoch": 1.0854583046175053, "eval_logits/chosen": -3.1302549839019775, "eval_logits/rejected": -3.1245484352111816, "eval_logps/chosen": -58.927452087402344, "eval_logps/rejected": -63.87955856323242, "eval_loss": 0.6908382177352905, "eval_rewards/accuracies": 0.5787639617919922, "eval_rewards/chosen": -0.0021555284038186073, "eval_rewards/margins": 0.0048387921415269375, "eval_rewards/rejected": -0.006994321011006832, "eval_runtime": 383.6447, "eval_samples_per_second": 11.219, "eval_steps_per_second": 1.402, "step": 6300 }, { "epoch": 1.0871812543073742, "grad_norm": 2.4470012187957764, "learning_rate": 1.0224015578134633e-08, "logits/chosen": -3.0433483123779297, "logits/rejected": -3.012185573577881, "logps/chosen": -52.22063446044922, "logps/rejected": -54.65047073364258, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": -0.01763722486793995, "rewards/margins": 0.013348887674510479, "rewards/rejected": -0.030986111611127853, "step": 6310 }, { "epoch": 1.0889042039972432, "grad_norm": 2.340951442718506, "learning_rate": 1.019395043631722e-08, "logits/chosen": -2.988467216491699, "logits/rejected": -2.9664227962493896, "logps/chosen": -56.86391067504883, "logps/rejected": -56.512306213378906, "loss": 0.6882, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.021419556811451912, "rewards/margins": 0.0104674668982625, "rewards/rejected": -0.03188702464103699, "step": 6320 }, { "epoch": 1.0906271536871124, "grad_norm": 2.3158609867095947, "learning_rate": 1.0163883540591075e-08, "logits/chosen": -3.0004405975341797, "logits/rejected": -2.9805476665496826, "logps/chosen": -55.39896774291992, "logps/rejected": -57.84352493286133, "loss": 0.6829, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.021089566871523857, "rewards/margins": 0.021264133974909782, "rewards/rejected": -0.04235369712114334, "step": 6330 }, { "epoch": 1.0923501033769814, "grad_norm": 2.5407843589782715, "learning_rate": 1.0133815162853452e-08, "logits/chosen": -3.0075252056121826, "logits/rejected": -2.98484468460083, "logps/chosen": -57.53838348388672, "logps/rejected": -55.88555908203125, "loss": 0.6881, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.019510172307491302, "rewards/margins": 0.010738825425505638, "rewards/rejected": -0.03024899959564209, "step": 6340 }, { "epoch": 1.0940730530668505, "grad_norm": 2.110309362411499, "learning_rate": 1.010374557501501e-08, "logits/chosen": -2.989995241165161, "logits/rejected": -2.9829235076904297, "logps/chosen": -57.7088508605957, "logps/rejected": -59.38560104370117, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02194030024111271, "rewards/margins": 0.008906031958758831, "rewards/rejected": -0.030846333131194115, "step": 6350 }, { "epoch": 1.0957960027567195, "grad_norm": 2.4076998233795166, "learning_rate": 1.0073675048997344e-08, "logits/chosen": -3.0212626457214355, "logits/rejected": -3.0192079544067383, "logps/chosen": -55.68696212768555, "logps/rejected": -57.60249710083008, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02218361385166645, "rewards/margins": 0.008003572933375835, "rewards/rejected": -0.03018718585371971, "step": 6360 }, { "epoch": 1.0975189524465885, "grad_norm": 2.4917333126068115, "learning_rate": 1.004360385673054e-08, "logits/chosen": -3.0473618507385254, "logits/rejected": -3.0467488765716553, "logps/chosen": -55.46171188354492, "logps/rejected": -57.114402770996094, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": -0.023173917084932327, "rewards/margins": 0.0035490102600306273, "rewards/rejected": -0.026722926646471024, "step": 6370 }, { "epoch": 1.0992419021364577, "grad_norm": 2.7534730434417725, "learning_rate": 1.0013532270150699e-08, "logits/chosen": -3.0758779048919678, "logits/rejected": -3.064692974090576, "logps/chosen": -54.99663543701172, "logps/rejected": -59.77294158935547, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.022621020674705505, "rewards/margins": 0.009905163198709488, "rewards/rejected": -0.03252618387341499, "step": 6380 }, { "epoch": 1.1009648518263266, "grad_norm": 2.4464833736419678, "learning_rate": 9.983460561197496e-09, "logits/chosen": -3.1145012378692627, "logits/rejected": -3.0785086154937744, "logps/chosen": -57.59659957885742, "logps/rejected": -54.70061492919922, "loss": 0.6826, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.015128199942409992, "rewards/margins": 0.021856555715203285, "rewards/rejected": -0.03698475658893585, "step": 6390 }, { "epoch": 1.1026878015161956, "grad_norm": 2.506380319595337, "learning_rate": 9.953389001811714e-09, "logits/chosen": -3.0488944053649902, "logits/rejected": -3.024442434310913, "logps/chosen": -59.581787109375, "logps/rejected": -59.423805236816406, "loss": 0.6884, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.018883857876062393, "rewards/margins": 0.009957622736692429, "rewards/rejected": -0.028841480612754822, "step": 6400 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -3.1286780834198, "eval_logits/rejected": -3.1230478286743164, "eval_logps/chosen": -58.95232391357422, "eval_logps/rejected": -63.89405059814453, "eval_loss": 0.6908935308456421, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": -0.002404270227998495, "eval_rewards/margins": 0.00473501393571496, "eval_rewards/rejected": -0.007139283698052168, "eval_runtime": 383.8443, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 6400 }, { "epoch": 1.1044107512060648, "grad_norm": 2.3390676975250244, "learning_rate": 9.923317863932776e-09, "logits/chosen": -3.1495254039764404, "logits/rejected": -3.1230266094207764, "logps/chosen": -56.2658576965332, "logps/rejected": -55.08391571044922, "loss": 0.6862, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.019014541059732437, "rewards/margins": 0.014427746646106243, "rewards/rejected": -0.033442284911870956, "step": 6410 }, { "epoch": 1.1061337008959338, "grad_norm": 2.3376822471618652, "learning_rate": 9.8932474194963e-09, "logits/chosen": -3.0909719467163086, "logits/rejected": -3.0730559825897217, "logps/chosen": -56.99043655395508, "logps/rejected": -57.393707275390625, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021999698132276535, "rewards/margins": 0.012688428163528442, "rewards/rejected": -0.03468812629580498, "step": 6420 }, { "epoch": 1.107856650585803, "grad_norm": 2.4933223724365234, "learning_rate": 9.863177940431631e-09, "logits/chosen": -3.000549793243408, "logits/rejected": -2.986180543899536, "logps/chosen": -52.349082946777344, "logps/rejected": -51.32086181640625, "loss": 0.6879, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.01957535557448864, "rewards/margins": 0.01128177996724844, "rewards/rejected": -0.030857134610414505, "step": 6430 }, { "epoch": 1.109579600275672, "grad_norm": 2.119002103805542, "learning_rate": 9.83310969865938e-09, "logits/chosen": -2.9967103004455566, "logits/rejected": -2.978527784347534, "logps/chosen": -55.7023811340332, "logps/rejected": -56.34955978393555, "loss": 0.6899, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.024433055892586708, "rewards/margins": 0.007135553751140833, "rewards/rejected": -0.031568609178066254, "step": 6440 }, { "epoch": 1.111302549965541, "grad_norm": 2.5312013626098633, "learning_rate": 9.803042966088975e-09, "logits/chosen": -3.0206658840179443, "logits/rejected": -2.986959218978882, "logps/chosen": -58.1439208984375, "logps/rejected": -55.67827224731445, "loss": 0.6853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01819276250898838, "rewards/margins": 0.016308221966028214, "rewards/rejected": -0.034500982612371445, "step": 6450 }, { "epoch": 1.11302549965541, "grad_norm": 2.3651509284973145, "learning_rate": 9.77297801461619e-09, "logits/chosen": -2.984637498855591, "logits/rejected": -2.98801851272583, "logps/chosen": -53.89622116088867, "logps/rejected": -61.230064392089844, "loss": 0.687, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.020203817635774612, "rewards/margins": 0.0128660649061203, "rewards/rejected": -0.03306988254189491, "step": 6460 }, { "epoch": 1.114748449345279, "grad_norm": 2.3542640209198, "learning_rate": 9.742915116120702e-09, "logits/chosen": -2.943131685256958, "logits/rejected": -2.920292377471924, "logps/chosen": -54.866676330566406, "logps/rejected": -55.22333908081055, "loss": 0.6866, "rewards/accuracies": 0.625, "rewards/chosen": -0.021034687757492065, "rewards/margins": 0.013601104728877544, "rewards/rejected": -0.03463580086827278, "step": 6470 }, { "epoch": 1.1164713990351482, "grad_norm": 2.671103000640869, "learning_rate": 9.71285454246361e-09, "logits/chosen": -3.0327906608581543, "logits/rejected": -2.995478868484497, "logps/chosen": -58.048858642578125, "logps/rejected": -53.625083923339844, "loss": 0.685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.014800253324210644, "rewards/margins": 0.016874242573976517, "rewards/rejected": -0.031674496829509735, "step": 6480 }, { "epoch": 1.1181943487250172, "grad_norm": 2.5911669731140137, "learning_rate": 9.682796565485007e-09, "logits/chosen": -3.2142341136932373, "logits/rejected": -3.2014338970184326, "logps/chosen": -55.8908576965332, "logps/rejected": -57.41044998168945, "loss": 0.6885, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.022705888375639915, "rewards/margins": 0.010103853419423103, "rewards/rejected": -0.03280974552035332, "step": 6490 }, { "epoch": 1.1199172984148862, "grad_norm": 2.531972885131836, "learning_rate": 9.65274145700148e-09, "logits/chosen": -3.047877550125122, "logits/rejected": -3.04103946685791, "logps/chosen": -58.6346549987793, "logps/rejected": -55.967201232910156, "loss": 0.6893, "rewards/accuracies": 0.53125, "rewards/chosen": -0.024279167875647545, "rewards/margins": 0.008274078369140625, "rewards/rejected": -0.03255324810743332, "step": 6500 }, { "epoch": 1.1199172984148862, "eval_logits/chosen": -3.1286838054656982, "eval_logits/rejected": -3.123046875, "eval_logps/chosen": -58.967567443847656, "eval_logps/rejected": -63.92682647705078, "eval_loss": 0.6908088326454163, "eval_rewards/accuracies": 0.5813196897506714, "eval_rewards/chosen": -0.0025567305274307728, "eval_rewards/margins": 0.004910381976515055, "eval_rewards/rejected": -0.00746711203828454, "eval_runtime": 383.4046, "eval_samples_per_second": 11.226, "eval_steps_per_second": 1.403, "step": 6500 }, { "epoch": 1.1216402481047554, "grad_norm": 2.5683422088623047, "learning_rate": 9.622689488803698e-09, "logits/chosen": -3.073943614959717, "logits/rejected": -3.0271260738372803, "logps/chosen": -58.1553955078125, "logps/rejected": -56.480018615722656, "loss": 0.6829, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.01371106505393982, "rewards/margins": 0.02100132405757904, "rewards/rejected": -0.03471238911151886, "step": 6510 }, { "epoch": 1.1233631977946243, "grad_norm": 2.425645112991333, "learning_rate": 9.592640932653922e-09, "logits/chosen": -3.009979248046875, "logits/rejected": -2.9836318492889404, "logps/chosen": -55.428565979003906, "logps/rejected": -54.707984924316406, "loss": 0.6863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0181711558252573, "rewards/margins": 0.014331887476146221, "rewards/rejected": -0.03250304237008095, "step": 6520 }, { "epoch": 1.1250861474844935, "grad_norm": 2.4266066551208496, "learning_rate": 9.562596060283558e-09, "logits/chosen": -2.986335515975952, "logits/rejected": -2.9570603370666504, "logps/chosen": -55.52927780151367, "logps/rejected": -55.080787658691406, "loss": 0.6848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01905311644077301, "rewards/margins": 0.017313417047262192, "rewards/rejected": -0.0363665409386158, "step": 6530 }, { "epoch": 1.1268090971743625, "grad_norm": 2.459099054336548, "learning_rate": 9.532555143390696e-09, "logits/chosen": -3.044667959213257, "logits/rejected": -3.0172994136810303, "logps/chosen": -59.32032012939453, "logps/rejected": -59.080108642578125, "loss": 0.6893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01945970579981804, "rewards/margins": 0.008291837759315968, "rewards/rejected": -0.02775154635310173, "step": 6540 }, { "epoch": 1.1285320468642315, "grad_norm": 2.381169080734253, "learning_rate": 9.502518453637671e-09, "logits/chosen": -2.93902850151062, "logits/rejected": -2.9150500297546387, "logps/chosen": -54.03925323486328, "logps/rejected": -55.1716194152832, "loss": 0.6885, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02493315376341343, "rewards/margins": 0.00981233362108469, "rewards/rejected": -0.034745484590530396, "step": 6550 }, { "epoch": 1.1302549965541007, "grad_norm": 2.125992774963379, "learning_rate": 9.472486262648568e-09, "logits/chosen": -3.074824810028076, "logits/rejected": -3.032547950744629, "logps/chosen": -57.7005729675293, "logps/rejected": -55.79645538330078, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.016963675618171692, "rewards/margins": 0.02236434444785118, "rewards/rejected": -0.03932802379131317, "step": 6560 }, { "epoch": 1.1319779462439696, "grad_norm": 2.3872108459472656, "learning_rate": 9.442458842006816e-09, "logits/chosen": -3.052938938140869, "logits/rejected": -3.0232064723968506, "logps/chosen": -55.656700134277344, "logps/rejected": -56.49686813354492, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.020926889032125473, "rewards/margins": 0.017068570479750633, "rewards/rejected": -0.037995465099811554, "step": 6570 }, { "epoch": 1.1337008959338388, "grad_norm": 2.2589311599731445, "learning_rate": 9.412436463252682e-09, "logits/chosen": -3.0725693702697754, "logits/rejected": -3.0329666137695312, "logps/chosen": -56.81243896484375, "logps/rejected": -53.0896110534668, "loss": 0.6869, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0178411602973938, "rewards/margins": 0.013036638498306274, "rewards/rejected": -0.030877795070409775, "step": 6580 }, { "epoch": 1.1354238456237078, "grad_norm": 2.367614507675171, "learning_rate": 9.382419397880853e-09, "logits/chosen": -3.015820026397705, "logits/rejected": -2.992987871170044, "logps/chosen": -55.18305206298828, "logps/rejected": -56.52252197265625, "loss": 0.6899, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02343105897307396, "rewards/margins": 0.006984441541135311, "rewards/rejected": -0.030415501445531845, "step": 6590 }, { "epoch": 1.1371467953135768, "grad_norm": 2.300924777984619, "learning_rate": 9.35240791733796e-09, "logits/chosen": -3.112875461578369, "logits/rejected": -3.0967743396759033, "logps/chosen": -58.0617561340332, "logps/rejected": -57.01247024536133, "loss": 0.6886, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0242941714823246, "rewards/margins": 0.009952034801244736, "rewards/rejected": -0.03424619883298874, "step": 6600 }, { "epoch": 1.1371467953135768, "eval_logits/chosen": -3.127255916595459, "eval_logits/rejected": -3.1215784549713135, "eval_logps/chosen": -59.00900650024414, "eval_logps/rejected": -63.97233581542969, "eval_loss": 0.6907920241355896, "eval_rewards/accuracies": 0.5748141407966614, "eval_rewards/chosen": -0.0029711031820625067, "eval_rewards/margins": 0.004950948059558868, "eval_rewards/rejected": -0.007922051474452019, "eval_runtime": 383.9688, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 6600 }, { "epoch": 1.138869745003446, "grad_norm": 2.4028170108795166, "learning_rate": 9.322402293020136e-09, "logits/chosen": -3.066715717315674, "logits/rejected": -3.046630382537842, "logps/chosen": -54.22076416015625, "logps/rejected": -55.5111083984375, "loss": 0.6859, "rewards/accuracies": 0.6875, "rewards/chosen": -0.022545434534549713, "rewards/margins": 0.015024189837276936, "rewards/rejected": -0.037569623440504074, "step": 6610 }, { "epoch": 1.140592694693315, "grad_norm": 2.5848283767700195, "learning_rate": 9.292402796270548e-09, "logits/chosen": -3.031874179840088, "logits/rejected": -3.0092434883117676, "logps/chosen": -55.2226448059082, "logps/rejected": -55.86328125, "loss": 0.6891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028317946940660477, "rewards/margins": 0.008727455511689186, "rewards/rejected": -0.037045400589704514, "step": 6620 }, { "epoch": 1.1423156443831841, "grad_norm": 2.1752817630767822, "learning_rate": 9.262409698376958e-09, "logits/chosen": -3.014620780944824, "logits/rejected": -2.9869723320007324, "logps/chosen": -55.143951416015625, "logps/rejected": -52.85613250732422, "loss": 0.6843, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01832455024123192, "rewards/margins": 0.018313277512788773, "rewards/rejected": -0.03663782775402069, "step": 6630 }, { "epoch": 1.144038594073053, "grad_norm": 2.098524332046509, "learning_rate": 9.23242327056926e-09, "logits/chosen": -3.063933849334717, "logits/rejected": -3.0252301692962646, "logps/chosen": -55.5810432434082, "logps/rejected": -53.812339782714844, "loss": 0.6849, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02475198730826378, "rewards/margins": 0.017390215769410133, "rewards/rejected": -0.04214220121502876, "step": 6640 }, { "epoch": 1.145761543762922, "grad_norm": 2.544950246810913, "learning_rate": 9.202443784017025e-09, "logits/chosen": -3.094503879547119, "logits/rejected": -3.0721499919891357, "logps/chosen": -54.26481246948242, "logps/rejected": -57.11799240112305, "loss": 0.6854, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.021512877196073532, "rewards/margins": 0.016131866723299026, "rewards/rejected": -0.03764474391937256, "step": 6650 }, { "epoch": 1.1474844934527912, "grad_norm": 2.321054220199585, "learning_rate": 9.172471509827065e-09, "logits/chosen": -3.13427472114563, "logits/rejected": -3.114708423614502, "logps/chosen": -53.75007247924805, "logps/rejected": -57.930686950683594, "loss": 0.6846, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.019893547520041466, "rewards/margins": 0.017774097621440887, "rewards/rejected": -0.0376676470041275, "step": 6660 }, { "epoch": 1.1492074431426602, "grad_norm": 2.4262032508850098, "learning_rate": 9.142506719040958e-09, "logits/chosen": -3.0133087635040283, "logits/rejected": -3.0175650119781494, "logps/chosen": -52.8214111328125, "logps/rejected": -58.95885467529297, "loss": 0.6898, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029145341366529465, "rewards/margins": 0.007524232380092144, "rewards/rejected": -0.03666957467794418, "step": 6670 }, { "epoch": 1.1509303928325294, "grad_norm": 2.388805389404297, "learning_rate": 9.112549682632617e-09, "logits/chosen": -3.0323078632354736, "logits/rejected": -3.0010530948638916, "logps/chosen": -55.79850387573242, "logps/rejected": -55.65300750732422, "loss": 0.6854, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.019732866436243057, "rewards/margins": 0.0161424670368433, "rewards/rejected": -0.03587533161044121, "step": 6680 }, { "epoch": 1.1526533425223984, "grad_norm": 2.479785203933716, "learning_rate": 9.082600671505824e-09, "logits/chosen": -3.018188953399658, "logits/rejected": -2.996375799179077, "logps/chosen": -59.262962341308594, "logps/rejected": -59.838768005371094, "loss": 0.684, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021032914519309998, "rewards/margins": 0.019005469977855682, "rewards/rejected": -0.04003838449716568, "step": 6690 }, { "epoch": 1.1543762922122673, "grad_norm": 2.6877920627593994, "learning_rate": 9.052659956491801e-09, "logits/chosen": -3.037140369415283, "logits/rejected": -3.02573561668396, "logps/chosen": -57.954002380371094, "logps/rejected": -59.13984298706055, "loss": 0.6865, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01825765147805214, "rewards/margins": 0.014165830798447132, "rewards/rejected": -0.032423485070466995, "step": 6700 }, { "epoch": 1.1543762922122673, "eval_logits/chosen": -3.127453088760376, "eval_logits/rejected": -3.1217737197875977, "eval_logps/chosen": -59.034568786621094, "eval_logps/rejected": -64.0009994506836, "eval_loss": 0.6907802224159241, "eval_rewards/accuracies": 0.5803903341293335, "eval_rewards/chosen": -0.003226715140044689, "eval_rewards/margins": 0.00498205004259944, "eval_rewards/rejected": -0.008208764716982841, "eval_runtime": 383.9771, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 6700 }, { "epoch": 1.1560992419021365, "grad_norm": 2.5995121002197266, "learning_rate": 9.02272780834673e-09, "logits/chosen": -3.029447078704834, "logits/rejected": -3.0037617683410645, "logps/chosen": -58.503143310546875, "logps/rejected": -57.007843017578125, "loss": 0.6841, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01577948033809662, "rewards/margins": 0.018815193325281143, "rewards/rejected": -0.03459467366337776, "step": 6710 }, { "epoch": 1.1578221915920055, "grad_norm": 2.3430838584899902, "learning_rate": 8.992804497749343e-09, "logits/chosen": -3.024191379547119, "logits/rejected": -3.0123202800750732, "logps/chosen": -53.38079833984375, "logps/rejected": -55.08259201049805, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": -0.01965912990272045, "rewards/margins": 0.01567569002509117, "rewards/rejected": -0.03533482179045677, "step": 6720 }, { "epoch": 1.1595451412818747, "grad_norm": 2.730286121368408, "learning_rate": 8.96289029529843e-09, "logits/chosen": -2.9752981662750244, "logits/rejected": -2.959228754043579, "logps/chosen": -58.8833122253418, "logps/rejected": -58.656455993652344, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.021824505180120468, "rewards/margins": 0.011972033418715, "rewards/rejected": -0.033796537667512894, "step": 6730 }, { "epoch": 1.1612680909717437, "grad_norm": 2.4725494384765625, "learning_rate": 8.932985471510436e-09, "logits/chosen": -3.0475223064422607, "logits/rejected": -3.0392818450927734, "logps/chosen": -56.54149627685547, "logps/rejected": -59.19397735595703, "loss": 0.6885, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02108325995504856, "rewards/margins": 0.009946262463927269, "rewards/rejected": -0.03102952241897583, "step": 6740 }, { "epoch": 1.1629910406616126, "grad_norm": 2.4905450344085693, "learning_rate": 8.903090296816975e-09, "logits/chosen": -3.088040828704834, "logits/rejected": -3.0729029178619385, "logps/chosen": -55.22167205810547, "logps/rejected": -57.266319274902344, "loss": 0.6894, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.027635056525468826, "rewards/margins": 0.00815325602889061, "rewards/rejected": -0.03578830882906914, "step": 6750 }, { "epoch": 1.1647139903514818, "grad_norm": 2.5701723098754883, "learning_rate": 8.873205041562426e-09, "logits/chosen": -3.0328335762023926, "logits/rejected": -3.0046257972717285, "logps/chosen": -54.555458068847656, "logps/rejected": -52.52616500854492, "loss": 0.6871, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025020277127623558, "rewards/margins": 0.01287126075476408, "rewards/rejected": -0.037891536951065063, "step": 6760 }, { "epoch": 1.1664369400413508, "grad_norm": 2.2472474575042725, "learning_rate": 8.843329976001443e-09, "logits/chosen": -3.045372486114502, "logits/rejected": -3.039490222930908, "logps/chosen": -55.7740364074707, "logps/rejected": -58.88713455200195, "loss": 0.6869, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.019391192123293877, "rewards/margins": 0.013176657259464264, "rewards/rejected": -0.03256785124540329, "step": 6770 }, { "epoch": 1.1681598897312198, "grad_norm": 2.712940216064453, "learning_rate": 8.813465370296552e-09, "logits/chosen": -3.0446479320526123, "logits/rejected": -3.01298189163208, "logps/chosen": -59.089202880859375, "logps/rejected": -57.0594482421875, "loss": 0.6878, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02587306872010231, "rewards/margins": 0.011457438580691814, "rewards/rejected": -0.0373305082321167, "step": 6780 }, { "epoch": 1.169882839421089, "grad_norm": 2.4926705360412598, "learning_rate": 8.783611494515675e-09, "logits/chosen": -3.0511624813079834, "logits/rejected": -3.035194158554077, "logps/chosen": -53.268836975097656, "logps/rejected": -57.05846405029297, "loss": 0.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027448957785964012, "rewards/margins": 0.011941693723201752, "rewards/rejected": -0.03939065709710121, "step": 6790 }, { "epoch": 1.171605789110958, "grad_norm": 2.2441673278808594, "learning_rate": 8.753768618629716e-09, "logits/chosen": -2.967897891998291, "logits/rejected": -2.9408040046691895, "logps/chosen": -55.0565185546875, "logps/rejected": -54.068275451660156, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.026381511241197586, "rewards/margins": 0.013342035934329033, "rewards/rejected": -0.03972354903817177, "step": 6800 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -3.126094102859497, "eval_logits/rejected": -3.1204111576080322, "eval_logps/chosen": -59.046142578125, "eval_logps/rejected": -64.0239028930664, "eval_loss": 0.6907273530960083, "eval_rewards/accuracies": 0.5836431384086609, "eval_rewards/chosen": -0.0033424401190131903, "eval_rewards/margins": 0.005095373373478651, "eval_rewards/rejected": -0.008437813259661198, "eval_runtime": 383.2996, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 6800 }, { "epoch": 1.173328738800827, "grad_norm": 2.347003936767578, "learning_rate": 8.723937012510093e-09, "logits/chosen": -3.0215110778808594, "logits/rejected": -3.0046989917755127, "logps/chosen": -57.366981506347656, "logps/rejected": -55.16851806640625, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -0.01663302257657051, "rewards/margins": 0.01758481189608574, "rewards/rejected": -0.03421783447265625, "step": 6810 }, { "epoch": 1.175051688490696, "grad_norm": 2.6514339447021484, "learning_rate": 8.694116945926324e-09, "logits/chosen": -3.1206183433532715, "logits/rejected": -3.089256763458252, "logps/chosen": -56.93294143676758, "logps/rejected": -57.380699157714844, "loss": 0.6813, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01909453049302101, "rewards/margins": 0.02453770488500595, "rewards/rejected": -0.043632231652736664, "step": 6820 }, { "epoch": 1.176774638180565, "grad_norm": 2.3778793811798096, "learning_rate": 8.66430868854356e-09, "logits/chosen": -3.127379894256592, "logits/rejected": -3.1207189559936523, "logps/chosen": -56.305023193359375, "logps/rejected": -56.054443359375, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024015625938773155, "rewards/margins": 0.006403226405382156, "rewards/rejected": -0.03041885420680046, "step": 6830 }, { "epoch": 1.1784975878704342, "grad_norm": 2.765775442123413, "learning_rate": 8.634512509920175e-09, "logits/chosen": -3.090207576751709, "logits/rejected": -3.064380407333374, "logps/chosen": -58.844017028808594, "logps/rejected": -58.45856857299805, "loss": 0.6877, "rewards/accuracies": 0.625, "rewards/chosen": -0.01913430169224739, "rewards/margins": 0.011462162248790264, "rewards/rejected": -0.03059646487236023, "step": 6840 }, { "epoch": 1.1802205375603032, "grad_norm": 2.216803550720215, "learning_rate": 8.604728679505301e-09, "logits/chosen": -2.9346718788146973, "logits/rejected": -2.9027819633483887, "logps/chosen": -56.310997009277344, "logps/rejected": -57.774322509765625, "loss": 0.6836, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021466486155986786, "rewards/margins": 0.019867608323693275, "rewards/rejected": -0.04133410006761551, "step": 6850 }, { "epoch": 1.1819434872501722, "grad_norm": 2.47184157371521, "learning_rate": 8.574957466636408e-09, "logits/chosen": -3.0480828285217285, "logits/rejected": -3.010148525238037, "logps/chosen": -60.95463943481445, "logps/rejected": -56.18805694580078, "loss": 0.6835, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.018399445340037346, "rewards/margins": 0.020264366641640663, "rewards/rejected": -0.03866381198167801, "step": 6860 }, { "epoch": 1.1836664369400414, "grad_norm": 2.0314888954162598, "learning_rate": 8.545199140536875e-09, "logits/chosen": -2.967435359954834, "logits/rejected": -2.956160545349121, "logps/chosen": -54.10022735595703, "logps/rejected": -55.425926208496094, "loss": 0.6903, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.030147861689329147, "rewards/margins": 0.006414397154003382, "rewards/rejected": -0.03656225651502609, "step": 6870 }, { "epoch": 1.1853893866299103, "grad_norm": 2.3606247901916504, "learning_rate": 8.515453970313526e-09, "logits/chosen": -3.0911428928375244, "logits/rejected": -3.0608937740325928, "logps/chosen": -56.398406982421875, "logps/rejected": -55.0906867980957, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02135491371154785, "rewards/margins": 0.01793890818953514, "rewards/rejected": -0.03929382562637329, "step": 6880 }, { "epoch": 1.1871123363197795, "grad_norm": 2.494527816772461, "learning_rate": 8.485722224954237e-09, "logits/chosen": -2.9907267093658447, "logits/rejected": -2.96384859085083, "logps/chosen": -54.1063346862793, "logps/rejected": -56.15788650512695, "loss": 0.6837, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01796005479991436, "rewards/margins": 0.019725376740098, "rewards/rejected": -0.03768543154001236, "step": 6890 }, { "epoch": 1.1888352860096485, "grad_norm": 2.524660348892212, "learning_rate": 8.456004173325458e-09, "logits/chosen": -3.0648722648620605, "logits/rejected": -3.043199300765991, "logps/chosen": -55.88935089111328, "logps/rejected": -56.82563400268555, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": -0.023347685113549232, "rewards/margins": 0.010543139651417732, "rewards/rejected": -0.033890821039676666, "step": 6900 }, { "epoch": 1.1888352860096485, "eval_logits/chosen": -3.1254613399505615, "eval_logits/rejected": -3.119804859161377, "eval_logps/chosen": -59.08448028564453, "eval_logps/rejected": -64.06681060791016, "eval_loss": 0.6907079815864563, "eval_rewards/accuracies": 0.5810873508453369, "eval_rewards/chosen": -0.0037258744705468416, "eval_rewards/margins": 0.0051410011947155, "eval_rewards/rejected": -0.008866875432431698, "eval_runtime": 384.0766, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 6900 }, { "epoch": 1.1905582356995175, "grad_norm": 2.2871227264404297, "learning_rate": 8.42630008416983e-09, "logits/chosen": -3.098883867263794, "logits/rejected": -3.0734448432922363, "logps/chosen": -59.03523635864258, "logps/rejected": -58.82707595825195, "loss": 0.6861, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02149212174117565, "rewards/margins": 0.014713278040289879, "rewards/rejected": -0.03620540350675583, "step": 6910 }, { "epoch": 1.1922811853893867, "grad_norm": 2.33981990814209, "learning_rate": 8.396610226103705e-09, "logits/chosen": -3.1584877967834473, "logits/rejected": -3.127455472946167, "logps/chosen": -58.35736846923828, "logps/rejected": -57.65404510498047, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.022533800452947617, "rewards/margins": 0.01630527526140213, "rewards/rejected": -0.038839079439640045, "step": 6920 }, { "epoch": 1.1940041350792556, "grad_norm": 2.3817827701568604, "learning_rate": 8.366934867614771e-09, "logits/chosen": -2.964742422103882, "logits/rejected": -2.9373018741607666, "logps/chosen": -59.388710021972656, "logps/rejected": -57.4041633605957, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01754842884838581, "rewards/margins": 0.013974052853882313, "rewards/rejected": -0.0315224826335907, "step": 6930 }, { "epoch": 1.1957270847691248, "grad_norm": 2.468682050704956, "learning_rate": 8.337274277059565e-09, "logits/chosen": -2.956246852874756, "logits/rejected": -2.9091484546661377, "logps/chosen": -59.93339157104492, "logps/rejected": -54.3348274230957, "loss": 0.6846, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0197045486420393, "rewards/margins": 0.017733246088027954, "rewards/rejected": -0.037437792867422104, "step": 6940 }, { "epoch": 1.1974500344589938, "grad_norm": 2.5347440242767334, "learning_rate": 8.307628722661104e-09, "logits/chosen": -3.024871349334717, "logits/rejected": -3.0062155723571777, "logps/chosen": -55.86821746826172, "logps/rejected": -56.56782913208008, "loss": 0.6854, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.020114390179514885, "rewards/margins": 0.016181785613298416, "rewards/rejected": -0.03629617765545845, "step": 6950 }, { "epoch": 1.1991729841488628, "grad_norm": 2.37861704826355, "learning_rate": 8.277998472506412e-09, "logits/chosen": -3.1921026706695557, "logits/rejected": -3.151654005050659, "logps/chosen": -57.934608459472656, "logps/rejected": -51.96177291870117, "loss": 0.6857, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.026161780580878258, "rewards/margins": 0.01564834825694561, "rewards/rejected": -0.04181012138724327, "step": 6960 }, { "epoch": 1.200895933838732, "grad_norm": 2.133126735687256, "learning_rate": 8.248383794544126e-09, "logits/chosen": -3.103128433227539, "logits/rejected": -3.068071126937866, "logps/chosen": -56.252784729003906, "logps/rejected": -54.36753463745117, "loss": 0.6823, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.017109837383031845, "rewards/margins": 0.022667525336146355, "rewards/rejected": -0.03977736085653305, "step": 6970 }, { "epoch": 1.202618883528601, "grad_norm": 2.298814058303833, "learning_rate": 8.218784956582052e-09, "logits/chosen": -3.0041556358337402, "logits/rejected": -2.973860502243042, "logps/chosen": -56.36700439453125, "logps/rejected": -56.94713592529297, "loss": 0.6845, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.021416518837213516, "rewards/margins": 0.017982427030801773, "rewards/rejected": -0.03939894586801529, "step": 6980 }, { "epoch": 1.20434183321847, "grad_norm": 2.5716981887817383, "learning_rate": 8.18920222628477e-09, "logits/chosen": -2.923178195953369, "logits/rejected": -2.9134066104888916, "logps/chosen": -59.46036911010742, "logps/rejected": -57.71564865112305, "loss": 0.6886, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.024769535288214684, "rewards/margins": 0.00976070761680603, "rewards/rejected": -0.034530241042375565, "step": 6990 }, { "epoch": 1.206064782908339, "grad_norm": 2.42403244972229, "learning_rate": 8.15963587117118e-09, "logits/chosen": -3.2224109172821045, "logits/rejected": -3.1947338581085205, "logps/chosen": -58.848487854003906, "logps/rejected": -58.787452697753906, "loss": 0.6859, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.025541281327605247, "rewards/margins": 0.015429399907588959, "rewards/rejected": -0.04097067937254906, "step": 7000 }, { "epoch": 1.206064782908339, "eval_logits/chosen": -3.1261141300201416, "eval_logits/rejected": -3.120389461517334, "eval_logps/chosen": -59.12325668334961, "eval_logps/rejected": -64.10928344726562, "eval_loss": 0.6906958222389221, "eval_rewards/accuracies": 0.5796933174133301, "eval_rewards/chosen": -0.004113591741770506, "eval_rewards/margins": 0.00517794955521822, "eval_rewards/rejected": -0.009291541762650013, "eval_runtime": 383.792, "eval_samples_per_second": 11.214, "eval_steps_per_second": 1.402, "step": 7000 }, { "epoch": 1.207787732598208, "grad_norm": 2.603954315185547, "learning_rate": 8.130086158612116e-09, "logits/chosen": -3.0315804481506348, "logits/rejected": -3.0040690898895264, "logps/chosen": -58.850257873535156, "logps/rejected": -57.16217803955078, "loss": 0.6842, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.024250095710158348, "rewards/margins": 0.01874765008687973, "rewards/rejected": -0.04299774393439293, "step": 7010 }, { "epoch": 1.2095106822880772, "grad_norm": 2.428187847137451, "learning_rate": 8.100553355827896e-09, "logits/chosen": -3.019529342651367, "logits/rejected": -2.9929378032684326, "logps/chosen": -53.589073181152344, "logps/rejected": -56.43495559692383, "loss": 0.6844, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.022542601451277733, "rewards/margins": 0.018220920115709305, "rewards/rejected": -0.040763527154922485, "step": 7020 }, { "epoch": 1.2112336319779462, "grad_norm": 2.319085121154785, "learning_rate": 8.071037729885937e-09, "logits/chosen": -3.0574421882629395, "logits/rejected": -3.0418524742126465, "logps/chosen": -55.7623405456543, "logps/rejected": -58.62355422973633, "loss": 0.6892, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026621636003255844, "rewards/margins": 0.00847454834729433, "rewards/rejected": -0.0350961834192276, "step": 7030 }, { "epoch": 1.2129565816678154, "grad_norm": 2.6634323596954346, "learning_rate": 8.041539547698307e-09, "logits/chosen": -2.998723268508911, "logits/rejected": -2.977909564971924, "logps/chosen": -57.95904541015625, "logps/rejected": -60.370140075683594, "loss": 0.6836, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.018822792917490005, "rewards/margins": 0.01990441419184208, "rewards/rejected": -0.038727205246686935, "step": 7040 }, { "epoch": 1.2146795313576844, "grad_norm": 2.6832826137542725, "learning_rate": 8.01205907601935e-09, "logits/chosen": -2.9719934463500977, "logits/rejected": -2.9498307704925537, "logps/chosen": -53.96051788330078, "logps/rejected": -57.45134735107422, "loss": 0.6856, "rewards/accuracies": 0.625, "rewards/chosen": -0.025047386065125465, "rewards/margins": 0.015683867037296295, "rewards/rejected": -0.04073125496506691, "step": 7050 }, { "epoch": 1.2164024810475533, "grad_norm": 2.773050308227539, "learning_rate": 7.982596581443237e-09, "logits/chosen": -3.0882060527801514, "logits/rejected": -3.0739293098449707, "logps/chosen": -55.9157829284668, "logps/rejected": -57.05158615112305, "loss": 0.6864, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.027055267244577408, "rewards/margins": 0.014333643019199371, "rewards/rejected": -0.04138891398906708, "step": 7060 }, { "epoch": 1.2181254307374225, "grad_norm": 2.327251672744751, "learning_rate": 7.953152330401568e-09, "logits/chosen": -3.051583766937256, "logits/rejected": -3.023996114730835, "logps/chosen": -56.63190841674805, "logps/rejected": -56.517967224121094, "loss": 0.6873, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.020288683474063873, "rewards/margins": 0.012412209063768387, "rewards/rejected": -0.03270088881254196, "step": 7070 }, { "epoch": 1.2198483804272915, "grad_norm": 2.5216662883758545, "learning_rate": 7.923726589160985e-09, "logits/chosen": -3.1125307083129883, "logits/rejected": -3.0817744731903076, "logps/chosen": -56.150787353515625, "logps/rejected": -58.22904586791992, "loss": 0.6856, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.02682647481560707, "rewards/margins": 0.015813734382390976, "rewards/rejected": -0.04264020919799805, "step": 7080 }, { "epoch": 1.2215713301171607, "grad_norm": 2.3830454349517822, "learning_rate": 7.894319623820721e-09, "logits/chosen": -3.141756296157837, "logits/rejected": -3.13077712059021, "logps/chosen": -56.7417106628418, "logps/rejected": -56.47627639770508, "loss": 0.6883, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03223857656121254, "rewards/margins": 0.010424691252410412, "rewards/rejected": -0.042663268744945526, "step": 7090 }, { "epoch": 1.2232942798070296, "grad_norm": 2.4901022911071777, "learning_rate": 7.864931700310235e-09, "logits/chosen": -3.0033276081085205, "logits/rejected": -2.9819869995117188, "logps/chosen": -59.11650466918945, "logps/rejected": -60.6047248840332, "loss": 0.685, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.020432157441973686, "rewards/margins": 0.0171308945864439, "rewards/rejected": -0.037563055753707886, "step": 7100 }, { "epoch": 1.2232942798070296, "eval_logits/chosen": -3.123748540878296, "eval_logits/rejected": -3.1180479526519775, "eval_logps/chosen": -59.15984344482422, "eval_logps/rejected": -64.1565170288086, "eval_loss": 0.6906439065933228, "eval_rewards/accuracies": 0.5796933174133301, "eval_rewards/chosen": -0.004479521419852972, "eval_rewards/margins": 0.005284461192786694, "eval_rewards/rejected": -0.009763982146978378, "eval_runtime": 384.0905, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 7100 }, { "epoch": 1.2250172294968986, "grad_norm": 2.255007266998291, "learning_rate": 7.835563084386777e-09, "logits/chosen": -3.0600950717926025, "logits/rejected": -3.0214476585388184, "logps/chosen": -56.0528678894043, "logps/rejected": -54.752723693847656, "loss": 0.6875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.025800343602895737, "rewards/margins": 0.011899485252797604, "rewards/rejected": -0.037699829787015915, "step": 7110 }, { "epoch": 1.2267401791867678, "grad_norm": 2.3676490783691406, "learning_rate": 7.806214041633009e-09, "logits/chosen": -3.045886278152466, "logits/rejected": -3.00185489654541, "logps/chosen": -63.60943603515625, "logps/rejected": -56.4583854675293, "loss": 0.6821, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016231974586844444, "rewards/margins": 0.022754736244678497, "rewards/rejected": -0.03898671269416809, "step": 7120 }, { "epoch": 1.2284631288766368, "grad_norm": 2.478337049484253, "learning_rate": 7.776884837454573e-09, "logits/chosen": -2.924165964126587, "logits/rejected": -2.8969979286193848, "logps/chosen": -58.44608688354492, "logps/rejected": -56.87275314331055, "loss": 0.6856, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02104850485920906, "rewards/margins": 0.015894543379545212, "rewards/rejected": -0.03694305196404457, "step": 7130 }, { "epoch": 1.230186078566506, "grad_norm": 2.278496742248535, "learning_rate": 7.747575737077732e-09, "logits/chosen": -3.0394952297210693, "logits/rejected": -3.0347137451171875, "logps/chosen": -54.013336181640625, "logps/rejected": -55.49956512451172, "loss": 0.6893, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026956558227539062, "rewards/margins": 0.008593494072556496, "rewards/rejected": -0.03555005416274071, "step": 7140 }, { "epoch": 1.231909028256375, "grad_norm": 2.5096144676208496, "learning_rate": 7.71828700554693e-09, "logits/chosen": -3.113610029220581, "logits/rejected": -3.0898070335388184, "logps/chosen": -58.9386100769043, "logps/rejected": -58.98060989379883, "loss": 0.6869, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025510985404253006, "rewards/margins": 0.013395940884947777, "rewards/rejected": -0.038906924426555634, "step": 7150 }, { "epoch": 1.233631977946244, "grad_norm": 2.4192092418670654, "learning_rate": 7.689018907722429e-09, "logits/chosen": -3.0234131813049316, "logits/rejected": -2.98473858833313, "logps/chosen": -55.985321044921875, "logps/rejected": -58.2840461730957, "loss": 0.6841, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02323567308485508, "rewards/margins": 0.018828080967068672, "rewards/rejected": -0.04206375032663345, "step": 7160 }, { "epoch": 1.235354927636113, "grad_norm": 2.607609748840332, "learning_rate": 7.659771708277883e-09, "logits/chosen": -2.985973834991455, "logits/rejected": -2.9590420722961426, "logps/chosen": -58.46117401123047, "logps/rejected": -57.82086181640625, "loss": 0.6879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02894872985780239, "rewards/margins": 0.011171415448188782, "rewards/rejected": -0.04012014716863632, "step": 7170 }, { "epoch": 1.237077877325982, "grad_norm": 2.5801990032196045, "learning_rate": 7.630545671697975e-09, "logits/chosen": -3.1181468963623047, "logits/rejected": -3.10412335395813, "logps/chosen": -57.78798294067383, "logps/rejected": -58.09895706176758, "loss": 0.6878, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024857694283127785, "rewards/margins": 0.011409392580389977, "rewards/rejected": -0.03626708686351776, "step": 7180 }, { "epoch": 1.2388008270158513, "grad_norm": 2.26977276802063, "learning_rate": 7.601341062275997e-09, "logits/chosen": -2.966801643371582, "logits/rejected": -2.942594051361084, "logps/chosen": -55.58942413330078, "logps/rejected": -57.827667236328125, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02918897196650505, "rewards/margins": 0.012870155274868011, "rewards/rejected": -0.04205913096666336, "step": 7190 }, { "epoch": 1.2405237767057202, "grad_norm": 2.7323532104492188, "learning_rate": 7.57215814411149e-09, "logits/chosen": -2.9832592010498047, "logits/rejected": -2.9655752182006836, "logps/chosen": -55.5822639465332, "logps/rejected": -60.1941032409668, "loss": 0.6858, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02484903857111931, "rewards/margins": 0.015351449139416218, "rewards/rejected": -0.04020049050450325, "step": 7200 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -3.124135971069336, "eval_logits/rejected": -3.1184306144714355, "eval_logps/chosen": -59.1701774597168, "eval_logps/rejected": -64.19103240966797, "eval_loss": 0.6905274391174316, "eval_rewards/accuracies": 0.5820167064666748, "eval_rewards/chosen": -0.004582811146974564, "eval_rewards/margins": 0.005526235792785883, "eval_rewards/rejected": -0.010109047405421734, "eval_runtime": 383.8514, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 7200 }, { "epoch": 1.2422467263955892, "grad_norm": 2.333134651184082, "learning_rate": 7.54299718110782e-09, "logits/chosen": -3.1046881675720215, "logits/rejected": -3.0822250843048096, "logps/chosen": -59.1954460144043, "logps/rejected": -56.4441032409668, "loss": 0.6873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02166753076016903, "rewards/margins": 0.012477634474635124, "rewards/rejected": -0.03414516523480415, "step": 7210 }, { "epoch": 1.2439696760854584, "grad_norm": 2.4416794776916504, "learning_rate": 7.51385843696983e-09, "logits/chosen": -2.9671475887298584, "logits/rejected": -2.931851387023926, "logps/chosen": -57.426719665527344, "logps/rejected": -55.91858673095703, "loss": 0.6867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.028967272490262985, "rewards/margins": 0.013575015589594841, "rewards/rejected": -0.04254228621721268, "step": 7220 }, { "epoch": 1.2456926257753274, "grad_norm": 2.446187973022461, "learning_rate": 7.484742175201417e-09, "logits/chosen": -3.0023951530456543, "logits/rejected": -2.9816293716430664, "logps/chosen": -56.86414337158203, "logps/rejected": -56.91185760498047, "loss": 0.6871, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.026427680626511574, "rewards/margins": 0.012890076264739037, "rewards/rejected": -0.03931775689125061, "step": 7230 }, { "epoch": 1.2474155754651963, "grad_norm": 2.3552913665771484, "learning_rate": 7.455648659103191e-09, "logits/chosen": -3.0364127159118652, "logits/rejected": -3.0254123210906982, "logps/chosen": -60.1931037902832, "logps/rejected": -58.56923294067383, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02746332809329033, "rewards/margins": 0.010513190180063248, "rewards/rejected": -0.03797651827335358, "step": 7240 }, { "epoch": 1.2491385251550655, "grad_norm": 2.817966938018799, "learning_rate": 7.426578151770047e-09, "logits/chosen": -3.0863070487976074, "logits/rejected": -3.054816722869873, "logps/chosen": -57.13684844970703, "logps/rejected": -56.9018440246582, "loss": 0.6861, "rewards/accuracies": 0.65625, "rewards/chosen": -0.025691917166113853, "rewards/margins": 0.014780798926949501, "rewards/rejected": -0.04047270864248276, "step": 7250 }, { "epoch": 1.2508614748449345, "grad_norm": 2.2241945266723633, "learning_rate": 7.397530916088828e-09, "logits/chosen": -3.081535816192627, "logits/rejected": -3.047715663909912, "logps/chosen": -55.83784866333008, "logps/rejected": -56.54887008666992, "loss": 0.6834, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02152073010802269, "rewards/margins": 0.020243234932422638, "rewards/rejected": -0.04176396131515503, "step": 7260 }, { "epoch": 1.2525844245348035, "grad_norm": 2.390984058380127, "learning_rate": 7.36850721473592e-09, "logits/chosen": -3.097338914871216, "logits/rejected": -3.0662691593170166, "logps/chosen": -56.4255256652832, "logps/rejected": -56.92626953125, "loss": 0.6826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.021098647266626358, "rewards/margins": 0.02181319333612919, "rewards/rejected": -0.0429118387401104, "step": 7270 }, { "epoch": 1.2543073742246726, "grad_norm": 2.712801218032837, "learning_rate": 7.339507310174884e-09, "logits/chosen": -3.126049041748047, "logits/rejected": -3.091052770614624, "logps/chosen": -59.09241485595703, "logps/rejected": -57.259437561035156, "loss": 0.6844, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025226742029190063, "rewards/margins": 0.018264181911945343, "rewards/rejected": -0.043490923941135406, "step": 7280 }, { "epoch": 1.2560303239145416, "grad_norm": 2.2661290168762207, "learning_rate": 7.3105314646541e-09, "logits/chosen": -2.9083609580993652, "logits/rejected": -2.8951635360717773, "logps/chosen": -57.48980712890625, "logps/rejected": -59.64112091064453, "loss": 0.6881, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.026984844356775284, "rewards/margins": 0.010795495472848415, "rewards/rejected": -0.037780337035655975, "step": 7290 }, { "epoch": 1.2577532736044108, "grad_norm": 2.620009183883667, "learning_rate": 7.281579940204361e-09, "logits/chosen": -2.8676297664642334, "logits/rejected": -2.8505687713623047, "logps/chosen": -53.25835037231445, "logps/rejected": -55.6039924621582, "loss": 0.6905, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029975971207022667, "rewards/margins": 0.005989503115415573, "rewards/rejected": -0.03596547618508339, "step": 7300 }, { "epoch": 1.2577532736044108, "eval_logits/chosen": -3.1238722801208496, "eval_logits/rejected": -3.1181890964508057, "eval_logps/chosen": -59.20159912109375, "eval_logps/rejected": -64.22035217285156, "eval_loss": 0.6905403137207031, "eval_rewards/accuracies": 0.5803903341293335, "eval_rewards/chosen": -0.004897048696875572, "eval_rewards/margins": 0.005505240522325039, "eval_rewards/rejected": -0.010402288287878036, "eval_runtime": 383.6551, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 7300 }, { "epoch": 1.2594762232942798, "grad_norm": 2.822331190109253, "learning_rate": 7.25265299863654e-09, "logits/chosen": -3.069945812225342, "logits/rejected": -3.046442747116089, "logps/chosen": -57.70795440673828, "logps/rejected": -55.729942321777344, "loss": 0.6876, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02648942545056343, "rewards/margins": 0.01195268053561449, "rewards/rejected": -0.03844210132956505, "step": 7310 }, { "epoch": 1.2611991729841487, "grad_norm": 2.468334674835205, "learning_rate": 7.22375090153919e-09, "logits/chosen": -3.0881543159484863, "logits/rejected": -3.061934471130371, "logps/chosen": -57.00069046020508, "logps/rejected": -55.181861877441406, "loss": 0.6868, "rewards/accuracies": 0.59375, "rewards/chosen": -0.026471266523003578, "rewards/margins": 0.013788128271698952, "rewards/rejected": -0.04025938734412193, "step": 7320 }, { "epoch": 1.262922122674018, "grad_norm": 2.297668933868408, "learning_rate": 7.194873910276204e-09, "logits/chosen": -3.0014100074768066, "logits/rejected": -2.9748117923736572, "logps/chosen": -55.19213104248047, "logps/rejected": -55.83556365966797, "loss": 0.6829, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02129554934799671, "rewards/margins": 0.021344564855098724, "rewards/rejected": -0.04264011234045029, "step": 7330 }, { "epoch": 1.264645072363887, "grad_norm": 2.6001136302948, "learning_rate": 7.166022285984437e-09, "logits/chosen": -3.061702251434326, "logits/rejected": -3.0339019298553467, "logps/chosen": -57.834259033203125, "logps/rejected": -58.05975341796875, "loss": 0.6851, "rewards/accuracies": 0.625, "rewards/chosen": -0.024531777948141098, "rewards/margins": 0.016910618171095848, "rewards/rejected": -0.0414423942565918, "step": 7340 }, { "epoch": 1.266368022053756, "grad_norm": 2.4237160682678223, "learning_rate": 7.13719628957135e-09, "logits/chosen": -3.1121439933776855, "logits/rejected": -3.0826239585876465, "logps/chosen": -58.967552185058594, "logps/rejected": -55.390892028808594, "loss": 0.6857, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.020481975749135017, "rewards/margins": 0.01573684811592102, "rewards/rejected": -0.03621882572770119, "step": 7350 }, { "epoch": 1.268090971743625, "grad_norm": 2.4601356983184814, "learning_rate": 7.108396181712643e-09, "logits/chosen": -3.018209934234619, "logits/rejected": -2.997499704360962, "logps/chosen": -56.858489990234375, "logps/rejected": -56.35799026489258, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -0.02560586854815483, "rewards/margins": 0.01612265780568123, "rewards/rejected": -0.04172852635383606, "step": 7360 }, { "epoch": 1.269813921433494, "grad_norm": 2.7204806804656982, "learning_rate": 7.079622222849917e-09, "logits/chosen": -2.9083404541015625, "logits/rejected": -2.8869762420654297, "logps/chosen": -55.68623733520508, "logps/rejected": -55.17841339111328, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -0.028797179460525513, "rewards/margins": 0.010576505213975906, "rewards/rejected": -0.03937368467450142, "step": 7370 }, { "epoch": 1.2715368711233632, "grad_norm": 2.232482433319092, "learning_rate": 7.05087467318829e-09, "logits/chosen": -3.0181708335876465, "logits/rejected": -2.996931552886963, "logps/chosen": -55.63871383666992, "logps/rejected": -57.555015563964844, "loss": 0.6865, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024709437042474747, "rewards/margins": 0.013970698229968548, "rewards/rejected": -0.03868013620376587, "step": 7380 }, { "epoch": 1.2732598208132322, "grad_norm": 2.4573562145233154, "learning_rate": 7.022153792694073e-09, "logits/chosen": -2.982329845428467, "logits/rejected": -2.9620840549468994, "logps/chosen": -54.878746032714844, "logps/rejected": -57.192298889160156, "loss": 0.686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02665008045732975, "rewards/margins": 0.015081726014614105, "rewards/rejected": -0.041731808334589005, "step": 7390 }, { "epoch": 1.2749827705031014, "grad_norm": 2.2985384464263916, "learning_rate": 6.993459841092396e-09, "logits/chosen": -2.9905054569244385, "logits/rejected": -2.9512736797332764, "logps/chosen": -58.15850067138672, "logps/rejected": -54.942726135253906, "loss": 0.6852, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0270744226872921, "rewards/margins": 0.016645211726427078, "rewards/rejected": -0.04371963441371918, "step": 7400 }, { "epoch": 1.2749827705031014, "eval_logits/chosen": -3.123706579208374, "eval_logits/rejected": -3.1180148124694824, "eval_logps/chosen": -59.22597885131836, "eval_logps/rejected": -64.24317169189453, "eval_loss": 0.6905500888824463, "eval_rewards/accuracies": 0.5789963006973267, "eval_rewards/chosen": -0.005140796769410372, "eval_rewards/margins": 0.005489727016538382, "eval_rewards/rejected": -0.010630524717271328, "eval_runtime": 384.0969, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 7400 }, { "epoch": 1.2767057201929704, "grad_norm": 2.598616361618042, "learning_rate": 6.964793077864876e-09, "logits/chosen": -2.9563305377960205, "logits/rejected": -2.9255309104919434, "logps/chosen": -56.2224006652832, "logps/rejected": -57.120323181152344, "loss": 0.6854, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02466033585369587, "rewards/margins": 0.01632571592926979, "rewards/rejected": -0.04098604992032051, "step": 7410 }, { "epoch": 1.2784286698828393, "grad_norm": 2.6292760372161865, "learning_rate": 6.936153762247254e-09, "logits/chosen": -2.8979477882385254, "logits/rejected": -2.8714406490325928, "logps/chosen": -58.17467498779297, "logps/rejected": -56.69647216796875, "loss": 0.6855, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.020067289471626282, "rewards/margins": 0.01622116006910801, "rewards/rejected": -0.03628844767808914, "step": 7420 }, { "epoch": 1.2801516195727085, "grad_norm": 2.500096321105957, "learning_rate": 6.907542153227073e-09, "logits/chosen": -2.9541492462158203, "logits/rejected": -2.9250648021698, "logps/chosen": -56.53348922729492, "logps/rejected": -56.31572341918945, "loss": 0.6869, "rewards/accuracies": 0.59375, "rewards/chosen": -0.031292449682950974, "rewards/margins": 0.013092470355331898, "rewards/rejected": -0.044384922832250595, "step": 7430 }, { "epoch": 1.2818745692625775, "grad_norm": 2.392944812774658, "learning_rate": 6.878958509541311e-09, "logits/chosen": -3.0708324909210205, "logits/rejected": -3.0418925285339355, "logps/chosen": -58.69022750854492, "logps/rejected": -58.29036331176758, "loss": 0.6824, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026854073628783226, "rewards/margins": 0.02240685746073723, "rewards/rejected": -0.049260932952165604, "step": 7440 }, { "epoch": 1.2835975189524467, "grad_norm": 2.7048799991607666, "learning_rate": 6.850403089674067e-09, "logits/chosen": -3.134178876876831, "logits/rejected": -3.1060986518859863, "logps/chosen": -58.184654235839844, "logps/rejected": -56.81584930419922, "loss": 0.6839, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.02258859947323799, "rewards/margins": 0.01950298808515072, "rewards/rejected": -0.04209158942103386, "step": 7450 }, { "epoch": 1.2853204686423156, "grad_norm": 2.316319704055786, "learning_rate": 6.8218761518541916e-09, "logits/chosen": -2.9367308616638184, "logits/rejected": -2.9404594898223877, "logps/chosen": -53.09021759033203, "logps/rejected": -57.01831817626953, "loss": 0.688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02927972748875618, "rewards/margins": 0.011012665927410126, "rewards/rejected": -0.040292393416166306, "step": 7460 }, { "epoch": 1.2870434183321846, "grad_norm": 2.487236499786377, "learning_rate": 6.793377954052989e-09, "logits/chosen": -3.029714584350586, "logits/rejected": -3.012195587158203, "logps/chosen": -57.121620178222656, "logps/rejected": -55.000083923339844, "loss": 0.6875, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.02466987445950508, "rewards/margins": 0.012196965515613556, "rewards/rejected": -0.03686683997511864, "step": 7470 }, { "epoch": 1.2887663680220538, "grad_norm": 2.2682740688323975, "learning_rate": 6.764908753981844e-09, "logits/chosen": -2.9855077266693115, "logits/rejected": -2.9578702449798584, "logps/chosen": -57.35419845581055, "logps/rejected": -53.1810417175293, "loss": 0.6873, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.027218962088227272, "rewards/margins": 0.0126141756772995, "rewards/rejected": -0.03983313590288162, "step": 7480 }, { "epoch": 1.2904893177119228, "grad_norm": 2.6248104572296143, "learning_rate": 6.7364688090899395e-09, "logits/chosen": -2.955197811126709, "logits/rejected": -2.941403865814209, "logps/chosen": -55.307899475097656, "logps/rejected": -57.485992431640625, "loss": 0.6856, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.027474090456962585, "rewards/margins": 0.015908868983387947, "rewards/rejected": -0.04338296130299568, "step": 7490 }, { "epoch": 1.292212267401792, "grad_norm": 2.2009220123291016, "learning_rate": 6.708058376561879e-09, "logits/chosen": -2.9785373210906982, "logits/rejected": -2.9540696144104004, "logps/chosen": -54.05745315551758, "logps/rejected": -55.798377990722656, "loss": 0.6873, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.025832027196884155, "rewards/margins": 0.012369804084300995, "rewards/rejected": -0.03820183128118515, "step": 7500 }, { "epoch": 1.292212267401792, "eval_logits/chosen": -3.123053789138794, "eval_logits/rejected": -3.1173923015594482, "eval_logps/chosen": -59.25996017456055, "eval_logps/rejected": -64.27454376220703, "eval_loss": 0.6905677318572998, "eval_rewards/accuracies": 0.5859665274620056, "eval_rewards/chosen": -0.0054806615225970745, "eval_rewards/margins": 0.005463543813675642, "eval_rewards/rejected": -0.010944206267595291, "eval_runtime": 384.1014, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 7500 }, { "epoch": 1.293935217091661, "grad_norm": 2.403428554534912, "learning_rate": 6.6796777133153885e-09, "logits/chosen": -3.0138533115386963, "logits/rejected": -2.9797167778015137, "logps/chosen": -59.300132751464844, "logps/rejected": -55.43413162231445, "loss": 0.6828, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.020390545949339867, "rewards/margins": 0.021740619093179703, "rewards/rejected": -0.04213116317987442, "step": 7510 }, { "epoch": 1.29565816678153, "grad_norm": 2.5295517444610596, "learning_rate": 6.651327075999e-09, "logits/chosen": -2.9840564727783203, "logits/rejected": -2.963972330093384, "logps/chosen": -55.64642333984375, "logps/rejected": -57.811431884765625, "loss": 0.6874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02718658745288849, "rewards/margins": 0.01235284935683012, "rewards/rejected": -0.039539434015750885, "step": 7520 }, { "epoch": 1.297381116471399, "grad_norm": 2.5545949935913086, "learning_rate": 6.623006720989699e-09, "logits/chosen": -2.927830934524536, "logits/rejected": -2.926427125930786, "logps/chosen": -54.93427276611328, "logps/rejected": -57.257972717285156, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02716946043074131, "rewards/margins": 0.011054610833525658, "rewards/rejected": -0.03822406753897667, "step": 7530 }, { "epoch": 1.299104066161268, "grad_norm": 2.114025592803955, "learning_rate": 6.594716904390648e-09, "logits/chosen": -3.1067121028900146, "logits/rejected": -3.099963903427124, "logps/chosen": -53.91367721557617, "logps/rejected": -56.411949157714844, "loss": 0.6866, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02373148314654827, "rewards/margins": 0.013630990870296955, "rewards/rejected": -0.0373624712228775, "step": 7540 }, { "epoch": 1.3008270158511372, "grad_norm": 2.553532838821411, "learning_rate": 6.566457882028829e-09, "logits/chosen": -3.0206680297851562, "logits/rejected": -3.0002551078796387, "logps/chosen": -54.42189407348633, "logps/rejected": -56.299537658691406, "loss": 0.6843, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026311367750167847, "rewards/margins": 0.018551740795373917, "rewards/rejected": -0.04486311227083206, "step": 7550 }, { "epoch": 1.3025499655410062, "grad_norm": 2.7336137294769287, "learning_rate": 6.5382299094527595e-09, "logits/chosen": -3.0959246158599854, "logits/rejected": -3.076599359512329, "logps/chosen": -54.67638397216797, "logps/rejected": -59.42815399169922, "loss": 0.6837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02101433463394642, "rewards/margins": 0.019638245925307274, "rewards/rejected": -0.04065258055925369, "step": 7560 }, { "epoch": 1.3042729152308752, "grad_norm": 2.2857635021209717, "learning_rate": 6.510033241930166e-09, "logits/chosen": -3.0978853702545166, "logits/rejected": -3.079770565032959, "logps/chosen": -59.495140075683594, "logps/rejected": -56.71014404296875, "loss": 0.6849, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0269808117300272, "rewards/margins": 0.017586104571819305, "rewards/rejected": -0.04456691816449165, "step": 7570 }, { "epoch": 1.3059958649207444, "grad_norm": 2.645477533340454, "learning_rate": 6.48186813444569e-09, "logits/chosen": -3.0349011421203613, "logits/rejected": -3.0162124633789062, "logps/chosen": -57.695037841796875, "logps/rejected": -58.27811813354492, "loss": 0.6838, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.016813058406114578, "rewards/margins": 0.019382277503609657, "rewards/rejected": -0.036195337772369385, "step": 7580 }, { "epoch": 1.3077188146106133, "grad_norm": 2.9784748554229736, "learning_rate": 6.4537348416985586e-09, "logits/chosen": -3.082174777984619, "logits/rejected": -3.0377144813537598, "logps/chosen": -60.8484992980957, "logps/rejected": -56.35919189453125, "loss": 0.6816, "rewards/accuracies": 0.65625, "rewards/chosen": -0.025256508961319923, "rewards/margins": 0.02421494387090206, "rewards/rejected": -0.04947146028280258, "step": 7590 }, { "epoch": 1.3094417643004825, "grad_norm": 2.295466184616089, "learning_rate": 6.425633618100315e-09, "logits/chosen": -3.045626163482666, "logits/rejected": -3.0147712230682373, "logps/chosen": -55.79792404174805, "logps/rejected": -53.15778350830078, "loss": 0.6871, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03260722756385803, "rewards/margins": 0.013006548397243023, "rewards/rejected": -0.04561377689242363, "step": 7600 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -3.122342586517334, "eval_logits/rejected": -3.1166090965270996, "eval_logps/chosen": -59.26426696777344, "eval_logps/rejected": -64.30005645751953, "eval_loss": 0.6904650926589966, "eval_rewards/accuracies": 0.5829461216926575, "eval_rewards/chosen": -0.005523705389350653, "eval_rewards/margins": 0.005675605032593012, "eval_rewards/rejected": -0.011199310421943665, "eval_runtime": 383.9608, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 7600 }, { "epoch": 1.3111647139903515, "grad_norm": 2.2035515308380127, "learning_rate": 6.397564717772479e-09, "logits/chosen": -2.997570753097534, "logits/rejected": -2.9652457237243652, "logps/chosen": -55.637603759765625, "logps/rejected": -54.816688537597656, "loss": 0.6842, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.027262642979621887, "rewards/margins": 0.018677236512303352, "rewards/rejected": -0.04593988507986069, "step": 7610 }, { "epoch": 1.3128876636802205, "grad_norm": 2.3824641704559326, "learning_rate": 6.369528394544282e-09, "logits/chosen": -3.0388424396514893, "logits/rejected": -3.010739803314209, "logps/chosen": -59.684471130371094, "logps/rejected": -57.155296325683594, "loss": 0.6876, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02171112224459648, "rewards/margins": 0.01191165205091238, "rewards/rejected": -0.033622775226831436, "step": 7620 }, { "epoch": 1.3146106133700897, "grad_norm": 2.6214728355407715, "learning_rate": 6.341524901950352e-09, "logits/chosen": -2.972496509552002, "logits/rejected": -2.986039400100708, "logps/chosen": -52.67688751220703, "logps/rejected": -57.54160690307617, "loss": 0.6902, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.028438786044716835, "rewards/margins": 0.0066140033304691315, "rewards/rejected": -0.03505278751254082, "step": 7630 }, { "epoch": 1.3163335630599586, "grad_norm": 2.7644217014312744, "learning_rate": 6.3135544932284304e-09, "logits/chosen": -2.9409775733947754, "logits/rejected": -2.917985677719116, "logps/chosen": -60.52558135986328, "logps/rejected": -56.77556610107422, "loss": 0.6848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02607950009405613, "rewards/margins": 0.01758510060608387, "rewards/rejected": -0.0436645969748497, "step": 7640 }, { "epoch": 1.3180565127498278, "grad_norm": 2.5142641067504883, "learning_rate": 6.2856174213170735e-09, "logits/chosen": -3.073223114013672, "logits/rejected": -3.048750162124634, "logps/chosen": -61.158958435058594, "logps/rejected": -60.50822067260742, "loss": 0.6838, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.027626430615782738, "rewards/margins": 0.019644903019070625, "rewards/rejected": -0.04727133363485336, "step": 7650 }, { "epoch": 1.3197794624396968, "grad_norm": 2.8654909133911133, "learning_rate": 6.25771393885338e-09, "logits/chosen": -3.0456790924072266, "logits/rejected": -3.005376100540161, "logps/chosen": -59.436973571777344, "logps/rejected": -59.17437744140625, "loss": 0.6836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024066317826509476, "rewards/margins": 0.019956592470407486, "rewards/rejected": -0.04402291029691696, "step": 7660 }, { "epoch": 1.3215024121295658, "grad_norm": 2.4010403156280518, "learning_rate": 6.229844298170681e-09, "logits/chosen": -2.9991161823272705, "logits/rejected": -2.9706664085388184, "logps/chosen": -60.234466552734375, "logps/rejected": -57.38881301879883, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.026901472359895706, "rewards/margins": 0.017608607187867165, "rewards/rejected": -0.04451008141040802, "step": 7670 }, { "epoch": 1.323225361819435, "grad_norm": 2.2777979373931885, "learning_rate": 6.202008751296293e-09, "logits/chosen": -3.074632167816162, "logits/rejected": -3.056203842163086, "logps/chosen": -53.30518341064453, "logps/rejected": -53.90324783325195, "loss": 0.6859, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026688244193792343, "rewards/margins": 0.015266289934515953, "rewards/rejected": -0.041954535990953445, "step": 7680 }, { "epoch": 1.324948311509304, "grad_norm": 2.355191230773926, "learning_rate": 6.174207549949205e-09, "logits/chosen": -3.1611850261688232, "logits/rejected": -3.1127707958221436, "logps/chosen": -56.6767463684082, "logps/rejected": -52.805686950683594, "loss": 0.6833, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.016922805458307266, "rewards/margins": 0.020582620054483414, "rewards/rejected": -0.03750542551279068, "step": 7690 }, { "epoch": 1.3266712611991731, "grad_norm": 2.563619613647461, "learning_rate": 6.146440945537821e-09, "logits/chosen": -3.015319585800171, "logits/rejected": -2.9848389625549316, "logps/chosen": -56.29510498046875, "logps/rejected": -57.7125244140625, "loss": 0.6865, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02584684267640114, "rewards/margins": 0.01425609178841114, "rewards/rejected": -0.04010293632745743, "step": 7700 }, { "epoch": 1.3266712611991731, "eval_logits/chosen": -3.121819496154785, "eval_logits/rejected": -3.1161410808563232, "eval_logps/chosen": -59.28498458862305, "eval_logps/rejected": -64.3291015625, "eval_loss": 0.690426766872406, "eval_rewards/accuracies": 0.5845724940299988, "eval_rewards/chosen": -0.005730922799557447, "eval_rewards/margins": 0.005758913233876228, "eval_rewards/rejected": -0.011489835567772388, "eval_runtime": 384.1336, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.401, "step": 7700 }, { "epoch": 1.328394210889042, "grad_norm": 2.249321699142456, "learning_rate": 6.1187091891576855e-09, "logits/chosen": -3.061793565750122, "logits/rejected": -3.0465798377990723, "logps/chosen": -54.94254684448242, "logps/rejected": -55.14897918701172, "loss": 0.6893, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.033700019121170044, "rewards/margins": 0.008526310324668884, "rewards/rejected": -0.04222633317112923, "step": 7710 }, { "epoch": 1.330117160578911, "grad_norm": 2.452054023742676, "learning_rate": 6.091012531589198e-09, "logits/chosen": -3.036054849624634, "logits/rejected": -2.9991581439971924, "logps/chosen": -59.057716369628906, "logps/rejected": -55.31732940673828, "loss": 0.6857, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029151711612939835, "rewards/margins": 0.01590500771999359, "rewards/rejected": -0.045056719332933426, "step": 7720 }, { "epoch": 1.33184011026878, "grad_norm": 2.3000831604003906, "learning_rate": 6.063351223295377e-09, "logits/chosen": -2.992392063140869, "logits/rejected": -2.9694530963897705, "logps/chosen": -56.845069885253906, "logps/rejected": -56.6755256652832, "loss": 0.6861, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029227161779999733, "rewards/margins": 0.014832262881100178, "rewards/rejected": -0.044059425592422485, "step": 7730 }, { "epoch": 1.3335630599586492, "grad_norm": 2.3779852390289307, "learning_rate": 6.035725514419554e-09, "logits/chosen": -2.9875688552856445, "logits/rejected": -2.971534013748169, "logps/chosen": -54.39215087890625, "logps/rejected": -58.52937698364258, "loss": 0.6833, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.02687232568860054, "rewards/margins": 0.020616920664906502, "rewards/rejected": -0.047489240765571594, "step": 7740 }, { "epoch": 1.3352860096485184, "grad_norm": 2.277998447418213, "learning_rate": 6.008135654783151e-09, "logits/chosen": -3.0029854774475098, "logits/rejected": -2.968958854675293, "logps/chosen": -56.569854736328125, "logps/rejected": -57.06427001953125, "loss": 0.6825, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026270601898431778, "rewards/margins": 0.02241623029112816, "rewards/rejected": -0.04868683964014053, "step": 7750 }, { "epoch": 1.3370089593383874, "grad_norm": 2.386118173599243, "learning_rate": 5.980581893883383e-09, "logits/chosen": -2.9068267345428467, "logits/rejected": -2.8905227184295654, "logps/chosen": -53.168365478515625, "logps/rejected": -55.826568603515625, "loss": 0.6862, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026616433635354042, "rewards/margins": 0.01475044060498476, "rewards/rejected": -0.04136687144637108, "step": 7760 }, { "epoch": 1.3387319090282563, "grad_norm": 2.473367214202881, "learning_rate": 5.95306448089104e-09, "logits/chosen": -2.953372001647949, "logits/rejected": -2.928014039993286, "logps/chosen": -54.80479049682617, "logps/rejected": -56.73512649536133, "loss": 0.6858, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.026885952800512314, "rewards/margins": 0.015338996425271034, "rewards/rejected": -0.0422249510884285, "step": 7770 }, { "epoch": 1.3404548587181253, "grad_norm": 2.2375106811523438, "learning_rate": 5.925583664648201e-09, "logits/chosen": -3.116488456726074, "logits/rejected": -3.0995609760284424, "logps/chosen": -54.901405334472656, "logps/rejected": -58.4135856628418, "loss": 0.6878, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.033624183386564255, "rewards/margins": 0.011648926883935928, "rewards/rejected": -0.04527311399579048, "step": 7780 }, { "epoch": 1.3421778084079945, "grad_norm": 2.5394904613494873, "learning_rate": 5.898139693666007e-09, "logits/chosen": -3.045893907546997, "logits/rejected": -3.030491352081299, "logps/chosen": -54.684173583984375, "logps/rejected": -56.65555953979492, "loss": 0.6853, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031137654557824135, "rewards/margins": 0.016448350623250008, "rewards/rejected": -0.047586001455783844, "step": 7790 }, { "epoch": 1.3439007580978635, "grad_norm": 2.456921100616455, "learning_rate": 5.870732816122394e-09, "logits/chosen": -3.0814521312713623, "logits/rejected": -3.0761094093322754, "logps/chosen": -57.13507843017578, "logps/rejected": -58.82529830932617, "loss": 0.6888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03154601901769638, "rewards/margins": 0.009393574669957161, "rewards/rejected": -0.040939588099718094, "step": 7800 }, { "epoch": 1.3439007580978635, "eval_logits/chosen": -3.1217851638793945, "eval_logits/rejected": -3.1161201000213623, "eval_logps/chosen": -59.319210052490234, "eval_logps/rejected": -64.35895538330078, "eval_loss": 0.6904501914978027, "eval_rewards/accuracies": 0.5820167064666748, "eval_rewards/chosen": -0.006073120515793562, "eval_rewards/margins": 0.005715163890272379, "eval_rewards/rejected": -0.01178828440606594, "eval_runtime": 383.7253, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 7800 }, { "epoch": 1.3456237077877327, "grad_norm": 2.6416800022125244, "learning_rate": 5.843363279859875e-09, "logits/chosen": -3.0954296588897705, "logits/rejected": -3.067091464996338, "logps/chosen": -61.6202507019043, "logps/rejected": -58.48479080200195, "loss": 0.6879, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.026026686653494835, "rewards/margins": 0.011260555125772953, "rewards/rejected": -0.03728724271059036, "step": 7810 }, { "epoch": 1.3473466574776016, "grad_norm": 2.322619676589966, "learning_rate": 5.816031332383267e-09, "logits/chosen": -3.060593605041504, "logits/rejected": -3.0286638736724854, "logps/chosen": -58.142189025878906, "logps/rejected": -59.22846603393555, "loss": 0.6829, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01776704005897045, "rewards/margins": 0.02139130048453808, "rewards/rejected": -0.03915834426879883, "step": 7820 }, { "epoch": 1.3490696071674706, "grad_norm": 2.551548957824707, "learning_rate": 5.788737220857479e-09, "logits/chosen": -2.9640679359436035, "logits/rejected": -2.9546852111816406, "logps/chosen": -52.7246208190918, "logps/rejected": -58.02629852294922, "loss": 0.6852, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.029245715588331223, "rewards/margins": 0.016737982630729675, "rewards/rejected": -0.045983701944351196, "step": 7830 }, { "epoch": 1.3507925568573398, "grad_norm": 2.1704752445220947, "learning_rate": 5.76148119210526e-09, "logits/chosen": -2.958341360092163, "logits/rejected": -2.936945676803589, "logps/chosen": -56.746986389160156, "logps/rejected": -58.57280731201172, "loss": 0.6878, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031794894486665726, "rewards/margins": 0.011606425978243351, "rewards/rejected": -0.0434013195335865, "step": 7840 }, { "epoch": 1.3525155065472088, "grad_norm": 2.541645050048828, "learning_rate": 5.734263492604981e-09, "logits/chosen": -2.9532153606414795, "logits/rejected": -2.91933012008667, "logps/chosen": -58.54957962036133, "logps/rejected": -52.94072341918945, "loss": 0.6856, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.029725080356001854, "rewards/margins": 0.01591680757701397, "rewards/rejected": -0.04564188793301582, "step": 7850 }, { "epoch": 1.354238456237078, "grad_norm": 2.3668036460876465, "learning_rate": 5.70708436848839e-09, "logits/chosen": -2.987510919570923, "logits/rejected": -2.9515819549560547, "logps/chosen": -55.26435089111328, "logps/rejected": -57.63298416137695, "loss": 0.6805, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.023679155856370926, "rewards/margins": 0.02612263336777687, "rewards/rejected": -0.0498017817735672, "step": 7860 }, { "epoch": 1.355961405926947, "grad_norm": 2.4281444549560547, "learning_rate": 5.679944065538403e-09, "logits/chosen": -3.0099899768829346, "logits/rejected": -2.9925038814544678, "logps/chosen": -58.16576385498047, "logps/rejected": -58.61614227294922, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -0.02491823211312294, "rewards/margins": 0.01760999858379364, "rewards/rejected": -0.04252823442220688, "step": 7870 }, { "epoch": 1.3576843556168159, "grad_norm": 2.4042530059814453, "learning_rate": 5.652842829186866e-09, "logits/chosen": -3.0708887577056885, "logits/rejected": -3.0550119876861572, "logps/chosen": -55.79817581176758, "logps/rejected": -56.93279266357422, "loss": 0.685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.026845788583159447, "rewards/margins": 0.01735367253422737, "rewards/rejected": -0.04419945925474167, "step": 7880 }, { "epoch": 1.359407305306685, "grad_norm": 2.446213722229004, "learning_rate": 5.625780904512352e-09, "logits/chosen": -3.010209560394287, "logits/rejected": -2.9910619258880615, "logps/chosen": -56.8497428894043, "logps/rejected": -58.90478515625, "loss": 0.6877, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.024736952036619186, "rewards/margins": 0.011801651678979397, "rewards/rejected": -0.036538608372211456, "step": 7890 }, { "epoch": 1.361130254996554, "grad_norm": 2.7450051307678223, "learning_rate": 5.598758536237917e-09, "logits/chosen": -2.999788284301758, "logits/rejected": -2.996474504470825, "logps/chosen": -55.325157165527344, "logps/rejected": -58.1741943359375, "loss": 0.6868, "rewards/accuracies": 0.59375, "rewards/chosen": -0.031489770859479904, "rewards/margins": 0.013761959969997406, "rewards/rejected": -0.04525173455476761, "step": 7900 }, { "epoch": 1.361130254996554, "eval_logits/chosen": -3.1220314502716064, "eval_logits/rejected": -3.11635160446167, "eval_logps/chosen": -59.33341979980469, "eval_logps/rejected": -64.38567352294922, "eval_loss": 0.6903904676437378, "eval_rewards/accuracies": 0.5845724940299988, "eval_rewards/chosen": -0.006215228233486414, "eval_rewards/margins": 0.005840308964252472, "eval_rewards/rejected": -0.012055537663400173, "eval_runtime": 383.7646, "eval_samples_per_second": 11.215, "eval_steps_per_second": 1.402, "step": 7900 }, { "epoch": 1.3628532046864232, "grad_norm": 2.488607883453369, "learning_rate": 5.571775968728934e-09, "logits/chosen": -3.006234645843506, "logits/rejected": -2.9830057621002197, "logps/chosen": -59.74515914916992, "logps/rejected": -57.955780029296875, "loss": 0.6867, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029217317700386047, "rewards/margins": 0.013838951475918293, "rewards/rejected": -0.043056271970272064, "step": 7910 }, { "epoch": 1.3645761543762922, "grad_norm": 2.591261863708496, "learning_rate": 5.544833445990827e-09, "logits/chosen": -3.0004377365112305, "logits/rejected": -2.9761016368865967, "logps/chosen": -57.81911087036133, "logps/rejected": -55.67632293701172, "loss": 0.6874, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02998395264148712, "rewards/margins": 0.012168223969638348, "rewards/rejected": -0.042152177542448044, "step": 7920 }, { "epoch": 1.3662991040661612, "grad_norm": 2.607875347137451, "learning_rate": 5.517931211666907e-09, "logits/chosen": -3.067615270614624, "logits/rejected": -3.0388498306274414, "logps/chosen": -60.940086364746094, "logps/rejected": -56.713706970214844, "loss": 0.6851, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.026055265218019485, "rewards/margins": 0.017093230038881302, "rewards/rejected": -0.04314848780632019, "step": 7930 }, { "epoch": 1.3680220537560304, "grad_norm": 2.8139894008636475, "learning_rate": 5.491069509036151e-09, "logits/chosen": -2.9540796279907227, "logits/rejected": -2.9387428760528564, "logps/chosen": -61.240760803222656, "logps/rejected": -59.60124588012695, "loss": 0.6851, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.026804599910974503, "rewards/margins": 0.01700502447783947, "rewards/rejected": -0.04380962252616882, "step": 7940 }, { "epoch": 1.3697450034458993, "grad_norm": 2.6647346019744873, "learning_rate": 5.464248581011002e-09, "logits/chosen": -2.9159152507781982, "logits/rejected": -2.9074153900146484, "logps/chosen": -54.17744827270508, "logps/rejected": -57.369102478027344, "loss": 0.6864, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02671412192285061, "rewards/margins": 0.014292205683887005, "rewards/rejected": -0.04100632667541504, "step": 7950 }, { "epoch": 1.3714679531357685, "grad_norm": 2.25141978263855, "learning_rate": 5.4374686701351815e-09, "logits/chosen": -2.9339842796325684, "logits/rejected": -2.905107021331787, "logps/chosen": -53.44514846801758, "logps/rejected": -56.796424865722656, "loss": 0.6869, "rewards/accuracies": 0.59375, "rewards/chosen": -0.029661059379577637, "rewards/margins": 0.013453202322125435, "rewards/rejected": -0.04311426356434822, "step": 7960 }, { "epoch": 1.3731909028256375, "grad_norm": 2.0555899143218994, "learning_rate": 5.410730018581482e-09, "logits/chosen": -3.035153865814209, "logits/rejected": -3.003742218017578, "logps/chosen": -58.52336502075195, "logps/rejected": -56.82777786254883, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.031260039657354355, "rewards/margins": 0.016300015151500702, "rewards/rejected": -0.047560058534145355, "step": 7970 }, { "epoch": 1.3749138525155065, "grad_norm": 2.8370964527130127, "learning_rate": 5.384032868149595e-09, "logits/chosen": -3.0796780586242676, "logits/rejected": -3.0659830570220947, "logps/chosen": -59.4162483215332, "logps/rejected": -60.17753219604492, "loss": 0.6859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02536647394299507, "rewards/margins": 0.015350818634033203, "rewards/rejected": -0.040717292577028275, "step": 7980 }, { "epoch": 1.3766368022053757, "grad_norm": 2.404433488845825, "learning_rate": 5.357377460263893e-09, "logits/chosen": -3.1086089611053467, "logits/rejected": -3.0962395668029785, "logps/chosen": -53.78315353393555, "logps/rejected": -55.634429931640625, "loss": 0.6892, "rewards/accuracies": 0.53125, "rewards/chosen": -0.031159456819295883, "rewards/margins": 0.008595505729317665, "rewards/rejected": -0.0397549606859684, "step": 7990 }, { "epoch": 1.3783597518952446, "grad_norm": 2.7585387229919434, "learning_rate": 5.330764035971298e-09, "logits/chosen": -2.9658491611480713, "logits/rejected": -2.9562182426452637, "logps/chosen": -57.27336502075195, "logps/rejected": -60.18293380737305, "loss": 0.6876, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02755850926041603, "rewards/margins": 0.011846770532429218, "rewards/rejected": -0.039405278861522675, "step": 8000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -3.1204283237457275, "eval_logits/rejected": -3.1147513389587402, "eval_logps/chosen": -59.34055709838867, "eval_logps/rejected": -64.40652465820312, "eval_loss": 0.6903232932090759, "eval_rewards/accuracies": 0.5838754773139954, "eval_rewards/chosen": -0.0062866369262337685, "eval_rewards/margins": 0.005977442022413015, "eval_rewards/rejected": -0.012264078482985497, "eval_runtime": 383.7449, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 8000 }, { "epoch": 1.3800827015851138, "grad_norm": 2.6549692153930664, "learning_rate": 5.3041928359390415e-09, "logits/chosen": -2.9890923500061035, "logits/rejected": -2.9675803184509277, "logps/chosen": -59.91522216796875, "logps/rejected": -56.33888626098633, "loss": 0.6879, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.027553830295801163, "rewards/margins": 0.011601470410823822, "rewards/rejected": -0.039155296981334686, "step": 8010 }, { "epoch": 1.3818056512749828, "grad_norm": 2.279928684234619, "learning_rate": 5.277664100452546e-09, "logits/chosen": -3.0127570629119873, "logits/rejected": -2.9724864959716797, "logps/chosen": -63.12485885620117, "logps/rejected": -59.47083282470703, "loss": 0.6845, "rewards/accuracies": 0.59375, "rewards/chosen": -0.028870245441794395, "rewards/margins": 0.0184099730104208, "rewards/rejected": -0.047280218452215195, "step": 8020 }, { "epoch": 1.3835286009648518, "grad_norm": 2.4220051765441895, "learning_rate": 5.251178069413196e-09, "logits/chosen": -3.0053482055664062, "logits/rejected": -2.9898781776428223, "logps/chosen": -53.86786651611328, "logps/rejected": -57.864479064941406, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": -0.02724533900618553, "rewards/margins": 0.01722780428826809, "rewards/rejected": -0.04447314515709877, "step": 8030 }, { "epoch": 1.385251550654721, "grad_norm": 2.450390100479126, "learning_rate": 5.224734982336216e-09, "logits/chosen": -2.96675443649292, "logits/rejected": -2.934190511703491, "logps/chosen": -57.38630294799805, "logps/rejected": -56.2953987121582, "loss": 0.6871, "rewards/accuracies": 0.5625, "rewards/chosen": -0.029596591368317604, "rewards/margins": 0.013075938448309898, "rewards/rejected": -0.0426725298166275, "step": 8040 }, { "epoch": 1.38697450034459, "grad_norm": 2.611423969268799, "learning_rate": 5.198335078348475e-09, "logits/chosen": -3.0320591926574707, "logits/rejected": -3.0146846771240234, "logps/chosen": -57.07909393310547, "logps/rejected": -59.59125900268555, "loss": 0.6847, "rewards/accuracies": 0.625, "rewards/chosen": -0.019715677946805954, "rewards/margins": 0.017734985798597336, "rewards/rejected": -0.03745066374540329, "step": 8050 }, { "epoch": 1.388697450034459, "grad_norm": 2.637826919555664, "learning_rate": 5.171978596186342e-09, "logits/chosen": -3.0777766704559326, "logits/rejected": -3.0379884243011475, "logps/chosen": -57.865684509277344, "logps/rejected": -55.034271240234375, "loss": 0.6816, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.025270912796258926, "rewards/margins": 0.024199113249778748, "rewards/rejected": -0.049470026046037674, "step": 8060 }, { "epoch": 1.390420399724328, "grad_norm": 2.563366651535034, "learning_rate": 5.145665774193511e-09, "logits/chosen": -2.9853625297546387, "logits/rejected": -2.954200267791748, "logps/chosen": -56.14939498901367, "logps/rejected": -54.62565231323242, "loss": 0.6838, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02691097930073738, "rewards/margins": 0.01969815418124199, "rewards/rejected": -0.04660913720726967, "step": 8070 }, { "epoch": 1.392143349414197, "grad_norm": 2.5079445838928223, "learning_rate": 5.1193968503188584e-09, "logits/chosen": -2.965848445892334, "logits/rejected": -2.9610848426818848, "logps/chosen": -55.743553161621094, "logps/rejected": -61.7056884765625, "loss": 0.6887, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02757110632956028, "rewards/margins": 0.009862509556114674, "rewards/rejected": -0.03743361681699753, "step": 8080 }, { "epoch": 1.3938662991040662, "grad_norm": 2.365835428237915, "learning_rate": 5.093172062114284e-09, "logits/chosen": -2.9592156410217285, "logits/rejected": -2.9305264949798584, "logps/chosen": -55.8453369140625, "logps/rejected": -56.141029357910156, "loss": 0.6838, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.027269789949059486, "rewards/margins": 0.019605975598096848, "rewards/rejected": -0.046875763684511185, "step": 8090 }, { "epoch": 1.3955892487939352, "grad_norm": 2.5415031909942627, "learning_rate": 5.066991646732575e-09, "logits/chosen": -3.0153441429138184, "logits/rejected": -3.0083537101745605, "logps/chosen": -56.55889892578125, "logps/rejected": -61.67368698120117, "loss": 0.688, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.024770287796854973, "rewards/margins": 0.011040473356842995, "rewards/rejected": -0.03581076115369797, "step": 8100 }, { "epoch": 1.3955892487939352, "eval_logits/chosen": -3.120063304901123, "eval_logits/rejected": -3.1143927574157715, "eval_logps/chosen": -59.36701583862305, "eval_logps/rejected": -64.42515563964844, "eval_loss": 0.6903651356697083, "eval_rewards/accuracies": 0.5831784605979919, "eval_rewards/chosen": -0.006551176775246859, "eval_rewards/margins": 0.0058991494588553905, "eval_rewards/rejected": -0.01245032623410225, "eval_runtime": 384.1938, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 8100 }, { "epoch": 1.3973121984838044, "grad_norm": 2.5754048824310303, "learning_rate": 5.040855840925227e-09, "logits/chosen": -3.0326080322265625, "logits/rejected": -2.9937872886657715, "logps/chosen": -58.87183380126953, "logps/rejected": -55.71844482421875, "loss": 0.6843, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.028102803975343704, "rewards/margins": 0.01863877847790718, "rewards/rejected": -0.04674157872796059, "step": 8110 }, { "epoch": 1.3990351481736734, "grad_norm": 2.5665032863616943, "learning_rate": 5.014764881040364e-09, "logits/chosen": -2.9887478351593018, "logits/rejected": -2.9701955318450928, "logps/chosen": -56.5562744140625, "logps/rejected": -56.73915481567383, "loss": 0.6865, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.024538787081837654, "rewards/margins": 0.014023144729435444, "rewards/rejected": -0.038561929017305374, "step": 8120 }, { "epoch": 1.4007580978635423, "grad_norm": 2.2508840560913086, "learning_rate": 4.98871900302053e-09, "logits/chosen": -3.002256155014038, "logits/rejected": -2.98468279838562, "logps/chosen": -57.396080017089844, "logps/rejected": -54.70935821533203, "loss": 0.6859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02575969137251377, "rewards/margins": 0.015312038362026215, "rewards/rejected": -0.041071731597185135, "step": 8130 }, { "epoch": 1.4024810475534115, "grad_norm": 2.453939199447632, "learning_rate": 4.962718442400611e-09, "logits/chosen": -2.9657180309295654, "logits/rejected": -2.939147710800171, "logps/chosen": -55.78546142578125, "logps/rejected": -56.93006134033203, "loss": 0.6883, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.030585378408432007, "rewards/margins": 0.010639393702149391, "rewards/rejected": -0.04122477397322655, "step": 8140 }, { "epoch": 1.4042039972432805, "grad_norm": 2.562664747238159, "learning_rate": 4.9367634343056786e-09, "logits/chosen": -3.0493216514587402, "logits/rejected": -3.028538703918457, "logps/chosen": -55.07560348510742, "logps/rejected": -57.47492599487305, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029278188943862915, "rewards/margins": 0.014732874929904938, "rewards/rejected": -0.04401106387376785, "step": 8150 }, { "epoch": 1.4059269469331497, "grad_norm": 2.32053804397583, "learning_rate": 4.91085421344887e-09, "logits/chosen": -2.979808807373047, "logits/rejected": -2.959838628768921, "logps/chosen": -59.30604934692383, "logps/rejected": -57.034095764160156, "loss": 0.6864, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.032929785549640656, "rewards/margins": 0.014530050568282604, "rewards/rejected": -0.047459833323955536, "step": 8160 }, { "epoch": 1.4076498966230186, "grad_norm": 2.6533477306365967, "learning_rate": 4.884991014129263e-09, "logits/chosen": -3.0457491874694824, "logits/rejected": -3.0203018188476562, "logps/chosen": -62.3807487487793, "logps/rejected": -55.70471954345703, "loss": 0.6881, "rewards/accuracies": 0.5625, "rewards/chosen": -0.026699107140302658, "rewards/margins": 0.010825890116393566, "rewards/rejected": -0.0375249981880188, "step": 8170 }, { "epoch": 1.4093728463128876, "grad_norm": 2.831326961517334, "learning_rate": 4.8591740702297614e-09, "logits/chosen": -3.03529691696167, "logits/rejected": -3.0137627124786377, "logps/chosen": -58.8499641418457, "logps/rejected": -59.05854415893555, "loss": 0.6861, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02557712234556675, "rewards/margins": 0.014896447770297527, "rewards/rejected": -0.040473572909832, "step": 8180 }, { "epoch": 1.4110957960027566, "grad_norm": 2.409442186355591, "learning_rate": 4.8334036152149805e-09, "logits/chosen": -2.9958674907684326, "logits/rejected": -2.9718363285064697, "logps/chosen": -60.311866760253906, "logps/rejected": -58.06665802001953, "loss": 0.6842, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02333252504467964, "rewards/margins": 0.01871083490550518, "rewards/rejected": -0.04204336181282997, "step": 8190 }, { "epoch": 1.4128187456926258, "grad_norm": 2.4591057300567627, "learning_rate": 4.807679882129118e-09, "logits/chosen": -3.0280957221984863, "logits/rejected": -2.993908405303955, "logps/chosen": -57.98280715942383, "logps/rejected": -57.11616897583008, "loss": 0.6858, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029876669868826866, "rewards/margins": 0.015772178769111633, "rewards/rejected": -0.04564885050058365, "step": 8200 }, { "epoch": 1.4128187456926258, "eval_logits/chosen": -3.119671583175659, "eval_logits/rejected": -3.1139838695526123, "eval_logps/chosen": -59.38847351074219, "eval_logps/rejected": -64.45050811767578, "eval_loss": 0.6903483271598816, "eval_rewards/accuracies": 0.578066885471344, "eval_rewards/chosen": -0.006765724625438452, "eval_rewards/margins": 0.005938132759183645, "eval_rewards/rejected": -0.012703859247267246, "eval_runtime": 384.1597, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 8200 }, { "epoch": 1.414541695382495, "grad_norm": 2.437087059020996, "learning_rate": 4.782003103593887e-09, "logits/chosen": -2.9005608558654785, "logits/rejected": -2.882728099822998, "logps/chosen": -57.88407516479492, "logps/rejected": -60.46735382080078, "loss": 0.6859, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027791589498519897, "rewards/margins": 0.015335688367486, "rewards/rejected": -0.04312727600336075, "step": 8210 }, { "epoch": 1.416264645072364, "grad_norm": 2.3415608406066895, "learning_rate": 4.756373511806359e-09, "logits/chosen": -3.0372555255889893, "logits/rejected": -3.0118002891540527, "logps/chosen": -56.182525634765625, "logps/rejected": -56.98543167114258, "loss": 0.6894, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.030894294381141663, "rewards/margins": 0.008487719111144543, "rewards/rejected": -0.03938201442360878, "step": 8220 }, { "epoch": 1.417987594762233, "grad_norm": 2.3519725799560547, "learning_rate": 4.73079133853692e-09, "logits/chosen": -2.9884510040283203, "logits/rejected": -2.969163417816162, "logps/chosen": -57.80139923095703, "logps/rejected": -55.51936721801758, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.02552812732756138, "rewards/margins": 0.016272902488708496, "rewards/rejected": -0.04180102422833443, "step": 8230 }, { "epoch": 1.4197105444521019, "grad_norm": 2.396422863006592, "learning_rate": 4.705256815127122e-09, "logits/chosen": -3.105541706085205, "logits/rejected": -3.075623035430908, "logps/chosen": -58.82453155517578, "logps/rejected": -55.772682189941406, "loss": 0.6872, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026508769020438194, "rewards/margins": 0.012797790579497814, "rewards/rejected": -0.039306558668613434, "step": 8240 }, { "epoch": 1.421433494141971, "grad_norm": 2.6921234130859375, "learning_rate": 4.679770172487632e-09, "logits/chosen": -3.02769136428833, "logits/rejected": -3.0033533573150635, "logps/chosen": -60.10667037963867, "logps/rejected": -58.4813117980957, "loss": 0.6851, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024572988972067833, "rewards/margins": 0.017005886882543564, "rewards/rejected": -0.041578877717256546, "step": 8250 }, { "epoch": 1.42315644383184, "grad_norm": 2.8120920658111572, "learning_rate": 4.6543316410961176e-09, "logits/chosen": -3.1258349418640137, "logits/rejected": -3.104653835296631, "logps/chosen": -58.88138961791992, "logps/rejected": -59.416297912597656, "loss": 0.6852, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.024667982012033463, "rewards/margins": 0.016674166545271873, "rewards/rejected": -0.041342150419950485, "step": 8260 }, { "epoch": 1.4248793935217092, "grad_norm": 2.539137840270996, "learning_rate": 4.62894145099518e-09, "logits/chosen": -3.0966219902038574, "logits/rejected": -3.085365056991577, "logps/chosen": -56.49332809448242, "logps/rejected": -59.21570587158203, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.030976910144090652, "rewards/margins": 0.007497703190892935, "rewards/rejected": -0.038474611937999725, "step": 8270 }, { "epoch": 1.4266023432115782, "grad_norm": 2.646667242050171, "learning_rate": 4.603599831790262e-09, "logits/chosen": -2.9852712154388428, "logits/rejected": -2.974323272705078, "logps/chosen": -56.3748893737793, "logps/rejected": -57.87810516357422, "loss": 0.6886, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03997837379574776, "rewards/margins": 0.01005262229591608, "rewards/rejected": -0.05003099516034126, "step": 8280 }, { "epoch": 1.4283252929014472, "grad_norm": 2.5989837646484375, "learning_rate": 4.578307012647578e-09, "logits/chosen": -2.975891590118408, "logits/rejected": -2.92683744430542, "logps/chosen": -61.94519805908203, "logps/rejected": -58.292625427246094, "loss": 0.6831, "rewards/accuracies": 0.65625, "rewards/chosen": -0.020089568570256233, "rewards/margins": 0.020902033895254135, "rewards/rejected": -0.04099160432815552, "step": 8290 }, { "epoch": 1.4300482425913164, "grad_norm": 2.4585771560668945, "learning_rate": 4.553063222292038e-09, "logits/chosen": -3.154768466949463, "logits/rejected": -3.13096284866333, "logps/chosen": -60.180511474609375, "logps/rejected": -59.13969802856445, "loss": 0.6836, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.026011323556303978, "rewards/margins": 0.02008582092821598, "rewards/rejected": -0.04609714448451996, "step": 8300 }, { "epoch": 1.4300482425913164, "eval_logits/chosen": -3.1194801330566406, "eval_logits/rejected": -3.1138548851013184, "eval_logps/chosen": -59.40495300292969, "eval_logps/rejected": -64.46597290039062, "eval_loss": 0.6903538703918457, "eval_rewards/accuracies": 0.5822490453720093, "eval_rewards/chosen": -0.006930571049451828, "eval_rewards/margins": 0.005927949212491512, "eval_rewards/rejected": -0.01285852026194334, "eval_runtime": 384.0017, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 8300 }, { "epoch": 1.4317711922811853, "grad_norm": 2.729517936706543, "learning_rate": 4.5278686890051835e-09, "logits/chosen": -2.925302743911743, "logits/rejected": -2.894500494003296, "logps/chosen": -58.59355926513672, "logps/rejected": -55.01383590698242, "loss": 0.6859, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.023244168609380722, "rewards/margins": 0.015363660641014576, "rewards/rejected": -0.03860782831907272, "step": 8310 }, { "epoch": 1.4334941419710545, "grad_norm": 2.2358431816101074, "learning_rate": 4.502723640623117e-09, "logits/chosen": -3.0099167823791504, "logits/rejected": -2.9926342964172363, "logps/chosen": -55.30101776123047, "logps/rejected": -58.07609939575195, "loss": 0.6874, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.027875792235136032, "rewards/margins": 0.012339317239820957, "rewards/rejected": -0.04021511226892471, "step": 8320 }, { "epoch": 1.4352170916609235, "grad_norm": 2.591007947921753, "learning_rate": 4.477628304534454e-09, "logits/chosen": -3.0072147846221924, "logits/rejected": -2.997791051864624, "logps/chosen": -51.999046325683594, "logps/rejected": -58.81654739379883, "loss": 0.6858, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03384193778038025, "rewards/margins": 0.015689820051193237, "rewards/rejected": -0.049531761556863785, "step": 8330 }, { "epoch": 1.4369400413507925, "grad_norm": 2.1845362186431885, "learning_rate": 4.45258290767824e-09, "logits/chosen": -3.0681591033935547, "logits/rejected": -3.0523247718811035, "logps/chosen": -57.44211959838867, "logps/rejected": -56.03607940673828, "loss": 0.6868, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02998577430844307, "rewards/margins": 0.013758832588791847, "rewards/rejected": -0.043744608759880066, "step": 8340 }, { "epoch": 1.4386629910406616, "grad_norm": 2.7989895343780518, "learning_rate": 4.427587676541932e-09, "logits/chosen": -2.91628360748291, "logits/rejected": -2.8969550132751465, "logps/chosen": -56.94103240966797, "logps/rejected": -57.142295837402344, "loss": 0.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.037700019776821136, "rewards/margins": 0.0077774375677108765, "rewards/rejected": -0.04547745734453201, "step": 8350 }, { "epoch": 1.4403859407305306, "grad_norm": 2.4125726222991943, "learning_rate": 4.4026428371593305e-09, "logits/chosen": -2.9135327339172363, "logits/rejected": -2.877890110015869, "logps/chosen": -56.788597106933594, "logps/rejected": -56.7236328125, "loss": 0.6807, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.022345153614878654, "rewards/margins": 0.0258797500282526, "rewards/rejected": -0.048224903643131256, "step": 8360 }, { "epoch": 1.4421088904203998, "grad_norm": 2.557354211807251, "learning_rate": 4.377748615108539e-09, "logits/chosen": -2.9914212226867676, "logits/rejected": -2.973506450653076, "logps/chosen": -54.44336700439453, "logps/rejected": -57.72014617919922, "loss": 0.69, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0338708758354187, "rewards/margins": 0.007198970764875412, "rewards/rejected": -0.04106984660029411, "step": 8370 }, { "epoch": 1.4438318401102688, "grad_norm": 2.258762836456299, "learning_rate": 4.352905235509924e-09, "logits/chosen": -3.111560344696045, "logits/rejected": -3.0924363136291504, "logps/chosen": -53.72526168823242, "logps/rejected": -58.379425048828125, "loss": 0.6848, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024840399622917175, "rewards/margins": 0.017487522214651108, "rewards/rejected": -0.04232792183756828, "step": 8380 }, { "epoch": 1.4455547898001377, "grad_norm": 2.6358630657196045, "learning_rate": 4.328112923024079e-09, "logits/chosen": -3.059661865234375, "logits/rejected": -3.0261569023132324, "logps/chosen": -59.1878547668457, "logps/rejected": -61.03777313232422, "loss": 0.6815, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.024916106835007668, "rewards/margins": 0.024374935775995255, "rewards/rejected": -0.049291037023067474, "step": 8390 }, { "epoch": 1.447277739490007, "grad_norm": 2.550302028656006, "learning_rate": 4.303371901849797e-09, "logits/chosen": -3.0383198261260986, "logits/rejected": -3.016228437423706, "logps/chosen": -53.170433044433594, "logps/rejected": -55.09442138671875, "loss": 0.6863, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.028643745929002762, "rewards/margins": 0.01448802649974823, "rewards/rejected": -0.04313177242875099, "step": 8400 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -3.1202504634857178, "eval_logits/rejected": -3.1146037578582764, "eval_logps/chosen": -59.42176055908203, "eval_logps/rejected": -64.49677276611328, "eval_loss": 0.6902862787246704, "eval_rewards/accuracies": 0.5829461216926575, "eval_rewards/chosen": -0.007098623551428318, "eval_rewards/margins": 0.006067925598472357, "eval_rewards/rejected": -0.013166549615561962, "eval_runtime": 383.7451, "eval_samples_per_second": 11.216, "eval_steps_per_second": 1.402, "step": 8400 }, { "epoch": 1.449000689179876, "grad_norm": 2.573391914367676, "learning_rate": 4.278682395722035e-09, "logits/chosen": -2.9906022548675537, "logits/rejected": -2.9558684825897217, "logps/chosen": -60.6536979675293, "logps/rejected": -57.44050216674805, "loss": 0.6805, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029932618141174316, "rewards/margins": 0.02632906101644039, "rewards/rejected": -0.05626168102025986, "step": 8410 }, { "epoch": 1.450723638869745, "grad_norm": 2.531142473220825, "learning_rate": 4.2540446279099024e-09, "logits/chosen": -2.878419876098633, "logits/rejected": -2.8656375408172607, "logps/chosen": -54.85551071166992, "logps/rejected": -55.78571319580078, "loss": 0.687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03203719109296799, "rewards/margins": 0.013183876872062683, "rewards/rejected": -0.04522106796503067, "step": 8420 }, { "epoch": 1.452446588559614, "grad_norm": 2.613314390182495, "learning_rate": 4.229458821214621e-09, "logits/chosen": -3.008746385574341, "logits/rejected": -2.993227005004883, "logps/chosen": -55.98688888549805, "logps/rejected": -60.89069366455078, "loss": 0.6849, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.024954982101917267, "rewards/margins": 0.017395135015249252, "rewards/rejected": -0.04235011711716652, "step": 8430 }, { "epoch": 1.454169538249483, "grad_norm": 2.6283459663391113, "learning_rate": 4.2049251979675465e-09, "logits/chosen": -3.0397636890411377, "logits/rejected": -3.0217602252960205, "logps/chosen": -57.668846130371094, "logps/rejected": -56.13346481323242, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.028762226924300194, "rewards/margins": 0.017270488664507866, "rewards/rejected": -0.04603271931409836, "step": 8440 }, { "epoch": 1.4558924879393522, "grad_norm": 2.6273765563964844, "learning_rate": 4.1804439800281105e-09, "logits/chosen": -2.9259116649627686, "logits/rejected": -2.9209136962890625, "logps/chosen": -51.028839111328125, "logps/rejected": -57.476707458496094, "loss": 0.6858, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03016982041299343, "rewards/margins": 0.01554443221539259, "rewards/rejected": -0.045714251697063446, "step": 8450 }, { "epoch": 1.4576154376292212, "grad_norm": 2.246420383453369, "learning_rate": 4.156015388781864e-09, "logits/chosen": -3.038684606552124, "logits/rejected": -3.0106842517852783, "logps/chosen": -55.45277786254883, "logps/rejected": -58.68220901489258, "loss": 0.682, "rewards/accuracies": 0.65625, "rewards/chosen": -0.026461321860551834, "rewards/margins": 0.02305610477924347, "rewards/rejected": -0.049517422914505005, "step": 8460 }, { "epoch": 1.4593383873190904, "grad_norm": 2.541693687438965, "learning_rate": 4.131639645138428e-09, "logits/chosen": -3.0102734565734863, "logits/rejected": -2.9820914268493652, "logps/chosen": -59.36157989501953, "logps/rejected": -57.94231414794922, "loss": 0.6861, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.032951220870018005, "rewards/margins": 0.015014281496405602, "rewards/rejected": -0.047965504229068756, "step": 8470 }, { "epoch": 1.4610613370089593, "grad_norm": 2.3055930137634277, "learning_rate": 4.107316969529535e-09, "logits/chosen": -3.0146217346191406, "logits/rejected": -2.983560085296631, "logps/chosen": -55.6895866394043, "logps/rejected": -56.18989944458008, "loss": 0.6839, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.026523908600211143, "rewards/margins": 0.019494030624628067, "rewards/rejected": -0.04601794108748436, "step": 8480 }, { "epoch": 1.4627842866988283, "grad_norm": 2.613065242767334, "learning_rate": 4.083047581907013e-09, "logits/chosen": -3.0532450675964355, "logits/rejected": -3.0469489097595215, "logps/chosen": -57.904701232910156, "logps/rejected": -61.51108932495117, "loss": 0.6879, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.031647052615880966, "rewards/margins": 0.01144656352698803, "rewards/rejected": -0.043093618005514145, "step": 8490 }, { "epoch": 1.4645072363886975, "grad_norm": 2.272087335586548, "learning_rate": 4.0588317017408e-09, "logits/chosen": -2.9357683658599854, "logits/rejected": -2.9200870990753174, "logps/chosen": -57.107269287109375, "logps/rejected": -58.03228759765625, "loss": 0.6847, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.028004806488752365, "rewards/margins": 0.017863240092992783, "rewards/rejected": -0.04586804285645485, "step": 8500 }, { "epoch": 1.4645072363886975, "eval_logits/chosen": -3.1188855171203613, "eval_logits/rejected": -3.113229990005493, "eval_logps/chosen": -59.43946838378906, "eval_logps/rejected": -64.51103973388672, "eval_loss": 0.6903047561645508, "eval_rewards/accuracies": 0.5871282815933228, "eval_rewards/chosen": -0.007275736890733242, "eval_rewards/margins": 0.006033329293131828, "eval_rewards/rejected": -0.013309067115187645, "eval_runtime": 384.2065, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 8500 }, { "epoch": 1.4662301860785665, "grad_norm": 2.6604816913604736, "learning_rate": 4.0346695480169684e-09, "logits/chosen": -3.0012319087982178, "logits/rejected": -2.965122699737549, "logps/chosen": -60.46849822998047, "logps/rejected": -56.2077751159668, "loss": 0.6833, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.026192188262939453, "rewards/margins": 0.020619530230760574, "rewards/rejected": -0.04681171476840973, "step": 8510 }, { "epoch": 1.4679531357684357, "grad_norm": 2.4973299503326416, "learning_rate": 4.010561339235732e-09, "logits/chosen": -2.993504524230957, "logits/rejected": -2.957441806793213, "logps/chosen": -56.485206604003906, "logps/rejected": -53.9075813293457, "loss": 0.6821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.029152899980545044, "rewards/margins": 0.022967610508203506, "rewards/rejected": -0.05212050676345825, "step": 8520 }, { "epoch": 1.4696760854583046, "grad_norm": 2.220632314682007, "learning_rate": 3.98650729340948e-09, "logits/chosen": -2.9751977920532227, "logits/rejected": -2.957366943359375, "logps/chosen": -57.468528747558594, "logps/rejected": -55.835716247558594, "loss": 0.6896, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03581540286540985, "rewards/margins": 0.00826946645975113, "rewards/rejected": -0.04408486932516098, "step": 8530 }, { "epoch": 1.4713990351481736, "grad_norm": 2.472700834274292, "learning_rate": 3.962507628060802e-09, "logits/chosen": -3.1107027530670166, "logits/rejected": -3.094367742538452, "logps/chosen": -57.2710075378418, "logps/rejected": -58.294677734375, "loss": 0.6871, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.025768334046006203, "rewards/margins": 0.012841353192925453, "rewards/rejected": -0.038609687238931656, "step": 8540 }, { "epoch": 1.4731219848380428, "grad_norm": 2.5321476459503174, "learning_rate": 3.938562560220523e-09, "logits/chosen": -3.028843879699707, "logits/rejected": -3.0211856365203857, "logps/chosen": -56.91617965698242, "logps/rejected": -59.54004669189453, "loss": 0.688, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030782613903284073, "rewards/margins": 0.011320657096803188, "rewards/rejected": -0.04210326820611954, "step": 8550 }, { "epoch": 1.4748449345279118, "grad_norm": 2.368504762649536, "learning_rate": 3.914672306425727e-09, "logits/chosen": -3.1164917945861816, "logits/rejected": -3.093698024749756, "logps/chosen": -57.331016540527344, "logps/rejected": -57.33484649658203, "loss": 0.6842, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03123745322227478, "rewards/margins": 0.018766935914754868, "rewards/rejected": -0.050004392862319946, "step": 8560 }, { "epoch": 1.476567884217781, "grad_norm": 2.386122226715088, "learning_rate": 3.890837082717822e-09, "logits/chosen": -2.973491668701172, "logits/rejected": -2.9486851692199707, "logps/chosen": -57.305992126464844, "logps/rejected": -56.970916748046875, "loss": 0.6837, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02948937751352787, "rewards/margins": 0.019870325922966003, "rewards/rejected": -0.04935970902442932, "step": 8570 }, { "epoch": 1.47829083390765, "grad_norm": 2.6596150398254395, "learning_rate": 3.867057104640573e-09, "logits/chosen": -3.0600357055664062, "logits/rejected": -3.037853479385376, "logps/chosen": -56.2053108215332, "logps/rejected": -59.14296340942383, "loss": 0.6809, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.020690327510237694, "rewards/margins": 0.02528388239443302, "rewards/rejected": -0.045974213629961014, "step": 8580 }, { "epoch": 1.480013783597519, "grad_norm": 2.690764904022217, "learning_rate": 3.843332587238151e-09, "logits/chosen": -3.0696463584899902, "logits/rejected": -3.0400075912475586, "logps/chosen": -58.996612548828125, "logps/rejected": -57.676673889160156, "loss": 0.6821, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020020674914121628, "rewards/margins": 0.022924596443772316, "rewards/rejected": -0.04294527322053909, "step": 8590 }, { "epoch": 1.481736733287388, "grad_norm": 2.450030565261841, "learning_rate": 3.819663745053194e-09, "logits/chosen": -2.987722635269165, "logits/rejected": -2.9553515911102295, "logps/chosen": -56.3746223449707, "logps/rejected": -56.209251403808594, "loss": 0.6861, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029363512992858887, "rewards/margins": 0.015252038836479187, "rewards/rejected": -0.044615548104047775, "step": 8600 }, { "epoch": 1.481736733287388, "eval_logits/chosen": -3.119209051132202, "eval_logits/rejected": -3.1135194301605225, "eval_logps/chosen": -59.45771408081055, "eval_logps/rejected": -64.53620147705078, "eval_loss": 0.690272867679596, "eval_rewards/accuracies": 0.5864312052726746, "eval_rewards/chosen": -0.007458226755261421, "eval_rewards/margins": 0.006102536339312792, "eval_rewards/rejected": -0.0135607635602355, "eval_runtime": 384.2938, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 8600 }, { "epoch": 1.483459682977257, "grad_norm": 2.50291109085083, "learning_rate": 3.796050792124867e-09, "logits/chosen": -3.011719226837158, "logits/rejected": -2.989260673522949, "logps/chosen": -56.86872482299805, "logps/rejected": -54.85808563232422, "loss": 0.6857, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.027238333597779274, "rewards/margins": 0.015780366957187653, "rewards/rejected": -0.04301869869232178, "step": 8610 }, { "epoch": 1.4851826326671262, "grad_norm": 2.6594996452331543, "learning_rate": 3.772493941986916e-09, "logits/chosen": -2.934577703475952, "logits/rejected": -2.9154083728790283, "logps/chosen": -58.123512268066406, "logps/rejected": -56.25068283081055, "loss": 0.6853, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03349360078573227, "rewards/margins": 0.016506288200616837, "rewards/rejected": -0.04999988526105881, "step": 8620 }, { "epoch": 1.4869055823569952, "grad_norm": 2.505457878112793, "learning_rate": 3.7489934076657596e-09, "logits/chosen": -2.9104361534118652, "logits/rejected": -2.898397922515869, "logps/chosen": -54.41759490966797, "logps/rejected": -56.243125915527344, "loss": 0.6862, "rewards/accuracies": 0.59375, "rewards/chosen": -0.027125433087348938, "rewards/margins": 0.01472453773021698, "rewards/rejected": -0.041849974542856216, "step": 8630 }, { "epoch": 1.4886285320468642, "grad_norm": 2.5061116218566895, "learning_rate": 3.725549401678525e-09, "logits/chosen": -3.0604488849639893, "logits/rejected": -3.0291781425476074, "logps/chosen": -58.31371307373047, "logps/rejected": -56.86163330078125, "loss": 0.686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02572130598127842, "rewards/margins": 0.01509423553943634, "rewards/rejected": -0.04081553965806961, "step": 8640 }, { "epoch": 1.4903514817367332, "grad_norm": 2.4240450859069824, "learning_rate": 3.7021621360311795e-09, "logits/chosen": -2.9729819297790527, "logits/rejected": -2.951951503753662, "logps/chosen": -57.3306999206543, "logps/rejected": -58.6158561706543, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.023774294182658195, "rewards/margins": 0.021655159071087837, "rewards/rejected": -0.045429449528455734, "step": 8650 }, { "epoch": 1.4920744314266023, "grad_norm": 2.508742332458496, "learning_rate": 3.6788318222165517e-09, "logits/chosen": -2.985428810119629, "logits/rejected": -2.9703781604766846, "logps/chosen": -56.572967529296875, "logps/rejected": -56.7482795715332, "loss": 0.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.032110270112752914, "rewards/margins": 0.01814529113471508, "rewards/rejected": -0.050255559384822845, "step": 8660 }, { "epoch": 1.4937973811164715, "grad_norm": 2.3349335193634033, "learning_rate": 3.655558671212481e-09, "logits/chosen": -3.0807077884674072, "logits/rejected": -3.0629634857177734, "logps/chosen": -55.401893615722656, "logps/rejected": -58.01811599731445, "loss": 0.6845, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03406014293432236, "rewards/margins": 0.018253503367304802, "rewards/rejected": -0.05231364443898201, "step": 8670 }, { "epoch": 1.4955203308063405, "grad_norm": 2.737377405166626, "learning_rate": 3.6323428934798497e-09, "logits/chosen": -3.0251212120056152, "logits/rejected": -3.0183393955230713, "logps/chosen": -53.94529342651367, "logps/rejected": -57.3693962097168, "loss": 0.687, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03267005831003189, "rewards/margins": 0.013181130401790142, "rewards/rejected": -0.045851193368434906, "step": 8680 }, { "epoch": 1.4972432804962095, "grad_norm": 2.630596876144409, "learning_rate": 3.609184698960737e-09, "logits/chosen": -3.0407185554504395, "logits/rejected": -3.0107009410858154, "logps/chosen": -58.4240608215332, "logps/rejected": -53.02643585205078, "loss": 0.6845, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.028877729550004005, "rewards/margins": 0.018301133066415787, "rewards/rejected": -0.047178857028484344, "step": 8690 }, { "epoch": 1.4989662301860784, "grad_norm": 2.498950719833374, "learning_rate": 3.5860842970764685e-09, "logits/chosen": -3.0441231727600098, "logits/rejected": -3.0103659629821777, "logps/chosen": -59.04729461669922, "logps/rejected": -56.984161376953125, "loss": 0.6847, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02549806609749794, "rewards/margins": 0.01791895180940628, "rewards/rejected": -0.04341701418161392, "step": 8700 }, { "epoch": 1.4989662301860784, "eval_logits/chosen": -3.1183865070343018, "eval_logits/rejected": -3.1126930713653564, "eval_logps/chosen": -59.478599548339844, "eval_logps/rejected": -64.55989837646484, "eval_loss": 0.6902616024017334, "eval_rewards/accuracies": 0.5843401551246643, "eval_rewards/chosen": -0.007667039055377245, "eval_rewards/margins": 0.006130703259259462, "eval_rewards/rejected": -0.013797740451991558, "eval_runtime": 384.0717, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 8700 }, { "epoch": 1.5006891798759476, "grad_norm": 2.500349760055542, "learning_rate": 3.563041896725762e-09, "logits/chosen": -3.010810375213623, "logits/rejected": -2.9961395263671875, "logps/chosen": -55.51630783081055, "logps/rejected": -56.80950927734375, "loss": 0.6886, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.026806961745023727, "rewards/margins": 0.009826353751122952, "rewards/rejected": -0.03663332015275955, "step": 8710 }, { "epoch": 1.5024121295658168, "grad_norm": 2.6423752307891846, "learning_rate": 3.5400577062828156e-09, "logits/chosen": -3.001882553100586, "logits/rejected": -2.976762294769287, "logps/chosen": -58.995338439941406, "logps/rejected": -58.893882751464844, "loss": 0.6846, "rewards/accuracies": 0.65625, "rewards/chosen": -0.026477474719285965, "rewards/margins": 0.017903322353959084, "rewards/rejected": -0.0443807952105999, "step": 8720 }, { "epoch": 1.5041350792556858, "grad_norm": 2.291661024093628, "learning_rate": 3.5171319335954356e-09, "logits/chosen": -3.035444736480713, "logits/rejected": -3.002584457397461, "logps/chosen": -55.37629318237305, "logps/rejected": -56.499671936035156, "loss": 0.6858, "rewards/accuracies": 0.625, "rewards/chosen": -0.031050005927681923, "rewards/margins": 0.01546327956020832, "rewards/rejected": -0.046513281762599945, "step": 8730 }, { "epoch": 1.5058580289455548, "grad_norm": 2.185478925704956, "learning_rate": 3.4942647859831476e-09, "logits/chosen": -2.9955713748931885, "logits/rejected": -2.985081195831299, "logps/chosen": -54.93894577026367, "logps/rejected": -57.57527542114258, "loss": 0.6885, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03534874692559242, "rewards/margins": 0.010406061075627804, "rewards/rejected": -0.0457548089325428, "step": 8740 }, { "epoch": 1.5075809786354237, "grad_norm": 2.6699836254119873, "learning_rate": 3.47145647023533e-09, "logits/chosen": -3.0291950702667236, "logits/rejected": -3.0053293704986572, "logps/chosen": -56.25014114379883, "logps/rejected": -57.870445251464844, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": -0.025520440191030502, "rewards/margins": 0.02117043174803257, "rewards/rejected": -0.04669087380170822, "step": 8750 }, { "epoch": 1.509303928325293, "grad_norm": 2.544248104095459, "learning_rate": 3.4487071926093407e-09, "logits/chosen": -2.9766876697540283, "logits/rejected": -2.9653265476226807, "logps/chosen": -53.982757568359375, "logps/rejected": -59.586402893066406, "loss": 0.684, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02485860511660576, "rewards/margins": 0.019283946603536606, "rewards/rejected": -0.044142551720142365, "step": 8760 }, { "epoch": 1.5110268780151621, "grad_norm": 2.2586305141448975, "learning_rate": 3.4260171588286427e-09, "logits/chosen": -3.024056911468506, "logits/rejected": -2.9952967166900635, "logps/chosen": -57.99956512451172, "logps/rejected": -54.73537063598633, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": -0.02924189530313015, "rewards/margins": 0.020510729402303696, "rewards/rejected": -0.049752626568078995, "step": 8770 }, { "epoch": 1.512749827705031, "grad_norm": 2.3685693740844727, "learning_rate": 3.403386574080961e-09, "logits/chosen": -3.0541341304779053, "logits/rejected": -3.0548415184020996, "logps/chosen": -52.13898468017578, "logps/rejected": -58.12361526489258, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.03045285865664482, "rewards/margins": 0.016288291662931442, "rewards/rejected": -0.04674115404486656, "step": 8780 }, { "epoch": 1.5144727773949, "grad_norm": 2.505659818649292, "learning_rate": 3.380815643016417e-09, "logits/chosen": -3.0582375526428223, "logits/rejected": -3.0293161869049072, "logps/chosen": -57.86345291137695, "logps/rejected": -56.745567321777344, "loss": 0.6848, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02939668856561184, "rewards/margins": 0.01776151731610298, "rewards/rejected": -0.04715820401906967, "step": 8790 }, { "epoch": 1.516195727084769, "grad_norm": 2.6972694396972656, "learning_rate": 3.3583045697456773e-09, "logits/chosen": -2.946511745452881, "logits/rejected": -2.9125216007232666, "logps/chosen": -57.9686164855957, "logps/rejected": -56.8680534362793, "loss": 0.6866, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029695045202970505, "rewards/margins": 0.01403980515897274, "rewards/rejected": -0.0437348447740078, "step": 8800 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -3.1182050704956055, "eval_logits/rejected": -3.112541913986206, "eval_logps/chosen": -59.48351287841797, "eval_logps/rejected": -64.56842041015625, "eval_loss": 0.6902456879615784, "eval_rewards/accuracies": 0.5878252983093262, "eval_rewards/chosen": -0.007716177962720394, "eval_rewards/margins": 0.006166784558445215, "eval_rewards/rejected": -0.013882962986826897, "eval_runtime": 384.0156, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 8800 }, { "epoch": 1.5179186767746382, "grad_norm": 2.4620769023895264, "learning_rate": 3.335853557838112e-09, "logits/chosen": -3.1001956462860107, "logits/rejected": -3.0769476890563965, "logps/chosen": -54.146240234375, "logps/rejected": -57.28928756713867, "loss": 0.6861, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03185844421386719, "rewards/margins": 0.014973374083638191, "rewards/rejected": -0.046831823885440826, "step": 8810 }, { "epoch": 1.5196416264645074, "grad_norm": 2.396519899368286, "learning_rate": 3.3134628103199495e-09, "logits/chosen": -3.0399370193481445, "logits/rejected": -3.0198001861572266, "logps/chosen": -53.8540153503418, "logps/rejected": -54.6851806640625, "loss": 0.6863, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03429834544658661, "rewards/margins": 0.014729444868862629, "rewards/rejected": -0.04902778938412666, "step": 8820 }, { "epoch": 1.5213645761543764, "grad_norm": 2.3248801231384277, "learning_rate": 3.291132529672444e-09, "logits/chosen": -2.8968586921691895, "logits/rejected": -2.875062942504883, "logps/chosen": -57.6630859375, "logps/rejected": -57.16463088989258, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030671019107103348, "rewards/margins": 0.013551396317780018, "rewards/rejected": -0.04422241449356079, "step": 8830 }, { "epoch": 1.5230875258442453, "grad_norm": 2.518972873687744, "learning_rate": 3.2688629178300435e-09, "logits/chosen": -2.932861804962158, "logits/rejected": -2.922234296798706, "logps/chosen": -55.32551956176758, "logps/rejected": -57.99912643432617, "loss": 0.6874, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029771555215120316, "rewards/margins": 0.012255150824785233, "rewards/rejected": -0.04202670603990555, "step": 8840 }, { "epoch": 1.5248104755341143, "grad_norm": 2.3851332664489746, "learning_rate": 3.2466541761785606e-09, "logits/chosen": -3.122568368911743, "logits/rejected": -3.1034460067749023, "logps/chosen": -54.5229606628418, "logps/rejected": -58.2146110534668, "loss": 0.6852, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.022725548595190048, "rewards/margins": 0.01649020053446293, "rewards/rejected": -0.03921574726700783, "step": 8850 }, { "epoch": 1.5265334252239835, "grad_norm": 2.5882325172424316, "learning_rate": 3.2245065055533616e-09, "logits/chosen": -3.0242655277252197, "logits/rejected": -3.002401828765869, "logps/chosen": -55.985130310058594, "logps/rejected": -56.9734992980957, "loss": 0.687, "rewards/accuracies": 0.59375, "rewards/chosen": -0.028040340170264244, "rewards/margins": 0.013081741519272327, "rewards/rejected": -0.041122086346149445, "step": 8860 }, { "epoch": 1.5282563749138525, "grad_norm": 2.635909080505371, "learning_rate": 3.2024201062375256e-09, "logits/chosen": -3.167145013809204, "logits/rejected": -3.133671283721924, "logps/chosen": -59.30853271484375, "logps/rejected": -59.278480529785156, "loss": 0.6851, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02538606896996498, "rewards/margins": 0.01714247651398182, "rewards/rejected": -0.04252853989601135, "step": 8870 }, { "epoch": 1.5299793246037217, "grad_norm": 2.15626859664917, "learning_rate": 3.180395177960077e-09, "logits/chosen": -2.962871789932251, "logits/rejected": -2.9352777004241943, "logps/chosen": -57.023284912109375, "logps/rejected": -56.61933135986328, "loss": 0.6862, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03572096303105354, "rewards/margins": 0.014982743188738823, "rewards/rejected": -0.05070370435714722, "step": 8880 }, { "epoch": 1.5317022742935906, "grad_norm": 2.1725003719329834, "learning_rate": 3.1584319198941235e-09, "logits/chosen": -2.9969067573547363, "logits/rejected": -2.9709436893463135, "logps/chosen": -57.14776611328125, "logps/rejected": -54.85799026489258, "loss": 0.6863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.02653408609330654, "rewards/margins": 0.014357566833496094, "rewards/rejected": -0.040891654789447784, "step": 8890 }, { "epoch": 1.5334252239834596, "grad_norm": 2.323460578918457, "learning_rate": 3.1365305306551128e-09, "logits/chosen": -3.0483028888702393, "logits/rejected": -3.01611065864563, "logps/chosen": -57.7647705078125, "logps/rejected": -57.78180694580078, "loss": 0.6841, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02878654934465885, "rewards/margins": 0.019070129841566086, "rewards/rejected": -0.04785668104887009, "step": 8900 }, { "epoch": 1.5334252239834596, "eval_logits/chosen": -3.118605613708496, "eval_logits/rejected": -3.1129119396209717, "eval_logps/chosen": -59.497798919677734, "eval_logps/rejected": -64.58731079101562, "eval_loss": 0.6902238726615906, "eval_rewards/accuracies": 0.5873606204986572, "eval_rewards/chosen": -0.007859011180698872, "eval_rewards/margins": 0.006212850101292133, "eval_rewards/rejected": -0.01407186221331358, "eval_runtime": 384.1009, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 8900 }, { "epoch": 1.5351481736733288, "grad_norm": 2.3581268787384033, "learning_rate": 3.1146912082989853e-09, "logits/chosen": -3.007734775543213, "logits/rejected": -2.9947516918182373, "logps/chosen": -58.75102615356445, "logps/rejected": -57.87038040161133, "loss": 0.6849, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030753616243600845, "rewards/margins": 0.017357924953103065, "rewards/rejected": -0.04811154305934906, "step": 8910 }, { "epoch": 1.5368711233631978, "grad_norm": 2.8827884197235107, "learning_rate": 3.092914150320416e-09, "logits/chosen": -3.095099687576294, "logits/rejected": -3.0627694129943848, "logps/chosen": -59.900108337402344, "logps/rejected": -57.189720153808594, "loss": 0.6825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.028544938191771507, "rewards/margins": 0.022241828963160515, "rewards/rejected": -0.050786763429641724, "step": 8920 }, { "epoch": 1.538594073053067, "grad_norm": 2.249345064163208, "learning_rate": 3.0711995536510174e-09, "logits/chosen": -3.0434162616729736, "logits/rejected": -3.0114622116088867, "logps/chosen": -57.410850524902344, "logps/rejected": -55.26404571533203, "loss": 0.6839, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.027216464281082153, "rewards/margins": 0.019718730822205544, "rewards/rejected": -0.04693519324064255, "step": 8930 }, { "epoch": 1.540317022742936, "grad_norm": 2.743708848953247, "learning_rate": 3.0495476146575608e-09, "logits/chosen": -3.023430347442627, "logits/rejected": -3.005967378616333, "logps/chosen": -58.66585159301758, "logps/rejected": -59.8930549621582, "loss": 0.6882, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.030310818925499916, "rewards/margins": 0.010716510005295277, "rewards/rejected": -0.04102732241153717, "step": 8940 }, { "epoch": 1.5420399724328049, "grad_norm": 2.820741891860962, "learning_rate": 3.0279585291401956e-09, "logits/chosen": -2.9350593090057373, "logits/rejected": -2.9369022846221924, "logps/chosen": -56.8059196472168, "logps/rejected": -61.012367248535156, "loss": 0.6879, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02704184129834175, "rewards/margins": 0.011341988109052181, "rewards/rejected": -0.03838383033871651, "step": 8950 }, { "epoch": 1.5437629221226739, "grad_norm": 2.4366254806518555, "learning_rate": 3.006432492330686e-09, "logits/chosen": -3.056548595428467, "logits/rejected": -3.0293049812316895, "logps/chosen": -57.80559158325195, "logps/rejected": -57.08323287963867, "loss": 0.684, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03402943164110184, "rewards/margins": 0.019523415714502335, "rewards/rejected": -0.05355284735560417, "step": 8960 }, { "epoch": 1.545485871812543, "grad_norm": 2.405066728591919, "learning_rate": 2.9849696988906426e-09, "logits/chosen": -3.005000591278076, "logits/rejected": -2.9771170616149902, "logps/chosen": -56.36186599731445, "logps/rejected": -55.556846618652344, "loss": 0.6867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.030779168009757996, "rewards/margins": 0.013805484399199486, "rewards/rejected": -0.04458465427160263, "step": 8970 }, { "epoch": 1.5472088215024122, "grad_norm": 2.729218006134033, "learning_rate": 2.9635703429097495e-09, "logits/chosen": -3.0958468914031982, "logits/rejected": -3.0640220642089844, "logps/chosen": -57.76591873168945, "logps/rejected": -57.62318801879883, "loss": 0.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03285207599401474, "rewards/margins": 0.01638091169297695, "rewards/rejected": -0.04923298954963684, "step": 8980 }, { "epoch": 1.5489317711922812, "grad_norm": 2.8934600353240967, "learning_rate": 2.942234617904044e-09, "logits/chosen": -3.0019514560699463, "logits/rejected": -2.98407244682312, "logps/chosen": -57.321800231933594, "logps/rejected": -57.94104766845703, "loss": 0.6857, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.023721177130937576, "rewards/margins": 0.015485422685742378, "rewards/rejected": -0.039206597954034805, "step": 8990 }, { "epoch": 1.5506547208821502, "grad_norm": 2.526658058166504, "learning_rate": 2.9209627168141196e-09, "logits/chosen": -3.034315586090088, "logits/rejected": -3.004920482635498, "logps/chosen": -59.76508712768555, "logps/rejected": -57.68981170654297, "loss": 0.6799, "rewards/accuracies": 0.71875, "rewards/chosen": -0.023506319150328636, "rewards/margins": 0.027574270963668823, "rewards/rejected": -0.05108059197664261, "step": 9000 }, { "epoch": 1.5506547208821502, "eval_logits/chosen": -3.1180856227874756, "eval_logits/rejected": -3.1123623847961426, "eval_logps/chosen": -59.51602554321289, "eval_logps/rejected": -64.6044921875, "eval_loss": 0.6902297735214233, "eval_rewards/accuracies": 0.5857341885566711, "eval_rewards/chosen": -0.00804129522293806, "eval_rewards/margins": 0.0062024411745369434, "eval_rewards/rejected": -0.014243737794458866, "eval_runtime": 384.1986, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 9000 }, { "epoch": 1.5523776705720191, "grad_norm": 2.2845945358276367, "learning_rate": 2.8997548320034205e-09, "logits/chosen": -3.0197858810424805, "logits/rejected": -2.9934756755828857, "logps/chosen": -58.68003463745117, "logps/rejected": -57.23857879638672, "loss": 0.6868, "rewards/accuracies": 0.59375, "rewards/chosen": -0.030407551676034927, "rewards/margins": 0.013433225452899933, "rewards/rejected": -0.04384077712893486, "step": 9010 }, { "epoch": 1.5541006202618883, "grad_norm": 2.222851276397705, "learning_rate": 2.87861115525648e-09, "logits/chosen": -2.9426398277282715, "logits/rejected": -2.926056385040283, "logps/chosen": -57.41417694091797, "logps/rejected": -56.05034637451172, "loss": 0.6854, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03490142524242401, "rewards/margins": 0.016401633620262146, "rewards/rejected": -0.05130305141210556, "step": 9020 }, { "epoch": 1.5558235699517575, "grad_norm": 2.456385374069214, "learning_rate": 2.8575318777771964e-09, "logits/chosen": -2.9651219844818115, "logits/rejected": -2.9711365699768066, "logps/chosen": -54.670021057128906, "logps/rejected": -58.19325637817383, "loss": 0.6894, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03518029302358627, "rewards/margins": 0.008600231260061264, "rewards/rejected": -0.043780528008937836, "step": 9030 }, { "epoch": 1.5575465196416265, "grad_norm": 2.4180855751037598, "learning_rate": 2.836517190187098e-09, "logits/chosen": -2.981226682662964, "logits/rejected": -2.958794593811035, "logps/chosen": -54.088279724121094, "logps/rejected": -56.5379638671875, "loss": 0.6855, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02919362485408783, "rewards/margins": 0.016179706901311874, "rewards/rejected": -0.045373331755399704, "step": 9040 }, { "epoch": 1.5592694693314955, "grad_norm": 2.4568464756011963, "learning_rate": 2.8155672825236246e-09, "logits/chosen": -3.0286495685577393, "logits/rejected": -3.009220838546753, "logps/chosen": -57.30841827392578, "logps/rejected": -57.4796142578125, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03550892323255539, "rewards/margins": 0.014863801188766956, "rewards/rejected": -0.05037272721529007, "step": 9050 }, { "epoch": 1.5609924190213644, "grad_norm": 2.575063467025757, "learning_rate": 2.7946823442384017e-09, "logits/chosen": -3.0351195335388184, "logits/rejected": -3.0088753700256348, "logps/chosen": -58.00165939331055, "logps/rejected": -58.076271057128906, "loss": 0.6857, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.032886531203985214, "rewards/margins": 0.015846019610762596, "rewards/rejected": -0.04873254522681236, "step": 9060 }, { "epoch": 1.5627153687112336, "grad_norm": 2.5507895946502686, "learning_rate": 2.7738625641955395e-09, "logits/chosen": -2.999799966812134, "logits/rejected": -2.9818906784057617, "logps/chosen": -57.21845245361328, "logps/rejected": -59.49110794067383, "loss": 0.6834, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.026528984308242798, "rewards/margins": 0.020386185497045517, "rewards/rejected": -0.046915166079998016, "step": 9070 }, { "epoch": 1.5644383184011028, "grad_norm": 2.39579439163208, "learning_rate": 2.7531081306699013e-09, "logits/chosen": -2.9876439571380615, "logits/rejected": -2.984792947769165, "logps/chosen": -54.6351432800293, "logps/rejected": -59.41515350341797, "loss": 0.687, "rewards/accuracies": 0.59375, "rewards/chosen": -0.026544544845819473, "rewards/margins": 0.013375622220337391, "rewards/rejected": -0.03992016613483429, "step": 9080 }, { "epoch": 1.5661612680909718, "grad_norm": 2.5818097591400146, "learning_rate": 2.732419231345441e-09, "logits/chosen": -2.964087963104248, "logits/rejected": -2.9281914234161377, "logps/chosen": -58.411651611328125, "logps/rejected": -55.20099639892578, "loss": 0.6853, "rewards/accuracies": 0.625, "rewards/chosen": -0.031430650502443314, "rewards/margins": 0.016425397247076035, "rewards/rejected": -0.04785604402422905, "step": 9090 }, { "epoch": 1.5678842177808407, "grad_norm": 2.6275980472564697, "learning_rate": 2.7117960533134556e-09, "logits/chosen": -3.0353236198425293, "logits/rejected": -3.0079779624938965, "logps/chosen": -59.2744026184082, "logps/rejected": -58.063377380371094, "loss": 0.6832, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.02477165125310421, "rewards/margins": 0.020760422572493553, "rewards/rejected": -0.04553207382559776, "step": 9100 }, { "epoch": 1.5678842177808407, "eval_logits/chosen": -3.117415189743042, "eval_logits/rejected": -3.1117069721221924, "eval_logps/chosen": -59.5157470703125, "eval_logps/rejected": -64.60614776611328, "eval_loss": 0.6902238726615906, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.008038590662181377, "eval_rewards/margins": 0.006221668794751167, "eval_rewards/rejected": -0.01426026038825512, "eval_runtime": 383.6571, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 9100 }, { "epoch": 1.5696071674707097, "grad_norm": 2.2990360260009766, "learning_rate": 2.691238783070944e-09, "logits/chosen": -3.040240526199341, "logits/rejected": -3.0122292041778564, "logps/chosen": -57.740074157714844, "logps/rejected": -57.43877029418945, "loss": 0.682, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.024915913119912148, "rewards/margins": 0.023519525304436684, "rewards/rejected": -0.04843544214963913, "step": 9110 }, { "epoch": 1.571330117160579, "grad_norm": 2.6483402252197266, "learning_rate": 2.670747606518872e-09, "logits/chosen": -3.0556299686431885, "logits/rejected": -3.020103931427002, "logps/chosen": -62.59893798828125, "logps/rejected": -59.377906799316406, "loss": 0.6838, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.023233408108353615, "rewards/margins": 0.0195440836250782, "rewards/rejected": -0.04277748614549637, "step": 9120 }, { "epoch": 1.573053066850448, "grad_norm": 2.4812519550323486, "learning_rate": 2.6503227089605387e-09, "logits/chosen": -2.9677350521087646, "logits/rejected": -2.944486618041992, "logps/chosen": -57.993263244628906, "logps/rejected": -58.1032600402832, "loss": 0.6885, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029743751510977745, "rewards/margins": 0.010279293172061443, "rewards/rejected": -0.04002305120229721, "step": 9130 }, { "epoch": 1.574776016540317, "grad_norm": 2.4657132625579834, "learning_rate": 2.6299642750998564e-09, "logits/chosen": -3.052119731903076, "logits/rejected": -2.9988605976104736, "logps/chosen": -59.33286666870117, "logps/rejected": -54.122520446777344, "loss": 0.6793, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02051299437880516, "rewards/margins": 0.028978755697607994, "rewards/rejected": -0.0494917556643486, "step": 9140 }, { "epoch": 1.576498966230186, "grad_norm": 2.5840601921081543, "learning_rate": 2.6096724890397127e-09, "logits/chosen": -3.034421443939209, "logits/rejected": -3.0066089630126953, "logps/chosen": -61.0789794921875, "logps/rejected": -59.18257522583008, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": -0.02752991020679474, "rewards/margins": 0.01952740177512169, "rewards/rejected": -0.04705731198191643, "step": 9150 }, { "epoch": 1.578221915920055, "grad_norm": 2.009469509124756, "learning_rate": 2.5894475342802928e-09, "logits/chosen": -3.043092966079712, "logits/rejected": -3.0389745235443115, "logps/chosen": -52.7302360534668, "logps/rejected": -57.86452102661133, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.035803310573101044, "rewards/margins": 0.011993474327027798, "rewards/rejected": -0.047796785831451416, "step": 9160 }, { "epoch": 1.5799448656099242, "grad_norm": 2.648082971572876, "learning_rate": 2.5692895937174175e-09, "logits/chosen": -3.0427441596984863, "logits/rejected": -3.0096170902252197, "logps/chosen": -58.09638595581055, "logps/rejected": -55.04772186279297, "loss": 0.6821, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03254878148436546, "rewards/margins": 0.023205403238534927, "rewards/rejected": -0.05575418472290039, "step": 9170 }, { "epoch": 1.5816678152997934, "grad_norm": 2.5571298599243164, "learning_rate": 2.549198849640898e-09, "logits/chosen": -3.0292630195617676, "logits/rejected": -3.0079505443573, "logps/chosen": -58.021263122558594, "logps/rejected": -56.97618865966797, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": -0.032903410494327545, "rewards/margins": 0.015215583145618439, "rewards/rejected": -0.04811898618936539, "step": 9180 }, { "epoch": 1.5833907649896624, "grad_norm": 2.3080177307128906, "learning_rate": 2.5291754837328786e-09, "logits/chosen": -3.144592046737671, "logits/rejected": -3.1382734775543213, "logps/chosen": -57.0882453918457, "logps/rejected": -60.96589279174805, "loss": 0.6881, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029310131445527077, "rewards/margins": 0.011074641719460487, "rewards/rejected": -0.040384769439697266, "step": 9190 }, { "epoch": 1.5851137146795313, "grad_norm": 2.7551798820495605, "learning_rate": 2.5092196770662013e-09, "logits/chosen": -3.044981002807617, "logits/rejected": -3.0230705738067627, "logps/chosen": -56.41364669799805, "logps/rejected": -56.97838592529297, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.035616904497146606, "rewards/margins": 0.01805807836353779, "rewards/rejected": -0.053674984723329544, "step": 9200 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -3.1172776222229004, "eval_logits/rejected": -3.1115829944610596, "eval_logps/chosen": -59.54096984863281, "eval_logps/rejected": -64.62459564208984, "eval_loss": 0.6902568340301514, "eval_rewards/accuracies": 0.5810873508453369, "eval_rewards/chosen": -0.008290711790323257, "eval_rewards/margins": 0.006153962574899197, "eval_rewards/rejected": -0.014444672502577305, "eval_runtime": 383.9243, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 9200 }, { "epoch": 1.5868366643694003, "grad_norm": 2.5398905277252197, "learning_rate": 2.4893316101027586e-09, "logits/chosen": -2.9198031425476074, "logits/rejected": -2.8921570777893066, "logps/chosen": -59.11091232299805, "logps/rejected": -56.6024284362793, "loss": 0.686, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03263935446739197, "rewards/margins": 0.01547915767878294, "rewards/rejected": -0.04811851307749748, "step": 9210 }, { "epoch": 1.5885596140592695, "grad_norm": 2.464064359664917, "learning_rate": 2.4695114626918715e-09, "logits/chosen": -3.019033908843994, "logits/rejected": -2.9961018562316895, "logps/chosen": -53.05615997314453, "logps/rejected": -54.33342361450195, "loss": 0.6849, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03042665496468544, "rewards/margins": 0.017417794093489647, "rewards/rejected": -0.047844450920820236, "step": 9220 }, { "epoch": 1.5902825637491387, "grad_norm": 2.680680274963379, "learning_rate": 2.449759414068662e-09, "logits/chosen": -2.936624050140381, "logits/rejected": -2.9228339195251465, "logps/chosen": -52.629417419433594, "logps/rejected": -57.511749267578125, "loss": 0.6869, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03322712704539299, "rewards/margins": 0.013344851322472095, "rewards/rejected": -0.04657197743654251, "step": 9230 }, { "epoch": 1.5920055134390076, "grad_norm": 2.6631574630737305, "learning_rate": 2.430075642852424e-09, "logits/chosen": -2.96937894821167, "logits/rejected": -2.9644036293029785, "logps/chosen": -58.7064208984375, "logps/rejected": -59.214813232421875, "loss": 0.6834, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0255623497068882, "rewards/margins": 0.02064797654747963, "rewards/rejected": -0.04621032625436783, "step": 9240 }, { "epoch": 1.5937284631288766, "grad_norm": 2.4485669136047363, "learning_rate": 2.4104603270450176e-09, "logits/chosen": -2.9927477836608887, "logits/rejected": -2.9674036502838135, "logps/chosen": -57.21258544921875, "logps/rejected": -58.436279296875, "loss": 0.6866, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.024265926331281662, "rewards/margins": 0.01401626504957676, "rewards/rejected": -0.03828219324350357, "step": 9250 }, { "epoch": 1.5954514128187456, "grad_norm": 2.4200003147125244, "learning_rate": 2.3909136440292543e-09, "logits/chosen": -2.9640910625457764, "logits/rejected": -2.9449548721313477, "logps/chosen": -52.23038864135742, "logps/rejected": -54.45415115356445, "loss": 0.6871, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04167623072862625, "rewards/margins": 0.012958998791873455, "rewards/rejected": -0.05463522672653198, "step": 9260 }, { "epoch": 1.5971743625086148, "grad_norm": 2.290677785873413, "learning_rate": 2.371435770567294e-09, "logits/chosen": -3.109546184539795, "logits/rejected": -3.07560133934021, "logps/chosen": -57.62566375732422, "logps/rejected": -56.7308464050293, "loss": 0.6828, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027324050664901733, "rewards/margins": 0.021839747205376625, "rewards/rejected": -0.04916379600763321, "step": 9270 }, { "epoch": 1.598897312198484, "grad_norm": 2.8847193717956543, "learning_rate": 2.3520268827990443e-09, "logits/chosen": -3.0211873054504395, "logits/rejected": -3.005167245864868, "logps/chosen": -56.53217315673828, "logps/rejected": -58.4986572265625, "loss": 0.685, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029618997126817703, "rewards/margins": 0.017433568835258484, "rewards/rejected": -0.04705256223678589, "step": 9280 }, { "epoch": 1.600620261888353, "grad_norm": 2.370560884475708, "learning_rate": 2.3326871562405736e-09, "logits/chosen": -2.9160075187683105, "logits/rejected": -2.893542528152466, "logps/chosen": -57.7754020690918, "logps/rejected": -54.99779510498047, "loss": 0.6885, "rewards/accuracies": 0.5625, "rewards/chosen": -0.031204476952552795, "rewards/margins": 0.010111426003277302, "rewards/rejected": -0.04131590202450752, "step": 9290 }, { "epoch": 1.602343211578222, "grad_norm": 2.956653594970703, "learning_rate": 2.31341676578252e-09, "logits/chosen": -3.002432346343994, "logits/rejected": -2.997676372528076, "logps/chosen": -57.20106887817383, "logps/rejected": -61.02766036987305, "loss": 0.6853, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03517729789018631, "rewards/margins": 0.016421284526586533, "rewards/rejected": -0.05159858614206314, "step": 9300 }, { "epoch": 1.602343211578222, "eval_logits/chosen": -3.1176798343658447, "eval_logits/rejected": -3.111994743347168, "eval_logps/chosen": -59.54670333862305, "eval_logps/rejected": -64.63745880126953, "eval_loss": 0.6902238726615906, "eval_rewards/accuracies": 0.582713782787323, "eval_rewards/chosen": -0.00834809523075819, "eval_rewards/margins": 0.006225237622857094, "eval_rewards/rejected": -0.014573332853615284, "eval_runtime": 384.037, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 9300 }, { "epoch": 1.6040661612680909, "grad_norm": 2.5189545154571533, "learning_rate": 2.2942158856884998e-09, "logits/chosen": -3.0237507820129395, "logits/rejected": -2.994091510772705, "logps/chosen": -57.047607421875, "logps/rejected": -55.44449996948242, "loss": 0.6848, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03301980346441269, "rewards/margins": 0.017662758007645607, "rewards/rejected": -0.050682563334703445, "step": 9310 }, { "epoch": 1.60578911095796, "grad_norm": 2.653944492340088, "learning_rate": 2.2750846895935627e-09, "logits/chosen": -3.0877106189727783, "logits/rejected": -3.0671634674072266, "logps/chosen": -54.3978157043457, "logps/rejected": -59.697364807128906, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.035656653344631195, "rewards/margins": 0.017190592363476753, "rewards/rejected": -0.052847255021333694, "step": 9320 }, { "epoch": 1.607512060647829, "grad_norm": 2.990072727203369, "learning_rate": 2.256023350502575e-09, "logits/chosen": -3.040903091430664, "logits/rejected": -3.0141682624816895, "logps/chosen": -59.84055709838867, "logps/rejected": -60.19636154174805, "loss": 0.6844, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.032625067979097366, "rewards/margins": 0.018737614154815674, "rewards/rejected": -0.05136268213391304, "step": 9330 }, { "epoch": 1.6092350103376982, "grad_norm": 2.7256877422332764, "learning_rate": 2.2370320407887056e-09, "logits/chosen": -2.959775447845459, "logits/rejected": -2.947470188140869, "logps/chosen": -55.50397872924805, "logps/rejected": -56.908599853515625, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -0.036947958171367645, "rewards/margins": 0.016250457614660263, "rewards/rejected": -0.05319841578602791, "step": 9340 }, { "epoch": 1.6109579600275672, "grad_norm": 3.4841971397399902, "learning_rate": 2.2181109321918236e-09, "logits/chosen": -3.015531063079834, "logits/rejected": -2.9840874671936035, "logps/chosen": -55.771270751953125, "logps/rejected": -57.008934020996094, "loss": 0.6843, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03221204876899719, "rewards/margins": 0.018690291792154312, "rewards/rejected": -0.0509023442864418, "step": 9350 }, { "epoch": 1.6126809097174362, "grad_norm": 2.2437381744384766, "learning_rate": 2.199260195816971e-09, "logits/chosen": -3.0981364250183105, "logits/rejected": -3.058584213256836, "logps/chosen": -59.319190979003906, "logps/rejected": -57.41559600830078, "loss": 0.6826, "rewards/accuracies": 0.625, "rewards/chosen": -0.02636829949915409, "rewards/margins": 0.022022128105163574, "rewards/rejected": -0.048390429466962814, "step": 9360 }, { "epoch": 1.6144038594073054, "grad_norm": 2.5757100582122803, "learning_rate": 2.1804800021328107e-09, "logits/chosen": -3.1047720909118652, "logits/rejected": -3.070584774017334, "logps/chosen": -56.7910041809082, "logps/rejected": -56.188743591308594, "loss": 0.6844, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02951735258102417, "rewards/margins": 0.0183617752045393, "rewards/rejected": -0.04787912219762802, "step": 9370 }, { "epoch": 1.6161268090971743, "grad_norm": 2.4145779609680176, "learning_rate": 2.16177052097008e-09, "logits/chosen": -3.1098639965057373, "logits/rejected": -3.0928986072540283, "logps/chosen": -57.31666946411133, "logps/rejected": -57.55403518676758, "loss": 0.6892, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.029079124331474304, "rewards/margins": 0.008899955078959465, "rewards/rejected": -0.03797907754778862, "step": 9380 }, { "epoch": 1.6178497587870435, "grad_norm": 2.444182872772217, "learning_rate": 2.143131921520055e-09, "logits/chosen": -2.8663852214813232, "logits/rejected": -2.8523082733154297, "logps/chosen": -56.66484832763672, "logps/rejected": -59.6016845703125, "loss": 0.6875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03564850240945816, "rewards/margins": 0.012243666686117649, "rewards/rejected": -0.04789217188954353, "step": 9390 }, { "epoch": 1.6195727084769125, "grad_norm": 2.6653835773468018, "learning_rate": 2.12456437233303e-09, "logits/chosen": -2.9787285327911377, "logits/rejected": -2.960437059402466, "logps/chosen": -57.28019332885742, "logps/rejected": -58.5770378112793, "loss": 0.6882, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03212272375822067, "rewards/margins": 0.010934999212622643, "rewards/rejected": -0.043057721108198166, "step": 9400 }, { "epoch": 1.6195727084769125, "eval_logits/chosen": -3.1168527603149414, "eval_logits/rejected": -3.1111221313476562, "eval_logps/chosen": -59.551536560058594, "eval_logps/rejected": -64.65276336669922, "eval_loss": 0.6901717782020569, "eval_rewards/accuracies": 0.5885223150253296, "eval_rewards/chosen": -0.008396387100219727, "eval_rewards/margins": 0.006330016069114208, "eval_rewards/rejected": -0.014726405031979084, "eval_runtime": 383.6566, "eval_samples_per_second": 11.218, "eval_steps_per_second": 1.402, "step": 9400 }, { "epoch": 1.6212956581667815, "grad_norm": 2.3467307090759277, "learning_rate": 2.106068041316781e-09, "logits/chosen": -2.9938812255859375, "logits/rejected": -2.9771664142608643, "logps/chosen": -57.26462936401367, "logps/rejected": -57.30256271362305, "loss": 0.6848, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03390895202755928, "rewards/margins": 0.017822980880737305, "rewards/rejected": -0.051731932908296585, "step": 9410 }, { "epoch": 1.6230186078566504, "grad_norm": 2.4428353309631348, "learning_rate": 2.0876430957350466e-09, "logits/chosen": -3.0373952388763428, "logits/rejected": -3.017669677734375, "logps/chosen": -58.389183044433594, "logps/rejected": -57.229583740234375, "loss": 0.6852, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.033502496778964996, "rewards/margins": 0.01688491180539131, "rewards/rejected": -0.05038740485906601, "step": 9420 }, { "epoch": 1.6247415575465196, "grad_norm": 2.479332208633423, "learning_rate": 2.0692897022060397e-09, "logits/chosen": -3.0020992755889893, "logits/rejected": -2.9905407428741455, "logps/chosen": -56.156005859375, "logps/rejected": -53.7123908996582, "loss": 0.6884, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.037239111959934235, "rewards/margins": 0.010548432357609272, "rewards/rejected": -0.04778754711151123, "step": 9430 }, { "epoch": 1.6264645072363888, "grad_norm": 2.5780246257781982, "learning_rate": 2.0510080267009023e-09, "logits/chosen": -2.995959758758545, "logits/rejected": -2.9901649951934814, "logps/chosen": -53.41289138793945, "logps/rejected": -57.17323684692383, "loss": 0.6889, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03778151422739029, "rewards/margins": 0.009339074604213238, "rewards/rejected": -0.0471205934882164, "step": 9440 }, { "epoch": 1.6281874569262578, "grad_norm": 2.6421189308166504, "learning_rate": 2.032798234542237e-09, "logits/chosen": -3.1385090351104736, "logits/rejected": -3.12992787361145, "logps/chosen": -57.56132125854492, "logps/rejected": -55.61620330810547, "loss": 0.6897, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.04172375798225403, "rewards/margins": 0.008058389648795128, "rewards/rejected": -0.049782149493694305, "step": 9450 }, { "epoch": 1.6299104066161267, "grad_norm": 2.476637601852417, "learning_rate": 2.0146604904025955e-09, "logits/chosen": -2.9790234565734863, "logits/rejected": -2.975015163421631, "logps/chosen": -55.6374626159668, "logps/rejected": -59.5582275390625, "loss": 0.683, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03386567533016205, "rewards/margins": 0.02146020717918873, "rewards/rejected": -0.055325884371995926, "step": 9460 }, { "epoch": 1.6316333563059957, "grad_norm": 2.6225929260253906, "learning_rate": 1.996594958302996e-09, "logits/chosen": -2.8437275886535645, "logits/rejected": -2.811704397201538, "logps/chosen": -56.132415771484375, "logps/rejected": -55.46335983276367, "loss": 0.6861, "rewards/accuracies": 0.59375, "rewards/chosen": -0.034872956573963165, "rewards/margins": 0.015074771828949451, "rewards/rejected": -0.04994772747159004, "step": 9470 }, { "epoch": 1.633356305995865, "grad_norm": 2.4534597396850586, "learning_rate": 1.978601801611436e-09, "logits/chosen": -2.9890143871307373, "logits/rejected": -2.95599627494812, "logps/chosen": -57.860130310058594, "logps/rejected": -58.361412048339844, "loss": 0.6844, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.03867609426379204, "rewards/margins": 0.018373996019363403, "rewards/rejected": -0.05705009773373604, "step": 9480 }, { "epoch": 1.635079255685734, "grad_norm": 2.2696869373321533, "learning_rate": 1.9606811830414163e-09, "logits/chosen": -2.9858505725860596, "logits/rejected": -2.96655535697937, "logps/chosen": -57.28430938720703, "logps/rejected": -57.04707717895508, "loss": 0.6856, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03171836584806442, "rewards/margins": 0.01598692685365677, "rewards/rejected": -0.04770529270172119, "step": 9490 }, { "epoch": 1.636802205375603, "grad_norm": 2.2463831901550293, "learning_rate": 1.94283326465047e-09, "logits/chosen": -3.016265869140625, "logits/rejected": -3.0025973320007324, "logps/chosen": -56.819480895996094, "logps/rejected": -59.62360382080078, "loss": 0.6867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03173057362437248, "rewards/margins": 0.0137772336602211, "rewards/rejected": -0.04550781100988388, "step": 9500 }, { "epoch": 1.636802205375603, "eval_logits/chosen": -3.116647720336914, "eval_logits/rejected": -3.111027240753174, "eval_logps/chosen": -59.55279541015625, "eval_logps/rejected": -64.6480712890625, "eval_loss": 0.6902014017105103, "eval_rewards/accuracies": 0.5815520286560059, "eval_rewards/chosen": -0.008409040048718452, "eval_rewards/margins": 0.0062704309821128845, "eval_rewards/rejected": -0.014679470099508762, "eval_runtime": 384.0968, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 9500 }, { "epoch": 1.638525155065472, "grad_norm": 2.787724256515503, "learning_rate": 1.925058207838699e-09, "logits/chosen": -2.931727886199951, "logits/rejected": -2.9154107570648193, "logps/chosen": -53.7954216003418, "logps/rejected": -59.11879348754883, "loss": 0.6823, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029123950749635696, "rewards/margins": 0.022644540295004845, "rewards/rejected": -0.05176848918199539, "step": 9510 }, { "epoch": 1.640248104755341, "grad_norm": 2.510998487472534, "learning_rate": 1.9073561733473088e-09, "logits/chosen": -3.1452136039733887, "logits/rejected": -3.1307685375213623, "logps/chosen": -55.496925354003906, "logps/rejected": -62.52506637573242, "loss": 0.6861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031591176986694336, "rewards/margins": 0.015155290253460407, "rewards/rejected": -0.04674647003412247, "step": 9520 }, { "epoch": 1.6419710544452102, "grad_norm": 2.6163759231567383, "learning_rate": 1.8897273212571643e-09, "logits/chosen": -3.048109769821167, "logits/rejected": -3.025350332260132, "logps/chosen": -56.559532165527344, "logps/rejected": -55.84375762939453, "loss": 0.6872, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03663558140397072, "rewards/margins": 0.012950601987540722, "rewards/rejected": -0.04958617687225342, "step": 9530 }, { "epoch": 1.6436940041350794, "grad_norm": 2.477329730987549, "learning_rate": 1.872171810987324e-09, "logits/chosen": -2.9872732162475586, "logits/rejected": -2.963569164276123, "logps/chosen": -57.75830078125, "logps/rejected": -58.98407745361328, "loss": 0.6841, "rewards/accuracies": 0.59375, "rewards/chosen": -0.036397237330675125, "rewards/margins": 0.019381213933229446, "rewards/rejected": -0.05577845126390457, "step": 9540 }, { "epoch": 1.6454169538249483, "grad_norm": 2.7419731616973877, "learning_rate": 1.8546898012936297e-09, "logits/chosen": -3.0292882919311523, "logits/rejected": -3.032219886779785, "logps/chosen": -56.93639373779297, "logps/rejected": -61.3848876953125, "loss": 0.6892, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.037130676209926605, "rewards/margins": 0.008765211328864098, "rewards/rejected": -0.045895881950855255, "step": 9550 }, { "epoch": 1.6471399035148173, "grad_norm": 2.5584213733673096, "learning_rate": 1.8372814502672308e-09, "logits/chosen": -3.0927374362945557, "logits/rejected": -3.0865063667297363, "logps/chosen": -56.66698455810547, "logps/rejected": -57.6358528137207, "loss": 0.686, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03044191002845764, "rewards/margins": 0.015346214175224304, "rewards/rejected": -0.04578813165426254, "step": 9560 }, { "epoch": 1.6488628532046863, "grad_norm": 2.4465668201446533, "learning_rate": 1.8199469153331949e-09, "logits/chosen": -3.003964900970459, "logits/rejected": -2.9885926246643066, "logps/chosen": -53.80454635620117, "logps/rejected": -55.86109161376953, "loss": 0.6853, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03552521392703056, "rewards/margins": 0.0164842177182436, "rewards/rejected": -0.052009426057338715, "step": 9570 }, { "epoch": 1.6505858028945555, "grad_norm": 2.3839058876037598, "learning_rate": 1.802686353249051e-09, "logits/chosen": -3.0763354301452637, "logits/rejected": -3.0583791732788086, "logps/chosen": -57.54524612426758, "logps/rejected": -61.7070426940918, "loss": 0.685, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.030573736876249313, "rewards/margins": 0.017048945650458336, "rewards/rejected": -0.0476226843893528, "step": 9580 }, { "epoch": 1.6523087525844247, "grad_norm": 2.5615997314453125, "learning_rate": 1.7854999201033917e-09, "logits/chosen": -3.1601243019104004, "logits/rejected": -3.1410508155822754, "logps/chosen": -59.41094970703125, "logps/rejected": -60.86851119995117, "loss": 0.6875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03314214199781418, "rewards/margins": 0.012432152405381203, "rewards/rejected": -0.04557429626584053, "step": 9590 }, { "epoch": 1.6540317022742936, "grad_norm": 2.9601426124572754, "learning_rate": 1.7683877713144559e-09, "logits/chosen": -2.979534149169922, "logits/rejected": -2.945681095123291, "logps/chosen": -58.09977340698242, "logps/rejected": -57.2403450012207, "loss": 0.6845, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.030393335968255997, "rewards/margins": 0.01838875375688076, "rewards/rejected": -0.048782095313072205, "step": 9600 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -3.1166112422943115, "eval_logits/rejected": -3.1109259128570557, "eval_logps/chosen": -59.561134338378906, "eval_logps/rejected": -64.66480255126953, "eval_loss": 0.6901615858078003, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.00849241204559803, "eval_rewards/margins": 0.006354290526360273, "eval_rewards/rejected": -0.014846701174974442, "eval_runtime": 384.14, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.401, "step": 9600 }, { "epoch": 1.6557546519641626, "grad_norm": 2.4565086364746094, "learning_rate": 1.7513500616287226e-09, "logits/chosen": -3.0060410499572754, "logits/rejected": -2.985361099243164, "logps/chosen": -58.816650390625, "logps/rejected": -59.23149490356445, "loss": 0.6847, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02508339285850525, "rewards/margins": 0.017644105479121208, "rewards/rejected": -0.042727500200271606, "step": 9610 }, { "epoch": 1.6574776016540316, "grad_norm": 2.263732671737671, "learning_rate": 1.734386945119515e-09, "logits/chosen": -3.125436782836914, "logits/rejected": -3.1177351474761963, "logps/chosen": -57.58256912231445, "logps/rejected": -59.40592575073242, "loss": 0.6886, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.029961705207824707, "rewards/margins": 0.010090869851410389, "rewards/rejected": -0.04005257040262222, "step": 9620 }, { "epoch": 1.6592005513439008, "grad_norm": 2.6932501792907715, "learning_rate": 1.7174985751855931e-09, "logits/chosen": -3.0406494140625, "logits/rejected": -3.0130789279937744, "logps/chosen": -59.420860290527344, "logps/rejected": -57.45256805419922, "loss": 0.6913, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03962600976228714, "rewards/margins": 0.004532286431640387, "rewards/rejected": -0.044158294796943665, "step": 9630 }, { "epoch": 1.66092350103377, "grad_norm": 2.2227652072906494, "learning_rate": 1.7006851045497996e-09, "logits/chosen": -3.0980916023254395, "logits/rejected": -3.0667550563812256, "logps/chosen": -59.05376052856445, "logps/rejected": -55.8654899597168, "loss": 0.6822, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.030568838119506836, "rewards/margins": 0.02280599996447563, "rewards/rejected": -0.05337483808398247, "step": 9640 }, { "epoch": 1.662646450723639, "grad_norm": 2.3098556995391846, "learning_rate": 1.6839466852576314e-09, "logits/chosen": -2.9595775604248047, "logits/rejected": -2.9384946823120117, "logps/chosen": -59.28965377807617, "logps/rejected": -58.59055709838867, "loss": 0.6897, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0381181500852108, "rewards/margins": 0.007864254526793957, "rewards/rejected": -0.04598240926861763, "step": 9650 }, { "epoch": 1.664369400413508, "grad_norm": 3.269378185272217, "learning_rate": 1.667283468675913e-09, "logits/chosen": -2.989434003829956, "logits/rejected": -2.983867645263672, "logps/chosen": -53.243675231933594, "logps/rejected": -57.77482986450195, "loss": 0.6862, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.026591796427965164, "rewards/margins": 0.014660343527793884, "rewards/rejected": -0.04125214368104935, "step": 9660 }, { "epoch": 1.6660923501033769, "grad_norm": 2.3100833892822266, "learning_rate": 1.6506956054913892e-09, "logits/chosen": -3.077094793319702, "logits/rejected": -3.04704213142395, "logps/chosen": -58.68497848510742, "logps/rejected": -58.46355056762695, "loss": 0.681, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02568379044532776, "rewards/margins": 0.025521423667669296, "rewards/rejected": -0.051205217838287354, "step": 9670 }, { "epoch": 1.667815299793246, "grad_norm": 2.4696285724639893, "learning_rate": 1.6341832457093853e-09, "logits/chosen": -3.039754867553711, "logits/rejected": -3.023983955383301, "logps/chosen": -57.664756774902344, "logps/rejected": -56.45305633544922, "loss": 0.6883, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03195812553167343, "rewards/margins": 0.010615061037242413, "rewards/rejected": -0.04257318750023842, "step": 9680 }, { "epoch": 1.6695382494831152, "grad_norm": 2.5291388034820557, "learning_rate": 1.6177465386524426e-09, "logits/chosen": -2.9263358116149902, "logits/rejected": -2.906914472579956, "logps/chosen": -58.816925048828125, "logps/rejected": -59.23249053955078, "loss": 0.6856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02988823689520359, "rewards/margins": 0.016320116817951202, "rewards/rejected": -0.04620835557579994, "step": 9690 }, { "epoch": 1.6712611991729842, "grad_norm": 2.3730547428131104, "learning_rate": 1.6013856329589683e-09, "logits/chosen": -2.9970877170562744, "logits/rejected": -2.964787721633911, "logps/chosen": -56.075889587402344, "logps/rejected": -57.289794921875, "loss": 0.6855, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03369338810443878, "rewards/margins": 0.01647287793457508, "rewards/rejected": -0.05016626790165901, "step": 9700 }, { "epoch": 1.6712611991729842, "eval_logits/chosen": -3.1167359352111816, "eval_logits/rejected": -3.1110715866088867, "eval_logps/chosen": -59.564579010009766, "eval_logps/rejected": -64.6676254272461, "eval_loss": 0.6901634335517883, "eval_rewards/accuracies": 0.5875929594039917, "eval_rewards/chosen": -0.008526891469955444, "eval_rewards/margins": 0.006348147988319397, "eval_rewards/rejected": -0.014875039458274841, "eval_runtime": 383.9173, "eval_samples_per_second": 11.211, "eval_steps_per_second": 1.401, "step": 9700 }, { "epoch": 1.6729841488628532, "grad_norm": 2.4649150371551514, "learning_rate": 1.585100676581892e-09, "logits/chosen": -2.86004376411438, "logits/rejected": -2.8453660011291504, "logps/chosen": -56.4981803894043, "logps/rejected": -56.37230682373047, "loss": 0.6857, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02869611419737339, "rewards/margins": 0.01581619493663311, "rewards/rejected": -0.0445123091340065, "step": 9710 }, { "epoch": 1.6747070985527222, "grad_norm": 2.7789175510406494, "learning_rate": 1.568891816787329e-09, "logits/chosen": -3.03645920753479, "logits/rejected": -3.022475242614746, "logps/chosen": -56.657142639160156, "logps/rejected": -56.0443229675293, "loss": 0.6867, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03298606723546982, "rewards/margins": 0.013968385756015778, "rewards/rejected": -0.046954452991485596, "step": 9720 }, { "epoch": 1.6764300482425913, "grad_norm": 2.554271936416626, "learning_rate": 1.5527592001532465e-09, "logits/chosen": -3.0012001991271973, "logits/rejected": -2.987691879272461, "logps/chosen": -56.674346923828125, "logps/rejected": -59.074127197265625, "loss": 0.6852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029376983642578125, "rewards/margins": 0.016980426385998726, "rewards/rejected": -0.0463574156165123, "step": 9730 }, { "epoch": 1.6781529979324605, "grad_norm": 2.4061145782470703, "learning_rate": 1.5367029725681403e-09, "logits/chosen": -3.041320562362671, "logits/rejected": -3.0223727226257324, "logps/chosen": -57.524688720703125, "logps/rejected": -57.442771911621094, "loss": 0.6879, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.030505716800689697, "rewards/margins": 0.011272273026406765, "rewards/rejected": -0.04177799075841904, "step": 9740 }, { "epoch": 1.6798759476223295, "grad_norm": 2.8132548332214355, "learning_rate": 1.5207232792297065e-09, "logits/chosen": -3.0355067253112793, "logits/rejected": -2.992421865463257, "logps/chosen": -58.13688278198242, "logps/rejected": -54.65366744995117, "loss": 0.6822, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.030089890584349632, "rewards/margins": 0.02290867641568184, "rewards/rejected": -0.05299856513738632, "step": 9750 }, { "epoch": 1.6815988973121985, "grad_norm": 2.302518129348755, "learning_rate": 1.5048202646435528e-09, "logits/chosen": -3.0535900592803955, "logits/rejected": -3.0285425186157227, "logps/chosen": -56.118263244628906, "logps/rejected": -56.26769256591797, "loss": 0.6833, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.029349148273468018, "rewards/margins": 0.020514894276857376, "rewards/rejected": -0.049864038825035095, "step": 9760 }, { "epoch": 1.6833218470020674, "grad_norm": 2.493689775466919, "learning_rate": 1.4889940726218521e-09, "logits/chosen": -2.963839292526245, "logits/rejected": -2.9446358680725098, "logps/chosen": -57.1699333190918, "logps/rejected": -58.53014373779297, "loss": 0.6861, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.030619194731116295, "rewards/margins": 0.015159929171204567, "rewards/rejected": -0.04577913135290146, "step": 9770 }, { "epoch": 1.6850447966919366, "grad_norm": 2.4732000827789307, "learning_rate": 1.4732448462820902e-09, "logits/chosen": -3.0382137298583984, "logits/rejected": -2.9898781776428223, "logps/chosen": -63.5068473815918, "logps/rejected": -56.90974807739258, "loss": 0.6804, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.026242714375257492, "rewards/margins": 0.026783952489495277, "rewards/rejected": -0.05302666500210762, "step": 9780 }, { "epoch": 1.6867677463818056, "grad_norm": 2.587778091430664, "learning_rate": 1.457572728045724e-09, "logits/chosen": -3.0090510845184326, "logits/rejected": -3.0093512535095215, "logps/chosen": -56.01578903198242, "logps/rejected": -62.18316650390625, "loss": 0.6837, "rewards/accuracies": 0.71875, "rewards/chosen": -0.026041844859719276, "rewards/margins": 0.019954070448875427, "rewards/rejected": -0.045995913445949554, "step": 9790 }, { "epoch": 1.6884906960716748, "grad_norm": 2.735978603363037, "learning_rate": 1.4419778596369293e-09, "logits/chosen": -3.040757656097412, "logits/rejected": -3.0044455528259277, "logps/chosen": -59.152740478515625, "logps/rejected": -58.39259719848633, "loss": 0.682, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.023504730314016342, "rewards/margins": 0.023418856784701347, "rewards/rejected": -0.04692358896136284, "step": 9800 }, { "epoch": 1.6884906960716748, "eval_logits/chosen": -3.1164276599884033, "eval_logits/rejected": -3.110750436782837, "eval_logps/chosen": -59.58139419555664, "eval_logps/rejected": -64.676513671875, "eval_loss": 0.6902053952217102, "eval_rewards/accuracies": 0.586663544178009, "eval_rewards/chosen": -0.008694971911609173, "eval_rewards/margins": 0.006268941797316074, "eval_rewards/rejected": -0.014963913708925247, "eval_runtime": 383.8504, "eval_samples_per_second": 11.213, "eval_steps_per_second": 1.402, "step": 9800 }, { "epoch": 1.6902136457615438, "grad_norm": 2.47900390625, "learning_rate": 1.4264603820813005e-09, "logits/chosen": -2.9712750911712646, "logits/rejected": -2.9366753101348877, "logps/chosen": -58.309791564941406, "logps/rejected": -59.19429397583008, "loss": 0.6815, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02890641614794731, "rewards/margins": 0.02413240447640419, "rewards/rejected": -0.0530388243496418, "step": 9810 }, { "epoch": 1.6919365954514127, "grad_norm": 2.372298002243042, "learning_rate": 1.411020435704584e-09, "logits/chosen": -3.037036180496216, "logits/rejected": -3.0111517906188965, "logps/chosen": -56.780792236328125, "logps/rejected": -59.128944396972656, "loss": 0.6812, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026590263471007347, "rewards/margins": 0.025052299723029137, "rewards/rejected": -0.05164256691932678, "step": 9820 }, { "epoch": 1.693659545141282, "grad_norm": 2.340632677078247, "learning_rate": 1.3956581601314045e-09, "logits/chosen": -3.0698554515838623, "logits/rejected": -3.0650217533111572, "logps/chosen": -61.13444137573242, "logps/rejected": -61.3189811706543, "loss": 0.6895, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.03146522864699364, "rewards/margins": 0.008265768177807331, "rewards/rejected": -0.039730995893478394, "step": 9830 }, { "epoch": 1.6953824948311509, "grad_norm": 2.3163886070251465, "learning_rate": 1.3803736942839963e-09, "logits/chosen": -2.9985289573669434, "logits/rejected": -2.9663424491882324, "logps/chosen": -59.07982635498047, "logps/rejected": -59.1026725769043, "loss": 0.6844, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.032019712030887604, "rewards/margins": 0.018637120723724365, "rewards/rejected": -0.05065683275461197, "step": 9840 }, { "epoch": 1.69710544452102, "grad_norm": 2.3132269382476807, "learning_rate": 1.3651671763809692e-09, "logits/chosen": -3.017277479171753, "logits/rejected": -3.005635976791382, "logps/chosen": -57.1483039855957, "logps/rejected": -55.78985595703125, "loss": 0.6857, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02546665631234646, "rewards/margins": 0.01578855887055397, "rewards/rejected": -0.04125521332025528, "step": 9850 }, { "epoch": 1.698828394210889, "grad_norm": 2.7721080780029297, "learning_rate": 1.3500387439360285e-09, "logits/chosen": -3.083782196044922, "logits/rejected": -3.0732412338256836, "logps/chosen": -60.18684005737305, "logps/rejected": -58.596580505371094, "loss": 0.6893, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03860507160425186, "rewards/margins": 0.008516994304955006, "rewards/rejected": -0.04712206497788429, "step": 9860 }, { "epoch": 1.700551343900758, "grad_norm": 2.747910499572754, "learning_rate": 1.3349885337567613e-09, "logits/chosen": -3.0581815242767334, "logits/rejected": -3.030397891998291, "logps/chosen": -55.94462966918945, "logps/rejected": -56.11644744873047, "loss": 0.6836, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.032134730368852615, "rewards/margins": 0.020175369456410408, "rewards/rejected": -0.052310097962617874, "step": 9870 }, { "epoch": 1.7022742935906272, "grad_norm": 2.2323834896087646, "learning_rate": 1.3200166819433701e-09, "logits/chosen": -2.945613145828247, "logits/rejected": -2.941906213760376, "logps/chosen": -55.61360549926758, "logps/rejected": -60.08890914916992, "loss": 0.6882, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.03548043593764305, "rewards/margins": 0.010891259647905827, "rewards/rejected": -0.0463716983795166, "step": 9880 }, { "epoch": 1.7039972432804962, "grad_norm": 2.667999267578125, "learning_rate": 1.305123323887467e-09, "logits/chosen": -2.9106783866882324, "logits/rejected": -2.904560089111328, "logps/chosen": -59.61014938354492, "logps/rejected": -58.7868537902832, "loss": 0.6852, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.031924955546855927, "rewards/margins": 0.01715216599404812, "rewards/rejected": -0.049077123403549194, "step": 9890 }, { "epoch": 1.7057201929703654, "grad_norm": 2.61613392829895, "learning_rate": 1.2903085942708348e-09, "logits/chosen": -3.014256715774536, "logits/rejected": -2.9844601154327393, "logps/chosen": -54.362159729003906, "logps/rejected": -54.91518020629883, "loss": 0.6814, "rewards/accuracies": 0.625, "rewards/chosen": -0.02655635215342045, "rewards/margins": 0.024641280993819237, "rewards/rejected": -0.051197636872529984, "step": 9900 }, { "epoch": 1.7057201929703654, "eval_logits/chosen": -3.116485595703125, "eval_logits/rejected": -3.110800266265869, "eval_logps/chosen": -59.58063888549805, "eval_logps/rejected": -64.68134307861328, "eval_loss": 0.6901763677597046, "eval_rewards/accuracies": 0.5913103818893433, "eval_rewards/chosen": -0.008687433786690235, "eval_rewards/margins": 0.006324751302599907, "eval_rewards/rejected": -0.015012186020612717, "eval_runtime": 383.8753, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.401, "step": 9900 }, { "epoch": 1.7074431426602343, "grad_norm": 2.469726324081421, "learning_rate": 1.2755726270642133e-09, "logits/chosen": -3.070533514022827, "logits/rejected": -3.0258636474609375, "logps/chosen": -60.8514404296875, "logps/rejected": -55.90861129760742, "loss": 0.6838, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03283166512846947, "rewards/margins": 0.01975090429186821, "rewards/rejected": -0.05258256942033768, "step": 9910 }, { "epoch": 1.7091660923501033, "grad_norm": 2.480678081512451, "learning_rate": 1.260915555526091e-09, "logits/chosen": -3.094026803970337, "logits/rejected": -3.0649092197418213, "logps/chosen": -55.47735595703125, "logps/rejected": -54.31943893432617, "loss": 0.6826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02788475528359413, "rewards/margins": 0.022009506821632385, "rewards/rejected": -0.04989425837993622, "step": 9920 }, { "epoch": 1.7108890420399723, "grad_norm": 2.630247116088867, "learning_rate": 1.246337512201492e-09, "logits/chosen": -3.027109146118164, "logits/rejected": -3.024893283843994, "logps/chosen": -55.09284591674805, "logps/rejected": -59.29688262939453, "loss": 0.6874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031861864030361176, "rewards/margins": 0.012505242601037025, "rewards/rejected": -0.04436710476875305, "step": 9930 }, { "epoch": 1.7126119917298415, "grad_norm": 2.31946063041687, "learning_rate": 1.2318386289207862e-09, "logits/chosen": -3.093456745147705, "logits/rejected": -3.088930368423462, "logps/chosen": -57.84920120239258, "logps/rejected": -64.39171600341797, "loss": 0.6893, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02787485346198082, "rewards/margins": 0.008635136298835278, "rewards/rejected": -0.03650998696684837, "step": 9940 }, { "epoch": 1.7143349414197107, "grad_norm": 2.4557619094848633, "learning_rate": 1.2174190367984905e-09, "logits/chosen": -3.057157039642334, "logits/rejected": -3.0204882621765137, "logps/chosen": -56.916282653808594, "logps/rejected": -56.3023681640625, "loss": 0.6835, "rewards/accuracies": 0.65625, "rewards/chosen": -0.029519572854042053, "rewards/margins": 0.02020849846303463, "rewards/rejected": -0.04972807317972183, "step": 9950 }, { "epoch": 1.7160578911095796, "grad_norm": 2.373345136642456, "learning_rate": 1.203078866232088e-09, "logits/chosen": -3.090630054473877, "logits/rejected": -3.073500394821167, "logps/chosen": -57.05073928833008, "logps/rejected": -59.31227493286133, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.036136444658041, "rewards/margins": 0.010895757004618645, "rewards/rejected": -0.047032203525304794, "step": 9960 }, { "epoch": 1.7177808407994486, "grad_norm": 2.390754222869873, "learning_rate": 1.1888182469008457e-09, "logits/chosen": -3.0301880836486816, "logits/rejected": -3.0101847648620605, "logps/chosen": -54.38376998901367, "logps/rejected": -56.96045684814453, "loss": 0.6828, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030236411839723587, "rewards/margins": 0.021668527275323868, "rewards/rejected": -0.051904939115047455, "step": 9970 }, { "epoch": 1.7195037904893176, "grad_norm": 2.6026532649993896, "learning_rate": 1.1746373077646366e-09, "logits/chosen": -2.977801561355591, "logits/rejected": -2.964418649673462, "logps/chosen": -56.35908126831055, "logps/rejected": -58.4140510559082, "loss": 0.6853, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03563179820775986, "rewards/margins": 0.016755376011133194, "rewards/rejected": -0.05238717794418335, "step": 9980 }, { "epoch": 1.7212267401791868, "grad_norm": 2.3901596069335938, "learning_rate": 1.1605361770627943e-09, "logits/chosen": -3.069767475128174, "logits/rejected": -3.0197501182556152, "logps/chosen": -57.231842041015625, "logps/rejected": -54.852928161621094, "loss": 0.6864, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03499136120080948, "rewards/margins": 0.014375315979123116, "rewards/rejected": -0.049366679042577744, "step": 9990 }, { "epoch": 1.722949689869056, "grad_norm": 2.420463800430298, "learning_rate": 1.1465149823129207e-09, "logits/chosen": -3.076472282409668, "logits/rejected": -3.055633783340454, "logps/chosen": -58.2899055480957, "logps/rejected": -61.497642517089844, "loss": 0.6837, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025413617491722107, "rewards/margins": 0.019886162132024765, "rewards/rejected": -0.045299775898456573, "step": 10000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -3.116307258605957, "eval_logits/rejected": -3.110671043395996, "eval_logps/chosen": -59.58539962768555, "eval_logps/rejected": -64.69255065917969, "eval_loss": 0.6901459693908691, "eval_rewards/accuracies": 0.5927044749259949, "eval_rewards/chosen": -0.00873502902686596, "eval_rewards/margins": 0.006389264483004808, "eval_rewards/rejected": -0.015124293975532055, "eval_runtime": 384.2617, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 10000 }, { "epoch": 1.724672639558925, "grad_norm": 2.7465927600860596, "learning_rate": 1.132573850309767e-09, "logits/chosen": -3.0503437519073486, "logits/rejected": -3.029636859893799, "logps/chosen": -55.57611083984375, "logps/rejected": -57.7164192199707, "loss": 0.6849, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0319674126803875, "rewards/margins": 0.017423031851649284, "rewards/rejected": -0.04939044266939163, "step": 10010 }, { "epoch": 1.7263955892487939, "grad_norm": 2.1733815670013428, "learning_rate": 1.1187129071240588e-09, "logits/chosen": -2.9910526275634766, "logits/rejected": -2.958216428756714, "logps/chosen": -57.27119827270508, "logps/rejected": -56.076995849609375, "loss": 0.6862, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03598593920469284, "rewards/margins": 0.014988981187343597, "rewards/rejected": -0.050974924117326736, "step": 10020 }, { "epoch": 1.7281185389386629, "grad_norm": 2.466876268386841, "learning_rate": 1.1049322781013726e-09, "logits/chosen": -3.0376193523406982, "logits/rejected": -3.004720449447632, "logps/chosen": -60.057777404785156, "logps/rejected": -56.2112922668457, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.026665765792131424, "rewards/margins": 0.014138095080852509, "rewards/rejected": -0.04080386459827423, "step": 10030 }, { "epoch": 1.729841488628532, "grad_norm": 2.6095364093780518, "learning_rate": 1.0912320878610017e-09, "logits/chosen": -3.1675872802734375, "logits/rejected": -3.1530065536499023, "logps/chosen": -57.85478973388672, "logps/rejected": -59.01996612548828, "loss": 0.6858, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.030137866735458374, "rewards/margins": 0.01570282131433487, "rewards/rejected": -0.045840684324502945, "step": 10040 }, { "epoch": 1.7315644383184012, "grad_norm": 2.6305747032165527, "learning_rate": 1.0776124602948146e-09, "logits/chosen": -3.009827136993408, "logits/rejected": -3.0053369998931885, "logps/chosen": -57.649658203125, "logps/rejected": -60.40259552001953, "loss": 0.6912, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03677880018949509, "rewards/margins": 0.004884940572082996, "rewards/rejected": -0.04166373983025551, "step": 10050 }, { "epoch": 1.7332873880082702, "grad_norm": 2.219585418701172, "learning_rate": 1.06407351856616e-09, "logits/chosen": -2.963155746459961, "logits/rejected": -2.9584078788757324, "logps/chosen": -52.41069793701172, "logps/rejected": -56.978782653808594, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0367356613278389, "rewards/margins": 0.009188707917928696, "rewards/rejected": -0.04592436924576759, "step": 10060 }, { "epoch": 1.7350103376981392, "grad_norm": 3.5949621200561523, "learning_rate": 1.050615385108722e-09, "logits/chosen": -2.9376320838928223, "logits/rejected": -2.927889347076416, "logps/chosen": -55.84636306762695, "logps/rejected": -57.358253479003906, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.033914633095264435, "rewards/margins": 0.014869148842990398, "rewards/rejected": -0.04878378286957741, "step": 10070 }, { "epoch": 1.7367332873880081, "grad_norm": 2.440239191055298, "learning_rate": 1.037238181625446e-09, "logits/chosen": -3.0707335472106934, "logits/rejected": -3.0552597045898438, "logps/chosen": -59.8310432434082, "logps/rejected": -59.3591423034668, "loss": 0.6887, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03293101117014885, "rewards/margins": 0.009801891632378101, "rewards/rejected": -0.042732901871204376, "step": 10080 }, { "epoch": 1.7384562370778773, "grad_norm": 2.5599682331085205, "learning_rate": 1.0239420290874058e-09, "logits/chosen": -3.1050057411193848, "logits/rejected": -3.0850796699523926, "logps/chosen": -57.10601806640625, "logps/rejected": -62.16572952270508, "loss": 0.685, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027327552437782288, "rewards/margins": 0.017170695587992668, "rewards/rejected": -0.044498249888420105, "step": 10090 }, { "epoch": 1.7401791867677465, "grad_norm": 2.586700439453125, "learning_rate": 1.010727047732739e-09, "logits/chosen": -3.0433762073516846, "logits/rejected": -3.0341668128967285, "logps/chosen": -57.99790573120117, "logps/rejected": -59.83086395263672, "loss": 0.6821, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030119875445961952, "rewards/margins": 0.023133408278226852, "rewards/rejected": -0.053253281861543655, "step": 10100 }, { "epoch": 1.7401791867677465, "eval_logits/chosen": -3.116196870803833, "eval_logits/rejected": -3.110548257827759, "eval_logps/chosen": -59.580108642578125, "eval_logps/rejected": -64.69314575195312, "eval_loss": 0.6901180148124695, "eval_rewards/accuracies": 0.5841078162193298, "eval_rewards/chosen": -0.008682068437337875, "eval_rewards/margins": 0.006448162719607353, "eval_rewards/rejected": -0.015130231156945229, "eval_runtime": 383.9779, "eval_samples_per_second": 11.209, "eval_steps_per_second": 1.401, "step": 10100 }, { "epoch": 1.7419021364576155, "grad_norm": 2.562401533126831, "learning_rate": 9.97593357065536e-10, "logits/chosen": -3.0932211875915527, "logits/rejected": -3.102454900741577, "logps/chosen": -54.34955596923828, "logps/rejected": -59.22336959838867, "loss": 0.6897, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03776390105485916, "rewards/margins": 0.008016002364456654, "rewards/rejected": -0.04577990248799324, "step": 10110 }, { "epoch": 1.7436250861474845, "grad_norm": 2.4685213565826416, "learning_rate": 9.845410758547724e-10, "logits/chosen": -3.0438075065612793, "logits/rejected": -3.0213944911956787, "logps/chosen": -57.01942825317383, "logps/rejected": -55.703758239746094, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0329575315117836, "rewards/margins": 0.018011055886745453, "rewards/rejected": -0.05096858739852905, "step": 10120 }, { "epoch": 1.7453480358373534, "grad_norm": 2.5328216552734375, "learning_rate": 9.715703221332328e-10, "logits/chosen": -3.0835208892822266, "logits/rejected": -3.063122034072876, "logps/chosen": -55.781517028808594, "logps/rejected": -57.80461502075195, "loss": 0.685, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.024351779371500015, "rewards/margins": 0.01719535142183304, "rewards/rejected": -0.041547130793333054, "step": 10130 }, { "epoch": 1.7470709855272226, "grad_norm": 2.9124436378479004, "learning_rate": 9.586812131964429e-10, "logits/chosen": -3.0506045818328857, "logits/rejected": -3.0077712535858154, "logps/chosen": -62.746177673339844, "logps/rejected": -58.44568634033203, "loss": 0.6829, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.023203754797577858, "rewards/margins": 0.02158413641154766, "rewards/rejected": -0.04478789120912552, "step": 10140 }, { "epoch": 1.7487939352170918, "grad_norm": 2.400768518447876, "learning_rate": 9.458738656016063e-10, "logits/chosen": -2.9910178184509277, "logits/rejected": -2.972029685974121, "logps/chosen": -58.909873962402344, "logps/rejected": -59.960166931152344, "loss": 0.6886, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03536384552717209, "rewards/margins": 0.010202957317233086, "rewards/rejected": -0.045566804707050323, "step": 10150 }, { "epoch": 1.7505168849069608, "grad_norm": 2.4359378814697266, "learning_rate": 9.331483951665532e-10, "logits/chosen": -3.0172200202941895, "logits/rejected": -3.0022635459899902, "logps/chosen": -55.558441162109375, "logps/rejected": -57.4552001953125, "loss": 0.6849, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03486592322587967, "rewards/margins": 0.01763620227575302, "rewards/rejected": -0.05250212550163269, "step": 10160 }, { "epoch": 1.7522398345968297, "grad_norm": 2.5012121200561523, "learning_rate": 9.20504916968693e-10, "logits/chosen": -3.0220370292663574, "logits/rejected": -3.0115466117858887, "logps/chosen": -57.70402145385742, "logps/rejected": -58.722206115722656, "loss": 0.686, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027941424399614334, "rewards/margins": 0.015233149752020836, "rewards/rejected": -0.04317457601428032, "step": 10170 }, { "epoch": 1.7539627842866987, "grad_norm": 2.5249717235565186, "learning_rate": 9.079435453439744e-10, "logits/chosen": -2.958926200866699, "logits/rejected": -2.930947780609131, "logps/chosen": -55.34897994995117, "logps/rejected": -57.743553161621094, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": -0.029070192947983742, "rewards/margins": 0.01526731252670288, "rewards/rejected": -0.04433750361204147, "step": 10180 }, { "epoch": 1.755685733976568, "grad_norm": 2.646099805831909, "learning_rate": 8.954643938858431e-10, "logits/chosen": -3.001873731613159, "logits/rejected": -2.977242946624756, "logps/chosen": -53.825775146484375, "logps/rejected": -55.69243240356445, "loss": 0.6866, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.02867623046040535, "rewards/margins": 0.014177098870277405, "rewards/rejected": -0.042853329330682755, "step": 10190 }, { "epoch": 1.757408683666437, "grad_norm": 2.4003872871398926, "learning_rate": 8.83067575444233e-10, "logits/chosen": -3.066793441772461, "logits/rejected": -3.0473647117614746, "logps/chosen": -56.400489807128906, "logps/rejected": -56.30890655517578, "loss": 0.6867, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.039797525852918625, "rewards/margins": 0.014033086597919464, "rewards/rejected": -0.05383061245083809, "step": 10200 }, { "epoch": 1.757408683666437, "eval_logits/chosen": -3.116189956665039, "eval_logits/rejected": -3.1104912757873535, "eval_logps/chosen": -59.59709548950195, "eval_logps/rejected": -64.70321655273438, "eval_loss": 0.6901513934135437, "eval_rewards/accuracies": 0.5815520286560059, "eval_rewards/chosen": -0.008852045051753521, "eval_rewards/margins": 0.00637889513745904, "eval_rewards/rejected": -0.015230940654873848, "eval_runtime": 384.2513, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 10200 }, { "epoch": 1.759131633356306, "grad_norm": 2.68137264251709, "learning_rate": 8.707532021245213e-10, "logits/chosen": -3.070094585418701, "logits/rejected": -3.065809726715088, "logps/chosen": -57.35698699951172, "logps/rejected": -62.095924377441406, "loss": 0.6855, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03344056382775307, "rewards/margins": 0.016259726136922836, "rewards/rejected": -0.049700286239385605, "step": 10210 }, { "epoch": 1.760854583046175, "grad_norm": 2.283804178237915, "learning_rate": 8.585213852865415e-10, "logits/chosen": -3.018273115158081, "logits/rejected": -3.013718843460083, "logps/chosen": -52.13434600830078, "logps/rejected": -55.81071090698242, "loss": 0.6888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03748854249715805, "rewards/margins": 0.009387836791574955, "rewards/rejected": -0.04687637463212013, "step": 10220 }, { "epoch": 1.762577532736044, "grad_norm": 2.206413984298706, "learning_rate": 8.463722355435466e-10, "logits/chosen": -2.9376513957977295, "logits/rejected": -2.9067463874816895, "logps/chosen": -56.89196014404297, "logps/rejected": -56.93088912963867, "loss": 0.6855, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03451027721166611, "rewards/margins": 0.01620904728770256, "rewards/rejected": -0.050719328224658966, "step": 10230 }, { "epoch": 1.7643004824259132, "grad_norm": 2.3493030071258545, "learning_rate": 8.343058627612421e-10, "logits/chosen": -2.991508722305298, "logits/rejected": -2.969237804412842, "logps/chosen": -54.86486053466797, "logps/rejected": -57.530853271484375, "loss": 0.6849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.034078199416399, "rewards/margins": 0.017571553587913513, "rewards/rejected": -0.05164974927902222, "step": 10240 }, { "epoch": 1.7660234321157822, "grad_norm": 2.20393705368042, "learning_rate": 8.223223760567588e-10, "logits/chosen": -3.0698676109313965, "logits/rejected": -3.028144359588623, "logps/chosen": -59.7209358215332, "logps/rejected": -54.87019729614258, "loss": 0.6824, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.029454076662659645, "rewards/margins": 0.02245365083217621, "rewards/rejected": -0.051907729357481, "step": 10250 }, { "epoch": 1.7677463818056514, "grad_norm": 2.4347379207611084, "learning_rate": 8.10421883797694e-10, "logits/chosen": -2.9993271827697754, "logits/rejected": -3.0037426948547363, "logps/chosen": -56.778282165527344, "logps/rejected": -61.5457878112793, "loss": 0.6878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03513524681329727, "rewards/margins": 0.011598859913647175, "rewards/rejected": -0.04673410579562187, "step": 10260 }, { "epoch": 1.7694693314955203, "grad_norm": 3.3981211185455322, "learning_rate": 7.986044936011149e-10, "logits/chosen": -3.0045104026794434, "logits/rejected": -2.9947686195373535, "logps/chosen": -58.53948211669922, "logps/rejected": -60.971046447753906, "loss": 0.6826, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0310873631387949, "rewards/margins": 0.022157657891511917, "rewards/rejected": -0.05324501916766167, "step": 10270 }, { "epoch": 1.7711922811853893, "grad_norm": 2.483266592025757, "learning_rate": 7.868703123325871e-10, "logits/chosen": -2.9915993213653564, "logits/rejected": -2.9726366996765137, "logps/chosen": -57.83294677734375, "logps/rejected": -57.76380157470703, "loss": 0.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03503817319869995, "rewards/margins": 0.012229489162564278, "rewards/rejected": -0.04726766422390938, "step": 10280 }, { "epoch": 1.7729152308752585, "grad_norm": 2.4922218322753906, "learning_rate": 7.75219446105222e-10, "logits/chosen": -2.9411327838897705, "logits/rejected": -2.937307834625244, "logps/chosen": -54.51689529418945, "logps/rejected": -60.47893142700195, "loss": 0.6841, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.030806083232164383, "rewards/margins": 0.019144952297210693, "rewards/rejected": -0.049951035529375076, "step": 10290 }, { "epoch": 1.7746381805651275, "grad_norm": 2.6811490058898926, "learning_rate": 7.636520002786928e-10, "logits/chosen": -3.024101495742798, "logits/rejected": -3.0088281631469727, "logps/chosen": -58.08661651611328, "logps/rejected": -59.34272384643555, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03438801318407059, "rewards/margins": 0.014171205461025238, "rewards/rejected": -0.048559218645095825, "step": 10300 }, { "epoch": 1.7746381805651275, "eval_logits/chosen": -3.116088628768921, "eval_logits/rejected": -3.1104257106781006, "eval_logps/chosen": -59.588050842285156, "eval_logps/rejected": -64.6971664428711, "eval_loss": 0.6901378035545349, "eval_rewards/accuracies": 0.5871282815933228, "eval_rewards/chosen": -0.00876156147569418, "eval_rewards/margins": 0.0064088571816682816, "eval_rewards/rejected": -0.015170418657362461, "eval_runtime": 384.1877, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 10300 }, { "epoch": 1.7763611302549966, "grad_norm": 2.2249252796173096, "learning_rate": 7.521680794583096e-10, "logits/chosen": -2.943056583404541, "logits/rejected": -2.917048931121826, "logps/chosen": -58.94867706298828, "logps/rejected": -56.73337936401367, "loss": 0.6807, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.026859750971198082, "rewards/margins": 0.02593878284096718, "rewards/rejected": -0.05279853194952011, "step": 10310 }, { "epoch": 1.7780840799448656, "grad_norm": 2.479624032974243, "learning_rate": 7.407677874940477e-10, "logits/chosen": -3.0229122638702393, "logits/rejected": -2.990391254425049, "logps/chosen": -59.6657600402832, "logps/rejected": -58.287437438964844, "loss": 0.6843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02904113195836544, "rewards/margins": 0.01862339861690998, "rewards/rejected": -0.04766453057527542, "step": 10320 }, { "epoch": 1.7798070296347346, "grad_norm": 2.1177711486816406, "learning_rate": 7.294512274796261e-10, "logits/chosen": -3.037933349609375, "logits/rejected": -3.020717144012451, "logps/chosen": -55.4109001159668, "logps/rejected": -59.27170944213867, "loss": 0.6862, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.032665468752384186, "rewards/margins": 0.014889764599502087, "rewards/rejected": -0.04755523055791855, "step": 10330 }, { "epoch": 1.7815299793246038, "grad_norm": 2.83569598197937, "learning_rate": 7.182185017515707e-10, "logits/chosen": -3.041626453399658, "logits/rejected": -3.0205533504486084, "logps/chosen": -59.1336669921875, "logps/rejected": -59.606727600097656, "loss": 0.6837, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030461763963103294, "rewards/margins": 0.01992267370223999, "rewards/rejected": -0.050384439527988434, "step": 10340 }, { "epoch": 1.7832529290144727, "grad_norm": 2.156789541244507, "learning_rate": 7.070697118882819e-10, "logits/chosen": -3.137744903564453, "logits/rejected": -3.1120965480804443, "logps/chosen": -56.865562438964844, "logps/rejected": -55.8765754699707, "loss": 0.6835, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.027836069464683533, "rewards/margins": 0.020382162183523178, "rewards/rejected": -0.04821822792291641, "step": 10350 }, { "epoch": 1.784975878704342, "grad_norm": 2.3762242794036865, "learning_rate": 6.960049587091277e-10, "logits/chosen": -2.9819459915161133, "logits/rejected": -2.9459383487701416, "logps/chosen": -58.7189826965332, "logps/rejected": -57.51030731201172, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": -0.02772151306271553, "rewards/margins": 0.019328957423567772, "rewards/rejected": -0.04705046862363815, "step": 10360 }, { "epoch": 1.786698828394211, "grad_norm": 2.5589816570281982, "learning_rate": 6.850243422735214e-10, "logits/chosen": -2.8506741523742676, "logits/rejected": -2.819082736968994, "logps/chosen": -54.847389221191406, "logps/rejected": -58.32719802856445, "loss": 0.6807, "rewards/accuracies": 0.71875, "rewards/chosen": -0.028699740767478943, "rewards/margins": 0.025770461186766624, "rewards/rejected": -0.05447020009160042, "step": 10370 }, { "epoch": 1.7884217780840799, "grad_norm": 2.3986716270446777, "learning_rate": 6.741279618800234e-10, "logits/chosen": -3.0086491107940674, "logits/rejected": -2.996666669845581, "logps/chosen": -55.70782470703125, "logps/rejected": -57.116722106933594, "loss": 0.6888, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03828931599855423, "rewards/margins": 0.00983067974448204, "rewards/rejected": -0.04811999946832657, "step": 10380 }, { "epoch": 1.7901447277739488, "grad_norm": 2.4709324836730957, "learning_rate": 6.633159160654411e-10, "logits/chosen": -3.0464024543762207, "logits/rejected": -3.016256809234619, "logps/chosen": -58.832664489746094, "logps/rejected": -59.042266845703125, "loss": 0.6841, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.030234787613153458, "rewards/margins": 0.019277578219771385, "rewards/rejected": -0.049512363970279694, "step": 10390 }, { "epoch": 1.791867677463818, "grad_norm": 2.5254104137420654, "learning_rate": 6.525883026039358e-10, "logits/chosen": -3.004934549331665, "logits/rejected": -2.986795425415039, "logps/chosen": -55.586334228515625, "logps/rejected": -60.53899002075195, "loss": 0.6847, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03446076437830925, "rewards/margins": 0.017832253128290176, "rewards/rejected": -0.052293021231889725, "step": 10400 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -3.1158525943756104, "eval_logits/rejected": -3.110152244567871, "eval_logps/chosen": -59.59916305541992, "eval_logps/rejected": -64.68959045410156, "eval_loss": 0.690230131149292, "eval_rewards/accuracies": 0.5868958830833435, "eval_rewards/chosen": -0.008872650563716888, "eval_rewards/margins": 0.006222100462764502, "eval_rewards/rejected": -0.015094749629497528, "eval_runtime": 383.947, "eval_samples_per_second": 11.21, "eval_steps_per_second": 1.401, "step": 10400 }, { "epoch": 1.7935906271536872, "grad_norm": 2.282289981842041, "learning_rate": 6.419452185061447e-10, "logits/chosen": -3.0068869590759277, "logits/rejected": -2.9670393466949463, "logps/chosen": -59.276466369628906, "logps/rejected": -58.21703338623047, "loss": 0.6817, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.027589306235313416, "rewards/margins": 0.023819051682949066, "rewards/rejected": -0.05140835791826248, "step": 10410 }, { "epoch": 1.7953135768435562, "grad_norm": 2.372286319732666, "learning_rate": 6.313867600182932e-10, "logits/chosen": -3.0341315269470215, "logits/rejected": -3.006105422973633, "logps/chosen": -57.386985778808594, "logps/rejected": -58.883995056152344, "loss": 0.6861, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.032664235681295395, "rewards/margins": 0.014921456575393677, "rewards/rejected": -0.04758569225668907, "step": 10420 }, { "epoch": 1.7970365265334252, "grad_norm": 2.8999135494232178, "learning_rate": 6.209130226213377e-10, "logits/chosen": -3.0889573097229004, "logits/rejected": -3.0721592903137207, "logps/chosen": -57.90520095825195, "logps/rejected": -57.81034469604492, "loss": 0.685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.030346263200044632, "rewards/margins": 0.0172057393938303, "rewards/rejected": -0.04755200073122978, "step": 10430 }, { "epoch": 1.7987594762232941, "grad_norm": 2.485287666320801, "learning_rate": 6.105241010300888e-10, "logits/chosen": -3.089571952819824, "logits/rejected": -3.0788373947143555, "logps/chosen": -61.08458709716797, "logps/rejected": -61.87718963623047, "loss": 0.6867, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0321456603705883, "rewards/margins": 0.013696420006453991, "rewards/rejected": -0.04584207385778427, "step": 10440 }, { "epoch": 1.8004824259131633, "grad_norm": 2.1716952323913574, "learning_rate": 6.002200891923693e-10, "logits/chosen": -3.045029878616333, "logits/rejected": -3.0410609245300293, "logps/chosen": -56.7320442199707, "logps/rejected": -62.7134895324707, "loss": 0.6838, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03110019862651825, "rewards/margins": 0.019653640687465668, "rewards/rejected": -0.05075383931398392, "step": 10450 }, { "epoch": 1.8022053756030325, "grad_norm": 2.5009636878967285, "learning_rate": 5.900010802881462e-10, "logits/chosen": -3.0444865226745605, "logits/rejected": -3.0173439979553223, "logps/chosen": -57.57358932495117, "logps/rejected": -57.37907028198242, "loss": 0.6869, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029986029490828514, "rewards/margins": 0.013526695780456066, "rewards/rejected": -0.043512724339962006, "step": 10460 }, { "epoch": 1.8039283252929015, "grad_norm": 2.530627965927124, "learning_rate": 5.798671667287059e-10, "logits/chosen": -2.8856892585754395, "logits/rejected": -2.862014055252075, "logps/chosen": -55.18822479248047, "logps/rejected": -55.52335739135742, "loss": 0.6863, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03281798213720322, "rewards/margins": 0.014614596962928772, "rewards/rejected": -0.04743257910013199, "step": 10470 }, { "epoch": 1.8056512749827704, "grad_norm": 2.694502830505371, "learning_rate": 5.698184401558093e-10, "logits/chosen": -3.0464718341827393, "logits/rejected": -3.008328914642334, "logps/chosen": -58.82337188720703, "logps/rejected": -57.099815368652344, "loss": 0.6845, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.02852488122880459, "rewards/margins": 0.018450209870934486, "rewards/rejected": -0.046975087374448776, "step": 10480 }, { "epoch": 1.8073742246726394, "grad_norm": 2.5299627780914307, "learning_rate": 5.598549914408657e-10, "logits/chosen": -3.0337061882019043, "logits/rejected": -3.002572774887085, "logps/chosen": -56.425636291503906, "logps/rejected": -57.16019821166992, "loss": 0.6837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02615206502377987, "rewards/margins": 0.01992407813668251, "rewards/rejected": -0.04607614129781723, "step": 10490 }, { "epoch": 1.8090971743625086, "grad_norm": 2.505113124847412, "learning_rate": 5.499769106841079e-10, "logits/chosen": -3.029731035232544, "logits/rejected": -3.016737461090088, "logps/chosen": -55.8951416015625, "logps/rejected": -59.8373908996582, "loss": 0.6861, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.031956687569618225, "rewards/margins": 0.01530187763273716, "rewards/rejected": -0.047258567065000534, "step": 10500 }, { "epoch": 1.8090971743625086, "eval_logits/chosen": -3.116119623184204, "eval_logits/rejected": -3.1104183197021484, "eval_logps/chosen": -59.59355926513672, "eval_logps/rejected": -64.70457458496094, "eval_loss": 0.6901285648345947, "eval_rewards/accuracies": 0.5861988663673401, "eval_rewards/chosen": -0.008816661313176155, "eval_rewards/margins": 0.006427871994674206, "eval_rewards/rejected": -0.015244533307850361, "eval_runtime": 384.1744, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 10500 }, { "epoch": 1.8108201240523778, "grad_norm": 2.4923551082611084, "learning_rate": 5.401842872137786e-10, "logits/chosen": -3.013028621673584, "logits/rejected": -2.988231897354126, "logps/chosen": -59.1893196105957, "logps/rejected": -57.93567657470703, "loss": 0.6878, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.036012202501297, "rewards/margins": 0.011595193296670914, "rewards/rejected": -0.04760739207267761, "step": 10510 }, { "epoch": 1.8125430737422468, "grad_norm": 2.3233695030212402, "learning_rate": 5.304772095853305e-10, "logits/chosen": -3.082209348678589, "logits/rejected": -3.0868873596191406, "logps/chosen": -53.884185791015625, "logps/rejected": -59.48823165893555, "loss": 0.6858, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.033986646682024, "rewards/margins": 0.01549739670008421, "rewards/rejected": -0.049484044313430786, "step": 10520 }, { "epoch": 1.8142660234321157, "grad_norm": 2.3757970333099365, "learning_rate": 5.208557655806078e-10, "logits/chosen": -2.98966646194458, "logits/rejected": -2.9673266410827637, "logps/chosen": -56.67155075073242, "logps/rejected": -56.58771896362305, "loss": 0.6879, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03330652788281441, "rewards/margins": 0.011316435411572456, "rewards/rejected": -0.04462296515703201, "step": 10530 }, { "epoch": 1.8159889731219847, "grad_norm": 2.238402843475342, "learning_rate": 5.113200422070763e-10, "logits/chosen": -2.97110915184021, "logits/rejected": -2.9450035095214844, "logps/chosen": -55.549415588378906, "logps/rejected": -54.656227111816406, "loss": 0.6834, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.032187167555093765, "rewards/margins": 0.0204298235476017, "rewards/rejected": -0.052616991102695465, "step": 10540 }, { "epoch": 1.817711922811854, "grad_norm": 2.7323215007781982, "learning_rate": 5.018701256970127e-10, "logits/chosen": -3.127934694290161, "logits/rejected": -3.1271016597747803, "logps/chosen": -57.295440673828125, "logps/rejected": -60.423919677734375, "loss": 0.6882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.035445090383291245, "rewards/margins": 0.010981904342770576, "rewards/rejected": -0.04642698913812637, "step": 10550 }, { "epoch": 1.819434872501723, "grad_norm": 2.640737771987915, "learning_rate": 4.9250610150674e-10, "logits/chosen": -2.9855422973632812, "logits/rejected": -2.966156005859375, "logps/chosen": -58.918701171875, "logps/rejected": -60.0306282043457, "loss": 0.6832, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0328650139272213, "rewards/margins": 0.021228309720754623, "rewards/rejected": -0.05409331992268562, "step": 10560 }, { "epoch": 1.821157822191592, "grad_norm": 2.298959493637085, "learning_rate": 4.832280543158507e-10, "logits/chosen": -3.0651791095733643, "logits/rejected": -3.0477185249328613, "logps/chosen": -59.076416015625, "logps/rejected": -61.52764129638672, "loss": 0.6855, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.031295984983444214, "rewards/margins": 0.016338912770152092, "rewards/rejected": -0.047634903341531754, "step": 10570 }, { "epoch": 1.822880771881461, "grad_norm": 2.188692331314087, "learning_rate": 4.740360680264388e-10, "logits/chosen": -3.0644102096557617, "logits/rejected": -3.0470824241638184, "logps/chosen": -56.48671340942383, "logps/rejected": -57.2518196105957, "loss": 0.687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03124716505408287, "rewards/margins": 0.0131332753226161, "rewards/rejected": -0.044380445033311844, "step": 10580 }, { "epoch": 1.82460372157133, "grad_norm": 2.293623924255371, "learning_rate": 4.6493022576234556e-10, "logits/chosen": -3.0044636726379395, "logits/rejected": -2.962233066558838, "logps/chosen": -59.2132568359375, "logps/rejected": -55.89684295654297, "loss": 0.6805, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03308730944991112, "rewards/margins": 0.026532579213380814, "rewards/rejected": -0.05961988493800163, "step": 10590 }, { "epoch": 1.8263266712611992, "grad_norm": 2.2917075157165527, "learning_rate": 4.559106098684029e-10, "logits/chosen": -3.11020565032959, "logits/rejected": -3.099874496459961, "logps/chosen": -56.04803466796875, "logps/rejected": -59.04853439331055, "loss": 0.6877, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.03165452927350998, "rewards/margins": 0.011951565742492676, "rewards/rejected": -0.043606095016002655, "step": 10600 }, { "epoch": 1.8263266712611992, "eval_logits/chosen": -3.1160738468170166, "eval_logits/rejected": -3.1104249954223633, "eval_logps/chosen": -59.59670639038086, "eval_logps/rejected": -64.7072982788086, "eval_loss": 0.6901311278343201, "eval_rewards/accuracies": 0.5920074582099915, "eval_rewards/chosen": -0.00884803757071495, "eval_rewards/margins": 0.006423789542168379, "eval_rewards/rejected": -0.015271826647222042, "eval_runtime": 384.1659, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 10600 }, { "epoch": 1.8280496209510684, "grad_norm": 2.4631903171539307, "learning_rate": 4.4697730190969235e-10, "logits/chosen": -3.001817226409912, "logits/rejected": -2.9916493892669678, "logps/chosen": -54.29302978515625, "logps/rejected": -56.63573455810547, "loss": 0.6858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0322897806763649, "rewards/margins": 0.015405547805130482, "rewards/rejected": -0.047695327550172806, "step": 10610 }, { "epoch": 1.8297725706409373, "grad_norm": 2.605523109436035, "learning_rate": 4.381303826708061e-10, "logits/chosen": -2.9537954330444336, "logits/rejected": -2.930615186691284, "logps/chosen": -56.80295944213867, "logps/rejected": -56.80694580078125, "loss": 0.6862, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.029991215094923973, "rewards/margins": 0.014791592955589294, "rewards/rejected": -0.04478280618786812, "step": 10620 }, { "epoch": 1.8314955203308063, "grad_norm": 2.5096333026885986, "learning_rate": 4.2936993215511257e-10, "logits/chosen": -3.041898250579834, "logits/rejected": -3.023247718811035, "logps/chosen": -57.2591438293457, "logps/rejected": -58.80683517456055, "loss": 0.6855, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03000602126121521, "rewards/margins": 0.01637157052755356, "rewards/rejected": -0.04637759551405907, "step": 10630 }, { "epoch": 1.8332184700206753, "grad_norm": 2.578603506088257, "learning_rate": 4.206960295840456e-10, "logits/chosen": -2.9995503425598145, "logits/rejected": -2.975860595703125, "logps/chosen": -58.27387237548828, "logps/rejected": -57.60478591918945, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03207911550998688, "rewards/margins": 0.018181387335062027, "rewards/rejected": -0.050260502845048904, "step": 10640 }, { "epoch": 1.8349414197105445, "grad_norm": 2.3200182914733887, "learning_rate": 4.1210875339636854e-10, "logits/chosen": -3.082303524017334, "logits/rejected": -3.059866428375244, "logps/chosen": -56.590576171875, "logps/rejected": -59.08875274658203, "loss": 0.6821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.029690880328416824, "rewards/margins": 0.02302057109773159, "rewards/rejected": -0.052711449563503265, "step": 10650 }, { "epoch": 1.8366643694004137, "grad_norm": 2.352198839187622, "learning_rate": 4.0360818124748677e-10, "logits/chosen": -2.9877002239227295, "logits/rejected": -2.9751667976379395, "logps/chosen": -58.22998809814453, "logps/rejected": -59.70201873779297, "loss": 0.6887, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03531861677765846, "rewards/margins": 0.009900919161736965, "rewards/rejected": -0.045219533145427704, "step": 10660 }, { "epoch": 1.8383873190902826, "grad_norm": 2.7161898612976074, "learning_rate": 3.9519439000872404e-10, "logits/chosen": -3.0114142894744873, "logits/rejected": -2.992724657058716, "logps/chosen": -55.91082763671875, "logps/rejected": -56.84636688232422, "loss": 0.6861, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03772654011845589, "rewards/margins": 0.015098011121153831, "rewards/rejected": -0.052824556827545166, "step": 10670 }, { "epoch": 1.8401102687801516, "grad_norm": 2.565969705581665, "learning_rate": 3.8686745576664626e-10, "logits/chosen": -3.039262294769287, "logits/rejected": -2.9938862323760986, "logps/chosen": -59.46996307373047, "logps/rejected": -56.1522102355957, "loss": 0.6827, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03180866315960884, "rewards/margins": 0.021904734894633293, "rewards/rejected": -0.05371339991688728, "step": 10680 }, { "epoch": 1.8418332184700206, "grad_norm": 2.3504178524017334, "learning_rate": 3.7862745382235775e-10, "logits/chosen": -3.025742292404175, "logits/rejected": -3.019537925720215, "logps/chosen": -56.9988899230957, "logps/rejected": -60.743568420410156, "loss": 0.688, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03248900920152664, "rewards/margins": 0.011180490255355835, "rewards/rejected": -0.043669503182172775, "step": 10690 }, { "epoch": 1.8435561681598898, "grad_norm": 2.2883052825927734, "learning_rate": 3.704744586908315e-10, "logits/chosen": -3.083956003189087, "logits/rejected": -3.0425620079040527, "logps/chosen": -58.91267013549805, "logps/rejected": -55.345741271972656, "loss": 0.6824, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.027451183646917343, "rewards/margins": 0.02261456288397312, "rewards/rejected": -0.050065744668245316, "step": 10700 }, { "epoch": 1.8435561681598898, "eval_logits/chosen": -3.115992784500122, "eval_logits/rejected": -3.110304832458496, "eval_logps/chosen": -59.599796295166016, "eval_logps/rejected": -64.70921325683594, "eval_loss": 0.6901361346244812, "eval_rewards/accuracies": 0.586663544178009, "eval_rewards/chosen": -0.008878974243998528, "eval_rewards/margins": 0.00641192402690649, "eval_rewards/rejected": -0.01529089454561472, "eval_runtime": 384.0656, "eval_samples_per_second": 11.206, "eval_steps_per_second": 1.401, "step": 10700 }, { "epoch": 1.8452791178497587, "grad_norm": 2.8073954582214355, "learning_rate": 3.6240854410023116e-10, "logits/chosen": -2.9952645301818848, "logits/rejected": -2.9651360511779785, "logps/chosen": -56.67476272583008, "logps/rejected": -54.63886642456055, "loss": 0.6838, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.032301269471645355, "rewards/margins": 0.019710997119545937, "rewards/rejected": -0.05201226472854614, "step": 10710 }, { "epoch": 1.847002067539628, "grad_norm": 2.4006543159484863, "learning_rate": 3.5442978299124126e-10, "logits/chosen": -3.021350383758545, "logits/rejected": -2.9896862506866455, "logps/chosen": -56.3643913269043, "logps/rejected": -57.85981369018555, "loss": 0.6846, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.03777404502034187, "rewards/margins": 0.018012024462223053, "rewards/rejected": -0.055786073207855225, "step": 10720 }, { "epoch": 1.848725017229497, "grad_norm": 2.676990509033203, "learning_rate": 3.465382475164169e-10, "logits/chosen": -3.1102206707000732, "logits/rejected": -3.073194980621338, "logps/chosen": -56.90290451049805, "logps/rejected": -55.273040771484375, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": -0.0294378399848938, "rewards/margins": 0.024509413167834282, "rewards/rejected": -0.05394725129008293, "step": 10730 }, { "epoch": 1.8504479669193659, "grad_norm": 3.0840725898742676, "learning_rate": 3.3873400903951634e-10, "logits/chosen": -2.979149580001831, "logits/rejected": -2.96920108795166, "logps/chosen": -58.513206481933594, "logps/rejected": -59.453834533691406, "loss": 0.6912, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.03822885453701019, "rewards/margins": 0.0048066796734929085, "rewards/rejected": -0.04303553327918053, "step": 10740 }, { "epoch": 1.852170916609235, "grad_norm": 2.448347330093384, "learning_rate": 3.310171381348726e-10, "logits/chosen": -2.982016086578369, "logits/rejected": -2.9525201320648193, "logps/chosen": -55.0046501159668, "logps/rejected": -57.35972213745117, "loss": 0.6848, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.029463868588209152, "rewards/margins": 0.017485864460468292, "rewards/rejected": -0.046949733048677444, "step": 10750 }, { "epoch": 1.853893866299104, "grad_norm": 2.85001802444458, "learning_rate": 3.233877045867417e-10, "logits/chosen": -2.994340419769287, "logits/rejected": -2.97330379486084, "logps/chosen": -56.65723419189453, "logps/rejected": -57.377845764160156, "loss": 0.6855, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.029764290899038315, "rewards/margins": 0.01615387573838234, "rewards/rejected": -0.045918166637420654, "step": 10760 }, { "epoch": 1.8556168159889732, "grad_norm": 2.89906907081604, "learning_rate": 3.1584577738867804e-10, "logits/chosen": -2.9776854515075684, "logits/rejected": -2.9613919258117676, "logps/chosen": -59.64055252075195, "logps/rejected": -58.95122528076172, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": -0.025688624009490013, "rewards/margins": 0.017932411283254623, "rewards/rejected": -0.043621040880680084, "step": 10770 }, { "epoch": 1.8573397656788422, "grad_norm": 2.3571207523345947, "learning_rate": 3.0839142474291206e-10, "logits/chosen": -3.1085402965545654, "logits/rejected": -3.0907883644104004, "logps/chosen": -56.31730270385742, "logps/rejected": -59.65522003173828, "loss": 0.6866, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.031476687639951706, "rewards/margins": 0.014222445897758007, "rewards/rejected": -0.04569913074374199, "step": 10780 }, { "epoch": 1.8590627153687111, "grad_norm": 2.8656840324401855, "learning_rate": 3.0102471405972666e-10, "logits/chosen": -3.1140193939208984, "logits/rejected": -3.082620143890381, "logps/chosen": -55.114166259765625, "logps/rejected": -58.22220993041992, "loss": 0.6835, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.027730736881494522, "rewards/margins": 0.020436272025108337, "rewards/rejected": -0.04816700890660286, "step": 10790 }, { "epoch": 1.8607856650585803, "grad_norm": 2.6412203311920166, "learning_rate": 2.937457119568554e-10, "logits/chosen": -3.110588312149048, "logits/rejected": -3.088531255722046, "logps/chosen": -55.08143997192383, "logps/rejected": -58.99031448364258, "loss": 0.6839, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.031220849603414536, "rewards/margins": 0.019479000940918922, "rewards/rejected": -0.05069985240697861, "step": 10800 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -3.11584734916687, "eval_logits/rejected": -3.1101913452148438, "eval_logps/chosen": -59.59832763671875, "eval_logps/rejected": -64.71131896972656, "eval_loss": 0.6901180148124695, "eval_rewards/accuracies": 0.5878252983093262, "eval_rewards/chosen": -0.008864316157996655, "eval_rewards/margins": 0.006447718013077974, "eval_rewards/rejected": -0.015312033705413342, "eval_runtime": 384.2453, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 10800 }, { "epoch": 1.8625086147484493, "grad_norm": 2.6558892726898193, "learning_rate": 2.8655448425887407e-10, "logits/chosen": -3.0395102500915527, "logits/rejected": -2.996107339859009, "logps/chosen": -60.2518424987793, "logps/rejected": -55.66035079956055, "loss": 0.6804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02872023545205593, "rewards/margins": 0.026779908686876297, "rewards/rejected": -0.05550014227628708, "step": 10810 }, { "epoch": 1.8642315644383185, "grad_norm": 2.5476865768432617, "learning_rate": 2.794510959966079e-10, "logits/chosen": -2.981799364089966, "logits/rejected": -2.957087755203247, "logps/chosen": -55.7774772644043, "logps/rejected": -55.99811935424805, "loss": 0.684, "rewards/accuracies": 0.65625, "rewards/chosen": -0.026825163513422012, "rewards/margins": 0.01911497488617897, "rewards/rejected": -0.045940134674310684, "step": 10820 }, { "epoch": 1.8659545141281875, "grad_norm": 2.5901870727539062, "learning_rate": 2.724356114065452e-10, "logits/chosen": -3.0013303756713867, "logits/rejected": -2.974020481109619, "logps/chosen": -55.19731521606445, "logps/rejected": -58.499778747558594, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0338115394115448, "rewards/margins": 0.018128041177988052, "rewards/rejected": -0.05193958431482315, "step": 10830 }, { "epoch": 1.8676774638180564, "grad_norm": 2.523749351501465, "learning_rate": 2.6550809393025233e-10, "logits/chosen": -3.051769495010376, "logits/rejected": -3.020822525024414, "logps/chosen": -55.566810607910156, "logps/rejected": -55.53137969970703, "loss": 0.6834, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.025759857147932053, "rewards/margins": 0.02048664726316929, "rewards/rejected": -0.04624650999903679, "step": 10840 }, { "epoch": 1.8694004135079254, "grad_norm": 2.379290819168091, "learning_rate": 2.586686062138044e-10, "logits/chosen": -2.9762279987335205, "logits/rejected": -2.9606053829193115, "logps/chosen": -52.855323791503906, "logps/rejected": -59.426116943359375, "loss": 0.6823, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02694542333483696, "rewards/margins": 0.02276376634836197, "rewards/rejected": -0.04970919340848923, "step": 10850 }, { "epoch": 1.8711233631977946, "grad_norm": 2.553745746612549, "learning_rate": 2.5191721010721204e-10, "logits/chosen": -3.0979971885681152, "logits/rejected": -3.0644125938415527, "logps/chosen": -58.5058479309082, "logps/rejected": -56.959007263183594, "loss": 0.6871, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03247032314538956, "rewards/margins": 0.0129841985180974, "rewards/rejected": -0.04545452445745468, "step": 10860 }, { "epoch": 1.8728463128876638, "grad_norm": 2.415626287460327, "learning_rate": 2.4525396666387534e-10, "logits/chosen": -2.92629337310791, "logits/rejected": -2.9215917587280273, "logps/chosen": -54.1297492980957, "logps/rejected": -61.116294860839844, "loss": 0.6866, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.036440201103687286, "rewards/margins": 0.014107112772762775, "rewards/rejected": -0.05054731294512749, "step": 10870 }, { "epoch": 1.8745692625775328, "grad_norm": 2.4143502712249756, "learning_rate": 2.386789361400121e-10, "logits/chosen": -3.021678924560547, "logits/rejected": -3.000797748565674, "logps/chosen": -57.00726318359375, "logps/rejected": -57.3067626953125, "loss": 0.6845, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027981415390968323, "rewards/margins": 0.018140435218811035, "rewards/rejected": -0.04612184688448906, "step": 10880 }, { "epoch": 1.8762922122674017, "grad_norm": 2.297905683517456, "learning_rate": 2.3219217799413604e-10, "logits/chosen": -3.074444532394409, "logits/rejected": -3.0573339462280273, "logps/chosen": -56.519920349121094, "logps/rejected": -59.452972412109375, "loss": 0.6873, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.033595021814107895, "rewards/margins": 0.01268724538385868, "rewards/rejected": -0.04628227278590202, "step": 10890 }, { "epoch": 1.8780151619572707, "grad_norm": 2.3045783042907715, "learning_rate": 2.257937508864949e-10, "logits/chosen": -3.1231143474578857, "logits/rejected": -3.0894017219543457, "logps/chosen": -57.63786697387695, "logps/rejected": -55.3487663269043, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": -0.026455294340848923, "rewards/margins": 0.020993905141949654, "rewards/rejected": -0.047449201345443726, "step": 10900 }, { "epoch": 1.8780151619572707, "eval_logits/chosen": -3.116037607192993, "eval_logits/rejected": -3.110398054122925, "eval_logps/chosen": -59.60280227661133, "eval_logps/rejected": -64.71473693847656, "eval_loss": 0.6901231408119202, "eval_rewards/accuracies": 0.5845724940299988, "eval_rewards/chosen": -0.008909125812351704, "eval_rewards/margins": 0.006437050178647041, "eval_rewards/rejected": -0.015346175990998745, "eval_runtime": 384.1875, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 10900 }, { "epoch": 1.8797381116471399, "grad_norm": 2.752044200897217, "learning_rate": 2.1948371267855983e-10, "logits/chosen": -3.0186257362365723, "logits/rejected": -2.984849214553833, "logps/chosen": -58.02870559692383, "logps/rejected": -59.342620849609375, "loss": 0.6797, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023622671142220497, "rewards/margins": 0.028002118691802025, "rewards/rejected": -0.051624786108732224, "step": 10910 }, { "epoch": 1.881461061337009, "grad_norm": 2.5073671340942383, "learning_rate": 2.132621204324925e-10, "logits/chosen": -2.987391948699951, "logits/rejected": -2.962049961090088, "logps/chosen": -59.12227249145508, "logps/rejected": -57.84357833862305, "loss": 0.6854, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.03284627944231033, "rewards/margins": 0.016383716836571693, "rewards/rejected": -0.049229998141527176, "step": 10920 }, { "epoch": 1.883184011026878, "grad_norm": 2.84432315826416, "learning_rate": 2.0712903041063102e-10, "logits/chosen": -2.965153455734253, "logits/rejected": -2.934025287628174, "logps/chosen": -57.401832580566406, "logps/rejected": -56.0716667175293, "loss": 0.6867, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.031898193061351776, "rewards/margins": 0.013943254947662354, "rewards/rejected": -0.04584144800901413, "step": 10930 }, { "epoch": 1.884906960716747, "grad_norm": 2.7386891841888428, "learning_rate": 2.010844980749793e-10, "logits/chosen": -2.919647693634033, "logits/rejected": -2.8994479179382324, "logps/chosen": -59.5832405090332, "logps/rejected": -60.03248977661133, "loss": 0.6849, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.031240392476320267, "rewards/margins": 0.017366571351885796, "rewards/rejected": -0.04860696569085121, "step": 10940 }, { "epoch": 1.886629910406616, "grad_norm": 2.3839938640594482, "learning_rate": 1.951285780867096e-10, "logits/chosen": -2.9704971313476562, "logits/rejected": -2.952685832977295, "logps/chosen": -57.843719482421875, "logps/rejected": -59.376014709472656, "loss": 0.6864, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03237280622124672, "rewards/margins": 0.014441991224884987, "rewards/rejected": -0.046814799308776855, "step": 10950 }, { "epoch": 1.8883528600964852, "grad_norm": 2.438133955001831, "learning_rate": 1.8926132430566512e-10, "logits/chosen": -3.043850898742676, "logits/rejected": -3.0247528553009033, "logps/chosen": -57.020118713378906, "logps/rejected": -57.449073791503906, "loss": 0.6853, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03209669888019562, "rewards/margins": 0.016656579449772835, "rewards/rejected": -0.048753272742033005, "step": 10960 }, { "epoch": 1.8900758097863544, "grad_norm": 2.7471344470977783, "learning_rate": 1.8348278978987166e-10, "logits/chosen": -3.0624217987060547, "logits/rejected": -3.0322635173797607, "logps/chosen": -59.22570037841797, "logps/rejected": -58.56772994995117, "loss": 0.6833, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.023627672344446182, "rewards/margins": 0.020939035341143608, "rewards/rejected": -0.04456670582294464, "step": 10970 }, { "epoch": 1.8917987594762233, "grad_norm": 2.3950552940368652, "learning_rate": 1.777930267950656e-10, "logits/chosen": -3.0392160415649414, "logits/rejected": -3.0243992805480957, "logps/chosen": -57.84577178955078, "logps/rejected": -61.35871124267578, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": -0.032726358622312546, "rewards/margins": 0.01907447539269924, "rewards/rejected": -0.05180083587765694, "step": 10980 }, { "epoch": 1.8935217091660923, "grad_norm": 2.4089016914367676, "learning_rate": 1.7219208677420882e-10, "logits/chosen": -2.959484815597534, "logits/rejected": -2.9480016231536865, "logps/chosen": -55.77641677856445, "logps/rejected": -57.395286560058594, "loss": 0.6855, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.032888785004615784, "rewards/margins": 0.016107559204101562, "rewards/rejected": -0.048996344208717346, "step": 10990 }, { "epoch": 1.8952446588559613, "grad_norm": 2.7076034545898438, "learning_rate": 1.6668002037703244e-10, "logits/chosen": -3.1547889709472656, "logits/rejected": -3.1513662338256836, "logps/chosen": -56.488006591796875, "logps/rejected": -61.58770751953125, "loss": 0.6886, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.03598905727267265, "rewards/margins": 0.010044041089713573, "rewards/rejected": -0.0460330992937088, "step": 11000 }, { "epoch": 1.8952446588559613, "eval_logits/chosen": -3.1160166263580322, "eval_logits/rejected": -3.1103060245513916, "eval_logps/chosen": -59.60320281982422, "eval_logps/rejected": -64.71551513671875, "eval_loss": 0.6901227235794067, "eval_rewards/accuracies": 0.5908457040786743, "eval_rewards/chosen": -0.008913068100810051, "eval_rewards/margins": 0.006440852303057909, "eval_rewards/rejected": -0.015353920869529247, "eval_runtime": 383.863, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 11000 }, { "epoch": 1.8969676085458305, "grad_norm": 2.4201393127441406, "learning_rate": 1.6125687744958039e-10, "logits/chosen": -3.0817818641662598, "logits/rejected": -3.049903154373169, "logps/chosen": -56.0413932800293, "logps/rejected": -56.055267333984375, "loss": 0.6848, "rewards/accuracies": 0.6875, "rewards/chosen": -0.028350725769996643, "rewards/margins": 0.017499305307865143, "rewards/rejected": -0.045850031077861786, "step": 11010 }, { "epoch": 1.8986905582356997, "grad_norm": 2.7356576919555664, "learning_rate": 1.5592270703374988e-10, "logits/chosen": -3.0850062370300293, "logits/rejected": -3.0661559104919434, "logps/chosen": -57.593597412109375, "logps/rejected": -57.5640869140625, "loss": 0.684, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025895848870277405, "rewards/margins": 0.019343126565217972, "rewards/rejected": -0.045238979160785675, "step": 11020 }, { "epoch": 1.9004135079255686, "grad_norm": 2.9743478298187256, "learning_rate": 1.5067755736685395e-10, "logits/chosen": -3.058427333831787, "logits/rejected": -3.0497522354125977, "logps/chosen": -58.04417037963867, "logps/rejected": -59.50719451904297, "loss": 0.6848, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03255953639745712, "rewards/margins": 0.0176172386854887, "rewards/rejected": -0.05017677694559097, "step": 11030 }, { "epoch": 1.9021364576154376, "grad_norm": 2.6739232540130615, "learning_rate": 1.4552147588118735e-10, "logits/chosen": -3.087679147720337, "logits/rejected": -3.072469711303711, "logps/chosen": -59.788185119628906, "logps/rejected": -60.3827018737793, "loss": 0.686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03321398049592972, "rewards/margins": 0.015200495719909668, "rewards/rejected": -0.048414479941129684, "step": 11040 }, { "epoch": 1.9038594073053066, "grad_norm": 2.4157602787017822, "learning_rate": 1.4045450920358916e-10, "logits/chosen": -3.0092153549194336, "logits/rejected": -2.989617109298706, "logps/chosen": -54.784645080566406, "logps/rejected": -59.533729553222656, "loss": 0.6864, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.028205087408423424, "rewards/margins": 0.014475090429186821, "rewards/rejected": -0.04268018156290054, "step": 11050 }, { "epoch": 1.9055823569951758, "grad_norm": 2.6217141151428223, "learning_rate": 1.354767031550308e-10, "logits/chosen": -3.0563225746154785, "logits/rejected": -3.0126149654388428, "logps/chosen": -57.26979446411133, "logps/rejected": -57.185638427734375, "loss": 0.6814, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.029949193820357323, "rewards/margins": 0.024533966556191444, "rewards/rejected": -0.054483164101839066, "step": 11060 }, { "epoch": 1.907305306685045, "grad_norm": 2.354989528656006, "learning_rate": 1.305881027501965e-10, "logits/chosen": -3.0827879905700684, "logits/rejected": -3.034569025039673, "logps/chosen": -55.78126907348633, "logps/rejected": -54.82404708862305, "loss": 0.6804, "rewards/accuracies": 0.6875, "rewards/chosen": -0.026058072224259377, "rewards/margins": 0.026529595255851746, "rewards/rejected": -0.052587658166885376, "step": 11070 }, { "epoch": 1.909028256374914, "grad_norm": 2.645761728286743, "learning_rate": 1.2578875219707463e-10, "logits/chosen": -3.06398344039917, "logits/rejected": -3.037116765975952, "logps/chosen": -58.64666748046875, "logps/rejected": -57.29736328125, "loss": 0.6853, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.031432170420885086, "rewards/margins": 0.016599904745817184, "rewards/rejected": -0.04803207889199257, "step": 11080 }, { "epoch": 1.9107512060647829, "grad_norm": 2.631910562515259, "learning_rate": 1.2107869489656141e-10, "logits/chosen": -3.0288023948669434, "logits/rejected": -3.009167432785034, "logps/chosen": -60.91929244995117, "logps/rejected": -57.264122009277344, "loss": 0.6865, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.030957896262407303, "rewards/margins": 0.014416252262890339, "rewards/rejected": -0.04537414759397507, "step": 11090 }, { "epoch": 1.9124741557546519, "grad_norm": 2.188575506210327, "learning_rate": 1.16457973442069e-10, "logits/chosen": -2.915553569793701, "logits/rejected": -2.900320529937744, "logps/chosen": -54.17964553833008, "logps/rejected": -58.8646240234375, "loss": 0.6859, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03301596641540527, "rewards/margins": 0.015402073971927166, "rewards/rejected": -0.048418037593364716, "step": 11100 }, { "epoch": 1.9124741557546519, "eval_logits/chosen": -3.1159143447875977, "eval_logits/rejected": -3.1102497577667236, "eval_logps/chosen": -59.591861724853516, "eval_logps/rejected": -64.70150756835938, "eval_loss": 0.6901371479034424, "eval_rewards/accuracies": 0.5845724940299988, "eval_rewards/chosen": -0.008799640461802483, "eval_rewards/margins": 0.006414216477423906, "eval_rewards/rejected": -0.01521385833621025, "eval_runtime": 383.9987, "eval_samples_per_second": 11.208, "eval_steps_per_second": 1.401, "step": 11100 }, { "epoch": 1.914197105444521, "grad_norm": 2.3288257122039795, "learning_rate": 1.119266296191368e-10, "logits/chosen": -3.0594656467437744, "logits/rejected": -3.034283399581909, "logps/chosen": -57.703758239746094, "logps/rejected": -59.37468719482422, "loss": 0.6835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03802116960287094, "rewards/margins": 0.02051617205142975, "rewards/rejected": -0.05853734165430069, "step": 11110 }, { "epoch": 1.9159200551343902, "grad_norm": 2.249931812286377, "learning_rate": 1.0748470440505532e-10, "logits/chosen": -3.150852918624878, "logits/rejected": -3.1203830242156982, "logps/chosen": -62.930511474609375, "logps/rejected": -58.691131591796875, "loss": 0.6857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.034155942499637604, "rewards/margins": 0.016003701835870743, "rewards/rejected": -0.05015964433550835, "step": 11120 }, { "epoch": 1.9176430048242592, "grad_norm": 2.8102667331695557, "learning_rate": 1.0313223796849735e-10, "logits/chosen": -2.976644992828369, "logits/rejected": -2.9540998935699463, "logps/chosen": -59.52458572387695, "logps/rejected": -57.91312789916992, "loss": 0.6882, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03648515045642853, "rewards/margins": 0.010790010914206505, "rewards/rejected": -0.04727516323328018, "step": 11130 }, { "epoch": 1.9193659545141282, "grad_norm": 2.495495319366455, "learning_rate": 9.886926966915178e-11, "logits/chosen": -2.9461255073547363, "logits/rejected": -2.9252490997314453, "logps/chosen": -56.85634231567383, "logps/rejected": -56.508995056152344, "loss": 0.6842, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.030429724603891373, "rewards/margins": 0.018887072801589966, "rewards/rejected": -0.04931679740548134, "step": 11140 }, { "epoch": 1.9210889042039971, "grad_norm": 2.9053337574005127, "learning_rate": 9.469583805736925e-11, "logits/chosen": -3.0301146507263184, "logits/rejected": -3.017643451690674, "logps/chosen": -61.63774490356445, "logps/rejected": -60.53892135620117, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03544190898537636, "rewards/margins": 0.004841863643378019, "rewards/rejected": -0.04028376564383507, "step": 11150 }, { "epoch": 1.9228118538938663, "grad_norm": 2.2418413162231445, "learning_rate": 9.06119808738115e-11, "logits/chosen": -3.1213736534118652, "logits/rejected": -3.1018104553222656, "logps/chosen": -54.93082809448242, "logps/rejected": -55.081199645996094, "loss": 0.6874, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.028121694922447205, "rewards/margins": 0.012537782080471516, "rewards/rejected": -0.04065947234630585, "step": 11160 }, { "epoch": 1.9245348035837355, "grad_norm": 2.396535634994507, "learning_rate": 8.661773504911486e-11, "logits/chosen": -2.9440383911132812, "logits/rejected": -2.9148967266082764, "logps/chosen": -59.909454345703125, "logps/rejected": -53.46514892578125, "loss": 0.6851, "rewards/accuracies": 0.65625, "rewards/chosen": -0.027473077178001404, "rewards/margins": 0.016955533996224403, "rewards/rejected": -0.04442860931158066, "step": 11170 }, { "epoch": 1.9262577532736045, "grad_norm": 2.487661600112915, "learning_rate": 8.271313670355163e-11, "logits/chosen": -3.0768094062805176, "logits/rejected": -3.053123950958252, "logps/chosen": -57.1087760925293, "logps/rejected": -55.69123077392578, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": -0.03142940253019333, "rewards/margins": 0.01724282279610634, "rewards/rejected": -0.04867222160100937, "step": 11180 }, { "epoch": 1.9279807029634735, "grad_norm": 2.548030376434326, "learning_rate": 7.889822114670708e-11, "logits/chosen": -3.079629421234131, "logits/rejected": -3.0492467880249023, "logps/chosen": -58.1506233215332, "logps/rejected": -57.200965881347656, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03268757462501526, "rewards/margins": 0.022937973961234093, "rewards/rejected": -0.0556255504488945, "step": 11190 }, { "epoch": 1.9297036526533424, "grad_norm": 2.3734309673309326, "learning_rate": 7.5173022877153e-11, "logits/chosen": -3.077995777130127, "logits/rejected": -3.056281566619873, "logps/chosen": -55.8563346862793, "logps/rejected": -58.005287170410156, "loss": 0.685, "rewards/accuracies": 0.65625, "rewards/chosen": -0.025432473048567772, "rewards/margins": 0.0171569362282753, "rewards/rejected": -0.04258941113948822, "step": 11200 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -3.115981101989746, "eval_logits/rejected": -3.1102943420410156, "eval_logps/chosen": -59.59299087524414, "eval_logps/rejected": -64.6996841430664, "eval_loss": 0.6901500225067139, "eval_rewards/accuracies": 0.5845724940299988, "eval_rewards/chosen": -0.00881095789372921, "eval_rewards/margins": 0.006384588778018951, "eval_rewards/rejected": -0.015195546671748161, "eval_runtime": 384.1727, "eval_samples_per_second": 11.203, "eval_steps_per_second": 1.4, "step": 11200 }, { "epoch": 1.9314266023432116, "grad_norm": 2.562915325164795, "learning_rate": 7.15375755821468e-11, "logits/chosen": -2.9258503913879395, "logits/rejected": -2.909374237060547, "logps/chosen": -53.963531494140625, "logps/rejected": -59.4776496887207, "loss": 0.6839, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03552749380469322, "rewards/margins": 0.019793927669525146, "rewards/rejected": -0.05532141774892807, "step": 11210 }, { "epoch": 1.9331495520330806, "grad_norm": 2.5470385551452637, "learning_rate": 6.799191213731737e-11, "logits/chosen": -3.0415053367614746, "logits/rejected": -3.0274736881256104, "logps/chosen": -55.11113739013672, "logps/rejected": -60.2578239440918, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": -0.030614599585533142, "rewards/margins": 0.015119021758437157, "rewards/rejected": -0.04573361948132515, "step": 11220 }, { "epoch": 1.9348725017229498, "grad_norm": 2.789475202560425, "learning_rate": 6.453606460637195e-11, "logits/chosen": -3.0025832653045654, "logits/rejected": -2.9894137382507324, "logps/chosen": -59.69022750854492, "logps/rejected": -56.4006462097168, "loss": 0.6889, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.039212532341480255, "rewards/margins": 0.009541334584355354, "rewards/rejected": -0.04875386878848076, "step": 11230 }, { "epoch": 1.9365954514128187, "grad_norm": 2.2757625579833984, "learning_rate": 6.11700642408064e-11, "logits/chosen": -2.9875824451446533, "logits/rejected": -2.971897602081299, "logps/chosen": -57.854469299316406, "logps/rejected": -58.3641471862793, "loss": 0.6851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.032043833285570145, "rewards/margins": 0.017164278775453568, "rewards/rejected": -0.04920811206102371, "step": 11240 }, { "epoch": 1.9383184011026877, "grad_norm": 2.598924398422241, "learning_rate": 5.7893941479620904e-11, "logits/chosen": -3.018852472305298, "logits/rejected": -2.991450786590576, "logps/chosen": -57.508934020996094, "logps/rejected": -58.6025390625, "loss": 0.6848, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.032207388430833817, "rewards/margins": 0.01755516789853573, "rewards/rejected": -0.049762558192014694, "step": 11250 }, { "epoch": 1.940041350792557, "grad_norm": 2.611050844192505, "learning_rate": 5.4707725949045826e-11, "logits/chosen": -2.967341661453247, "logits/rejected": -2.9327094554901123, "logps/chosen": -60.662452697753906, "logps/rejected": -56.185386657714844, "loss": 0.6846, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.028289467096328735, "rewards/margins": 0.017964553087949753, "rewards/rejected": -0.04625401645898819, "step": 11260 }, { "epoch": 1.9417643004824259, "grad_norm": 2.6599137783050537, "learning_rate": 5.1611446462274116e-11, "logits/chosen": -3.0220043659210205, "logits/rejected": -2.9908974170684814, "logps/chosen": -58.7435417175293, "logps/rejected": -55.7933235168457, "loss": 0.6857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0317700020968914, "rewards/margins": 0.015762265771627426, "rewards/rejected": -0.04753226786851883, "step": 11270 }, { "epoch": 1.943487250172295, "grad_norm": 2.3055601119995117, "learning_rate": 4.8605131019198165e-11, "logits/chosen": -3.0796470642089844, "logits/rejected": -3.049293041229248, "logps/chosen": -59.201881408691406, "logps/rejected": -57.14641189575195, "loss": 0.6843, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.028897400945425034, "rewards/margins": 0.01845616102218628, "rewards/rejected": -0.04735356569290161, "step": 11280 }, { "epoch": 1.945210199862164, "grad_norm": 2.618269681930542, "learning_rate": 4.568880680616228e-11, "logits/chosen": -3.031619071960449, "logits/rejected": -3.0159995555877686, "logps/chosen": -62.53656005859375, "logps/rejected": -60.65777587890625, "loss": 0.6841, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.025777708739042282, "rewards/margins": 0.019383911043405533, "rewards/rejected": -0.045161619782447815, "step": 11290 }, { "epoch": 1.946933149552033, "grad_norm": 2.4982001781463623, "learning_rate": 4.2862500195708364e-11, "logits/chosen": -3.007124423980713, "logits/rejected": -2.9809606075286865, "logps/chosen": -61.85832977294922, "logps/rejected": -58.00861740112305, "loss": 0.6869, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.029342034831643105, "rewards/margins": 0.01363422255963087, "rewards/rejected": -0.0429762564599514, "step": 11300 }, { "epoch": 1.946933149552033, "eval_logits/chosen": -3.116105318069458, "eval_logits/rejected": -3.110396146774292, "eval_logps/chosen": -59.598411560058594, "eval_logps/rejected": -64.70812225341797, "eval_loss": 0.6901355385780334, "eval_rewards/accuracies": 0.5875929594039917, "eval_rewards/chosen": -0.00886518508195877, "eval_rewards/margins": 0.006414768751710653, "eval_rewards/rejected": -0.015279954299330711, "eval_runtime": 384.0506, "eval_samples_per_second": 11.207, "eval_steps_per_second": 1.401, "step": 11300 }, { "epoch": 1.948656099241902, "grad_norm": 2.3274354934692383, "learning_rate": 4.01262367463473e-11, "logits/chosen": -3.119818687438965, "logits/rejected": -3.113994598388672, "logps/chosen": -53.55225372314453, "logps/rejected": -56.986839294433594, "loss": 0.6817, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030891120433807373, "rewards/margins": 0.02375052683055401, "rewards/rejected": -0.05464165285229683, "step": 11310 }, { "epoch": 1.9503790489317712, "grad_norm": 2.3976454734802246, "learning_rate": 3.748004120231685e-11, "logits/chosen": -3.033069133758545, "logits/rejected": -3.0026774406433105, "logps/chosen": -57.32920455932617, "logps/rejected": -58.893646240234375, "loss": 0.6808, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02407662384212017, "rewards/margins": 0.025767523795366287, "rewards/rejected": -0.04984414950013161, "step": 11320 }, { "epoch": 1.9521019986216404, "grad_norm": 2.341428279876709, "learning_rate": 3.492393749336964e-11, "logits/chosen": -3.0544650554656982, "logits/rejected": -3.017298698425293, "logps/chosen": -55.70989990234375, "logps/rejected": -53.46592330932617, "loss": 0.6827, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03151210397481918, "rewards/margins": 0.021801788359880447, "rewards/rejected": -0.05331388860940933, "step": 11330 }, { "epoch": 1.9538249483115093, "grad_norm": 2.4387247562408447, "learning_rate": 3.245794873454777e-11, "logits/chosen": -2.930663585662842, "logits/rejected": -2.9110989570617676, "logps/chosen": -58.4086799621582, "logps/rejected": -59.36699295043945, "loss": 0.6857, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.030292293056845665, "rewards/margins": 0.015786411240696907, "rewards/rejected": -0.04607870429754257, "step": 11340 }, { "epoch": 1.9555478980013783, "grad_norm": 2.442936420440674, "learning_rate": 3.0082097225977436e-11, "logits/chosen": -3.0895800590515137, "logits/rejected": -3.0782761573791504, "logps/chosen": -60.061737060546875, "logps/rejected": -60.759864807128906, "loss": 0.6859, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0315508171916008, "rewards/margins": 0.015562380664050579, "rewards/rejected": -0.04711320251226425, "step": 11350 }, { "epoch": 1.9572708476912473, "grad_norm": 2.343500852584839, "learning_rate": 2.7796404452666847e-11, "logits/chosen": -3.0479581356048584, "logits/rejected": -3.0179901123046875, "logps/chosen": -55.6785774230957, "logps/rejected": -54.041419982910156, "loss": 0.6834, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03423875570297241, "rewards/margins": 0.020591724663972855, "rewards/rejected": -0.05483048036694527, "step": 11360 }, { "epoch": 1.9589937973811165, "grad_norm": 2.565880298614502, "learning_rate": 2.5600891084311962e-11, "logits/chosen": -3.0038440227508545, "logits/rejected": -2.984208345413208, "logps/chosen": -59.0855598449707, "logps/rejected": -58.140045166015625, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.028102219104766846, "rewards/margins": 0.017020905390381813, "rewards/rejected": -0.04512312263250351, "step": 11370 }, { "epoch": 1.9607167470709856, "grad_norm": 2.500283718109131, "learning_rate": 2.3495576975107737e-11, "logits/chosen": -2.9798200130462646, "logits/rejected": -2.9610161781311035, "logps/chosen": -55.38152313232422, "logps/rejected": -55.190673828125, "loss": 0.6857, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03548261523246765, "rewards/margins": 0.015744756907224655, "rewards/rejected": -0.051227372139692307, "step": 11380 }, { "epoch": 1.9624396967608546, "grad_norm": 2.4395084381103516, "learning_rate": 2.1480481163572707e-11, "logits/chosen": -3.0970280170440674, "logits/rejected": -3.0766406059265137, "logps/chosen": -55.200523376464844, "logps/rejected": -60.348594665527344, "loss": 0.6829, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.028758252039551735, "rewards/margins": 0.021567989140748978, "rewards/rejected": -0.05032623931765556, "step": 11390 }, { "epoch": 1.9641626464507236, "grad_norm": 2.49170184135437, "learning_rate": 1.9555621872374695e-11, "logits/chosen": -3.0487561225891113, "logits/rejected": -3.0144057273864746, "logps/chosen": -58.76116180419922, "logps/rejected": -57.61921310424805, "loss": 0.6864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028487438336014748, "rewards/margins": 0.014596112072467804, "rewards/rejected": -0.043083555996418, "step": 11400 }, { "epoch": 1.9641626464507236, "eval_logits/chosen": -3.116114377975464, "eval_logits/rejected": -3.1104519367218018, "eval_logps/chosen": -59.5885124206543, "eval_logps/rejected": -64.69520568847656, "eval_loss": 0.6901496052742004, "eval_rewards/accuracies": 0.5908457040786743, "eval_rewards/chosen": -0.008766171522438526, "eval_rewards/margins": 0.006384550128132105, "eval_rewards/rejected": -0.015150722116231918, "eval_runtime": 383.7144, "eval_samples_per_second": 11.217, "eval_steps_per_second": 1.402, "step": 11400 }, { "epoch": 1.9658855961405926, "grad_norm": 2.6475255489349365, "learning_rate": 1.7721016508163158e-11, "logits/chosen": -3.0536608695983887, "logits/rejected": -3.0251669883728027, "logps/chosen": -58.230804443359375, "logps/rejected": -59.3325309753418, "loss": 0.685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.025450700893998146, "rewards/margins": 0.017182698473334312, "rewards/rejected": -0.04263339936733246, "step": 11410 }, { "epoch": 1.9676085458304617, "grad_norm": 2.4510061740875244, "learning_rate": 1.597668166141486e-11, "logits/chosen": -3.0368571281433105, "logits/rejected": -3.004257917404175, "logps/chosen": -56.6202278137207, "logps/rejected": -55.45795822143555, "loss": 0.6824, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029352698475122452, "rewards/margins": 0.022640466690063477, "rewards/rejected": -0.05199316143989563, "step": 11420 }, { "epoch": 1.969331495520331, "grad_norm": 2.4990296363830566, "learning_rate": 1.4322633106286232e-11, "logits/chosen": -2.9539122581481934, "logits/rejected": -2.92718505859375, "logps/chosen": -58.123313903808594, "logps/rejected": -57.20905303955078, "loss": 0.6866, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.03152982145547867, "rewards/margins": 0.013926585204899311, "rewards/rejected": -0.045456402003765106, "step": 11430 }, { "epoch": 1.9710544452102, "grad_norm": 2.4815919399261475, "learning_rate": 1.2758885800464581e-11, "logits/chosen": -3.035094738006592, "logits/rejected": -3.0234522819519043, "logps/chosen": -56.13129806518555, "logps/rejected": -56.96406936645508, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03581147640943527, "rewards/margins": 0.007677781395614147, "rewards/rejected": -0.043489255011081696, "step": 11440 }, { "epoch": 1.9727773949000689, "grad_norm": 2.4564032554626465, "learning_rate": 1.12854538850371e-11, "logits/chosen": -2.955756664276123, "logits/rejected": -2.9337503910064697, "logps/chosen": -55.9810676574707, "logps/rejected": -56.99578857421875, "loss": 0.6835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.032729096710681915, "rewards/margins": 0.02022354118525982, "rewards/rejected": -0.05295264720916748, "step": 11450 }, { "epoch": 1.9745003445899378, "grad_norm": 2.609710931777954, "learning_rate": 9.90235068436207e-12, "logits/chosen": -2.999922275543213, "logits/rejected": -2.9787392616271973, "logps/chosen": -58.519287109375, "logps/rejected": -59.29533767700195, "loss": 0.6838, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.029814541339874268, "rewards/margins": 0.019777704030275345, "rewards/rejected": -0.049592241644859314, "step": 11460 }, { "epoch": 1.976223294279807, "grad_norm": 2.5704898834228516, "learning_rate": 8.609588705947857e-12, "logits/chosen": -3.0523245334625244, "logits/rejected": -3.0218346118927, "logps/chosen": -60.27888870239258, "logps/rejected": -56.3681526184082, "loss": 0.684, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03436389937996864, "rewards/margins": 0.019532622769474983, "rewards/rejected": -0.053896524012088776, "step": 11470 }, { "epoch": 1.9779462439696762, "grad_norm": 2.4101979732513428, "learning_rate": 7.407179640341877e-12, "logits/chosen": -3.090115547180176, "logits/rejected": -3.056811809539795, "logps/chosen": -58.93994140625, "logps/rejected": -55.08115768432617, "loss": 0.6846, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0284547321498394, "rewards/margins": 0.01805991493165493, "rewards/rejected": -0.04651464894413948, "step": 11480 }, { "epoch": 1.9796691936595452, "grad_norm": 2.46321177482605, "learning_rate": 6.295134361020694e-12, "logits/chosen": -3.010709047317505, "logits/rejected": -2.9817841053009033, "logps/chosen": -57.873619079589844, "logps/rejected": -55.653099060058594, "loss": 0.6868, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03182325139641762, "rewards/margins": 0.013528557494282722, "rewards/rejected": -0.04535181075334549, "step": 11490 }, { "epoch": 1.9813921433494142, "grad_norm": 2.106982469558716, "learning_rate": 5.273462924296757e-12, "logits/chosen": -3.058711528778076, "logits/rejected": -3.036954402923584, "logps/chosen": -53.891815185546875, "logps/rejected": -55.3429069519043, "loss": 0.689, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.03603522107005119, "rewards/margins": 0.009504115208983421, "rewards/rejected": -0.04553934186697006, "step": 11500 }, { "epoch": 1.9813921433494142, "eval_logits/chosen": -3.1161394119262695, "eval_logits/rejected": -3.1104989051818848, "eval_logps/chosen": -59.606449127197266, "eval_logps/rejected": -64.71166229248047, "eval_loss": 0.6901578307151794, "eval_rewards/accuracies": 0.5820167064666748, "eval_rewards/chosen": -0.008945533074438572, "eval_rewards/margins": 0.006369884591549635, "eval_rewards/rejected": -0.01531541720032692, "eval_runtime": 383.8703, "eval_samples_per_second": 11.212, "eval_steps_per_second": 1.402, "step": 11500 }, { "epoch": 1.9831150930392831, "grad_norm": 2.3931286334991455, "learning_rate": 4.342174569221813e-12, "logits/chosen": -2.955284833908081, "logits/rejected": -2.945343017578125, "logps/chosen": -58.84697723388672, "logps/rejected": -58.00005340576172, "loss": 0.687, "rewards/accuracies": 0.59375, "rewards/chosen": -0.033665161579847336, "rewards/margins": 0.013647237792611122, "rewards/rejected": -0.04731239750981331, "step": 11510 }, { "epoch": 1.9848380427291523, "grad_norm": 2.32000994682312, "learning_rate": 3.501277717508078e-12, "logits/chosen": -3.0707528591156006, "logits/rejected": -3.0135512351989746, "logps/chosen": -61.96906280517578, "logps/rejected": -55.12891387939453, "loss": 0.6788, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02170426771044731, "rewards/margins": 0.029822617769241333, "rewards/rejected": -0.051526885479688644, "step": 11520 }, { "epoch": 1.9865609924190215, "grad_norm": 2.522261142730713, "learning_rate": 2.750779973452744e-12, "logits/chosen": -3.03159761428833, "logits/rejected": -2.9960219860076904, "logps/chosen": -56.40028762817383, "logps/rejected": -53.7711296081543, "loss": 0.6845, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029385218396782875, "rewards/margins": 0.018221555277705193, "rewards/rejected": -0.04760677367448807, "step": 11530 }, { "epoch": 1.9882839421088905, "grad_norm": 2.307715892791748, "learning_rate": 2.0906881238624833e-12, "logits/chosen": -3.074688196182251, "logits/rejected": -3.0627036094665527, "logps/chosen": -53.6716194152832, "logps/rejected": -59.98317337036133, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.029241954907774925, "rewards/margins": 0.01832282915711403, "rewards/rejected": -0.047564782202243805, "step": 11540 }, { "epoch": 1.9900068917987594, "grad_norm": 2.5106430053710938, "learning_rate": 1.5210081380001572e-12, "logits/chosen": -3.000810146331787, "logits/rejected": -2.9803338050842285, "logps/chosen": -54.16124725341797, "logps/rejected": -57.501487731933594, "loss": 0.6844, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.028317982330918312, "rewards/margins": 0.018404236063361168, "rewards/rejected": -0.04672221839427948, "step": 11550 }, { "epoch": 1.9917298414886284, "grad_norm": 2.4142873287200928, "learning_rate": 1.0417451675248657e-12, "logits/chosen": -3.031704902648926, "logits/rejected": -3.0022201538085938, "logps/chosen": -58.48185348510742, "logps/rejected": -57.05162811279297, "loss": 0.6848, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0349888876080513, "rewards/margins": 0.01772366650402546, "rewards/rejected": -0.05271255224943161, "step": 11560 }, { "epoch": 1.9934527911784976, "grad_norm": 2.451390266418457, "learning_rate": 6.529035464486466e-13, "logits/chosen": -3.0356264114379883, "logits/rejected": -3.015634536743164, "logps/chosen": -57.8956413269043, "logps/rejected": -60.697242736816406, "loss": 0.6876, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.037846293300390244, "rewards/margins": 0.012129221111536026, "rewards/rejected": -0.04997551441192627, "step": 11570 }, { "epoch": 1.9951757408683668, "grad_norm": 2.7996134757995605, "learning_rate": 3.5448679109761907e-13, "logits/chosen": -3.0607693195343018, "logits/rejected": -3.0405261516571045, "logps/chosen": -59.582923889160156, "logps/rejected": -60.716270446777344, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03324711322784424, "rewards/margins": 0.017170798033475876, "rewards/rejected": -0.050417911261320114, "step": 11580 }, { "epoch": 1.9968986905582358, "grad_norm": 2.7875218391418457, "learning_rate": 1.4649760007534597e-13, "logits/chosen": -2.9645540714263916, "logits/rejected": -2.924387216567993, "logps/chosen": -61.875083923339844, "logps/rejected": -55.68303680419922, "loss": 0.6822, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.023736396804451942, "rewards/margins": 0.02287469431757927, "rewards/rejected": -0.04661108925938606, "step": 11590 }, { "epoch": 1.9986216402481047, "grad_norm": 2.565707206726074, "learning_rate": 2.8937854245070226e-14, "logits/chosen": -3.003178119659424, "logits/rejected": -2.991347551345825, "logps/chosen": -53.818214416503906, "logps/rejected": -58.562896728515625, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.033688973635435104, "rewards/margins": 0.014114851132035255, "rewards/rejected": -0.04780382663011551, "step": 11600 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -3.11618709564209, "eval_logits/rejected": -3.1104962825775146, "eval_logps/chosen": -59.58960723876953, "eval_logps/rejected": -64.70088958740234, "eval_loss": 0.6901275515556335, "eval_rewards/accuracies": 0.589219331741333, "eval_rewards/chosen": -0.00877712108194828, "eval_rewards/margins": 0.006430591456592083, "eval_rewards/rejected": -0.015207710675895214, "eval_runtime": 385.2313, "eval_samples_per_second": 11.173, "eval_steps_per_second": 1.397, "step": 11600 }, { "epoch": 2.0, "step": 11608, "total_flos": 0.0, "train_loss": 0.6883086910911629, "train_runtime": 95005.2512, "train_samples_per_second": 1.955, "train_steps_per_second": 0.122 } ], "logging_steps": 10, "max_steps": 11608, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }