diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19313 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 11608, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017229496898690558, + "grad_norm": 2.1822969913482666, + "learning_rate": 1.7226528854435833e-11, + "logits/chosen": -2.967046022415161, + "logits/rejected": -2.9243061542510986, + "logps/chosen": -43.99115753173828, + "logps/rejected": -41.627906799316406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0017229496898690559, + "grad_norm": 2.3854050636291504, + "learning_rate": 1.7226528854435832e-10, + "logits/chosen": -3.055169105529785, + "logits/rejected": -3.025726795196533, + "logps/chosen": -50.45830535888672, + "logps/rejected": -49.59857177734375, + "loss": 0.693, + "rewards/accuracies": 0.3819444477558136, + "rewards/chosen": 7.992664905032143e-05, + "rewards/margins": 0.00021500879665836692, + "rewards/rejected": -0.00013508212578017265, + "step": 10 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 2.243231773376465, + "learning_rate": 3.4453057708871663e-10, + "logits/chosen": -3.1189680099487305, + "logits/rejected": -3.110758066177368, + "logps/chosen": -52.657142639160156, + "logps/rejected": -52.99263381958008, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 5.9444828366395086e-05, + "rewards/margins": -6.718172517139465e-05, + "rewards/rejected": 0.0001266265317099169, + "step": 20 + }, + { + "epoch": 0.005168849069607168, + "grad_norm": 2.578056573867798, + "learning_rate": 5.167958656330749e-10, + "logits/chosen": -3.0915324687957764, + "logits/rejected": -3.067788600921631, + "logps/chosen": -56.78974151611328, + "logps/rejected": -58.443809509277344, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 6.385785673046485e-05, + "rewards/margins": 0.0001299582072533667, + "rewards/rejected": -6.610035052290186e-05, + "step": 30 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 2.0117297172546387, + "learning_rate": 6.890611541774333e-10, + "logits/chosen": -3.105164051055908, + "logits/rejected": -3.0736613273620605, + "logps/chosen": -55.2633056640625, + "logps/rejected": -50.67898178100586, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 8.863389666657895e-05, + "rewards/margins": 2.5206496502505615e-05, + "rewards/rejected": 6.342738925013691e-05, + "step": 40 + }, + { + "epoch": 0.00861474844934528, + "grad_norm": 2.3875701427459717, + "learning_rate": 8.613264427217916e-10, + "logits/chosen": -3.1009817123413086, + "logits/rejected": -3.0846290588378906, + "logps/chosen": -53.1203498840332, + "logps/rejected": -51.499549865722656, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -3.746306902030483e-05, + "rewards/margins": -4.0330953197553754e-05, + "rewards/rejected": 2.86787394543353e-06, + "step": 50 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 2.789041757583618, + "learning_rate": 1.0335917312661499e-09, + "logits/chosen": -3.153869867324829, + "logits/rejected": -3.1241626739501953, + "logps/chosen": -57.59900665283203, + "logps/rejected": -54.145477294921875, + "loss": 0.6934, + "rewards/accuracies": 0.40625, + "rewards/chosen": -0.00012567281373776495, + "rewards/margins": -0.0004897199687547982, + "rewards/rejected": 0.0003640470968093723, + "step": 60 + }, + { + "epoch": 0.012060647829083391, + "grad_norm": 2.1988277435302734, + "learning_rate": 1.2058570198105082e-09, + "logits/chosen": -3.0509283542633057, + "logits/rejected": -3.030928134918213, + "logps/chosen": -53.77088165283203, + "logps/rejected": -53.22446823120117, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.639079346205108e-05, + "rewards/margins": -1.7232532627531327e-05, + "rewards/rejected": 8.417169965468929e-07, + "step": 70 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 2.439988136291504, + "learning_rate": 1.3781223083548665e-09, + "logits/chosen": -3.159721851348877, + "logits/rejected": -3.126398801803589, + "logps/chosen": -59.07847213745117, + "logps/rejected": -54.11749267578125, + "loss": 0.6929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.00017951996414922178, + "rewards/margins": 0.0003985256771557033, + "rewards/rejected": -0.0002190056984545663, + "step": 80 + }, + { + "epoch": 0.015506547208821502, + "grad_norm": 2.474202871322632, + "learning_rate": 1.5503875968992249e-09, + "logits/chosen": -2.9933810234069824, + "logits/rejected": -2.9786269664764404, + "logps/chosen": -53.468894958496094, + "logps/rejected": -52.8430290222168, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 2.1566442228504457e-05, + "rewards/margins": 3.089628808083944e-05, + "rewards/rejected": -9.329845852334984e-06, + "step": 90 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 2.488950252532959, + "learning_rate": 1.7226528854435832e-09, + "logits/chosen": -3.169787883758545, + "logits/rejected": -3.107752561569214, + "logps/chosen": -55.961708068847656, + "logps/rejected": -49.63905715942383, + "loss": 0.6932, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00016789429355412722, + "rewards/margins": -7.354038825724274e-05, + "rewards/rejected": -9.435390529688448e-05, + "step": 100 + }, + { + "epoch": 0.01722949689869056, + "eval_logits/chosen": -3.1630771160125732, + "eval_logits/rejected": -3.157426118850708, + "eval_logps/chosen": -58.701412200927734, + "eval_logps/rejected": -63.16501998901367, + "eval_loss": 0.6931708455085754, + "eval_rewards/accuracies": 0.49465614557266235, + "eval_rewards/chosen": 0.00010478924377821386, + "eval_rewards/margins": -4.6228298742789775e-05, + "eval_rewards/rejected": 0.00015101753524504602, + "eval_runtime": 383.2857, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 100 + }, + { + "epoch": 0.018952446588559616, + "grad_norm": 2.542346715927124, + "learning_rate": 1.8949181739879414e-09, + "logits/chosen": -3.121835470199585, + "logits/rejected": -3.0979726314544678, + "logps/chosen": -55.59098434448242, + "logps/rejected": -52.3350715637207, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00013937894254922867, + "rewards/margins": -9.85497172223404e-05, + "rewards/rejected": -4.08292435167823e-05, + "step": 110 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 2.5591018199920654, + "learning_rate": 2.0671834625322997e-09, + "logits/chosen": -3.065739870071411, + "logits/rejected": -3.0502285957336426, + "logps/chosen": -53.17518997192383, + "logps/rejected": -55.5625, + "loss": 0.6932, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 6.941436004126444e-05, + "rewards/margins": -2.7006733944290318e-05, + "rewards/rejected": 9.642112127039582e-05, + "step": 120 + }, + { + "epoch": 0.022398345968297727, + "grad_norm": 2.138871908187866, + "learning_rate": 2.239448751076658e-09, + "logits/chosen": -3.1010239124298096, + "logits/rejected": -3.087006092071533, + "logps/chosen": -55.1888313293457, + "logps/rejected": -53.76519775390625, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 4.798931331606582e-05, + "rewards/margins": 9.56843578023836e-05, + "rewards/rejected": -4.769503721036017e-05, + "step": 130 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 2.427386999130249, + "learning_rate": 2.4117140396210164e-09, + "logits/chosen": -3.122720241546631, + "logits/rejected": -3.1043787002563477, + "logps/chosen": -54.18378829956055, + "logps/rejected": -53.78192901611328, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 2.194441003666725e-05, + "rewards/margins": 0.0002684879000298679, + "rewards/rejected": -0.0002465434663463384, + "step": 140 + }, + { + "epoch": 0.025844245348035838, + "grad_norm": 2.211906909942627, + "learning_rate": 2.5839793281653743e-09, + "logits/chosen": -3.027405261993408, + "logits/rejected": -3.0095810890197754, + "logps/chosen": -52.62847137451172, + "logps/rejected": -52.40435791015625, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.676296778896358e-05, + "rewards/margins": 5.470390988193685e-06, + "rewards/rejected": -2.2233365598367527e-05, + "step": 150 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 2.1558420658111572, + "learning_rate": 2.756244616709733e-09, + "logits/chosen": -3.0889861583709717, + "logits/rejected": -3.0681259632110596, + "logps/chosen": -53.5107536315918, + "logps/rejected": -54.70793914794922, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00023508230515290052, + "rewards/margins": -0.0001835847506299615, + "rewards/rejected": -5.1497579988790676e-05, + "step": 160 + }, + { + "epoch": 0.02929014472777395, + "grad_norm": 2.360699415206909, + "learning_rate": 2.9285099052540914e-09, + "logits/chosen": -3.076531410217285, + "logits/rejected": -3.056931734085083, + "logps/chosen": -56.27750778198242, + "logps/rejected": -51.34051513671875, + "loss": 0.693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 7.097180059645325e-05, + "rewards/margins": 0.00026213008095510304, + "rewards/rejected": -0.00019115829491056502, + "step": 170 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 2.610349655151367, + "learning_rate": 3.1007751937984498e-09, + "logits/chosen": -3.0624287128448486, + "logits/rejected": -3.043592929840088, + "logps/chosen": -56.41620635986328, + "logps/rejected": -53.785743713378906, + "loss": 0.6932, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.00010555762855801731, + "rewards/margins": -5.6592289183754474e-05, + "rewards/rejected": -4.8965321184368804e-05, + "step": 180 + }, + { + "epoch": 0.03273604410751206, + "grad_norm": 2.6376547813415527, + "learning_rate": 3.2730404823428077e-09, + "logits/chosen": -3.124223232269287, + "logits/rejected": -3.0807528495788574, + "logps/chosen": -58.18146896362305, + "logps/rejected": -52.5531120300293, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": 2.0690204109996557e-05, + "rewards/margins": 0.00017370580462738872, + "rewards/rejected": -0.0001530156150693074, + "step": 190 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 2.581326484680176, + "learning_rate": 3.4453057708871665e-09, + "logits/chosen": -3.059694528579712, + "logits/rejected": -3.0440192222595215, + "logps/chosen": -54.1214714050293, + "logps/rejected": -54.7166862487793, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 1.2389538824209012e-05, + "rewards/margins": -4.3077256123069674e-05, + "rewards/rejected": 5.546676038648002e-05, + "step": 200 + }, + { + "epoch": 0.03445899379738112, + "eval_logits/chosen": -3.163004159927368, + "eval_logits/rejected": -3.1573050022125244, + "eval_logps/chosen": -58.7076301574707, + "eval_logps/rejected": -63.1617546081543, + "eval_loss": 0.6932182908058167, + "eval_rewards/accuracies": 0.4839684069156647, + "eval_rewards/chosen": 4.26560036430601e-05, + "eval_rewards/margins": -0.00014103209832683206, + "eval_rewards/rejected": 0.00018368809833191335, + "eval_runtime": 383.3111, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.404, + "step": 200 + }, + { + "epoch": 0.03618194348725017, + "grad_norm": 2.2879650592803955, + "learning_rate": 3.617571059431525e-09, + "logits/chosen": -3.014284610748291, + "logits/rejected": -3.005575656890869, + "logps/chosen": -53.272361755371094, + "logps/rejected": -57.27521896362305, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -8.317607716890052e-05, + "rewards/margins": -0.00023591746867168695, + "rewards/rejected": 0.00015274141333065927, + "step": 210 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 2.327847957611084, + "learning_rate": 3.789836347975883e-09, + "logits/chosen": -3.0507373809814453, + "logits/rejected": -3.019326686859131, + "logps/chosen": -52.20600128173828, + "logps/rejected": -51.31000900268555, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -6.532550469273701e-05, + "rewards/margins": -0.0001269731583306566, + "rewards/rejected": 6.164763908600435e-05, + "step": 220 + }, + { + "epoch": 0.03962784286698828, + "grad_norm": 2.3976006507873535, + "learning_rate": 3.962101636520241e-09, + "logits/chosen": -3.0511653423309326, + "logits/rejected": -3.032834529876709, + "logps/chosen": -48.90367889404297, + "logps/rejected": -49.952972412109375, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 5.9426492953207344e-05, + "rewards/margins": 0.0001413938298355788, + "rewards/rejected": -8.196735143428668e-05, + "step": 230 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 2.251117706298828, + "learning_rate": 4.134366925064599e-09, + "logits/chosen": -3.0247445106506348, + "logits/rejected": -2.982290029525757, + "logps/chosen": -55.94219207763672, + "logps/rejected": -52.15814208984375, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0001196784432977438, + "rewards/margins": 0.0001547091087559238, + "rewards/rejected": -3.503066545818001e-05, + "step": 240 + }, + { + "epoch": 0.043073742246726394, + "grad_norm": 2.3111438751220703, + "learning_rate": 4.306632213608958e-09, + "logits/chosen": -3.1180098056793213, + "logits/rejected": -3.0976080894470215, + "logps/chosen": -52.28120803833008, + "logps/rejected": -51.09599685668945, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.1321669262542855e-05, + "rewards/margins": -2.742862307059113e-05, + "rewards/rejected": 1.61069674504688e-05, + "step": 250 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 2.313485622406006, + "learning_rate": 4.478897502153316e-09, + "logits/chosen": -3.09426212310791, + "logits/rejected": -3.0820038318634033, + "logps/chosen": -54.8555908203125, + "logps/rejected": -56.63024139404297, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.00013580010272562504, + "rewards/margins": 0.0002656346478033811, + "rewards/rejected": -0.0001298345159739256, + "step": 260 + }, + { + "epoch": 0.046519641626464506, + "grad_norm": 2.2110888957977295, + "learning_rate": 4.6511627906976744e-09, + "logits/chosen": -3.03316593170166, + "logits/rejected": -3.0150184631347656, + "logps/chosen": -53.122108459472656, + "logps/rejected": -54.308738708496094, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.5679284590296447e-05, + "rewards/margins": 9.607634638086893e-06, + "rewards/rejected": -3.52868992194999e-05, + "step": 270 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 2.4297540187835693, + "learning_rate": 4.823428079242033e-09, + "logits/chosen": -3.124643325805664, + "logits/rejected": -3.090567111968994, + "logps/chosen": -57.60612106323242, + "logps/rejected": -53.4041862487793, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 7.05211132299155e-05, + "rewards/margins": -3.20916369673796e-05, + "rewards/rejected": 0.00010261273564537987, + "step": 280 + }, + { + "epoch": 0.04996554100620262, + "grad_norm": 2.245150566101074, + "learning_rate": 4.995693367786391e-09, + "logits/chosen": -3.046997547149658, + "logits/rejected": -3.0325276851654053, + "logps/chosen": -55.36955642700195, + "logps/rejected": -54.27775192260742, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 6.89334119670093e-05, + "rewards/margins": 2.902208325394895e-05, + "rewards/rejected": 3.991132689407095e-05, + "step": 290 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 2.361457347869873, + "learning_rate": 5.167958656330749e-09, + "logits/chosen": -3.0027079582214355, + "logits/rejected": -2.9940621852874756, + "logps/chosen": -52.8454475402832, + "logps/rejected": -53.935462951660156, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -3.082995681324974e-05, + "rewards/margins": -0.00011777142208302394, + "rewards/rejected": 8.694147254573181e-05, + "step": 300 + }, + { + "epoch": 0.051688490696071676, + "eval_logits/chosen": -3.163076639175415, + "eval_logits/rejected": -3.1573708057403564, + "eval_logps/chosen": -58.71210479736328, + "eval_logps/rejected": -63.17204284667969, + "eval_loss": 0.6931892037391663, + "eval_rewards/accuracies": 0.48420074582099915, + "eval_rewards/chosen": -2.114654535034788e-06, + "eval_rewards/margins": -8.291260019177571e-05, + "eval_rewards/rejected": 8.079793769866228e-05, + "eval_runtime": 383.4681, + "eval_samples_per_second": 11.224, + "eval_steps_per_second": 1.403, + "step": 300 + }, + { + "epoch": 0.05341144038594073, + "grad_norm": 2.4729714393615723, + "learning_rate": 5.340223944875108e-09, + "logits/chosen": -3.0653252601623535, + "logits/rejected": -3.060011625289917, + "logps/chosen": -53.5208854675293, + "logps/rejected": -53.29474639892578, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.4837738490314223e-05, + "rewards/margins": -2.0687919459305704e-05, + "rewards/rejected": 5.8501582316239364e-06, + "step": 310 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 2.3792271614074707, + "learning_rate": 5.512489233419466e-09, + "logits/chosen": -3.023054599761963, + "logits/rejected": -2.9967429637908936, + "logps/chosen": -54.5090446472168, + "logps/rejected": -49.25890350341797, + "loss": 0.6931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.0441066478961147e-05, + "rewards/margins": 0.00017671639216132462, + "rewards/rejected": -0.00018715743499342352, + "step": 320 + }, + { + "epoch": 0.05685733976567884, + "grad_norm": 2.3428661823272705, + "learning_rate": 5.6847545219638245e-09, + "logits/chosen": -3.083404064178467, + "logits/rejected": -3.059711456298828, + "logps/chosen": -55.03895950317383, + "logps/rejected": -52.21526336669922, + "loss": 0.6932, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -5.828489156556316e-05, + "rewards/margins": -8.575782885600347e-06, + "rewards/rejected": -4.9709080485627055e-05, + "step": 330 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 2.1629602909088135, + "learning_rate": 5.857019810508183e-09, + "logits/chosen": -3.005366802215576, + "logits/rejected": -2.983691692352295, + "logps/chosen": -52.5191535949707, + "logps/rejected": -51.93767166137695, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 7.401472248602659e-05, + "rewards/margins": 9.557695739204064e-05, + "rewards/rejected": -2.1562223992077634e-05, + "step": 340 + }, + { + "epoch": 0.06030323914541695, + "grad_norm": 2.317671298980713, + "learning_rate": 6.02928509905254e-09, + "logits/chosen": -2.977421998977661, + "logits/rejected": -2.937859296798706, + "logps/chosen": -56.217933654785156, + "logps/rejected": -53.575584411621094, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -8.471935871057212e-05, + "rewards/margins": 0.00010555875633144751, + "rewards/rejected": -0.00019027813686989248, + "step": 350 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 2.4035069942474365, + "learning_rate": 6.2015503875968995e-09, + "logits/chosen": -3.128420114517212, + "logits/rejected": -3.105487585067749, + "logps/chosen": -54.573753356933594, + "logps/rejected": -50.5115966796875, + "loss": 0.6931, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.0001374700659653172, + "rewards/margins": 0.00011274970893282443, + "rewards/rejected": 2.4720366127439775e-05, + "step": 360 + }, + { + "epoch": 0.06374913852515507, + "grad_norm": 2.323314666748047, + "learning_rate": 6.373815676141258e-09, + "logits/chosen": -3.10359263420105, + "logits/rejected": -3.074420213699341, + "logps/chosen": -52.39251708984375, + "logps/rejected": -51.35547637939453, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -7.354580156970769e-05, + "rewards/margins": 7.115851622074842e-05, + "rewards/rejected": -0.00014470433234237134, + "step": 370 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 2.0820016860961914, + "learning_rate": 6.546080964685615e-09, + "logits/chosen": -3.203951597213745, + "logits/rejected": -3.17911958694458, + "logps/chosen": -53.507789611816406, + "logps/rejected": -52.19771194458008, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -4.075709875905886e-05, + "rewards/margins": 3.0424340366153046e-05, + "rewards/rejected": -7.11814864189364e-05, + "step": 380 + }, + { + "epoch": 0.06719503790489317, + "grad_norm": 2.4141087532043457, + "learning_rate": 6.7183462532299746e-09, + "logits/chosen": -3.1004703044891357, + "logits/rejected": -3.0749688148498535, + "logps/chosen": -56.06284713745117, + "logps/rejected": -55.22441864013672, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.6862904633162543e-05, + "rewards/margins": 0.00019700443954207003, + "rewards/rejected": -0.00017014151671901345, + "step": 390 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 2.0874099731445312, + "learning_rate": 6.890611541774333e-09, + "logits/chosen": -3.0706710815429688, + "logits/rejected": -3.054839849472046, + "logps/chosen": -52.64934539794922, + "logps/rejected": -52.7557258605957, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00010136842320207506, + "rewards/margins": -0.00021895563986618072, + "rewards/rejected": 0.00011758720211219043, + "step": 400 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -3.163289785385132, + "eval_logits/rejected": -3.1576626300811768, + "eval_logps/chosen": -58.715309143066406, + "eval_logps/rejected": -63.17881774902344, + "eval_loss": 0.6931713819503784, + "eval_rewards/accuracies": 0.48559480905532837, + "eval_rewards/chosen": -3.4172830055467784e-05, + "eval_rewards/margins": -4.7213809011736885e-05, + "eval_rewards/rejected": 1.304098896071082e-05, + "eval_runtime": 383.4664, + "eval_samples_per_second": 11.224, + "eval_steps_per_second": 1.403, + "step": 400 + }, + { + "epoch": 0.07064093728463129, + "grad_norm": 2.1497490406036377, + "learning_rate": 7.0628768303186904e-09, + "logits/chosen": -3.0745506286621094, + "logits/rejected": -3.0703208446502686, + "logps/chosen": -50.75954818725586, + "logps/rejected": -55.532379150390625, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00014055120118428022, + "rewards/margins": -9.702378883957863e-05, + "rewards/rejected": -4.35274196206592e-05, + "step": 410 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 2.540682792663574, + "learning_rate": 7.23514211886305e-09, + "logits/chosen": -3.0597333908081055, + "logits/rejected": -3.0517995357513428, + "logps/chosen": -54.171173095703125, + "logps/rejected": -53.88414764404297, + "loss": 0.6933, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -5.193626202526502e-05, + "rewards/margins": -0.00034823891473934054, + "rewards/rejected": 0.00029630266362801194, + "step": 420 + }, + { + "epoch": 0.0740868366643694, + "grad_norm": 2.244760274887085, + "learning_rate": 7.407407407407407e-09, + "logits/chosen": -3.0882716178894043, + "logits/rejected": -3.074573040008545, + "logps/chosen": -53.08251953125, + "logps/rejected": -54.14112091064453, + "loss": 0.6932, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 8.592494123149663e-05, + "rewards/margins": -0.00013404383207671344, + "rewards/rejected": 0.00021996880241204053, + "step": 430 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 2.5090692043304443, + "learning_rate": 7.579672695951765e-09, + "logits/chosen": -3.1335277557373047, + "logits/rejected": -3.098327398300171, + "logps/chosen": -54.31789016723633, + "logps/rejected": -53.2615966796875, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00013140590453986079, + "rewards/margins": 2.339702950848732e-05, + "rewards/rejected": -0.00015480289584957063, + "step": 440 + }, + { + "epoch": 0.07753273604410751, + "grad_norm": 2.2801172733306885, + "learning_rate": 7.751937984496125e-09, + "logits/chosen": -3.056034803390503, + "logits/rejected": -3.023494005203247, + "logps/chosen": -56.08808517456055, + "logps/rejected": -54.58913040161133, + "loss": 0.6932, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": 7.79370020609349e-05, + "rewards/margins": -4.6285451389849186e-05, + "rewards/rejected": 0.0001242224534507841, + "step": 450 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 2.3933279514312744, + "learning_rate": 7.924203273040482e-09, + "logits/chosen": -3.0245463848114014, + "logits/rejected": -3.0049405097961426, + "logps/chosen": -56.2148551940918, + "logps/rejected": -52.991844177246094, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.00014541010023094714, + "rewards/margins": -0.00010043817746918648, + "rewards/rejected": -4.497191548580304e-05, + "step": 460 + }, + { + "epoch": 0.08097863542384562, + "grad_norm": 2.2035293579101562, + "learning_rate": 8.096468561584841e-09, + "logits/chosen": -3.0475990772247314, + "logits/rejected": -3.016028642654419, + "logps/chosen": -53.28934860229492, + "logps/rejected": -51.39487838745117, + "loss": 0.6933, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.0002375897893216461, + "rewards/margins": -0.0002828095166478306, + "rewards/rejected": 4.52197200502269e-05, + "step": 470 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 2.4459140300750732, + "learning_rate": 8.268733850129199e-09, + "logits/chosen": -3.045457363128662, + "logits/rejected": -3.040175199508667, + "logps/chosen": -54.21479415893555, + "logps/rejected": -58.94578170776367, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00024394100182689726, + "rewards/margins": 0.00010521084914216772, + "rewards/rejected": 0.0001387301308568567, + "step": 480 + }, + { + "epoch": 0.08442453480358374, + "grad_norm": 2.470845937728882, + "learning_rate": 8.440999138673558e-09, + "logits/chosen": -2.955177068710327, + "logits/rejected": -2.905369520187378, + "logps/chosen": -60.59124755859375, + "logps/rejected": -51.36162567138672, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00024259297060780227, + "rewards/margins": 0.00047093481407500803, + "rewards/rejected": -0.000228341858019121, + "step": 490 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 2.259693145751953, + "learning_rate": 8.613264427217916e-09, + "logits/chosen": -3.0174524784088135, + "logits/rejected": -2.9887466430664062, + "logps/chosen": -54.963653564453125, + "logps/rejected": -51.62682342529297, + "loss": 0.693, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00013784432667307556, + "rewards/margins": 0.00024374081112910062, + "rewards/rejected": -0.0003815850941464305, + "step": 500 + }, + { + "epoch": 0.08614748449345279, + "eval_logits/chosen": -3.1633033752441406, + "eval_logits/rejected": -3.157639265060425, + "eval_logps/chosen": -58.70395278930664, + "eval_logps/rejected": -63.16484832763672, + "eval_loss": 0.6931844353675842, + "eval_rewards/accuracies": 0.4846654236316681, + "eval_rewards/chosen": 7.943623495521024e-05, + "eval_rewards/margins": -7.329090294661e-05, + "eval_rewards/rejected": 0.00015272715245373547, + "eval_runtime": 383.5733, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 500 + }, + { + "epoch": 0.08787043418332184, + "grad_norm": 2.237663984298706, + "learning_rate": 8.785529715762273e-09, + "logits/chosen": -3.0099520683288574, + "logits/rejected": -2.9886584281921387, + "logps/chosen": -58.27411651611328, + "logps/rejected": -51.98424530029297, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 8.301792149723042e-06, + "rewards/margins": -9.65881918091327e-05, + "rewards/rejected": 0.00010488999396329746, + "step": 510 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 2.0645158290863037, + "learning_rate": 8.957795004306632e-09, + "logits/chosen": -3.0576930046081543, + "logits/rejected": -3.032172441482544, + "logps/chosen": -56.39656448364258, + "logps/rejected": -51.65898895263672, + "loss": 0.693, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00010374795965617523, + "rewards/margins": 0.00039428164018318057, + "rewards/rejected": -0.00029053367325104773, + "step": 520 + }, + { + "epoch": 0.09131633356305996, + "grad_norm": 2.061357259750366, + "learning_rate": 9.130060292850991e-09, + "logits/chosen": -3.0545859336853027, + "logits/rejected": -3.012650966644287, + "logps/chosen": -55.69318771362305, + "logps/rejected": -51.20554733276367, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 6.437679257942364e-05, + "rewards/margins": 0.0002606067282613367, + "rewards/rejected": -0.00019622994295787066, + "step": 530 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 2.2379069328308105, + "learning_rate": 9.302325581395349e-09, + "logits/chosen": -3.0406105518341064, + "logits/rejected": -3.0240485668182373, + "logps/chosen": -52.84473419189453, + "logps/rejected": -52.934791564941406, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 7.517748599639162e-05, + "rewards/margins": 0.0002457521914038807, + "rewards/rejected": -0.00017057466902770102, + "step": 540 + }, + { + "epoch": 0.09476223294279806, + "grad_norm": 2.266610622406006, + "learning_rate": 9.474590869939708e-09, + "logits/chosen": -3.1011712551116943, + "logits/rejected": -3.0831127166748047, + "logps/chosen": -53.59284210205078, + "logps/rejected": -51.971588134765625, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.0001898288755910471, + "rewards/margins": -0.0001018575276248157, + "rewards/rejected": -8.797131886240095e-05, + "step": 550 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 2.573582649230957, + "learning_rate": 9.646856158484066e-09, + "logits/chosen": -3.072469472885132, + "logits/rejected": -3.0646400451660156, + "logps/chosen": -52.102210998535156, + "logps/rejected": -54.95751190185547, + "loss": 0.6932, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 6.365185981849208e-05, + "rewards/margins": -6.325983122223988e-05, + "rewards/rejected": 0.00012691169104073197, + "step": 560 + }, + { + "epoch": 0.09820813232253618, + "grad_norm": 2.2397186756134033, + "learning_rate": 9.819121447028425e-09, + "logits/chosen": -3.0452890396118164, + "logits/rejected": -3.0365958213806152, + "logps/chosen": -51.370277404785156, + "logps/rejected": -53.8305778503418, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -8.120768325170502e-05, + "rewards/margins": 4.172849003225565e-06, + "rewards/rejected": -8.538054680684581e-05, + "step": 570 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 1.768967628479004, + "learning_rate": 9.991386735572782e-09, + "logits/chosen": -3.0521512031555176, + "logits/rejected": -3.0463624000549316, + "logps/chosen": -51.12982940673828, + "logps/rejected": -53.257110595703125, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00021762735559605062, + "rewards/margins": -6.527120422106236e-05, + "rewards/rejected": -0.00015235615137498826, + "step": 580 + }, + { + "epoch": 0.1016540317022743, + "grad_norm": 2.06492018699646, + "learning_rate": 1.016365202411714e-08, + "logits/chosen": -3.051334857940674, + "logits/rejected": -3.0294275283813477, + "logps/chosen": -54.873023986816406, + "logps/rejected": -54.8026008605957, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 5.9067795518785715e-05, + "rewards/margins": 0.000223180977627635, + "rewards/rejected": -0.00016411316755693406, + "step": 590 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 2.32629132270813, + "learning_rate": 1.0335917312661497e-08, + "logits/chosen": -3.024348735809326, + "logits/rejected": -3.0000967979431152, + "logps/chosen": -53.887939453125, + "logps/rejected": -56.68596649169922, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.189003448118456e-05, + "rewards/margins": 0.00013417910668067634, + "rewards/rejected": -0.0001760691375238821, + "step": 600 + }, + { + "epoch": 0.10337698139214335, + "eval_logits/chosen": -3.162902355194092, + "eval_logits/rejected": -3.157241106033325, + "eval_logps/chosen": -58.70703125, + "eval_logps/rejected": -63.17946243286133, + "eval_loss": 0.6931267976760864, + "eval_rewards/accuracies": 0.4960501790046692, + "eval_rewards/chosen": 4.8649228119757026e-05, + "eval_rewards/margins": 4.204766082693823e-05, + "eval_rewards/rejected": 6.601568657060852e-06, + "eval_runtime": 383.5477, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 600 + }, + { + "epoch": 0.1050999310820124, + "grad_norm": 2.2237448692321777, + "learning_rate": 1.0508182601205858e-08, + "logits/chosen": -2.9901814460754395, + "logits/rejected": -2.987638473510742, + "logps/chosen": -52.5461540222168, + "logps/rejected": -53.254302978515625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00018452919903211296, + "rewards/margins": 0.00011761534551624209, + "rewards/rejected": 6.691385351587087e-05, + "step": 610 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 2.368446111679077, + "learning_rate": 1.0680447889750216e-08, + "logits/chosen": -3.149543523788452, + "logits/rejected": -3.122969150543213, + "logps/chosen": -55.02565383911133, + "logps/rejected": -53.259056091308594, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -6.642246444243938e-05, + "rewards/margins": -0.00014482057304121554, + "rewards/rejected": 7.839810859877616e-05, + "step": 620 + }, + { + "epoch": 0.10854583046175052, + "grad_norm": 2.478334903717041, + "learning_rate": 1.0852713178294573e-08, + "logits/chosen": -3.1329097747802734, + "logits/rejected": -3.1060264110565186, + "logps/chosen": -53.866172790527344, + "logps/rejected": -50.85861587524414, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3715110071643721e-05, + "rewards/margins": 3.362820280017331e-05, + "rewards/rejected": -4.7343302867375314e-05, + "step": 630 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 2.4779860973358154, + "learning_rate": 1.1024978466838932e-08, + "logits/chosen": -3.1013245582580566, + "logits/rejected": -3.0902600288391113, + "logps/chosen": -52.90496063232422, + "logps/rejected": -54.33183670043945, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 7.12585897417739e-05, + "rewards/margins": 0.00012497395800892264, + "rewards/rejected": -5.371534643927589e-05, + "step": 640 + }, + { + "epoch": 0.11199172984148863, + "grad_norm": 2.6754417419433594, + "learning_rate": 1.1197243755383291e-08, + "logits/chosen": -3.1137092113494873, + "logits/rejected": -3.1150639057159424, + "logps/chosen": -51.477821350097656, + "logps/rejected": -54.8226318359375, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -2.929338370449841e-05, + "rewards/margins": -1.7334494259557687e-05, + "rewards/rejected": -1.1958894901908934e-05, + "step": 650 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 2.2342963218688965, + "learning_rate": 1.1369509043927649e-08, + "logits/chosen": -3.0037097930908203, + "logits/rejected": -2.9974873065948486, + "logps/chosen": -54.63112258911133, + "logps/rejected": -52.28680419921875, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.072547769988887e-05, + "rewards/margins": 0.00020177674014121294, + "rewards/rejected": -0.0002325022069271654, + "step": 660 + }, + { + "epoch": 0.11543762922122675, + "grad_norm": 2.200890064239502, + "learning_rate": 1.1541774332472008e-08, + "logits/chosen": -3.0266995429992676, + "logits/rejected": -3.02182936668396, + "logps/chosen": -53.01947021484375, + "logps/rejected": -57.498291015625, + "loss": 0.6933, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.00013879637117497623, + "rewards/margins": -0.00036192036350257695, + "rewards/rejected": 0.0002231239777756855, + "step": 670 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 2.317094564437866, + "learning_rate": 1.1714039621016366e-08, + "logits/chosen": -2.97847056388855, + "logits/rejected": -2.9534149169921875, + "logps/chosen": -53.9498405456543, + "logps/rejected": -50.85622024536133, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.00016623221745248884, + "rewards/margins": -0.00025777897099033, + "rewards/rejected": 9.154676081379876e-05, + "step": 680 + }, + { + "epoch": 0.11888352860096485, + "grad_norm": 2.5922181606292725, + "learning_rate": 1.1886304909560723e-08, + "logits/chosen": -3.1273410320281982, + "logits/rejected": -3.0974478721618652, + "logps/chosen": -59.101234436035156, + "logps/rejected": -50.5319709777832, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.00018734775949269533, + "rewards/margins": -0.00028559009660966694, + "rewards/rejected": 9.8242329841014e-05, + "step": 690 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 2.1908469200134277, + "learning_rate": 1.205857019810508e-08, + "logits/chosen": -3.0850281715393066, + "logits/rejected": -3.0565896034240723, + "logps/chosen": -55.72981643676758, + "logps/rejected": -52.97819137573242, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.571721804793924e-05, + "rewards/margins": -4.1126662836177275e-05, + "rewards/rejected": 2.540946297813207e-05, + "step": 700 + }, + { + "epoch": 0.1206064782908339, + "eval_logits/chosen": -3.16308331489563, + "eval_logits/rejected": -3.1573946475982666, + "eval_logps/chosen": -58.70769119262695, + "eval_logps/rejected": -63.170066833496094, + "eval_loss": 0.6931769847869873, + "eval_rewards/accuracies": 0.4911710023880005, + "eval_rewards/chosen": 4.198389797238633e-05, + "eval_rewards/margins": -5.857193536940031e-05, + "eval_rewards/rejected": 0.00010055583697976544, + "eval_runtime": 383.7985, + "eval_samples_per_second": 11.214, + "eval_steps_per_second": 1.402, + "step": 700 + }, + { + "epoch": 0.12232942798070297, + "grad_norm": 2.2663283348083496, + "learning_rate": 1.2230835486649442e-08, + "logits/chosen": -3.063431978225708, + "logits/rejected": -3.0346083641052246, + "logps/chosen": -54.637359619140625, + "logps/rejected": -54.698516845703125, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00011339558841427788, + "rewards/margins": 0.00011806526163127273, + "rewards/rejected": -4.669668669521343e-06, + "step": 710 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 2.4980812072753906, + "learning_rate": 1.2403100775193799e-08, + "logits/chosen": -3.0252647399902344, + "logits/rejected": -3.0209569931030273, + "logps/chosen": -53.32914352416992, + "logps/rejected": -54.40614700317383, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00012032913218718022, + "rewards/margins": -3.563683640095405e-05, + "rewards/rejected": -8.469230670016259e-05, + "step": 720 + }, + { + "epoch": 0.12577532736044109, + "grad_norm": 2.3726646900177, + "learning_rate": 1.2575366063738157e-08, + "logits/chosen": -3.1419854164123535, + "logits/rejected": -3.116884708404541, + "logps/chosen": -56.341453552246094, + "logps/rejected": -52.523292541503906, + "loss": 0.6932, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -9.19977537705563e-05, + "rewards/margins": -7.002463098615408e-05, + "rewards/rejected": -2.197313733631745e-05, + "step": 730 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 2.221714735031128, + "learning_rate": 1.2747631352282516e-08, + "logits/chosen": -3.02409291267395, + "logits/rejected": -2.9982411861419678, + "logps/chosen": -54.91374588012695, + "logps/rejected": -53.7316780090332, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 3.111477053607814e-05, + "rewards/margins": 0.00035662594018504024, + "rewards/rejected": -0.0003255111805628985, + "step": 740 + }, + { + "epoch": 0.1292212267401792, + "grad_norm": 2.4172091484069824, + "learning_rate": 1.2919896640826873e-08, + "logits/chosen": -3.1980748176574707, + "logits/rejected": -3.169914960861206, + "logps/chosen": -56.0087776184082, + "logps/rejected": -54.03033447265625, + "loss": 0.693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 3.72965369024314e-05, + "rewards/margins": 0.0002863496483769268, + "rewards/rejected": -0.000249053118750453, + "step": 750 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 2.5564565658569336, + "learning_rate": 1.309216192937123e-08, + "logits/chosen": -3.0481514930725098, + "logits/rejected": -3.009295701980591, + "logps/chosen": -54.42664337158203, + "logps/rejected": -49.540592193603516, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 9.81381963356398e-05, + "rewards/margins": 0.0003268247819505632, + "rewards/rejected": -0.00022868656378705055, + "step": 760 + }, + { + "epoch": 0.1326671261199173, + "grad_norm": 2.1019692420959473, + "learning_rate": 1.3264427217915592e-08, + "logits/chosen": -3.098783254623413, + "logits/rejected": -3.075409412384033, + "logps/chosen": -52.925331115722656, + "logps/rejected": -52.193687438964844, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -6.0318860050756484e-05, + "rewards/margins": -0.00021020628628320992, + "rewards/rejected": 0.0001498874044045806, + "step": 770 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 2.6707863807678223, + "learning_rate": 1.3436692506459949e-08, + "logits/chosen": -3.093926191329956, + "logits/rejected": -3.062821865081787, + "logps/chosen": -53.164955139160156, + "logps/rejected": -51.190185546875, + "loss": 0.6932, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00024976825807243586, + "rewards/margins": -5.909635729040019e-05, + "rewards/rejected": -0.00019067191169597208, + "step": 780 + }, + { + "epoch": 0.1361130254996554, + "grad_norm": 2.2457172870635986, + "learning_rate": 1.3608957795004307e-08, + "logits/chosen": -3.101630687713623, + "logits/rejected": -3.0681843757629395, + "logps/chosen": -53.92539596557617, + "logps/rejected": -53.56317138671875, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.00018752229516394436, + "rewards/margins": -0.00024204616784118116, + "rewards/rejected": 5.452388359117322e-05, + "step": 790 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 2.69878888130188, + "learning_rate": 1.3781223083548666e-08, + "logits/chosen": -2.9877758026123047, + "logits/rejected": -2.961682081222534, + "logps/chosen": -55.00641632080078, + "logps/rejected": -54.427734375, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -8.858703949954361e-05, + "rewards/margins": 0.00023625604808330536, + "rewards/rejected": -0.0003248431021347642, + "step": 800 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -3.163255214691162, + "eval_logits/rejected": -3.1575722694396973, + "eval_logps/chosen": -58.69498062133789, + "eval_logps/rejected": -63.16041946411133, + "eval_loss": 0.6931617259979248, + "eval_rewards/accuracies": 0.490938663482666, + "eval_rewards/chosen": 0.0001691343932179734, + "eval_rewards/margins": -2.792733175738249e-05, + "eval_rewards/rejected": 0.00019706170132849365, + "eval_runtime": 383.156, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 800 + }, + { + "epoch": 0.13955892487939353, + "grad_norm": 2.2642996311187744, + "learning_rate": 1.3953488372093023e-08, + "logits/chosen": -3.0624594688415527, + "logits/rejected": -3.0343310832977295, + "logps/chosen": -56.66850662231445, + "logps/rejected": -55.576751708984375, + "loss": 0.6932, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.00031107664108276367, + "rewards/margins": -0.00017458898946642876, + "rewards/rejected": -0.00013648762251250446, + "step": 810 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 2.155913829803467, + "learning_rate": 1.4125753660637381e-08, + "logits/chosen": -3.1119720935821533, + "logits/rejected": -3.086433172225952, + "logps/chosen": -51.55561447143555, + "logps/rejected": -50.739524841308594, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -7.932411972433329e-05, + "rewards/margins": 4.6575911255786195e-05, + "rewards/rejected": -0.0001259000418940559, + "step": 820 + }, + { + "epoch": 0.14300482425913164, + "grad_norm": 2.428403615951538, + "learning_rate": 1.429801894918174e-08, + "logits/chosen": -3.039891481399536, + "logits/rejected": -3.0244946479797363, + "logps/chosen": -54.586090087890625, + "logps/rejected": -54.00188446044922, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -6.285020208451897e-05, + "rewards/margins": 0.00019378354772925377, + "rewards/rejected": -0.0002566337352618575, + "step": 830 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 2.541550397872925, + "learning_rate": 1.44702842377261e-08, + "logits/chosen": -3.14371919631958, + "logits/rejected": -3.117215156555176, + "logps/chosen": -54.11638259887695, + "logps/rejected": -49.55205535888672, + "loss": 0.6932, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.00021488538186531514, + "rewards/margins": -0.00011695261491695419, + "rewards/rejected": -9.793278150027618e-05, + "step": 840 + }, + { + "epoch": 0.14645072363886974, + "grad_norm": 2.313411235809326, + "learning_rate": 1.4642549526270457e-08, + "logits/chosen": -3.0190269947052, + "logits/rejected": -3.008479118347168, + "logps/chosen": -50.86606979370117, + "logps/rejected": -54.94173049926758, + "loss": 0.6933, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.00044976655044592917, + "rewards/margins": -0.00038919810322113335, + "rewards/rejected": -6.056845450075343e-05, + "step": 850 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 2.3254234790802, + "learning_rate": 1.4814814814814814e-08, + "logits/chosen": -3.0448098182678223, + "logits/rejected": -3.0247676372528076, + "logps/chosen": -52.861968994140625, + "logps/rejected": -52.325035095214844, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.7349426090950146e-05, + "rewards/margins": 0.00015882976003922522, + "rewards/rejected": -0.0001861791533883661, + "step": 860 + }, + { + "epoch": 0.14989662301860784, + "grad_norm": 1.9174031019210815, + "learning_rate": 1.4987080103359175e-08, + "logits/chosen": -3.1186442375183105, + "logits/rejected": -3.114907741546631, + "logps/chosen": -51.29471969604492, + "logps/rejected": -53.679344177246094, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 6.848828343208879e-05, + "rewards/margins": 0.0003035779227502644, + "rewards/rejected": -0.00023508965387009084, + "step": 870 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 1.8897603750228882, + "learning_rate": 1.515934539190353e-08, + "logits/chosen": -3.032423734664917, + "logits/rejected": -3.009608507156372, + "logps/chosen": -51.75300979614258, + "logps/rejected": -51.415382385253906, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0002309719566255808, + "rewards/margins": 0.00012134017742937431, + "rewards/rejected": -0.0003523121413309127, + "step": 880 + }, + { + "epoch": 0.15334252239834598, + "grad_norm": 2.2520952224731445, + "learning_rate": 1.533161068044789e-08, + "logits/chosen": -3.060913562774658, + "logits/rejected": -3.0235142707824707, + "logps/chosen": -58.44805145263672, + "logps/rejected": -54.19799041748047, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0003900781739503145, + "rewards/margins": 0.000767029938288033, + "rewards/rejected": -0.0003769517061300576, + "step": 890 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 2.146815538406372, + "learning_rate": 1.550387596899225e-08, + "logits/chosen": -3.0814216136932373, + "logits/rejected": -3.070983409881592, + "logps/chosen": -54.16011428833008, + "logps/rejected": -52.36823272705078, + "loss": 0.6934, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00023367287940345705, + "rewards/margins": -0.00040599278872832656, + "rewards/rejected": 0.00017231988022103906, + "step": 900 + }, + { + "epoch": 0.15506547208821503, + "eval_logits/chosen": -3.1633338928222656, + "eval_logits/rejected": -3.1576449871063232, + "eval_logps/chosen": -58.702510833740234, + "eval_logps/rejected": -63.16947937011719, + "eval_loss": 0.6931542754173279, + "eval_rewards/accuracies": 0.5060408711433411, + "eval_rewards/chosen": 9.381815470987931e-05, + "eval_rewards/margins": -1.2631733625312336e-05, + "eval_rewards/rejected": 0.00010644988651620224, + "eval_runtime": 383.1861, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 900 + }, + { + "epoch": 0.15678842177808408, + "grad_norm": 2.1222777366638184, + "learning_rate": 1.567614125753661e-08, + "logits/chosen": -3.0520882606506348, + "logits/rejected": -3.0439276695251465, + "logps/chosen": -51.022674560546875, + "logps/rejected": -52.06635284423828, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 4.3686806748155504e-05, + "rewards/margins": 0.0001305050973314792, + "rewards/rejected": -8.68182978592813e-05, + "step": 910 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 2.3210599422454834, + "learning_rate": 1.5848406546080964e-08, + "logits/chosen": -3.0938668251037598, + "logits/rejected": -3.0504050254821777, + "logps/chosen": -54.33452224731445, + "logps/rejected": -49.48955535888672, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00012349920871201903, + "rewards/margins": 0.0002427975705359131, + "rewards/rejected": -0.00036629679379984736, + "step": 920 + }, + { + "epoch": 0.16023432115782218, + "grad_norm": 2.5764107704162598, + "learning_rate": 1.6020671834625323e-08, + "logits/chosen": -3.155245780944824, + "logits/rejected": -3.1392102241516113, + "logps/chosen": -52.4669189453125, + "logps/rejected": -54.50859832763672, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0001693625090410933, + "rewards/margins": 0.0003471010422799736, + "rewards/rejected": -0.00017773854779079556, + "step": 930 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 2.1913082599639893, + "learning_rate": 1.6192937123169683e-08, + "logits/chosen": -3.1352171897888184, + "logits/rejected": -3.097580671310425, + "logps/chosen": -60.212013244628906, + "logps/rejected": -54.92094802856445, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -6.684206891804934e-05, + "rewards/margins": 0.00029070020536892116, + "rewards/rejected": -0.0003575422742869705, + "step": 940 + }, + { + "epoch": 0.16368022053756032, + "grad_norm": 2.1950390338897705, + "learning_rate": 1.636520241171404e-08, + "logits/chosen": -2.9277501106262207, + "logits/rejected": -2.910341739654541, + "logps/chosen": -55.29688262939453, + "logps/rejected": -55.66237258911133, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00025500665651634336, + "rewards/margins": 5.84846711717546e-05, + "rewards/rejected": -0.00031349132768809795, + "step": 950 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 2.2826006412506104, + "learning_rate": 1.6537467700258398e-08, + "logits/chosen": -2.8967947959899902, + "logits/rejected": -2.8999621868133545, + "logps/chosen": -50.75765609741211, + "logps/rejected": -55.64331817626953, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0001634789223317057, + "rewards/margins": 0.000268703734036535, + "rewards/rejected": -0.0004321826563682407, + "step": 960 + }, + { + "epoch": 0.16712611991729842, + "grad_norm": 2.3014798164367676, + "learning_rate": 1.6709732988802757e-08, + "logits/chosen": -3.0674057006835938, + "logits/rejected": -3.0318312644958496, + "logps/chosen": -60.67380905151367, + "logps/rejected": -52.45458221435547, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0001028587794280611, + "rewards/margins": -4.980314315616852e-06, + "rewards/rejected": -9.787843737285584e-05, + "step": 970 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 3.0363972187042236, + "learning_rate": 1.6881998277347116e-08, + "logits/chosen": -3.1694116592407227, + "logits/rejected": -3.149758815765381, + "logps/chosen": -56.066307067871094, + "logps/rejected": -54.85820388793945, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00017563713481649756, + "rewards/margins": 7.07071740180254e-06, + "rewards/rejected": -0.00018270780856255442, + "step": 980 + }, + { + "epoch": 0.17057201929703653, + "grad_norm": 2.4502651691436768, + "learning_rate": 1.7054263565891472e-08, + "logits/chosen": -3.036480665206909, + "logits/rejected": -3.013042449951172, + "logps/chosen": -54.58531951904297, + "logps/rejected": -52.66328048706055, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00016072092694230378, + "rewards/margins": 0.0002869007585104555, + "rewards/rejected": -0.0004476216563489288, + "step": 990 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 2.1972694396972656, + "learning_rate": 1.722652885443583e-08, + "logits/chosen": -2.9883952140808105, + "logits/rejected": -2.9606387615203857, + "logps/chosen": -56.881141662597656, + "logps/rejected": -51.748878479003906, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0001291980588575825, + "rewards/margins": -6.897748971823603e-05, + "rewards/rejected": -6.022059096721932e-05, + "step": 1000 + }, + { + "epoch": 0.17229496898690558, + "eval_logits/chosen": -3.1630799770355225, + "eval_logits/rejected": -3.157437562942505, + "eval_logps/chosen": -58.69733810424805, + "eval_logps/rejected": -63.16838073730469, + "eval_loss": 0.6931337118148804, + "eval_rewards/accuracies": 0.4948884844779968, + "eval_rewards/chosen": 0.0001455719320802018, + "eval_rewards/margins": 2.818080065480899e-05, + "eval_rewards/rejected": 0.00011739113688236102, + "eval_runtime": 383.2811, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 1000 + }, + { + "epoch": 0.17401791867677463, + "grad_norm": 2.363290309906006, + "learning_rate": 1.739879414298019e-08, + "logits/chosen": -2.9239678382873535, + "logits/rejected": -2.934494972229004, + "logps/chosen": -53.7801399230957, + "logps/rejected": -57.93299102783203, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.00022317534603644162, + "rewards/margins": 0.0002450795436743647, + "rewards/rejected": -0.000468254933366552, + "step": 1010 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 2.214242458343506, + "learning_rate": 1.7571059431524546e-08, + "logits/chosen": -3.1074411869049072, + "logits/rejected": -3.0718629360198975, + "logps/chosen": -57.12139892578125, + "logps/rejected": -54.26393508911133, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00018779639503918588, + "rewards/margins": 0.0001408702664775774, + "rewards/rejected": -0.00032866670517250896, + "step": 1020 + }, + { + "epoch": 0.17746381805651276, + "grad_norm": 2.365598201751709, + "learning_rate": 1.774332472006891e-08, + "logits/chosen": -3.1569809913635254, + "logits/rejected": -3.129786729812622, + "logps/chosen": -53.72365188598633, + "logps/rejected": -51.948936462402344, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.860901754000224e-05, + "rewards/margins": 0.0003156032762490213, + "rewards/rejected": -0.00036421226104721427, + "step": 1030 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 2.4483449459075928, + "learning_rate": 1.7915590008613264e-08, + "logits/chosen": -3.0721476078033447, + "logits/rejected": -3.0619688034057617, + "logps/chosen": -53.41161346435547, + "logps/rejected": -55.581390380859375, + "loss": 0.693, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.00013245883747003973, + "rewards/margins": 0.00030615561990998685, + "rewards/rejected": -0.0004386144573800266, + "step": 1040 + }, + { + "epoch": 0.18090971743625087, + "grad_norm": 2.4459915161132812, + "learning_rate": 1.8087855297157624e-08, + "logits/chosen": -2.99334716796875, + "logits/rejected": -2.9578709602355957, + "logps/chosen": -57.15093231201172, + "logps/rejected": -50.6357307434082, + "loss": 0.6929, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -7.875073060858995e-05, + "rewards/margins": 0.0005473211058415473, + "rewards/rejected": -0.000626071821898222, + "step": 1050 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 2.437818765640259, + "learning_rate": 1.8260120585701983e-08, + "logits/chosen": -3.002201795578003, + "logits/rejected": -2.982909917831421, + "logps/chosen": -56.52927780151367, + "logps/rejected": -54.999961853027344, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0002802074304781854, + "rewards/margins": -4.4669039198197424e-05, + "rewards/rejected": -0.0002355383476242423, + "step": 1060 + }, + { + "epoch": 0.18435561681598897, + "grad_norm": 2.2553157806396484, + "learning_rate": 1.843238587424634e-08, + "logits/chosen": -3.146183490753174, + "logits/rejected": -3.112220287322998, + "logps/chosen": -55.97260665893555, + "logps/rejected": -53.28546905517578, + "loss": 0.6932, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.000413569767260924, + "rewards/margins": -4.205874938634224e-05, + "rewards/rejected": -0.0003715109487529844, + "step": 1070 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 2.253410816192627, + "learning_rate": 1.8604651162790698e-08, + "logits/chosen": -3.123624086380005, + "logits/rejected": -3.100271463394165, + "logps/chosen": -55.81609344482422, + "logps/rejected": -50.781005859375, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0003030496300198138, + "rewards/margins": 0.00020159417181275785, + "rewards/rejected": -0.0005046438309364021, + "step": 1080 + }, + { + "epoch": 0.18780151619572708, + "grad_norm": 2.353863477706909, + "learning_rate": 1.8776916451335057e-08, + "logits/chosen": -3.000624656677246, + "logits/rejected": -2.9921963214874268, + "logps/chosen": -52.26164627075195, + "logps/rejected": -52.68159866333008, + "loss": 0.6931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0002996811526827514, + "rewards/margins": 0.00017872537137009203, + "rewards/rejected": -0.000478406494949013, + "step": 1090 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 2.1127638816833496, + "learning_rate": 1.8949181739879416e-08, + "logits/chosen": -3.0667271614074707, + "logits/rejected": -3.0638859272003174, + "logps/chosen": -50.94106674194336, + "logps/rejected": -54.64240646362305, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0003363923169672489, + "rewards/margins": 1.02457470347872e-05, + "rewards/rejected": -0.0003466380585450679, + "step": 1100 + }, + { + "epoch": 0.18952446588559613, + "eval_logits/chosen": -3.162712812423706, + "eval_logits/rejected": -3.1570920944213867, + "eval_logps/chosen": -58.683189392089844, + "eval_logps/rejected": -63.15974426269531, + "eval_loss": 0.6931062936782837, + "eval_rewards/accuracies": 0.515566885471344, + "eval_rewards/chosen": 0.0002870217140298337, + "eval_rewards/margins": 8.319580228999257e-05, + "eval_rewards/rejected": 0.0002038259117398411, + "eval_runtime": 383.1432, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 1100 + }, + { + "epoch": 0.1912474155754652, + "grad_norm": 2.639432668685913, + "learning_rate": 1.9121447028423772e-08, + "logits/chosen": -3.064542055130005, + "logits/rejected": -3.0769705772399902, + "logps/chosen": -52.93410110473633, + "logps/rejected": -56.49309539794922, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.00038267840864136815, + "rewards/margins": 2.7892394427908584e-05, + "rewards/rejected": -0.00041057082125917077, + "step": 1110 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 2.38193941116333, + "learning_rate": 1.929371231696813e-08, + "logits/chosen": -3.0945816040039062, + "logits/rejected": -3.072700262069702, + "logps/chosen": -56.38603973388672, + "logps/rejected": -53.792144775390625, + "loss": 0.693, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00015010975766927004, + "rewards/margins": 0.0003310061583761126, + "rewards/rejected": -0.0004811159451492131, + "step": 1120 + }, + { + "epoch": 0.1946933149552033, + "grad_norm": 2.2352566719055176, + "learning_rate": 1.946597760551249e-08, + "logits/chosen": -3.1612792015075684, + "logits/rejected": -3.1366684436798096, + "logps/chosen": -52.0703125, + "logps/rejected": -54.243202209472656, + "loss": 0.6931, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00037932602572254837, + "rewards/margins": 0.0001295518159167841, + "rewards/rejected": -0.0005088779143989086, + "step": 1130 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 2.4157116413116455, + "learning_rate": 1.963824289405685e-08, + "logits/chosen": -3.0681800842285156, + "logits/rejected": -3.0344736576080322, + "logps/chosen": -56.911476135253906, + "logps/rejected": -53.044189453125, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00030110584339126945, + "rewards/margins": 0.0002923770225606859, + "rewards/rejected": -0.0005934828659519553, + "step": 1140 + }, + { + "epoch": 0.19813921433494142, + "grad_norm": 2.3486196994781494, + "learning_rate": 1.9810508182601205e-08, + "logits/chosen": -3.007859706878662, + "logits/rejected": -2.9886839389801025, + "logps/chosen": -53.38232421875, + "logps/rejected": -54.55126953125, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00025944452499970794, + "rewards/margins": 0.000397385039832443, + "rewards/rejected": -0.0006568295648321509, + "step": 1150 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 2.2162816524505615, + "learning_rate": 1.9982773471145565e-08, + "logits/chosen": -3.1045355796813965, + "logits/rejected": -3.095886707305908, + "logps/chosen": -54.0484619140625, + "logps/rejected": -54.620849609375, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0002283039502799511, + "rewards/margins": 0.0005001203389838338, + "rewards/rejected": -0.0007284242892637849, + "step": 1160 + }, + { + "epoch": 0.20158511371467952, + "grad_norm": 2.117940902709961, + "learning_rate": 1.9999963375532916e-08, + "logits/chosen": -2.9765725135803223, + "logits/rejected": -2.9637269973754883, + "logps/chosen": -52.097686767578125, + "logps/rejected": -54.725807189941406, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00018397597887087613, + "rewards/margins": 0.00034015963319689035, + "rewards/rejected": -0.0005241355393081903, + "step": 1170 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 2.5803020000457764, + "learning_rate": 1.9999836772781233e-08, + "logits/chosen": -2.948564291000366, + "logits/rejected": -2.917327642440796, + "logps/chosen": -52.84502410888672, + "logps/rejected": -51.07862091064453, + "loss": 0.693, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00043227747664786875, + "rewards/margins": 0.000333493750076741, + "rewards/rejected": -0.0007657711976207793, + "step": 1180 + }, + { + "epoch": 0.20503101309441765, + "grad_norm": 2.5155222415924072, + "learning_rate": 1.9999619740735644e-08, + "logits/chosen": -3.1464123725891113, + "logits/rejected": -3.111192226409912, + "logps/chosen": -58.7384147644043, + "logps/rejected": -50.48283386230469, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0003894003457389772, + "rewards/margins": 0.0004400517209433019, + "rewards/rejected": -0.0008294520666822791, + "step": 1190 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 2.093356132507324, + "learning_rate": 1.999931228135879e-08, + "logits/chosen": -2.978224515914917, + "logits/rejected": -2.9631943702697754, + "logps/chosen": -53.191490173339844, + "logps/rejected": -51.32482147216797, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0002702943456824869, + "rewards/margins": 0.0002604659821372479, + "rewards/rejected": -0.0005307603860273957, + "step": 1200 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -3.1626968383789062, + "eval_logits/rejected": -3.157034158706665, + "eval_logps/chosen": -58.677303314208984, + "eval_logps/rejected": -63.153099060058594, + "eval_loss": 0.6931099891662598, + "eval_rewards/accuracies": 0.5153345465660095, + "eval_rewards/chosen": 0.0003459024301264435, + "eval_rewards/margins": 7.563854160252959e-05, + "eval_rewards/rejected": 0.00027026390307582915, + "eval_runtime": 383.167, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 1200 + }, + { + "epoch": 0.20847691247415576, + "grad_norm": 2.264655590057373, + "learning_rate": 1.999891439743105e-08, + "logits/chosen": -3.0778591632843018, + "logits/rejected": -3.0437171459198, + "logps/chosen": -53.799530029296875, + "logps/rejected": -53.07401657104492, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00011281321349088103, + "rewards/margins": 0.0004704711027443409, + "rewards/rejected": -0.0005832842434756458, + "step": 1210 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 2.1167376041412354, + "learning_rate": 1.9998426092550514e-08, + "logits/chosen": -3.089040994644165, + "logits/rejected": -3.061993360519409, + "logps/chosen": -53.419349670410156, + "logps/rejected": -52.35260772705078, + "loss": 0.693, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.00038046788540668786, + "rewards/margins": 0.0003108192759100348, + "rewards/rejected": -0.0006912872195243835, + "step": 1220 + }, + { + "epoch": 0.21192281185389386, + "grad_norm": 2.5345876216888428, + "learning_rate": 1.999784737113296e-08, + "logits/chosen": -3.1593711376190186, + "logits/rejected": -3.117147922515869, + "logps/chosen": -55.76252365112305, + "logps/rejected": -52.289398193359375, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00011659580195555463, + "rewards/margins": 0.0005294819129630923, + "rewards/rejected": -0.0006460777367465198, + "step": 1230 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 2.0671513080596924, + "learning_rate": 1.999717823841182e-08, + "logits/chosen": -3.0202252864837646, + "logits/rejected": -3.0034167766571045, + "logps/chosen": -52.499916076660156, + "logps/rejected": -52.897300720214844, + "loss": 0.6931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0006282069953158498, + "rewards/margins": 0.0001435217709513381, + "rewards/rejected": -0.0007717286935076118, + "step": 1240 + }, + { + "epoch": 0.21536871123363197, + "grad_norm": 2.48652720451355, + "learning_rate": 1.99964187004381e-08, + "logits/chosen": -3.1409010887145996, + "logits/rejected": -3.1046009063720703, + "logps/chosen": -53.774330139160156, + "logps/rejected": -51.2626953125, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0007029867265373468, + "rewards/margins": 5.009355299989693e-05, + "rewards/rejected": -0.000753080181311816, + "step": 1250 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 2.1381783485412598, + "learning_rate": 1.999556876408037e-08, + "logits/chosen": -2.9890284538269043, + "logits/rejected": -2.9575300216674805, + "logps/chosen": -53.025726318359375, + "logps/rejected": -50.35152816772461, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0005038737435825169, + "rewards/margins": 0.00016801193123683333, + "rewards/rejected": -0.0006718856748193502, + "step": 1260 + }, + { + "epoch": 0.2188146106133701, + "grad_norm": 2.4455270767211914, + "learning_rate": 1.9994628437024666e-08, + "logits/chosen": -3.049938678741455, + "logits/rejected": -3.043519973754883, + "logps/chosen": -51.91309356689453, + "logps/rejected": -54.79779815673828, + "loss": 0.6929, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0003726807772181928, + "rewards/margins": 0.0005565760657191277, + "rewards/rejected": -0.0009292567847296596, + "step": 1270 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 2.275219678878784, + "learning_rate": 1.9993597727774438e-08, + "logits/chosen": -3.1218161582946777, + "logits/rejected": -3.1289162635803223, + "logps/chosen": -51.21726608276367, + "logps/rejected": -59.98350143432617, + "loss": 0.693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00045720464549958706, + "rewards/margins": 0.00024440709967166185, + "rewards/rejected": -0.000701611686963588, + "step": 1280 + }, + { + "epoch": 0.2222605099931082, + "grad_norm": 2.1652839183807373, + "learning_rate": 1.999247664565047e-08, + "logits/chosen": -3.0220205783843994, + "logits/rejected": -2.99029278755188, + "logps/chosen": -54.62388229370117, + "logps/rejected": -50.48380661010742, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00022962580260355026, + "rewards/margins": 0.0004112176247872412, + "rewards/rejected": -0.0006408434128388762, + "step": 1290 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 2.1271426677703857, + "learning_rate": 1.9991265200790797e-08, + "logits/chosen": -3.1019668579101562, + "logits/rejected": -3.089221715927124, + "logps/chosen": -50.25926971435547, + "logps/rejected": -53.80472946166992, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0004433942958712578, + "rewards/margins": 0.0003799190162681043, + "rewards/rejected": -0.000823313370347023, + "step": 1300 + }, + { + "epoch": 0.22398345968297725, + "eval_logits/chosen": -3.16233229637146, + "eval_logits/rejected": -3.1566648483276367, + "eval_logps/chosen": -58.66947555541992, + "eval_logps/rejected": -63.150753021240234, + "eval_loss": 0.6930826306343079, + "eval_rewards/accuracies": 0.5174256563186646, + "eval_rewards/chosen": 0.0004242155991960317, + "eval_rewards/margins": 0.00013055592717137188, + "eval_rewards/rejected": 0.00029365968657657504, + "eval_runtime": 383.1603, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 1300 + }, + { + "epoch": 0.2257064093728463, + "grad_norm": 1.916630506515503, + "learning_rate": 1.99899634041506e-08, + "logits/chosen": -3.1023964881896973, + "logits/rejected": -3.066821575164795, + "logps/chosen": -53.39400100708008, + "logps/rejected": -49.54502487182617, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0005346934776753187, + "rewards/margins": 0.0007152494508773088, + "rewards/rejected": -0.0012499429285526276, + "step": 1310 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 2.2478299140930176, + "learning_rate": 1.9988571267502137e-08, + "logits/chosen": -3.075390338897705, + "logits/rejected": -3.049856662750244, + "logps/chosen": -55.4655647277832, + "logps/rejected": -50.82777404785156, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00045771937584504485, + "rewards/margins": 0.0004580998793244362, + "rewards/rejected": -0.000915819313377142, + "step": 1320 + }, + { + "epoch": 0.22915230875258444, + "grad_norm": 2.4180092811584473, + "learning_rate": 1.9987088803434594e-08, + "logits/chosen": -3.1548409461975098, + "logits/rejected": -3.1240522861480713, + "logps/chosen": -55.25426483154297, + "logps/rejected": -50.222694396972656, + "loss": 0.6929, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00036666609230451286, + "rewards/margins": 0.0004698067787103355, + "rewards/rejected": -0.0008364729583263397, + "step": 1330 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 2.302684783935547, + "learning_rate": 1.9985516025354018e-08, + "logits/chosen": -2.9962856769561768, + "logits/rejected": -2.974853515625, + "logps/chosen": -55.167320251464844, + "logps/rejected": -52.973182678222656, + "loss": 0.6928, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.00027919316198676825, + "rewards/margins": 0.0007761950837448239, + "rewards/rejected": -0.0010553881293162704, + "step": 1340 + }, + { + "epoch": 0.23259820813232254, + "grad_norm": 2.2061896324157715, + "learning_rate": 1.9983852947483158e-08, + "logits/chosen": -3.0621368885040283, + "logits/rejected": -3.037357807159424, + "logps/chosen": -54.15734100341797, + "logps/rejected": -53.328369140625, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0005606129416264594, + "rewards/margins": 0.00038336380384862423, + "rewards/rejected": -0.0009439766290597618, + "step": 1350 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 2.3170135021209717, + "learning_rate": 1.9982099584861356e-08, + "logits/chosen": -3.0194990634918213, + "logits/rejected": -3.0071558952331543, + "logps/chosen": -55.27729034423828, + "logps/rejected": -56.09923553466797, + "loss": 0.6932, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0007478878833353519, + "rewards/margins": -4.510110102273757e-06, + "rewards/rejected": -0.0007433776627294719, + "step": 1360 + }, + { + "epoch": 0.23604410751206065, + "grad_norm": 2.3610928058624268, + "learning_rate": 1.9980255953344406e-08, + "logits/chosen": -3.133690595626831, + "logits/rejected": -3.1139533519744873, + "logps/chosen": -52.73761749267578, + "logps/rejected": -52.88722610473633, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0006277118809521198, + "rewards/margins": 0.0003715711645781994, + "rewards/rejected": -0.000999283161945641, + "step": 1370 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 2.0576012134552, + "learning_rate": 1.9978322069604412e-08, + "logits/chosen": -3.025172710418701, + "logits/rejected": -3.0068202018737793, + "logps/chosen": -54.36109161376953, + "logps/rejected": -52.417701721191406, + "loss": 0.6928, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00028318905970081687, + "rewards/margins": 0.0007199858082458377, + "rewards/rejected": -0.0010031748097389936, + "step": 1380 + }, + { + "epoch": 0.23949000689179875, + "grad_norm": 2.3447775840759277, + "learning_rate": 1.9976297951129625e-08, + "logits/chosen": -3.175565242767334, + "logits/rejected": -3.147883892059326, + "logps/chosen": -56.92084503173828, + "logps/rejected": -53.5631103515625, + "loss": 0.6928, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.00027007010066881776, + "rewards/margins": 0.0007962186937220395, + "rewards/rejected": -0.0010662887943908572, + "step": 1390 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 2.098050832748413, + "learning_rate": 1.9974183616224314e-08, + "logits/chosen": -2.9952683448791504, + "logits/rejected": -2.9694926738739014, + "logps/chosen": -55.85969161987305, + "logps/rejected": -52.920196533203125, + "loss": 0.6928, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0005089627811685205, + "rewards/margins": 0.0006413789233192801, + "rewards/rejected": -0.0011503419373184443, + "step": 1400 + }, + { + "epoch": 0.2412129565816678, + "eval_logits/chosen": -3.1622049808502197, + "eval_logits/rejected": -3.1565866470336914, + "eval_logps/chosen": -58.66376876831055, + "eval_logps/rejected": -63.146671295166016, + "eval_loss": 0.693074643611908, + "eval_rewards/accuracies": 0.5130111575126648, + "eval_rewards/chosen": 0.0004812688275706023, + "eval_rewards/margins": 0.00014677205763291568, + "eval_rewards/rejected": 0.0003344967553857714, + "eval_runtime": 383.5374, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 1400 + }, + { + "epoch": 0.24293590627153688, + "grad_norm": 2.009655475616455, + "learning_rate": 1.9971979084008567e-08, + "logits/chosen": -3.0599923133850098, + "logits/rejected": -3.047905445098877, + "logps/chosen": -54.46318435668945, + "logps/rejected": -53.312583923339844, + "loss": 0.693, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.0008952345815487206, + "rewards/margins": 0.0002275546285090968, + "rewards/rejected": -0.0011227892246097326, + "step": 1410 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 2.1165614128112793, + "learning_rate": 1.9969684374418137e-08, + "logits/chosen": -2.977726697921753, + "logits/rejected": -2.983685255050659, + "logps/chosen": -50.83002471923828, + "logps/rejected": -54.78386306762695, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0007586570573039353, + "rewards/margins": 0.00024669113918207586, + "rewards/rejected": -0.0010053481673821807, + "step": 1420 + }, + { + "epoch": 0.246381805651275, + "grad_norm": 2.3889706134796143, + "learning_rate": 1.9967299508204266e-08, + "logits/chosen": -3.097414016723633, + "logits/rejected": -3.085890531539917, + "logps/chosen": -53.62162399291992, + "logps/rejected": -56.60115432739258, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.00036977732088416815, + "rewards/margins": 0.0007430274854414165, + "rewards/rejected": -0.0011128047481179237, + "step": 1430 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 2.135133981704712, + "learning_rate": 1.996482450693348e-08, + "logits/chosen": -3.0247979164123535, + "logits/rejected": -2.993356704711914, + "logps/chosen": -50.533348083496094, + "logps/rejected": -48.5119743347168, + "loss": 0.6928, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0007086361292749643, + "rewards/margins": 0.000766909564845264, + "rewards/rejected": -0.0014755458105355501, + "step": 1440 + }, + { + "epoch": 0.2498277050310131, + "grad_norm": 2.7499895095825195, + "learning_rate": 1.9962259392987405e-08, + "logits/chosen": -3.0386104583740234, + "logits/rejected": -3.0010645389556885, + "logps/chosen": -55.5077018737793, + "logps/rejected": -52.076683044433594, + "loss": 0.6925, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0003259262884967029, + "rewards/margins": 0.0012405237648636103, + "rewards/rejected": -0.0015664503443986177, + "step": 1450 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 2.2248263359069824, + "learning_rate": 1.995960418956256e-08, + "logits/chosen": -3.041229248046875, + "logits/rejected": -3.008714437484741, + "logps/chosen": -51.79349899291992, + "logps/rejected": -50.624542236328125, + "loss": 0.6928, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0009388105827383697, + "rewards/margins": 0.0006548297824338078, + "rewards/rejected": -0.0015936403069645166, + "step": 1460 + }, + { + "epoch": 0.2532736044107512, + "grad_norm": 2.4349653720855713, + "learning_rate": 1.9956858920670163e-08, + "logits/chosen": -3.2067325115203857, + "logits/rejected": -3.1706488132476807, + "logps/chosen": -57.066429138183594, + "logps/rejected": -54.8380241394043, + "loss": 0.6926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0004277366679161787, + "rewards/margins": 0.0011886181309819221, + "rewards/rejected": -0.0016163547988981009, + "step": 1470 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 2.150383472442627, + "learning_rate": 1.9954023611135885e-08, + "logits/chosen": -3.173959970474243, + "logits/rejected": -3.1377644538879395, + "logps/chosen": -51.36625289916992, + "logps/rejected": -51.846641540527344, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0005552283255383372, + "rewards/margins": 0.0005073813372291625, + "rewards/rejected": -0.0010626097209751606, + "step": 1480 + }, + { + "epoch": 0.2567195037904893, + "grad_norm": 2.321821928024292, + "learning_rate": 1.995109828659965e-08, + "logits/chosen": -3.105658769607544, + "logits/rejected": -3.0809221267700195, + "logps/chosen": -57.459449768066406, + "logps/rejected": -53.830970764160156, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.000757657631766051, + "rewards/margins": 0.0008538023685105145, + "rewards/rejected": -0.0016114600002765656, + "step": 1490 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 2.2787139415740967, + "learning_rate": 1.9948082973515395e-08, + "logits/chosen": -3.014392614364624, + "logits/rejected": -3.0117087364196777, + "logps/chosen": -50.05243682861328, + "logps/rejected": -52.818359375, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0006372680654749274, + "rewards/margins": 0.0008813614840619266, + "rewards/rejected": -0.0015186297241598368, + "step": 1500 + }, + { + "epoch": 0.2584424534803584, + "eval_logits/chosen": -3.1620614528656006, + "eval_logits/rejected": -3.1564226150512695, + "eval_logps/chosen": -58.660011291503906, + "eval_logps/rejected": -63.144344329833984, + "eval_loss": 0.6930677890777588, + "eval_rewards/accuracies": 0.5206784605979919, + "eval_rewards/chosen": 0.0005188515642657876, + "eval_rewards/margins": 0.0001610520266694948, + "eval_rewards/rejected": 0.00035779952304437757, + "eval_runtime": 383.1959, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 1500 + }, + { + "epoch": 0.2601654031702274, + "grad_norm": 2.407702684402466, + "learning_rate": 1.9944977699150825e-08, + "logits/chosen": -2.9633898735046387, + "logits/rejected": -2.927666425704956, + "logps/chosen": -58.6661262512207, + "logps/rejected": -52.97742462158203, + "loss": 0.6927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0006018796702846885, + "rewards/margins": 0.0009065620834007859, + "rewards/rejected": -0.0015084416372701526, + "step": 1510 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 2.5089685916900635, + "learning_rate": 1.9941782491587175e-08, + "logits/chosen": -3.0775656700134277, + "logits/rejected": -3.0695385932922363, + "logps/chosen": -52.35771560668945, + "logps/rejected": -51.60698318481445, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0010706728789955378, + "rewards/margins": 2.0194147509755567e-05, + "rewards/rejected": -0.0010908670956268907, + "step": 1520 + }, + { + "epoch": 0.26361130254996556, + "grad_norm": 2.536482334136963, + "learning_rate": 1.993849737971896e-08, + "logits/chosen": -2.96296763420105, + "logits/rejected": -2.9462900161743164, + "logps/chosen": -51.77375030517578, + "logps/rejected": -50.902069091796875, + "loss": 0.6928, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0008512031054124236, + "rewards/margins": 0.000627328990958631, + "rewards/rejected": -0.0014785320963710546, + "step": 1530 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 2.257230043411255, + "learning_rate": 1.9935122393253692e-08, + "logits/chosen": -3.09374737739563, + "logits/rejected": -3.0615248680114746, + "logps/chosen": -55.999359130859375, + "logps/rejected": -51.18012237548828, + "loss": 0.6927, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0009388996404595673, + "rewards/margins": 0.0008283822098746896, + "rewards/rejected": -0.0017672820249572396, + "step": 1540 + }, + { + "epoch": 0.26705720192970367, + "grad_norm": 2.0905497074127197, + "learning_rate": 1.9931657562711637e-08, + "logits/chosen": -3.0119576454162598, + "logits/rejected": -2.9918830394744873, + "logps/chosen": -53.45903396606445, + "logps/rejected": -50.73937225341797, + "loss": 0.6926, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0008969675982370973, + "rewards/margins": 0.0010906046954914927, + "rewards/rejected": -0.00198757229372859, + "step": 1550 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 2.2873430252075195, + "learning_rate": 1.9928102919425526e-08, + "logits/chosen": -3.026608943939209, + "logits/rejected": -3.0091512203216553, + "logps/chosen": -50.974910736083984, + "logps/rejected": -50.85519027709961, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.0012713803444057703, + "rewards/margins": 0.0006782411364838481, + "rewards/rejected": -0.0019496215973049402, + "step": 1560 + }, + { + "epoch": 0.2705031013094418, + "grad_norm": 2.2136454582214355, + "learning_rate": 1.9924458495540268e-08, + "logits/chosen": -3.0785794258117676, + "logits/rejected": -3.0804882049560547, + "logps/chosen": -51.9824104309082, + "logps/rejected": -56.40099334716797, + "loss": 0.6926, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0006633226876147091, + "rewards/margins": 0.0010450478876009583, + "rewards/rejected": -0.0017083704005926847, + "step": 1570 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 2.671224355697632, + "learning_rate": 1.992072432401267e-08, + "logits/chosen": -3.013683557510376, + "logits/rejected": -3.0035407543182373, + "logps/chosen": -53.052703857421875, + "logps/rejected": -54.06764602661133, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0013744436437264085, + "rewards/margins": 0.0003955420688726008, + "rewards/rejected": -0.0017699853051453829, + "step": 1580 + }, + { + "epoch": 0.2739490006891799, + "grad_norm": 2.372692584991455, + "learning_rate": 1.991690043861113e-08, + "logits/chosen": -3.0596675872802734, + "logits/rejected": -3.0450618267059326, + "logps/chosen": -54.20551681518555, + "logps/rejected": -54.7797966003418, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0007508709677495062, + "rewards/margins": 0.0006255079642869532, + "rewards/rejected": -0.0013763790484517813, + "step": 1590 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 2.3430051803588867, + "learning_rate": 1.9912986873915344e-08, + "logits/chosen": -3.051609992980957, + "logits/rejected": -3.0133116245269775, + "logps/chosen": -52.2765998840332, + "logps/rejected": -51.117454528808594, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0009734455379657447, + "rewards/margins": 0.000701104465406388, + "rewards/rejected": -0.001674549886956811, + "step": 1600 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -3.161592483520508, + "eval_logits/rejected": -3.155942678451538, + "eval_logps/chosen": -58.649715423583984, + "eval_logps/rejected": -63.13517379760742, + "eval_loss": 0.6930622458457947, + "eval_rewards/accuracies": 0.515566885471344, + "eval_rewards/chosen": 0.0006218124181032181, + "eval_rewards/margins": 0.00017230722005479038, + "eval_rewards/rejected": 0.0004495051980484277, + "eval_runtime": 383.4363, + "eval_samples_per_second": 11.225, + "eval_steps_per_second": 1.403, + "step": 1600 + }, + { + "epoch": 0.277394900068918, + "grad_norm": 2.1574578285217285, + "learning_rate": 1.9908983665315976e-08, + "logits/chosen": -3.090954065322876, + "logits/rejected": -3.0656261444091797, + "logps/chosen": -56.01924514770508, + "logps/rejected": -57.913787841796875, + "loss": 0.6927, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0012514353729784489, + "rewards/margins": 0.0009009768255054951, + "rewards/rejected": -0.0021524124313145876, + "step": 1610 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 2.0792019367218018, + "learning_rate": 1.990489084901435e-08, + "logits/chosen": -3.012794017791748, + "logits/rejected": -2.990811347961426, + "logps/chosen": -51.55316925048828, + "logps/rejected": -54.24964141845703, + "loss": 0.6928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0011311458656564355, + "rewards/margins": 0.000712685112375766, + "rewards/rejected": -0.0018438309198245406, + "step": 1620 + }, + { + "epoch": 0.2808407994486561, + "grad_norm": 2.3274433612823486, + "learning_rate": 1.990070846202212e-08, + "logits/chosen": -3.0834336280822754, + "logits/rejected": -3.0569839477539062, + "logps/chosen": -55.578704833984375, + "logps/rejected": -51.934776306152344, + "loss": 0.6925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.000689486158080399, + "rewards/margins": 0.001395157421939075, + "rewards/rejected": -0.0020846438128501177, + "step": 1630 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 2.333423614501953, + "learning_rate": 1.989643654216093e-08, + "logits/chosen": -3.1261088848114014, + "logits/rejected": -3.0887601375579834, + "logps/chosen": -56.98992156982422, + "logps/rejected": -51.56941604614258, + "loss": 0.6922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0003556433948688209, + "rewards/margins": 0.001908325357362628, + "rewards/rejected": -0.00226396881043911, + "step": 1640 + }, + { + "epoch": 0.2842866988283942, + "grad_norm": 2.48422908782959, + "learning_rate": 1.9892075128062082e-08, + "logits/chosen": -3.0957982540130615, + "logits/rejected": -3.0681676864624023, + "logps/chosen": -57.281890869140625, + "logps/rejected": -55.1717414855957, + "loss": 0.6924, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0004545752890408039, + "rewards/margins": 0.0014074406353756785, + "rewards/rejected": -0.0018620159244164824, + "step": 1650 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 2.484135627746582, + "learning_rate": 1.988762425916618e-08, + "logits/chosen": -3.131685256958008, + "logits/rejected": -3.084357500076294, + "logps/chosen": -55.02167892456055, + "logps/rejected": -49.93708038330078, + "loss": 0.6923, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0005012772744521499, + "rewards/margins": 0.0017122188583016396, + "rewards/rejected": -0.0022134962491691113, + "step": 1660 + }, + { + "epoch": 0.2877325982081323, + "grad_norm": 2.482999324798584, + "learning_rate": 1.9883083975722772e-08, + "logits/chosen": -3.1197116374969482, + "logits/rejected": -3.0976428985595703, + "logps/chosen": -54.56291961669922, + "logps/rejected": -55.3899040222168, + "loss": 0.6925, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00048305519158020616, + "rewards/margins": 0.001361898030154407, + "rewards/rejected": -0.0018449531635269523, + "step": 1670 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 2.213054895401001, + "learning_rate": 1.987845431879e-08, + "logits/chosen": -3.077357530593872, + "logits/rejected": -3.051088809967041, + "logps/chosen": -55.71039962768555, + "logps/rejected": -54.652687072753906, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0010206119623035192, + "rewards/margins": 0.0014232432004064322, + "rewards/rejected": -0.0024438551627099514, + "step": 1680 + }, + { + "epoch": 0.29117849758787046, + "grad_norm": 2.3456971645355225, + "learning_rate": 1.9873735330234196e-08, + "logits/chosen": -3.0565898418426514, + "logits/rejected": -3.0409586429595947, + "logps/chosen": -55.663421630859375, + "logps/rejected": -52.42894744873047, + "loss": 0.693, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.001181212835945189, + "rewards/margins": 0.00037458629230968654, + "rewards/rejected": -0.001555799157358706, + "step": 1690 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 2.397773265838623, + "learning_rate": 1.986892705272954e-08, + "logits/chosen": -2.9650046825408936, + "logits/rejected": -2.9699463844299316, + "logps/chosen": -49.70085906982422, + "logps/rejected": -55.52735137939453, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0013781094457954168, + "rewards/margins": 0.0007676255772821605, + "rewards/rejected": -0.0021457350812852383, + "step": 1700 + }, + { + "epoch": 0.2929014472777395, + "eval_logits/chosen": -3.1609573364257812, + "eval_logits/rejected": -3.1553165912628174, + "eval_logps/chosen": -58.63572311401367, + "eval_logps/rejected": -63.1287841796875, + "eval_loss": 0.6930245757102966, + "eval_rewards/accuracies": 0.5290427803993225, + "eval_rewards/chosen": 0.0007616986404173076, + "eval_rewards/margins": 0.00024828972527757287, + "eval_rewards/rejected": 0.0005134089151397347, + "eval_runtime": 383.8785, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 1700 + }, + { + "epoch": 0.29462439696760856, + "grad_norm": 2.4840126037597656, + "learning_rate": 1.986402952975766e-08, + "logits/chosen": -3.097963809967041, + "logits/rejected": -3.057067394256592, + "logps/chosen": -58.48628616333008, + "logps/rejected": -54.680519104003906, + "loss": 0.6925, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.000765048258472234, + "rewards/margins": 0.0012052144156768918, + "rewards/rejected": -0.001970262499526143, + "step": 1710 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 2.4435672760009766, + "learning_rate": 1.985904280560723e-08, + "logits/chosen": -3.107060432434082, + "logits/rejected": -3.077709674835205, + "logps/chosen": -56.552574157714844, + "logps/rejected": -51.2525749206543, + "loss": 0.6928, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0012668697163462639, + "rewards/margins": 0.0007003343780525029, + "rewards/rejected": -0.001967204036191106, + "step": 1720 + }, + { + "epoch": 0.29807029634734666, + "grad_norm": 2.1897132396698, + "learning_rate": 1.9853966925373585e-08, + "logits/chosen": -3.1019604206085205, + "logits/rejected": -3.0847129821777344, + "logps/chosen": -54.06304168701172, + "logps/rejected": -52.69061279296875, + "loss": 0.6929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0012872053775936365, + "rewards/margins": 0.0005615145200863481, + "rewards/rejected": -0.0018487200140953064, + "step": 1730 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 2.3099875450134277, + "learning_rate": 1.9848801934958293e-08, + "logits/chosen": -3.0509893894195557, + "logits/rejected": -3.041018009185791, + "logps/chosen": -52.259376525878906, + "logps/rejected": -54.26354217529297, + "loss": 0.6929, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.001581055112183094, + "rewards/margins": 0.0004412340931594372, + "rewards/rejected": -0.002022289205342531, + "step": 1740 + }, + { + "epoch": 0.30151619572708477, + "grad_norm": 2.471845865249634, + "learning_rate": 1.9843547881068763e-08, + "logits/chosen": -3.0957400798797607, + "logits/rejected": -3.0844783782958984, + "logps/chosen": -55.27183151245117, + "logps/rejected": -56.19903564453125, + "loss": 0.6927, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0008580518770031631, + "rewards/margins": 0.0008658823790028691, + "rewards/rejected": -0.0017239341977983713, + "step": 1750 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 2.350327253341675, + "learning_rate": 1.983820481121781e-08, + "logits/chosen": -3.0834641456604004, + "logits/rejected": -3.046614408493042, + "logps/chosen": -56.12206268310547, + "logps/rejected": -53.220245361328125, + "loss": 0.6923, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0009891155641525984, + "rewards/margins": 0.0017277583247050643, + "rewards/rejected": -0.0027168740052729845, + "step": 1760 + }, + { + "epoch": 0.3049620951068229, + "grad_norm": 1.9723304510116577, + "learning_rate": 1.9832772773723228e-08, + "logits/chosen": -3.110938310623169, + "logits/rejected": -3.074063777923584, + "logps/chosen": -55.12842559814453, + "logps/rejected": -49.23452377319336, + "loss": 0.6924, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0014672544784843922, + "rewards/margins": 0.001455864286981523, + "rewards/rejected": -0.002923118881881237, + "step": 1770 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 2.564880609512329, + "learning_rate": 1.9827251817707347e-08, + "logits/chosen": -3.02485990524292, + "logits/rejected": -3.0266916751861572, + "logps/chosen": -54.67890548706055, + "logps/rejected": -58.6965446472168, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0016034668078646064, + "rewards/margins": 0.0007381063187494874, + "rewards/rejected": -0.00234157289378345, + "step": 1780 + }, + { + "epoch": 0.308407994486561, + "grad_norm": 2.5322887897491455, + "learning_rate": 1.98216419930966e-08, + "logits/chosen": -3.1726322174072266, + "logits/rejected": -3.1501011848449707, + "logps/chosen": -52.1053352355957, + "logps/rejected": -52.75413131713867, + "loss": 0.6926, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0010994903277605772, + "rewards/margins": 0.0011364398524165154, + "rewards/rejected": -0.002235929947346449, + "step": 1790 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 2.2157833576202393, + "learning_rate": 1.9815943350621065e-08, + "logits/chosen": -3.1358418464660645, + "logits/rejected": -3.1119844913482666, + "logps/chosen": -51.9261589050293, + "logps/rejected": -52.291221618652344, + "loss": 0.6923, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001239894307218492, + "rewards/margins": 0.0017638758290559053, + "rewards/rejected": -0.0030037700198590755, + "step": 1800 + }, + { + "epoch": 0.31013094417643006, + "eval_logits/chosen": -3.1606557369232178, + "eval_logits/rejected": -3.154991626739502, + "eval_logps/chosen": -58.635379791259766, + "eval_logps/rejected": -63.13025665283203, + "eval_loss": 0.6930158734321594, + "eval_rewards/accuracies": 0.5394981503486633, + "eval_rewards/chosen": 0.0007651591440662742, + "eval_rewards/margins": 0.00026646017795428634, + "eval_rewards/rejected": 0.0004986989079043269, + "eval_runtime": 383.5622, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 1800 + }, + { + "epoch": 0.3118538938662991, + "grad_norm": 2.3355607986450195, + "learning_rate": 1.9810155941813995e-08, + "logits/chosen": -3.139138698577881, + "logits/rejected": -3.1034657955169678, + "logps/chosen": -56.69929885864258, + "logps/rejected": -53.90766143798828, + "loss": 0.6925, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0008215238340198994, + "rewards/margins": 0.0013491030549630523, + "rewards/rejected": -0.002170627238228917, + "step": 1810 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 2.256990909576416, + "learning_rate": 1.9804279819011383e-08, + "logits/chosen": -3.110410690307617, + "logits/rejected": -3.0756146907806396, + "logps/chosen": -55.28718948364258, + "logps/rejected": -50.91753005981445, + "loss": 0.6926, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0012509961379691958, + "rewards/margins": 0.00110354193020612, + "rewards/rejected": -0.002354537835344672, + "step": 1820 + }, + { + "epoch": 0.31529979324603724, + "grad_norm": 2.239717483520508, + "learning_rate": 1.9798315035351457e-08, + "logits/chosen": -3.004528522491455, + "logits/rejected": -2.990546941757202, + "logps/chosen": -55.655792236328125, + "logps/rejected": -54.94952392578125, + "loss": 0.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0011249512899667025, + "rewards/margins": 0.001042040647007525, + "rewards/rejected": -0.0021669918205589056, + "step": 1830 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 2.331162929534912, + "learning_rate": 1.9792261644774218e-08, + "logits/chosen": -3.215100049972534, + "logits/rejected": -3.2110018730163574, + "logps/chosen": -54.98247146606445, + "logps/rejected": -55.251922607421875, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0016236340161412954, + "rewards/margins": 0.0004331713425926864, + "rewards/rejected": -0.0020568054169416428, + "step": 1840 + }, + { + "epoch": 0.31874569262577535, + "grad_norm": 2.3451895713806152, + "learning_rate": 1.9786119702020934e-08, + "logits/chosen": -3.0801799297332764, + "logits/rejected": -3.0801517963409424, + "logps/chosen": -53.336570739746094, + "logps/rejected": -54.976661682128906, + "loss": 0.6924, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00126861990429461, + "rewards/margins": 0.0014225415652617812, + "rewards/rejected": -0.002691161585971713, + "step": 1850 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 2.414311647415161, + "learning_rate": 1.9779889262633673e-08, + "logits/chosen": -3.0122768878936768, + "logits/rejected": -2.976020097732544, + "logps/chosen": -57.33064651489258, + "logps/rejected": -54.029083251953125, + "loss": 0.6919, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0003869426145683974, + "rewards/margins": 0.002549747470766306, + "rewards/rejected": -0.0029366896487772465, + "step": 1860 + }, + { + "epoch": 0.32219159200551345, + "grad_norm": 2.6407299041748047, + "learning_rate": 1.9773570382954776e-08, + "logits/chosen": -3.0591697692871094, + "logits/rejected": -3.0335114002227783, + "logps/chosen": -56.03263473510742, + "logps/rejected": -54.18107986450195, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0008292980492115021, + "rewards/margins": 0.0030972822569310665, + "rewards/rejected": -0.003926579840481281, + "step": 1870 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 2.2970800399780273, + "learning_rate": 1.9767163120126365e-08, + "logits/chosen": -3.1058273315429688, + "logits/rejected": -3.0676872730255127, + "logps/chosen": -52.84998321533203, + "logps/rejected": -51.77935791015625, + "loss": 0.6923, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0013415589928627014, + "rewards/margins": 0.0017453646287322044, + "rewards/rejected": -0.003086923388764262, + "step": 1880 + }, + { + "epoch": 0.32563749138525155, + "grad_norm": 2.5498745441436768, + "learning_rate": 1.97606675320898e-08, + "logits/chosen": -3.1286263465881348, + "logits/rejected": -3.1229748725891113, + "logps/chosen": -53.16112518310547, + "logps/rejected": -54.714317321777344, + "loss": 0.6923, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0010337254498153925, + "rewards/margins": 0.0016309624770656228, + "rewards/rejected": -0.002664688043296337, + "step": 1890 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 2.452503204345703, + "learning_rate": 1.975408367758519e-08, + "logits/chosen": -3.0616016387939453, + "logits/rejected": -3.0282962322235107, + "logps/chosen": -57.5360107421875, + "logps/rejected": -52.49907684326172, + "loss": 0.6924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0009620963828638196, + "rewards/margins": 0.0014106438029557467, + "rewards/rejected": -0.002372740302234888, + "step": 1900 + }, + { + "epoch": 0.32736044107512063, + "eval_logits/chosen": -3.160248279571533, + "eval_logits/rejected": -3.1546409130096436, + "eval_logps/chosen": -58.629112243652344, + "eval_logps/rejected": -63.12491226196289, + "eval_loss": 0.6930115222930908, + "eval_rewards/accuracies": 0.5223048329353333, + "eval_rewards/chosen": 0.0008278373279608786, + "eval_rewards/margins": 0.0002757786714937538, + "eval_rewards/rejected": 0.0005520587437786162, + "eval_runtime": 384.1694, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 1900 + }, + { + "epoch": 0.32908339076498966, + "grad_norm": 2.60693621635437, + "learning_rate": 1.9747411616150837e-08, + "logits/chosen": -2.960416316986084, + "logits/rejected": -2.9253084659576416, + "logps/chosen": -54.61848068237305, + "logps/rejected": -53.64332962036133, + "loss": 0.6922, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0015078171854838729, + "rewards/margins": 0.0019502185750752687, + "rewards/rejected": -0.00345803564414382, + "step": 1910 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 2.3648858070373535, + "learning_rate": 1.974065140812271e-08, + "logits/chosen": -3.0746426582336426, + "logits/rejected": -3.0466296672821045, + "logps/chosen": -55.4968147277832, + "logps/rejected": -54.046043395996094, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0012814325746148825, + "rewards/margins": 0.0021591049153357744, + "rewards/rejected": -0.0034405372571200132, + "step": 1920 + }, + { + "epoch": 0.33252929014472776, + "grad_norm": 2.1132709980010986, + "learning_rate": 1.973380311463389e-08, + "logits/chosen": -3.0233805179595947, + "logits/rejected": -2.986288070678711, + "logps/chosen": -53.99530029296875, + "logps/rejected": -54.19614791870117, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0010488248663023114, + "rewards/margins": 0.002408870728686452, + "rewards/rejected": -0.003457695245742798, + "step": 1930 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 2.1448163986206055, + "learning_rate": 1.9726866797614016e-08, + "logits/chosen": -3.0474705696105957, + "logits/rejected": -3.0288310050964355, + "logps/chosen": -51.166717529296875, + "logps/rejected": -50.376094818115234, + "loss": 0.6925, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0022671956103295088, + "rewards/margins": 0.0012547748629003763, + "rewards/rejected": -0.0035219707060605288, + "step": 1940 + }, + { + "epoch": 0.33597518952446587, + "grad_norm": 2.253101348876953, + "learning_rate": 1.9719842519788743e-08, + "logits/chosen": -3.053658962249756, + "logits/rejected": -3.052180051803589, + "logps/chosen": -52.709800720214844, + "logps/rejected": -55.065162658691406, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0020216540433466434, + "rewards/margins": 0.0008553097140975296, + "rewards/rejected": -0.002876963932067156, + "step": 1950 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 2.281599998474121, + "learning_rate": 1.971273034467915e-08, + "logits/chosen": -3.059936046600342, + "logits/rejected": -3.0406055450439453, + "logps/chosen": -54.529571533203125, + "logps/rejected": -54.80952835083008, + "loss": 0.6924, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0017005018889904022, + "rewards/margins": 0.0014261369360610843, + "rewards/rejected": -0.0031266387086361647, + "step": 1960 + }, + { + "epoch": 0.33942108890420397, + "grad_norm": 2.1517691612243652, + "learning_rate": 1.9705530336601192e-08, + "logits/chosen": -3.105541706085205, + "logits/rejected": -3.0760109424591064, + "logps/chosen": -56.24677658081055, + "logps/rejected": -52.138221740722656, + "loss": 0.6922, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0010016715386882424, + "rewards/margins": 0.0019247450400143862, + "rewards/rejected": -0.0029264166951179504, + "step": 1970 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 2.4106578826904297, + "learning_rate": 1.969824256066509e-08, + "logits/chosen": -3.010801315307617, + "logits/rejected": -3.0054256916046143, + "logps/chosen": -55.7724609375, + "logps/rejected": -54.3217658996582, + "loss": 0.6927, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0018466083565726876, + "rewards/margins": 0.0009771850891411304, + "rewards/rejected": -0.00282379356212914, + "step": 1980 + }, + { + "epoch": 0.34286698828394213, + "grad_norm": 2.3051235675811768, + "learning_rate": 1.9690867082774768e-08, + "logits/chosen": -3.1406359672546387, + "logits/rejected": -3.105625629425049, + "logps/chosen": -51.02665710449219, + "logps/rejected": -48.61326217651367, + "loss": 0.6919, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0019225224386900663, + "rewards/margins": 0.002514256862923503, + "rewards/rejected": -0.004436778835952282, + "step": 1990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 2.5171186923980713, + "learning_rate": 1.968340396962724e-08, + "logits/chosen": -3.0621447563171387, + "logits/rejected": -3.0685670375823975, + "logps/chosen": -50.19961929321289, + "logps/rejected": -57.98683547973633, + "loss": 0.6925, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.002387142274528742, + "rewards/margins": 0.0013573340838775039, + "rewards/rejected": -0.003744476707652211, + "step": 2000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -3.160059928894043, + "eval_logits/rejected": -3.154402017593384, + "eval_logps/chosen": -58.621490478515625, + "eval_logps/rejected": -63.13191223144531, + "eval_loss": 0.6929388046264648, + "eval_rewards/accuracies": 0.542286217212677, + "eval_rewards/chosen": 0.0009040239383466542, + "eval_rewards/margins": 0.0004219270485918969, + "eval_rewards/rejected": 0.00048209683154709637, + "eval_runtime": 383.2101, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 2000 + }, + { + "epoch": 0.34631288766368024, + "grad_norm": 2.4561853408813477, + "learning_rate": 1.9675853288712007e-08, + "logits/chosen": -3.0682225227355957, + "logits/rejected": -3.039653778076172, + "logps/chosen": -55.76312255859375, + "logps/rejected": -52.18657302856445, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0016124986577779055, + "rewards/margins": 0.0022291613277047873, + "rewards/rejected": -0.0038416602183133364, + "step": 2010 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 2.3522374629974365, + "learning_rate": 1.9668215108310464e-08, + "logits/chosen": -3.0565547943115234, + "logits/rejected": -3.034590244293213, + "logps/chosen": -50.00489044189453, + "logps/rejected": -55.06145095825195, + "loss": 0.6921, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0017557486426085234, + "rewards/margins": 0.002072775736451149, + "rewards/rejected": -0.0038285241462290287, + "step": 2020 + }, + { + "epoch": 0.34975878704341834, + "grad_norm": 2.365412950515747, + "learning_rate": 1.9660489497495258e-08, + "logits/chosen": -3.1443276405334473, + "logits/rejected": -3.1226987838745117, + "logps/chosen": -57.24473190307617, + "logps/rejected": -55.498451232910156, + "loss": 0.6925, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0019245322328060865, + "rewards/margins": 0.0012448631459847093, + "rewards/rejected": -0.0031693950295448303, + "step": 2030 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 2.3502867221832275, + "learning_rate": 1.965267652612969e-08, + "logits/chosen": -3.007511615753174, + "logits/rejected": -2.9947307109832764, + "logps/chosen": -52.93482208251953, + "logps/rejected": -54.30632781982422, + "loss": 0.6925, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0021886585745960474, + "rewards/margins": 0.0013397895963862538, + "rewards/rejected": -0.003528448287397623, + "step": 2040 + }, + { + "epoch": 0.35320468642315644, + "grad_norm": 2.388796329498291, + "learning_rate": 1.964477626486706e-08, + "logits/chosen": -3.1260132789611816, + "logits/rejected": -3.100912094116211, + "logps/chosen": -51.36164093017578, + "logps/rejected": -54.71668243408203, + "loss": 0.6923, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.0013962425291538239, + "rewards/margins": 0.0016656548250466585, + "rewards/rejected": -0.0030618971213698387, + "step": 2050 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 2.1621086597442627, + "learning_rate": 1.9636788785150038e-08, + "logits/chosen": -3.1135175228118896, + "logits/rejected": -3.0747947692871094, + "logps/chosen": -55.415794372558594, + "logps/rejected": -51.89849090576172, + "loss": 0.692, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0008633068646304309, + "rewards/margins": 0.002299419604241848, + "rewards/rejected": -0.0031627260614186525, + "step": 2060 + }, + { + "epoch": 0.35665058580289455, + "grad_norm": 2.4029855728149414, + "learning_rate": 1.962871415921001e-08, + "logits/chosen": -3.131016969680786, + "logits/rejected": -3.107579469680786, + "logps/chosen": -55.44580078125, + "logps/rejected": -55.06316375732422, + "loss": 0.6925, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0015260230284184217, + "rewards/margins": 0.0012810361804440618, + "rewards/rejected": -0.0028070593252778053, + "step": 2070 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 2.4090819358825684, + "learning_rate": 1.9620552460066455e-08, + "logits/chosen": -3.071326732635498, + "logits/rejected": -3.0413899421691895, + "logps/chosen": -52.11077880859375, + "logps/rejected": -51.01923370361328, + "loss": 0.6924, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0024296611081808805, + "rewards/margins": 0.001453958684578538, + "rewards/rejected": -0.003883620025590062, + "step": 2080 + }, + { + "epoch": 0.36009648518263265, + "grad_norm": 2.2452566623687744, + "learning_rate": 1.9612303761526236e-08, + "logits/chosen": -3.095968246459961, + "logits/rejected": -3.092430353164673, + "logps/chosen": -54.56378173828125, + "logps/rejected": -54.91994094848633, + "loss": 0.6927, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0016902232309803367, + "rewards/margins": 0.0009237364865839481, + "rewards/rejected": -0.0026139598339796066, + "step": 2090 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 2.5663130283355713, + "learning_rate": 1.9603968138182974e-08, + "logits/chosen": -3.045274257659912, + "logits/rejected": -3.0230183601379395, + "logps/chosen": -55.53925323486328, + "logps/rejected": -51.63383102416992, + "loss": 0.6922, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0019278887193650007, + "rewards/margins": 0.0019202090334147215, + "rewards/rejected": -0.0038480982184410095, + "step": 2100 + }, + { + "epoch": 0.36181943487250173, + "eval_logits/chosen": -3.1595144271850586, + "eval_logits/rejected": -3.1538989543914795, + "eval_logps/chosen": -58.603946685791016, + "eval_logps/rejected": -63.11533737182617, + "eval_loss": 0.692934513092041, + "eval_rewards/accuracies": 0.5511152148246765, + "eval_rewards/chosen": 0.0010795381385833025, + "eval_rewards/margins": 0.0004316373378969729, + "eval_rewards/rejected": 0.0006479007424786687, + "eval_runtime": 383.4209, + "eval_samples_per_second": 11.225, + "eval_steps_per_second": 1.403, + "step": 2100 + }, + { + "epoch": 0.36354238456237076, + "grad_norm": 2.277183771133423, + "learning_rate": 1.959554566541635e-08, + "logits/chosen": -3.1013741493225098, + "logits/rejected": -3.104506731033325, + "logps/chosen": -49.140750885009766, + "logps/rejected": -55.5598258972168, + "loss": 0.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0024113706313073635, + "rewards/margins": 0.0012931081000715494, + "rewards/rejected": -0.0037044784985482693, + "step": 2110 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 2.345186471939087, + "learning_rate": 1.9587036419391437e-08, + "logits/chosen": -2.9661850929260254, + "logits/rejected": -2.938690185546875, + "logps/chosen": -53.660194396972656, + "logps/rejected": -51.288002014160156, + "loss": 0.6918, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0008453844930045307, + "rewards/margins": 0.002739228308200836, + "rewards/rejected": -0.0035846128594130278, + "step": 2120 + }, + { + "epoch": 0.3669882839421089, + "grad_norm": 2.1597959995269775, + "learning_rate": 1.9578440477057998e-08, + "logits/chosen": -3.0039591789245605, + "logits/rejected": -2.9852240085601807, + "logps/chosen": -54.717498779296875, + "logps/rejected": -52.96318817138672, + "loss": 0.6923, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.002471815561875701, + "rewards/margins": 0.0017909994348883629, + "rewards/rejected": -0.0042628152295947075, + "step": 2130 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 2.2517848014831543, + "learning_rate": 1.9569757916149805e-08, + "logits/chosen": -2.9810574054718018, + "logits/rejected": -2.9728846549987793, + "logps/chosen": -49.01877975463867, + "logps/rejected": -54.04807662963867, + "loss": 0.6925, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0036360882222652435, + "rewards/margins": 0.001321372459642589, + "rewards/rejected": -0.00495745986700058, + "step": 2140 + }, + { + "epoch": 0.370434183321847, + "grad_norm": 2.262716054916382, + "learning_rate": 1.956098881518392e-08, + "logits/chosen": -3.0333046913146973, + "logits/rejected": -2.9868977069854736, + "logps/chosen": -54.80281448364258, + "logps/rejected": -48.4425163269043, + "loss": 0.6916, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.001142291584983468, + "rewards/margins": 0.00319478427991271, + "rewards/rejected": -0.004337075632065535, + "step": 2150 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 2.2613468170166016, + "learning_rate": 1.9552133253460006e-08, + "logits/chosen": -2.999462366104126, + "logits/rejected": -2.984152317047119, + "logps/chosen": -54.93077850341797, + "logps/rejected": -49.919715881347656, + "loss": 0.6922, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0017375343013554811, + "rewards/margins": 0.0019211741164326668, + "rewards/rejected": -0.0036587081849575043, + "step": 2160 + }, + { + "epoch": 0.3738800827015851, + "grad_norm": 2.3072938919067383, + "learning_rate": 1.954319131105958e-08, + "logits/chosen": -3.123497247695923, + "logits/rejected": -3.1076152324676514, + "logps/chosen": -54.589698791503906, + "logps/rejected": -51.89857864379883, + "loss": 0.6924, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0025846485514193773, + "rewards/margins": 0.0015800563851371408, + "rewards/rejected": -0.004164704121649265, + "step": 2170 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 2.694261074066162, + "learning_rate": 1.953416306884532e-08, + "logits/chosen": -3.1719765663146973, + "logits/rejected": -3.133807897567749, + "logps/chosen": -58.207847595214844, + "logps/rejected": -53.22174072265625, + "loss": 0.6916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0018373355269432068, + "rewards/margins": 0.003037033136934042, + "rewards/rejected": -0.0048743681982159615, + "step": 2180 + }, + { + "epoch": 0.37732598208132323, + "grad_norm": 2.5449416637420654, + "learning_rate": 1.952504860846032e-08, + "logits/chosen": -3.235800266265869, + "logits/rejected": -3.2293059825897217, + "logps/chosen": -53.08635711669922, + "logps/rejected": -54.184967041015625, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0024451869539916515, + "rewards/margins": 0.0008518371032550931, + "rewards/rejected": -0.0032970241736620665, + "step": 2190 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 2.451497793197632, + "learning_rate": 1.951584801232734e-08, + "logits/chosen": -3.065656900405884, + "logits/rejected": -3.0425562858581543, + "logps/chosen": -52.4706916809082, + "logps/rejected": -55.25239181518555, + "loss": 0.6917, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0016196580836549401, + "rewards/margins": 0.0029670994263142347, + "rewards/rejected": -0.004586757160723209, + "step": 2200 + }, + { + "epoch": 0.37904893177119225, + "eval_logits/chosen": -3.159013032913208, + "eval_logits/rejected": -3.1533420085906982, + "eval_logps/chosen": -58.596683502197266, + "eval_logps/rejected": -63.1153450012207, + "eval_loss": 0.6928985714912415, + "eval_rewards/accuracies": 0.5378717184066772, + "eval_rewards/chosen": 0.0011521215783432126, + "eval_rewards/margins": 0.0005043414421379566, + "eval_rewards/rejected": 0.0006477802526205778, + "eval_runtime": 383.2871, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 2200 + }, + { + "epoch": 0.38077188146106133, + "grad_norm": 2.220224618911743, + "learning_rate": 1.9506561363648082e-08, + "logits/chosen": -3.1231906414031982, + "logits/rejected": -3.1068406105041504, + "logps/chosen": -54.8387451171875, + "logps/rejected": -54.0641975402832, + "loss": 0.6919, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0011239739833399653, + "rewards/margins": 0.002422004472464323, + "rewards/rejected": -0.00354597857221961, + "step": 2210 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 2.312995433807373, + "learning_rate": 1.9497188746402428e-08, + "logits/chosen": -2.9493377208709717, + "logits/rejected": -2.941772937774658, + "logps/chosen": -54.039085388183594, + "logps/rejected": -54.20904541015625, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0016108605777844787, + "rewards/margins": 0.0020083982963114977, + "rewards/rejected": -0.003619259223341942, + "step": 2220 + }, + { + "epoch": 0.38421778084079944, + "grad_norm": 2.4227945804595947, + "learning_rate": 1.948773024534767e-08, + "logits/chosen": -3.1078758239746094, + "logits/rejected": -3.075817584991455, + "logps/chosen": -53.9468879699707, + "logps/rejected": -51.3045654296875, + "loss": 0.6917, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0026930735912173986, + "rewards/margins": 0.002932693576440215, + "rewards/rejected": -0.005625767167657614, + "step": 2230 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 2.5007476806640625, + "learning_rate": 1.9478185946017774e-08, + "logits/chosen": -3.0606689453125, + "logits/rejected": -3.0243403911590576, + "logps/chosen": -57.04365921020508, + "logps/rejected": -54.07979202270508, + "loss": 0.6919, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.002340201986953616, + "rewards/margins": 0.0025223747361451387, + "rewards/rejected": -0.00486257579177618, + "step": 2240 + }, + { + "epoch": 0.38766368022053754, + "grad_norm": 2.6046512126922607, + "learning_rate": 1.946855593472256e-08, + "logits/chosen": -3.0197103023529053, + "logits/rejected": -2.9851489067077637, + "logps/chosen": -55.551429748535156, + "logps/rejected": -53.84883499145508, + "loss": 0.6917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0020156418904662132, + "rewards/margins": 0.0030193165875971317, + "rewards/rejected": -0.005034958478063345, + "step": 2250 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 2.2071502208709717, + "learning_rate": 1.945884029854697e-08, + "logits/chosen": -2.9758896827697754, + "logits/rejected": -2.94950270652771, + "logps/chosen": -60.996726989746094, + "logps/rejected": -57.81394577026367, + "loss": 0.6919, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0018908934434875846, + "rewards/margins": 0.0025968493428081274, + "rewards/rejected": -0.00448774266988039, + "step": 2260 + }, + { + "epoch": 0.39110957960027565, + "grad_norm": 2.4286463260650635, + "learning_rate": 1.9449039125350245e-08, + "logits/chosen": -2.9611337184906006, + "logits/rejected": -2.927670478820801, + "logps/chosen": -54.38924026489258, + "logps/rejected": -52.332435607910156, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0032784740906208754, + "rewards/margins": 0.0028812182135879993, + "rewards/rejected": -0.006159692537039518, + "step": 2270 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 2.2948031425476074, + "learning_rate": 1.943915250376515e-08, + "logits/chosen": -3.0407679080963135, + "logits/rejected": -3.030714750289917, + "logps/chosen": -53.62548828125, + "logps/rejected": -55.8744010925293, + "loss": 0.6923, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0018194593722000718, + "rewards/margins": 0.00162306590937078, + "rewards/rejected": -0.0034425253979861736, + "step": 2280 + }, + { + "epoch": 0.3945554789800138, + "grad_norm": 2.675964593887329, + "learning_rate": 1.9429180523197173e-08, + "logits/chosen": -2.9342846870422363, + "logits/rejected": -2.9056806564331055, + "logps/chosen": -53.1612663269043, + "logps/rejected": -54.345176696777344, + "loss": 0.6916, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.002783877542242408, + "rewards/margins": 0.00306482776068151, + "rewards/rejected": -0.0058487048372626305, + "step": 2290 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 2.2668910026550293, + "learning_rate": 1.9419123273823692e-08, + "logits/chosen": -3.121488571166992, + "logits/rejected": -3.087228775024414, + "logps/chosen": -56.041099548339844, + "logps/rejected": -54.0965576171875, + "loss": 0.6914, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0009602559730410576, + "rewards/margins": 0.0034466232173144817, + "rewards/rejected": -0.004406879656016827, + "step": 2300 + }, + { + "epoch": 0.39627842866988283, + "eval_logits/chosen": -3.1587371826171875, + "eval_logits/rejected": -3.153092622756958, + "eval_logps/chosen": -58.58064270019531, + "eval_logps/rejected": -63.10941696166992, + "eval_loss": 0.6928492188453674, + "eval_rewards/accuracies": 0.5480948090553284, + "eval_rewards/chosen": 0.001312516164034605, + "eval_rewards/margins": 0.0006054288824088871, + "eval_rewards/rejected": 0.000707087223418057, + "eval_runtime": 383.4093, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 2300 + }, + { + "epoch": 0.3980013783597519, + "grad_norm": 2.4323816299438477, + "learning_rate": 1.940898084659319e-08, + "logits/chosen": -3.022576093673706, + "logits/rejected": -3.0067625045776367, + "logps/chosen": -51.424705505371094, + "logps/rejected": -51.964454650878906, + "loss": 0.6922, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.003465785412117839, + "rewards/margins": 0.0018342696130275726, + "rewards/rejected": -0.005300055257976055, + "step": 2310 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 2.3269340991973877, + "learning_rate": 1.939875333322442e-08, + "logits/chosen": -3.1188583374023438, + "logits/rejected": -3.077375888824463, + "logps/chosen": -57.623809814453125, + "logps/rejected": -51.6974983215332, + "loss": 0.6913, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0018242349615320563, + "rewards/margins": 0.003677531611174345, + "rewards/rejected": -0.005501766689121723, + "step": 2320 + }, + { + "epoch": 0.40144727773949, + "grad_norm": 2.243847608566284, + "learning_rate": 1.938844082620557e-08, + "logits/chosen": -3.029540538787842, + "logits/rejected": -3.007463216781616, + "logps/chosen": -56.02894973754883, + "logps/rejected": -52.85606002807617, + "loss": 0.6913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.002195675391703844, + "rewards/margins": 0.0038120609242469072, + "rewards/rejected": -0.006007737014442682, + "step": 2330 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 2.2276456356048584, + "learning_rate": 1.9378043418793438e-08, + "logits/chosen": -3.0718817710876465, + "logits/rejected": -3.0597751140594482, + "logps/chosen": -52.65851974487305, + "logps/rejected": -55.89642333984375, + "loss": 0.6924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.003569630440324545, + "rewards/margins": 0.0016181267565116286, + "rewards/rejected": -0.005187757313251495, + "step": 2340 + }, + { + "epoch": 0.4048931771192281, + "grad_norm": 2.4954779148101807, + "learning_rate": 1.936756120501258e-08, + "logits/chosen": -3.0566139221191406, + "logits/rejected": -3.0337390899658203, + "logps/chosen": -58.45566940307617, + "logps/rejected": -55.84288787841797, + "loss": 0.6915, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0019436674192547798, + "rewards/margins": 0.0033232986461371183, + "rewards/rejected": -0.005266966298222542, + "step": 2350 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 2.192545175552368, + "learning_rate": 1.935699427965446e-08, + "logits/chosen": -3.07662296295166, + "logits/rejected": -3.0633997917175293, + "logps/chosen": -50.66261291503906, + "logps/rejected": -51.67461395263672, + "loss": 0.6919, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0030053765513002872, + "rewards/margins": 0.0024440365377813578, + "rewards/rejected": -0.005449412856251001, + "step": 2360 + }, + { + "epoch": 0.4083390764989662, + "grad_norm": 2.514657497406006, + "learning_rate": 1.9346342738276593e-08, + "logits/chosen": -3.0921218395233154, + "logits/rejected": -3.0855157375335693, + "logps/chosen": -54.12009811401367, + "logps/rejected": -54.41124725341797, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0035813034046441317, + "rewards/margins": 0.0018840819830074906, + "rewards/rejected": -0.005465385504066944, + "step": 2370 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 2.1899521350860596, + "learning_rate": 1.93356066772017e-08, + "logits/chosen": -3.0172626972198486, + "logits/rejected": -2.9923696517944336, + "logps/chosen": -54.54243850708008, + "logps/rejected": -53.3748893737793, + "loss": 0.6916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0020027304999530315, + "rewards/margins": 0.003100222907960415, + "rewards/rejected": -0.005102953407913446, + "step": 2380 + }, + { + "epoch": 0.41178497587870433, + "grad_norm": 2.286336660385132, + "learning_rate": 1.9324786193516794e-08, + "logits/chosen": -3.1032328605651855, + "logits/rejected": -3.0758352279663086, + "logps/chosen": -56.50042724609375, + "logps/rejected": -53.74065017700195, + "loss": 0.6913, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0026643401943147182, + "rewards/margins": 0.0038009281270205975, + "rewards/rejected": -0.006465268321335316, + "step": 2390 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 2.1463067531585693, + "learning_rate": 1.9313881385072357e-08, + "logits/chosen": -3.151245355606079, + "logits/rejected": -3.1303699016571045, + "logps/chosen": -52.655494689941406, + "logps/rejected": -53.832435607910156, + "loss": 0.6921, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.003549426095560193, + "rewards/margins": 0.002139916643500328, + "rewards/rejected": -0.005689342971891165, + "step": 2400 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -3.157942771911621, + "eval_logits/rejected": -3.1523544788360596, + "eval_logps/chosen": -58.57807159423828, + "eval_logps/rejected": -63.113616943359375, + "eval_loss": 0.6928165555000305, + "eval_rewards/accuracies": 0.5499535202980042, + "eval_rewards/chosen": 0.0013382199686020613, + "eval_rewards/margins": 0.0006732027977705002, + "eval_rewards/rejected": 0.0006650172872468829, + "eval_runtime": 383.7102, + "eval_samples_per_second": 11.217, + "eval_steps_per_second": 1.402, + "step": 2400 + }, + { + "epoch": 0.41523087525844243, + "grad_norm": 2.061424970626831, + "learning_rate": 1.9302892350481398e-08, + "logits/chosen": -3.1260993480682373, + "logits/rejected": -3.0860531330108643, + "logps/chosen": -54.02878952026367, + "logps/rejected": -49.81685256958008, + "loss": 0.6913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0017845083493739367, + "rewards/margins": 0.003701354144141078, + "rewards/rejected": -0.005485862959176302, + "step": 2410 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 2.0637965202331543, + "learning_rate": 1.9291819189118608e-08, + "logits/chosen": -3.1225945949554443, + "logits/rejected": -3.0999155044555664, + "logps/chosen": -56.397972106933594, + "logps/rejected": -55.43430709838867, + "loss": 0.6916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0021585775539278984, + "rewards/margins": 0.003159626852720976, + "rewards/rejected": -0.005318204872310162, + "step": 2420 + }, + { + "epoch": 0.41867677463818054, + "grad_norm": 2.2857463359832764, + "learning_rate": 1.9280662001119444e-08, + "logits/chosen": -3.085228443145752, + "logits/rejected": -3.062695026397705, + "logps/chosen": -55.07440948486328, + "logps/rejected": -52.27777099609375, + "loss": 0.6916, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.002314269542694092, + "rewards/margins": 0.00313003221526742, + "rewards/rejected": -0.005444302223622799, + "step": 2430 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 2.347783327102661, + "learning_rate": 1.9269420887379205e-08, + "logits/chosen": -3.0803112983703613, + "logits/rejected": -3.0668201446533203, + "logps/chosen": -55.31931686401367, + "logps/rejected": -54.99340057373047, + "loss": 0.692, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.002624379238113761, + "rewards/margins": 0.0023085675202310085, + "rewards/rejected": -0.004932946525514126, + "step": 2440 + }, + { + "epoch": 0.4221226740179187, + "grad_norm": 2.415978193283081, + "learning_rate": 1.9258095949552154e-08, + "logits/chosen": -3.0366785526275635, + "logits/rejected": -3.013326406478882, + "logps/chosen": -53.44938278198242, + "logps/rejected": -52.85466766357422, + "loss": 0.6921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.003910476807504892, + "rewards/margins": 0.0020568217150866985, + "rewards/rejected": -0.005967298522591591, + "step": 2450 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 2.3307220935821533, + "learning_rate": 1.9246687290050577e-08, + "logits/chosen": -3.020193576812744, + "logits/rejected": -2.98730731010437, + "logps/chosen": -56.455039978027344, + "logps/rejected": -53.13280487060547, + "loss": 0.6917, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.003136079292744398, + "rewards/margins": 0.0028621095698326826, + "rewards/rejected": -0.005998189095407724, + "step": 2460 + }, + { + "epoch": 0.4255685733976568, + "grad_norm": 2.3056654930114746, + "learning_rate": 1.923519501204386e-08, + "logits/chosen": -3.1820156574249268, + "logits/rejected": -3.160578966140747, + "logps/chosen": -55.23564910888672, + "logps/rejected": -53.224754333496094, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0019143400713801384, + "rewards/margins": 0.003250572830438614, + "rewards/rejected": -0.005164912901818752, + "step": 2470 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 2.56144642829895, + "learning_rate": 1.9223619219457556e-08, + "logits/chosen": -3.0781826972961426, + "logits/rejected": -3.046678066253662, + "logps/chosen": -54.766204833984375, + "logps/rejected": -51.062870025634766, + "loss": 0.6919, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.003880419535562396, + "rewards/margins": 0.00249149976298213, + "rewards/rejected": -0.0063719190657138824, + "step": 2480 + }, + { + "epoch": 0.4290144727773949, + "grad_norm": 2.0401649475097656, + "learning_rate": 1.9211960016972447e-08, + "logits/chosen": -3.061281681060791, + "logits/rejected": -3.0525238513946533, + "logps/chosen": -50.482879638671875, + "logps/rejected": -52.11334228515625, + "loss": 0.6928, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.004440463148057461, + "rewards/margins": 0.0007436785381287336, + "rewards/rejected": -0.005184141453355551, + "step": 2490 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 2.23286509513855, + "learning_rate": 1.9200217510023604e-08, + "logits/chosen": -3.140615224838257, + "logits/rejected": -3.116429090499878, + "logps/chosen": -57.23732376098633, + "logps/rejected": -57.09992218017578, + "loss": 0.6922, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0034662075340747833, + "rewards/margins": 0.0018366838339716196, + "rewards/rejected": -0.005302890669554472, + "step": 2500 + }, + { + "epoch": 0.43073742246726393, + "eval_logits/chosen": -3.1574504375457764, + "eval_logits/rejected": -3.1518211364746094, + "eval_logps/chosen": -58.564754486083984, + "eval_logps/rejected": -63.113121032714844, + "eval_loss": 0.6927535533905029, + "eval_rewards/accuracies": 0.5601765513420105, + "eval_rewards/chosen": 0.0014713724376633763, + "eval_rewards/margins": 0.0008013962069526315, + "eval_rewards/rejected": 0.0006699761725030839, + "eval_runtime": 383.6708, + "eval_samples_per_second": 11.218, + "eval_steps_per_second": 1.402, + "step": 2500 + }, + { + "epoch": 0.432460372157133, + "grad_norm": 2.3200342655181885, + "learning_rate": 1.9188391804799416e-08, + "logits/chosen": -2.9886715412139893, + "logits/rejected": -2.978713274002075, + "logps/chosen": -52.80669403076172, + "logps/rejected": -52.61443328857422, + "loss": 0.6921, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.004210265818983316, + "rewards/margins": 0.002118567703291774, + "rewards/rejected": -0.006328833755105734, + "step": 2510 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 2.2182769775390625, + "learning_rate": 1.9176483008240652e-08, + "logits/chosen": -2.9787347316741943, + "logits/rejected": -2.9508373737335205, + "logps/chosen": -52.73773193359375, + "logps/rejected": -49.517173767089844, + "loss": 0.6919, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0026951669715344906, + "rewards/margins": 0.0024579425808042288, + "rewards/rejected": -0.005153109785169363, + "step": 2520 + }, + { + "epoch": 0.4359062715368711, + "grad_norm": 2.1345114707946777, + "learning_rate": 1.916449122803947e-08, + "logits/chosen": -3.067570924758911, + "logits/rejected": -3.0630366802215576, + "logps/chosen": -51.600563049316406, + "logps/rejected": -54.767860412597656, + "loss": 0.6922, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.004309863317757845, + "rewards/margins": 0.0018437877297401428, + "rewards/rejected": -0.0061536505818367004, + "step": 2530 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 2.2189536094665527, + "learning_rate": 1.9152416572638466e-08, + "logits/chosen": -3.1077988147735596, + "logits/rejected": -3.0970089435577393, + "logps/chosen": -54.01378631591797, + "logps/rejected": -54.51206588745117, + "loss": 0.6922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004733550362288952, + "rewards/margins": 0.0019095508614555001, + "rewards/rejected": -0.006643100641667843, + "step": 2540 + }, + { + "epoch": 0.4393521709166092, + "grad_norm": 2.328441619873047, + "learning_rate": 1.9140259151229674e-08, + "logits/chosen": -3.0512402057647705, + "logits/rejected": -3.018428325653076, + "logps/chosen": -58.816741943359375, + "logps/rejected": -54.53766632080078, + "loss": 0.691, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.00133642612490803, + "rewards/margins": 0.0043244450353085995, + "rewards/rejected": -0.005660871509462595, + "step": 2550 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 2.3890929222106934, + "learning_rate": 1.9128019073753598e-08, + "logits/chosen": -3.140418291091919, + "logits/rejected": -3.1156299114227295, + "logps/chosen": -55.52399826049805, + "logps/rejected": -54.633018493652344, + "loss": 0.6914, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0026364210061728954, + "rewards/margins": 0.0034923895727843046, + "rewards/rejected": -0.006128811277449131, + "step": 2560 + }, + { + "epoch": 0.4427980702963473, + "grad_norm": 2.3404576778411865, + "learning_rate": 1.9115696450898193e-08, + "logits/chosen": -3.073380947113037, + "logits/rejected": -3.0549139976501465, + "logps/chosen": -58.83415985107422, + "logps/rejected": -57.058067321777344, + "loss": 0.6916, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.002324377652257681, + "rewards/margins": 0.0032200622372329235, + "rewards/rejected": -0.005544439889490604, + "step": 2570 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 2.5390334129333496, + "learning_rate": 1.9103291394097894e-08, + "logits/chosen": -3.1005444526672363, + "logits/rejected": -3.0815181732177734, + "logps/chosen": -53.89350128173828, + "logps/rejected": -53.769981384277344, + "loss": 0.6928, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.005077004432678223, + "rewards/margins": 0.0007953291060402989, + "rewards/rejected": -0.0058723329566419125, + "step": 2580 + }, + { + "epoch": 0.4462439696760855, + "grad_norm": 2.608225107192993, + "learning_rate": 1.9090804015532585e-08, + "logits/chosen": -3.0780763626098633, + "logits/rejected": -3.040466785430908, + "logps/chosen": -57.0039176940918, + "logps/rejected": -52.00291061401367, + "loss": 0.6909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.003094766056165099, + "rewards/margins": 0.004467259161174297, + "rewards/rejected": -0.007562024984508753, + "step": 2590 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 2.4454474449157715, + "learning_rate": 1.9078234428126585e-08, + "logits/chosen": -3.0497794151306152, + "logits/rejected": -3.0013108253479004, + "logps/chosen": -57.414405822753906, + "logps/rejected": -49.24188995361328, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.003182294312864542, + "rewards/margins": 0.004597029648721218, + "rewards/rejected": -0.007779325358569622, + "step": 2600 + }, + { + "epoch": 0.4479669193659545, + "eval_logits/chosen": -3.1568431854248047, + "eval_logits/rejected": -3.1512274742126465, + "eval_logps/chosen": -58.55170822143555, + "eval_logps/rejected": -63.10794448852539, + "eval_loss": 0.6927151679992676, + "eval_rewards/accuracies": 0.5580855011940002, + "eval_rewards/chosen": 0.0016019355971366167, + "eval_rewards/margins": 0.0008801804506219923, + "eval_rewards/rejected": 0.0007217551465146244, + "eval_runtime": 383.6818, + "eval_samples_per_second": 11.218, + "eval_steps_per_second": 1.402, + "step": 2600 + }, + { + "epoch": 0.4496898690558236, + "grad_norm": 2.163145065307617, + "learning_rate": 1.9065582745547646e-08, + "logits/chosen": -3.0288872718811035, + "logits/rejected": -2.989107131958008, + "logps/chosen": -58.72133255004883, + "logps/rejected": -52.37580108642578, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0034227855503559113, + "rewards/margins": 0.0036105778999626637, + "rewards/rejected": -0.007033363915979862, + "step": 2610 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 2.1911659240722656, + "learning_rate": 1.9052849082205908e-08, + "logits/chosen": -3.1205995082855225, + "logits/rejected": -3.0899150371551514, + "logps/chosen": -51.6873664855957, + "logps/rejected": -51.2896614074707, + "loss": 0.6911, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.004214108921587467, + "rewards/margins": 0.004206494893878698, + "rewards/rejected": -0.008420604281127453, + "step": 2620 + }, + { + "epoch": 0.4531357684355617, + "grad_norm": 2.6254820823669434, + "learning_rate": 1.9040033553252865e-08, + "logits/chosen": -3.0019752979278564, + "logits/rejected": -2.9693052768707275, + "logps/chosen": -55.170799255371094, + "logps/rejected": -53.92267990112305, + "loss": 0.6911, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0035726067144423723, + "rewards/margins": 0.004099712241441011, + "rewards/rejected": -0.00767231872305274, + "step": 2630 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 2.2283997535705566, + "learning_rate": 1.9027136274580334e-08, + "logits/chosen": -3.0246224403381348, + "logits/rejected": -3.0031864643096924, + "logps/chosen": -50.46599578857422, + "logps/rejected": -49.7197151184082, + "loss": 0.6914, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0037057590670883656, + "rewards/margins": 0.0036159877199679613, + "rewards/rejected": -0.007321746554225683, + "step": 2640 + }, + { + "epoch": 0.4565816678152998, + "grad_norm": 2.218834638595581, + "learning_rate": 1.90141573628194e-08, + "logits/chosen": -2.9965646266937256, + "logits/rejected": -2.97887921333313, + "logps/chosen": -52.707855224609375, + "logps/rejected": -54.880126953125, + "loss": 0.6916, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0027702811639755964, + "rewards/margins": 0.0031878617592155933, + "rewards/rejected": -0.005958142690360546, + "step": 2650 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 2.5716030597686768, + "learning_rate": 1.9001096935339365e-08, + "logits/chosen": -3.025402784347534, + "logits/rejected": -2.982842445373535, + "logps/chosen": -58.6676139831543, + "logps/rejected": -53.14191436767578, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.003777928650379181, + "rewards/margins": 0.0028960234485566616, + "rewards/rejected": -0.006673953030258417, + "step": 2660 + }, + { + "epoch": 0.4600275671950379, + "grad_norm": 2.253568172454834, + "learning_rate": 1.898795511024667e-08, + "logits/chosen": -2.997163772583008, + "logits/rejected": -2.968554973602295, + "logps/chosen": -54.23125076293945, + "logps/rejected": -52.85374069213867, + "loss": 0.6906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0028527709655463696, + "rewards/margins": 0.005068852566182613, + "rewards/rejected": -0.007921623066067696, + "step": 2670 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 2.7172999382019043, + "learning_rate": 1.8974732006383862e-08, + "logits/chosen": -3.0443577766418457, + "logits/rejected": -3.016765594482422, + "logps/chosen": -58.591461181640625, + "logps/rejected": -54.34435272216797, + "loss": 0.6915, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0034392178058624268, + "rewards/margins": 0.0033766250126063824, + "rewards/rejected": -0.0068158432841300964, + "step": 2680 + }, + { + "epoch": 0.463473466574776, + "grad_norm": 2.3720362186431885, + "learning_rate": 1.8961427743328484e-08, + "logits/chosen": -3.0178565979003906, + "logits/rejected": -2.9950802326202393, + "logps/chosen": -51.14887237548828, + "logps/rejected": -50.86174774169922, + "loss": 0.6908, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.005077795125544071, + "rewards/margins": 0.0046742986887693405, + "rewards/rejected": -0.009752093814313412, + "step": 2690 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 2.2392497062683105, + "learning_rate": 1.8948042441392008e-08, + "logits/chosen": -3.0468106269836426, + "logits/rejected": -3.0200257301330566, + "logps/chosen": -53.548301696777344, + "logps/rejected": -54.21494674682617, + "loss": 0.6911, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.003175341058522463, + "rewards/margins": 0.004210122860968113, + "rewards/rejected": -0.0073854634538292885, + "step": 2700 + }, + { + "epoch": 0.4651964162646451, + "eval_logits/chosen": -3.156172752380371, + "eval_logits/rejected": -3.15053391456604, + "eval_logps/chosen": -58.5521354675293, + "eval_logps/rejected": -63.113643646240234, + "eval_loss": 0.6926901340484619, + "eval_rewards/accuracies": 0.5627323389053345, + "eval_rewards/chosen": 0.001597628928720951, + "eval_rewards/margins": 0.0009329087333753705, + "eval_rewards/rejected": 0.0006647202535532415, + "eval_runtime": 383.58, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 2700 + }, + { + "epoch": 0.4669193659545141, + "grad_norm": 2.4848990440368652, + "learning_rate": 1.893457622161875e-08, + "logits/chosen": -3.0706381797790527, + "logits/rejected": -3.051726818084717, + "logps/chosen": -59.67988967895508, + "logps/rejected": -55.850067138671875, + "loss": 0.6918, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0023324606008827686, + "rewards/margins": 0.0026608449406921864, + "rewards/rejected": -0.004993305075913668, + "step": 2710 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 2.4114415645599365, + "learning_rate": 1.8921029205784776e-08, + "logits/chosen": -3.0598363876342773, + "logits/rejected": -3.0630555152893066, + "logps/chosen": -52.59183883666992, + "logps/rejected": -54.42897415161133, + "loss": 0.6926, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.005445893853902817, + "rewards/margins": 0.0012161575723439455, + "rewards/rejected": -0.006662050727754831, + "step": 2720 + }, + { + "epoch": 0.4703652653342522, + "grad_norm": 2.2148189544677734, + "learning_rate": 1.890740151639679e-08, + "logits/chosen": -3.0595743656158447, + "logits/rejected": -3.0394186973571777, + "logps/chosen": -58.775352478027344, + "logps/rejected": -55.89299392700195, + "loss": 0.6909, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0020863967947661877, + "rewards/margins": 0.004591117147356272, + "rewards/rejected": -0.006677514407783747, + "step": 2730 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 2.4539008140563965, + "learning_rate": 1.8893693276691043e-08, + "logits/chosen": -3.049955368041992, + "logits/rejected": -3.028224468231201, + "logps/chosen": -53.324928283691406, + "logps/rejected": -50.06505584716797, + "loss": 0.6921, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.004656701814383268, + "rewards/margins": 0.002166991587728262, + "rewards/rejected": -0.00682369340211153, + "step": 2740 + }, + { + "epoch": 0.4738111647139904, + "grad_norm": 2.2547569274902344, + "learning_rate": 1.8879904610632196e-08, + "logits/chosen": -2.9805612564086914, + "logits/rejected": -2.975369930267334, + "logps/chosen": -49.73875427246094, + "logps/rejected": -54.2640495300293, + "loss": 0.6914, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0047501143999397755, + "rewards/margins": 0.003620445728302002, + "rewards/rejected": -0.008370560593903065, + "step": 2750 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 2.424126625061035, + "learning_rate": 1.8866035642912217e-08, + "logits/chosen": -3.030247211456299, + "logits/rejected": -3.01254940032959, + "logps/chosen": -54.67096710205078, + "logps/rejected": -55.503089904785156, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.003959754481911659, + "rewards/margins": 0.0030137132853269577, + "rewards/rejected": -0.006973467767238617, + "step": 2760 + }, + { + "epoch": 0.4772570640937285, + "grad_norm": 2.4591829776763916, + "learning_rate": 1.885208649894925e-08, + "logits/chosen": -3.1635355949401855, + "logits/rejected": -3.133530616760254, + "logps/chosen": -55.5648078918457, + "logps/rejected": -53.63425827026367, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0035004555247724056, + "rewards/margins": 0.002456140238791704, + "rewards/rejected": -0.00595659576356411, + "step": 2770 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 2.2963922023773193, + "learning_rate": 1.8838057304886483e-08, + "logits/chosen": -2.9820423126220703, + "logits/rejected": -2.951641082763672, + "logps/chosen": -53.36084747314453, + "logps/rejected": -51.40031814575195, + "loss": 0.6919, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.005320178810507059, + "rewards/margins": 0.002531546400859952, + "rewards/rejected": -0.007851725444197655, + "step": 2780 + }, + { + "epoch": 0.4807029634734666, + "grad_norm": 2.3971450328826904, + "learning_rate": 1.8823948187590994e-08, + "logits/chosen": -3.1007752418518066, + "logits/rejected": -3.066845655441284, + "logps/chosen": -51.98784255981445, + "logps/rejected": -50.7843132019043, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.002275830367580056, + "rewards/margins": 0.004438784904778004, + "rewards/rejected": -0.006714615970849991, + "step": 2790 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 2.691331148147583, + "learning_rate": 1.8809759274652614e-08, + "logits/chosen": -3.080821990966797, + "logits/rejected": -3.055692195892334, + "logps/chosen": -60.45173263549805, + "logps/rejected": -57.539276123046875, + "loss": 0.6917, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.004160616081207991, + "rewards/margins": 0.002856313716620207, + "rewards/rejected": -0.0070169297978281975, + "step": 2800 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -3.1558778285980225, + "eval_logits/rejected": -3.150256395339966, + "eval_logps/chosen": -58.53827667236328, + "eval_logps/rejected": -63.10443878173828, + "eval_loss": 0.6926683783531189, + "eval_rewards/accuracies": 0.5506505370140076, + "eval_rewards/chosen": 0.0017361408099532127, + "eval_rewards/margins": 0.0009792475029826164, + "eval_rewards/rejected": 0.0007568933651782572, + "eval_runtime": 383.2845, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 2800 + }, + { + "epoch": 0.4841488628532047, + "grad_norm": 2.172553300857544, + "learning_rate": 1.8795490694382782e-08, + "logits/chosen": -2.971768617630005, + "logits/rejected": -2.9470152854919434, + "logps/chosen": -55.66022872924805, + "logps/rejected": -56.47652053833008, + "loss": 0.6912, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0032442144583910704, + "rewards/margins": 0.0038853243459016085, + "rewards/rejected": -0.007129538804292679, + "step": 2810 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 2.5765879154205322, + "learning_rate": 1.8781142575813362e-08, + "logits/chosen": -3.121720314025879, + "logits/rejected": -3.109372615814209, + "logps/chosen": -54.74897384643555, + "logps/rejected": -53.752784729003906, + "loss": 0.6916, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.00421771639958024, + "rewards/margins": 0.003111992496997118, + "rewards/rejected": -0.007329708896577358, + "step": 2820 + }, + { + "epoch": 0.4875947622329428, + "grad_norm": 2.184145212173462, + "learning_rate": 1.8766715048695498e-08, + "logits/chosen": -2.9223580360412598, + "logits/rejected": -2.9071013927459717, + "logps/chosen": -55.8566780090332, + "logps/rejected": -55.659027099609375, + "loss": 0.6915, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.003561650635674596, + "rewards/margins": 0.0033836769871413708, + "rewards/rejected": -0.00694532785564661, + "step": 2830 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 2.2509357929229736, + "learning_rate": 1.875220824349843e-08, + "logits/chosen": -3.0975735187530518, + "logits/rejected": -3.086153507232666, + "logps/chosen": -53.18671417236328, + "logps/rejected": -52.27134323120117, + "loss": 0.6915, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.005504312459379435, + "rewards/margins": 0.0034088040702044964, + "rewards/rejected": -0.008913116529583931, + "step": 2840 + }, + { + "epoch": 0.4910406616126809, + "grad_norm": 2.483192205429077, + "learning_rate": 1.873762229140831e-08, + "logits/chosen": -3.0591354370117188, + "logits/rejected": -3.039513111114502, + "logps/chosen": -52.63386917114258, + "logps/rejected": -55.8605842590332, + "loss": 0.6901, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0038743845652788877, + "rewards/margins": 0.006244526244699955, + "rewards/rejected": -0.010118911042809486, + "step": 2850 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 2.017402172088623, + "learning_rate": 1.872295732432703e-08, + "logits/chosen": -3.0378129482269287, + "logits/rejected": -3.0134940147399902, + "logps/chosen": -55.37115478515625, + "logps/rejected": -52.80224609375, + "loss": 0.6911, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.004066367633640766, + "rewards/margins": 0.004180265124887228, + "rewards/rejected": -0.008246633224189281, + "step": 2860 + }, + { + "epoch": 0.494486560992419, + "grad_norm": 2.3846540451049805, + "learning_rate": 1.8708213474871015e-08, + "logits/chosen": -3.0799076557159424, + "logits/rejected": -3.055649757385254, + "logps/chosen": -56.450767517089844, + "logps/rejected": -53.93091583251953, + "loss": 0.6904, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0033499475102871656, + "rewards/margins": 0.005589387379586697, + "rewards/rejected": -0.008939335122704506, + "step": 2870 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 2.6427111625671387, + "learning_rate": 1.8693390876370032e-08, + "logits/chosen": -3.156221389770508, + "logits/rejected": -3.129972219467163, + "logps/chosen": -55.811683654785156, + "logps/rejected": -51.90478515625, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0046365829184651375, + "rewards/margins": 0.005148191004991531, + "rewards/rejected": -0.009784774854779243, + "step": 2880 + }, + { + "epoch": 0.49793246037215716, + "grad_norm": 2.230509042739868, + "learning_rate": 1.867848966286598e-08, + "logits/chosen": -3.1655468940734863, + "logits/rejected": -3.160212993621826, + "logps/chosen": -53.45560836791992, + "logps/rejected": -53.241310119628906, + "loss": 0.6921, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.004979230929166079, + "rewards/margins": 0.0022403167095035315, + "rewards/rejected": -0.007219547871500254, + "step": 2890 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 2.3694610595703125, + "learning_rate": 1.8663509969111677e-08, + "logits/chosen": -3.084071636199951, + "logits/rejected": -3.0744006633758545, + "logps/chosen": -52.7962532043457, + "logps/rejected": -52.0960578918457, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0030820216052234173, + "rewards/margins": 0.002532669808715582, + "rewards/rejected": -0.005614691413938999, + "step": 2900 + }, + { + "epoch": 0.4996554100620262, + "eval_logits/chosen": -3.155229330062866, + "eval_logits/rejected": -3.1495659351348877, + "eval_logps/chosen": -58.53911209106445, + "eval_logps/rejected": -63.11810302734375, + "eval_loss": 0.6926056742668152, + "eval_rewards/accuracies": 0.5608736276626587, + "eval_rewards/chosen": 0.001727823168039322, + "eval_rewards/margins": 0.0011076563969254494, + "eval_rewards/rejected": 0.0006201668875291944, + "eval_runtime": 383.253, + "eval_samples_per_second": 11.23, + "eval_steps_per_second": 1.404, + "step": 2900 + }, + { + "epoch": 0.5013783597518953, + "grad_norm": 2.7788450717926025, + "learning_rate": 1.8648451930569647e-08, + "logits/chosen": -3.1230292320251465, + "logits/rejected": -3.1110596656799316, + "logps/chosen": -56.4962158203125, + "logps/rejected": -56.69160842895508, + "loss": 0.6912, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0038903437089174986, + "rewards/margins": 0.004020698834210634, + "rewards/rejected": -0.007911041378974915, + "step": 2910 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 2.253162145614624, + "learning_rate": 1.8633315683410898e-08, + "logits/chosen": -3.0678889751434326, + "logits/rejected": -3.0639030933380127, + "logps/chosen": -54.0466194152832, + "logps/rejected": -56.43535614013672, + "loss": 0.6909, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.004764976445585489, + "rewards/margins": 0.004528032150119543, + "rewards/rejected": -0.009293009527027607, + "step": 2920 + }, + { + "epoch": 0.5048242591316333, + "grad_norm": 2.342252254486084, + "learning_rate": 1.8618101364513675e-08, + "logits/chosen": -3.0404350757598877, + "logits/rejected": -3.0122365951538086, + "logps/chosen": -53.86481857299805, + "logps/rejected": -52.56145477294922, + "loss": 0.6908, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.004135873168706894, + "rewards/margins": 0.00479888916015625, + "rewards/rejected": -0.008934763260185719, + "step": 2930 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 2.1050474643707275, + "learning_rate": 1.8602809111462233e-08, + "logits/chosen": -3.071373462677002, + "logits/rejected": -3.0339467525482178, + "logps/chosen": -51.91243362426758, + "logps/rejected": -51.6020622253418, + "loss": 0.6914, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.004520986694842577, + "rewards/margins": 0.00364798866212368, + "rewards/rejected": -0.00816897489130497, + "step": 2940 + }, + { + "epoch": 0.5082701585113715, + "grad_norm": 2.170177936553955, + "learning_rate": 1.8587439062545598e-08, + "logits/chosen": -3.1068174839019775, + "logits/rejected": -3.0881857872009277, + "logps/chosen": -54.690635681152344, + "logps/rejected": -55.344505310058594, + "loss": 0.691, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0036951780784875154, + "rewards/margins": 0.004416085779666901, + "rewards/rejected": -0.008111263625323772, + "step": 2950 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 2.382573366165161, + "learning_rate": 1.8571991356756304e-08, + "logits/chosen": -3.0656509399414062, + "logits/rejected": -3.038205623626709, + "logps/chosen": -54.41986083984375, + "logps/rejected": -52.79435348510742, + "loss": 0.6911, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0047190822660923, + "rewards/margins": 0.004267274402081966, + "rewards/rejected": -0.008986357599496841, + "step": 2960 + }, + { + "epoch": 0.5117160578911096, + "grad_norm": 2.9030227661132812, + "learning_rate": 1.8556466133789146e-08, + "logits/chosen": -2.9982943534851074, + "logits/rejected": -2.9701733589172363, + "logps/chosen": -55.818634033203125, + "logps/rejected": -53.390159606933594, + "loss": 0.6912, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.005811252165585756, + "rewards/margins": 0.00395150575786829, + "rewards/rejected": -0.009762757457792759, + "step": 2970 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 2.39404034614563, + "learning_rate": 1.8540863534039903e-08, + "logits/chosen": -2.9953501224517822, + "logits/rejected": -2.9691390991210938, + "logps/chosen": -54.135398864746094, + "logps/rejected": -53.153411865234375, + "loss": 0.6898, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.001961924834176898, + "rewards/margins": 0.0067636966705322266, + "rewards/rejected": -0.008725621737539768, + "step": 2980 + }, + { + "epoch": 0.5151619572708477, + "grad_norm": 2.3001177310943604, + "learning_rate": 1.8525183698604096e-08, + "logits/chosen": -3.04237699508667, + "logits/rejected": -3.014385938644409, + "logps/chosen": -56.99365234375, + "logps/rejected": -55.51732635498047, + "loss": 0.6906, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0038381018675863743, + "rewards/margins": 0.005188227631151676, + "rewards/rejected": -0.009026329033076763, + "step": 2990 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 1.937101125717163, + "learning_rate": 1.8509426769275677e-08, + "logits/chosen": -3.0544161796569824, + "logits/rejected": -3.048501968383789, + "logps/chosen": -52.471229553222656, + "logps/rejected": -54.636680603027344, + "loss": 0.6918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.004588799085468054, + "rewards/margins": 0.002834170823916793, + "rewards/rejected": -0.00742297014221549, + "step": 3000 + }, + { + "epoch": 0.5168849069607168, + "eval_logits/chosen": -3.154435634613037, + "eval_logits/rejected": -3.148829460144043, + "eval_logps/chosen": -58.526187896728516, + "eval_logps/rejected": -63.12165832519531, + "eval_loss": 0.6925256848335266, + "eval_rewards/accuracies": 0.5606412887573242, + "eval_rewards/chosen": 0.0018571042455732822, + "eval_rewards/margins": 0.0012724484549835324, + "eval_rewards/rejected": 0.0005846557905897498, + "eval_runtime": 383.6244, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 3000 + }, + { + "epoch": 0.5186078566505858, + "grad_norm": 2.4951703548431396, + "learning_rate": 1.8493592888545773e-08, + "logits/chosen": -3.080109119415283, + "logits/rejected": -3.0565946102142334, + "logps/chosen": -55.65546798706055, + "logps/rejected": -53.9165153503418, + "loss": 0.6914, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.004678068216890097, + "rewards/margins": 0.0035416565369814634, + "rewards/rejected": -0.008219725452363491, + "step": 3010 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 2.145470142364502, + "learning_rate": 1.8477682199601388e-08, + "logits/chosen": -3.166893720626831, + "logits/rejected": -3.134974956512451, + "logps/chosen": -54.71784591674805, + "logps/rejected": -51.342918395996094, + "loss": 0.6905, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.004793129861354828, + "rewards/margins": 0.0054800366051495075, + "rewards/rejected": -0.010273166000843048, + "step": 3020 + }, + { + "epoch": 0.5220537560303239, + "grad_norm": 2.2682783603668213, + "learning_rate": 1.8461694846324108e-08, + "logits/chosen": -3.0667412281036377, + "logits/rejected": -3.044809341430664, + "logps/chosen": -56.32188034057617, + "logps/rejected": -55.66552734375, + "loss": 0.6925, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.006469712592661381, + "rewards/margins": 0.00129136280156672, + "rewards/rejected": -0.00776107469573617, + "step": 3030 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 2.118988275527954, + "learning_rate": 1.84456309732888e-08, + "logits/chosen": -3.171809196472168, + "logits/rejected": -3.1654465198516846, + "logps/chosen": -51.78215789794922, + "logps/rejected": -55.33644485473633, + "loss": 0.6911, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.005943716503679752, + "rewards/margins": 0.004264301620423794, + "rewards/rejected": -0.010208018124103546, + "step": 3040 + }, + { + "epoch": 0.525499655410062, + "grad_norm": 1.9871397018432617, + "learning_rate": 1.84294907257623e-08, + "logits/chosen": -3.030111789703369, + "logits/rejected": -3.016605854034424, + "logps/chosen": -54.00185012817383, + "logps/rejected": -56.15374755859375, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004437591414898634, + "rewards/margins": 0.0035246661864221096, + "rewards/rejected": -0.007962257601320744, + "step": 3050 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 2.395477533340454, + "learning_rate": 1.8413274249702112e-08, + "logits/chosen": -3.0659689903259277, + "logits/rejected": -3.0472943782806396, + "logps/chosen": -56.21149444580078, + "logps/rejected": -55.056556701660156, + "loss": 0.6924, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0053570386953651905, + "rewards/margins": 0.0015337929362431169, + "rewards/rejected": -0.006890831049531698, + "step": 3060 + }, + { + "epoch": 0.5289455547898001, + "grad_norm": 2.4162089824676514, + "learning_rate": 1.839698169175508e-08, + "logits/chosen": -3.0253570079803467, + "logits/rejected": -2.9994709491729736, + "logps/chosen": -58.442176818847656, + "logps/rejected": -54.90386199951172, + "loss": 0.6909, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.004995794501155615, + "rewards/margins": 0.004661207552999258, + "rewards/rejected": -0.009657002054154873, + "step": 3070 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 2.121013641357422, + "learning_rate": 1.8380613199256057e-08, + "logits/chosen": -2.9383175373077393, + "logits/rejected": -2.921355724334717, + "logps/chosen": -51.76934051513672, + "logps/rejected": -55.88709259033203, + "loss": 0.6923, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.009108955040574074, + "rewards/margins": 0.0017499884124845266, + "rewards/rejected": -0.01085894275456667, + "step": 3080 + }, + { + "epoch": 0.5323914541695383, + "grad_norm": 2.1115143299102783, + "learning_rate": 1.836416892022658e-08, + "logits/chosen": -3.023068428039551, + "logits/rejected": -3.0035014152526855, + "logps/chosen": -54.106475830078125, + "logps/rejected": -54.712646484375, + "loss": 0.6906, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.004943912848830223, + "rewards/margins": 0.005139966495335102, + "rewards/rejected": -0.010083879344165325, + "step": 3090 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 2.477569341659546, + "learning_rate": 1.8347649003373534e-08, + "logits/chosen": -3.061750888824463, + "logits/rejected": -3.0320868492126465, + "logps/chosen": -54.39912033081055, + "logps/rejected": -52.48120880126953, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00478705670684576, + "rewards/margins": 0.004317191429436207, + "rewards/rejected": -0.009104247204959393, + "step": 3100 + }, + { + "epoch": 0.5341144038594073, + "eval_logits/chosen": -3.154158592224121, + "eval_logits/rejected": -3.148547887802124, + "eval_logps/chosen": -58.521949768066406, + "eval_logps/rejected": -63.126922607421875, + "eval_loss": 0.6924790740013123, + "eval_rewards/accuracies": 0.5669144988059998, + "eval_rewards/chosen": 0.0018994332058355212, + "eval_rewards/margins": 0.0013674128567799926, + "eval_rewards/rejected": 0.0005320201744325459, + "eval_runtime": 383.3353, + "eval_samples_per_second": 11.228, + "eval_steps_per_second": 1.403, + "step": 3100 + }, + { + "epoch": 0.5358373535492763, + "grad_norm": 2.5770890712738037, + "learning_rate": 1.8331053598087794e-08, + "logits/chosen": -3.0007739067077637, + "logits/rejected": -3.010702133178711, + "logps/chosen": -51.33858108520508, + "logps/rejected": -55.857177734375, + "loss": 0.6923, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.007260960526764393, + "rewards/margins": 0.001832063077017665, + "rewards/rejected": -0.009093021973967552, + "step": 3110 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 2.4113028049468994, + "learning_rate": 1.8314382854442894e-08, + "logits/chosen": -3.049830198287964, + "logits/rejected": -3.025566339492798, + "logps/chosen": -56.523887634277344, + "logps/rejected": -56.6429328918457, + "loss": 0.6916, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005131120793521404, + "rewards/margins": 0.003246209817007184, + "rewards/rejected": -0.008377330377697945, + "step": 3120 + }, + { + "epoch": 0.5392832529290145, + "grad_norm": 2.1546311378479004, + "learning_rate": 1.8297636923193653e-08, + "logits/chosen": -2.966508626937866, + "logits/rejected": -2.9583373069763184, + "logps/chosen": -52.656578063964844, + "logps/rejected": -54.178009033203125, + "loss": 0.6919, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.00575623381882906, + "rewards/margins": 0.002625588094815612, + "rewards/rejected": -0.008381822146475315, + "step": 3130 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 2.1864230632781982, + "learning_rate": 1.828081595577481e-08, + "logits/chosen": -3.084721088409424, + "logits/rejected": -3.0690340995788574, + "logps/chosen": -52.35600662231445, + "logps/rejected": -54.71647262573242, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.006588843651115894, + "rewards/margins": 0.003160575870424509, + "rewards/rejected": -0.00974941998720169, + "step": 3140 + }, + { + "epoch": 0.5427291523087526, + "grad_norm": 2.3320834636688232, + "learning_rate": 1.8263920104299668e-08, + "logits/chosen": -3.0859665870666504, + "logits/rejected": -3.0568442344665527, + "logps/chosen": -54.78154373168945, + "logps/rejected": -54.6657829284668, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004125826992094517, + "rewards/margins": 0.006509957369416952, + "rewards/rejected": -0.010635784827172756, + "step": 3150 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 2.330270767211914, + "learning_rate": 1.824694952155872e-08, + "logits/chosen": -3.0266149044036865, + "logits/rejected": -2.997025728225708, + "logps/chosen": -54.46602249145508, + "logps/rejected": -53.62001419067383, + "loss": 0.6899, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0031971274875104427, + "rewards/margins": 0.006669213529676199, + "rewards/rejected": -0.009866341017186642, + "step": 3160 + }, + { + "epoch": 0.5461750516884907, + "grad_norm": 2.5265400409698486, + "learning_rate": 1.822990436101825e-08, + "logits/chosen": -3.052417516708374, + "logits/rejected": -3.019367218017578, + "logps/chosen": -56.2938346862793, + "logps/rejected": -50.660343170166016, + "loss": 0.6894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004044114612042904, + "rewards/margins": 0.007699139416217804, + "rewards/rejected": -0.011743253096938133, + "step": 3170 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 2.0642013549804688, + "learning_rate": 1.8212784776818955e-08, + "logits/chosen": -3.1040587425231934, + "logits/rejected": -3.0662312507629395, + "logps/chosen": -53.784767150878906, + "logps/rejected": -52.4791145324707, + "loss": 0.6906, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.005283757578581572, + "rewards/margins": 0.005227072164416313, + "rewards/rejected": -0.010510829277336597, + "step": 3180 + }, + { + "epoch": 0.5496209510682288, + "grad_norm": 2.294727325439453, + "learning_rate": 1.8195590923774554e-08, + "logits/chosen": -3.1036376953125, + "logits/rejected": -3.0982773303985596, + "logps/chosen": -52.858741760253906, + "logps/rejected": -57.697471618652344, + "loss": 0.6908, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.005610528867691755, + "rewards/margins": 0.004711043555289507, + "rewards/rejected": -0.010321573354303837, + "step": 3190 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 2.7669525146484375, + "learning_rate": 1.8178322957370386e-08, + "logits/chosen": -3.0280566215515137, + "logits/rejected": -3.013529062271118, + "logps/chosen": -55.881103515625, + "logps/rejected": -52.9276008605957, + "loss": 0.692, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.00552431819960475, + "rewards/margins": 0.0024294995237141848, + "rewards/rejected": -0.007953817956149578, + "step": 3200 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -3.1533045768737793, + "eval_logits/rejected": -3.1476891040802, + "eval_logps/chosen": -58.526756286621094, + "eval_logps/rejected": -63.130882263183594, + "eval_loss": 0.6924848556518555, + "eval_rewards/accuracies": 0.5606412887573242, + "eval_rewards/chosen": 0.00185136660002172, + "eval_rewards/margins": 0.0013589225709438324, + "eval_rewards/rejected": 0.0004924440290778875, + "eval_runtime": 383.5261, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 3200 + }, + { + "epoch": 0.5530668504479669, + "grad_norm": 2.6868748664855957, + "learning_rate": 1.8160981033762e-08, + "logits/chosen": -2.945176362991333, + "logits/rejected": -2.9206013679504395, + "logps/chosen": -54.455101013183594, + "logps/rejected": -53.40840530395508, + "loss": 0.6912, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.005037089344114065, + "rewards/margins": 0.004074054770171642, + "rewards/rejected": -0.00911114551126957, + "step": 3210 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 2.1434881687164307, + "learning_rate": 1.8143565309773743e-08, + "logits/chosen": -2.9813790321350098, + "logits/rejected": -2.9762043952941895, + "logps/chosen": -51.000160217285156, + "logps/rejected": -53.827186584472656, + "loss": 0.6918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.006563312374055386, + "rewards/margins": 0.002880085725337267, + "rewards/rejected": -0.009443397633731365, + "step": 3220 + }, + { + "epoch": 0.556512749827705, + "grad_norm": 2.1510605812072754, + "learning_rate": 1.812607594289735e-08, + "logits/chosen": -3.036472797393799, + "logits/rejected": -3.0197601318359375, + "logps/chosen": -54.75006866455078, + "logps/rejected": -57.2293815612793, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.005926917772740126, + "rewards/margins": 0.0025798422284424305, + "rewards/rejected": -0.008506760001182556, + "step": 3230 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 2.268118143081665, + "learning_rate": 1.8108513091290518e-08, + "logits/chosen": -3.157724380493164, + "logits/rejected": -3.1337904930114746, + "logps/chosen": -56.884422302246094, + "logps/rejected": -53.697410583496094, + "loss": 0.6907, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.00647754967212677, + "rewards/margins": 0.005060004070401192, + "rewards/rejected": -0.011537553742527962, + "step": 3240 + }, + { + "epoch": 0.5599586492074431, + "grad_norm": 2.060023546218872, + "learning_rate": 1.8090876913775457e-08, + "logits/chosen": -3.052263021469116, + "logits/rejected": -3.0307984352111816, + "logps/chosen": -53.035064697265625, + "logps/rejected": -54.42890548706055, + "loss": 0.6899, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.004625025205314159, + "rewards/margins": 0.006578563246876001, + "rewards/rejected": -0.011203588917851448, + "step": 3250 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 2.4339516162872314, + "learning_rate": 1.8073167569837484e-08, + "logits/chosen": -3.063772678375244, + "logits/rejected": -3.046740770339966, + "logps/chosen": -52.22735595703125, + "logps/rejected": -53.87421417236328, + "loss": 0.691, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006919285748153925, + "rewards/margins": 0.004433135036379099, + "rewards/rejected": -0.011352420784533024, + "step": 3260 + }, + { + "epoch": 0.5634045485871813, + "grad_norm": 2.4903886318206787, + "learning_rate": 1.8055385219623555e-08, + "logits/chosen": -3.120856523513794, + "logits/rejected": -3.116936445236206, + "logps/chosen": -55.896339416503906, + "logps/rejected": -59.06333541870117, + "loss": 0.6912, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.007667438592761755, + "rewards/margins": 0.004086242523044348, + "rewards/rejected": -0.011753681115806103, + "step": 3270 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 2.4499690532684326, + "learning_rate": 1.8037530023940842e-08, + "logits/chosen": -3.127962112426758, + "logits/rejected": -3.117267370223999, + "logps/chosen": -50.12376022338867, + "logps/rejected": -53.41807174682617, + "loss": 0.6916, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0070305257104337215, + "rewards/margins": 0.003172159194946289, + "rewards/rejected": -0.010202684439718723, + "step": 3280 + }, + { + "epoch": 0.5668504479669194, + "grad_norm": 2.1593992710113525, + "learning_rate": 1.8019602144255244e-08, + "logits/chosen": -3.1752045154571533, + "logits/rejected": -3.143965005874634, + "logps/chosen": -53.74822235107422, + "logps/rejected": -54.3244743347168, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005819928366690874, + "rewards/margins": 0.00419646967202425, + "rewards/rejected": -0.010016398504376411, + "step": 3290 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 2.6347358226776123, + "learning_rate": 1.800160174268996e-08, + "logits/chosen": -3.050291061401367, + "logits/rejected": -3.027971029281616, + "logps/chosen": -55.40729904174805, + "logps/rejected": -54.66904830932617, + "loss": 0.6902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004908127710223198, + "rewards/margins": 0.006011998746544123, + "rewards/rejected": -0.010920126922428608, + "step": 3300 + }, + { + "epoch": 0.5685733976567884, + "eval_logits/chosen": -3.152574300765991, + "eval_logits/rejected": -3.146979331970215, + "eval_logps/chosen": -58.527687072753906, + "eval_logps/rejected": -63.1528434753418, + "eval_loss": 0.6923813223838806, + "eval_rewards/accuracies": 0.5604089498519897, + "eval_rewards/chosen": 0.0018420711858198047, + "eval_rewards/margins": 0.001569233019836247, + "eval_rewards/rejected": 0.0002728381659835577, + "eval_runtime": 383.5849, + "eval_samples_per_second": 11.22, + "eval_steps_per_second": 1.403, + "step": 3300 + }, + { + "epoch": 0.5702963473466575, + "grad_norm": 2.300382137298584, + "learning_rate": 1.7983528982024008e-08, + "logits/chosen": -2.99222731590271, + "logits/rejected": -2.96380615234375, + "logps/chosen": -53.65486526489258, + "logps/rejected": -51.8136100769043, + "loss": 0.6904, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.005560653749853373, + "rewards/margins": 0.00558196846395731, + "rewards/rejected": -0.01114262267947197, + "step": 3310 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 2.6217761039733887, + "learning_rate": 1.796538402569076e-08, + "logits/chosen": -3.049356698989868, + "logits/rejected": -3.013167142868042, + "logps/chosen": -54.18232345581055, + "logps/rejected": -50.1631965637207, + "loss": 0.6895, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004746898077428341, + "rewards/margins": 0.007434196770191193, + "rewards/rejected": -0.012181093916296959, + "step": 3320 + }, + { + "epoch": 0.5737422467263956, + "grad_norm": 2.4825403690338135, + "learning_rate": 1.7947167037776444e-08, + "logits/chosen": -3.1506247520446777, + "logits/rejected": -3.1185247898101807, + "logps/chosen": -54.65381622314453, + "logps/rejected": -54.71440887451172, + "loss": 0.6902, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.004404108040034771, + "rewards/margins": 0.005980129353702068, + "rewards/rejected": -0.01038423739373684, + "step": 3330 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 2.2614500522613525, + "learning_rate": 1.792887818301869e-08, + "logits/chosen": -3.0857772827148438, + "logits/rejected": -3.0680103302001953, + "logps/chosen": -56.436622619628906, + "logps/rejected": -55.13303756713867, + "loss": 0.6901, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.005234843119978905, + "rewards/margins": 0.006301610264927149, + "rewards/rejected": -0.01153645385056734, + "step": 3340 + }, + { + "epoch": 0.5771881461061337, + "grad_norm": 2.3960185050964355, + "learning_rate": 1.791051762680502e-08, + "logits/chosen": -2.980429172515869, + "logits/rejected": -2.9654033184051514, + "logps/chosen": -56.40067672729492, + "logps/rejected": -58.61570358276367, + "loss": 0.6899, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.006220079492777586, + "rewards/margins": 0.006694138050079346, + "rewards/rejected": -0.01291421614587307, + "step": 3350 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 2.283350944519043, + "learning_rate": 1.789208553517135e-08, + "logits/chosen": -3.103543758392334, + "logits/rejected": -3.068631649017334, + "logps/chosen": -55.334007263183594, + "logps/rejected": -53.504661560058594, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005187752656638622, + "rewards/margins": 0.0075442553497850895, + "rewards/rejected": -0.012732008472084999, + "step": 3360 + }, + { + "epoch": 0.5806340454858718, + "grad_norm": 2.334491491317749, + "learning_rate": 1.7873582074800518e-08, + "logits/chosen": -3.034996509552002, + "logits/rejected": -3.0251402854919434, + "logps/chosen": -51.59055709838867, + "logps/rejected": -55.11620330810547, + "loss": 0.6909, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.00635263929143548, + "rewards/margins": 0.004672903101891279, + "rewards/rejected": -0.01102554239332676, + "step": 3370 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 2.041323184967041, + "learning_rate": 1.785500741302073e-08, + "logits/chosen": -3.1335296630859375, + "logits/rejected": -3.1007940769195557, + "logps/chosen": -55.99006271362305, + "logps/rejected": -51.322906494140625, + "loss": 0.69, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.005291844252496958, + "rewards/margins": 0.006499829702079296, + "rewards/rejected": -0.011791672557592392, + "step": 3380 + }, + { + "epoch": 0.5840799448656099, + "grad_norm": 1.9655297994613647, + "learning_rate": 1.7836361717804083e-08, + "logits/chosen": -3.056028366088867, + "logits/rejected": -3.0294790267944336, + "logps/chosen": -54.32973098754883, + "logps/rejected": -52.4959716796875, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00602052453905344, + "rewards/margins": 0.0046034911647439, + "rewards/rejected": -0.010624016635119915, + "step": 3390 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 2.251234531402588, + "learning_rate": 1.7817645157765035e-08, + "logits/chosen": -3.109879970550537, + "logits/rejected": -3.0702693462371826, + "logps/chosen": -53.470672607421875, + "logps/rejected": -52.68741989135742, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006139514502137899, + "rewards/margins": 0.006888681091368198, + "rewards/rejected": -0.01302819512784481, + "step": 3400 + }, + { + "epoch": 0.585802894555479, + "eval_logits/chosen": -3.151777982711792, + "eval_logits/rejected": -3.1461565494537354, + "eval_logps/chosen": -58.51345443725586, + "eval_logps/rejected": -63.1519660949707, + "eval_loss": 0.69231778383255, + "eval_rewards/accuracies": 0.5601765513420105, + "eval_rewards/chosen": 0.001984409289434552, + "eval_rewards/margins": 0.0017028645379468799, + "eval_rewards/rejected": 0.0002815446350723505, + "eval_runtime": 383.5062, + "eval_samples_per_second": 11.223, + "eval_steps_per_second": 1.403, + "step": 3400 + }, + { + "epoch": 0.587525844245348, + "grad_norm": 2.122176170349121, + "learning_rate": 1.7798857902158887e-08, + "logits/chosen": -2.9969916343688965, + "logits/rejected": -2.967726230621338, + "logps/chosen": -50.218971252441406, + "logps/rejected": -48.38374710083008, + "loss": 0.6899, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.006457244511693716, + "rewards/margins": 0.006697861943393946, + "rewards/rejected": -0.013155105523765087, + "step": 3410 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 2.6450655460357666, + "learning_rate": 1.7780000120880232e-08, + "logits/chosen": -3.017392873764038, + "logits/rejected": -2.9929404258728027, + "logps/chosen": -53.33781051635742, + "logps/rejected": -54.46907424926758, + "loss": 0.6902, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005704882554709911, + "rewards/margins": 0.005997374653816223, + "rewards/rejected": -0.01170225627720356, + "step": 3420 + }, + { + "epoch": 0.5909717436250862, + "grad_norm": 1.989274024963379, + "learning_rate": 1.7761071984461438e-08, + "logits/chosen": -3.1018154621124268, + "logits/rejected": -3.081432342529297, + "logps/chosen": -51.916282653808594, + "logps/rejected": -55.77009963989258, + "loss": 0.6904, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.005753933917731047, + "rewards/margins": 0.005630741361528635, + "rewards/rejected": -0.011384674347937107, + "step": 3430 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 2.477759838104248, + "learning_rate": 1.7742073664071095e-08, + "logits/chosen": -3.0237717628479004, + "logits/rejected": -3.0075230598449707, + "logps/chosen": -53.5707893371582, + "logps/rejected": -54.196929931640625, + "loss": 0.6912, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0099791893735528, + "rewards/margins": 0.003979781176894903, + "rewards/rejected": -0.01395897101610899, + "step": 3440 + }, + { + "epoch": 0.5944176430048242, + "grad_norm": 2.5092668533325195, + "learning_rate": 1.772300533151249e-08, + "logits/chosen": -3.272890090942383, + "logits/rejected": -3.2318217754364014, + "logps/chosen": -59.927818298339844, + "logps/rejected": -56.14251708984375, + "loss": 0.69, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.00548345223069191, + "rewards/margins": 0.006471781525760889, + "rewards/rejected": -0.01195523515343666, + "step": 3450 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 2.2644803524017334, + "learning_rate": 1.7703867159222012e-08, + "logits/chosen": -3.0503110885620117, + "logits/rejected": -3.037585496902466, + "logps/chosen": -52.72737503051758, + "logps/rejected": -54.55059051513672, + "loss": 0.6912, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.005479804240167141, + "rewards/margins": 0.004024847410619259, + "rewards/rejected": -0.009504652582108974, + "step": 3460 + }, + { + "epoch": 0.5978635423845624, + "grad_norm": 1.9865294694900513, + "learning_rate": 1.768465932026763e-08, + "logits/chosen": -3.1293208599090576, + "logits/rejected": -3.1130614280700684, + "logps/chosen": -56.37324905395508, + "logps/rejected": -55.44392013549805, + "loss": 0.6905, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.005630441941320896, + "rewards/margins": 0.005450661294162273, + "rewards/rejected": -0.011081104166805744, + "step": 3470 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 2.379979133605957, + "learning_rate": 1.766538198834731e-08, + "logits/chosen": -3.0560507774353027, + "logits/rejected": -3.0245845317840576, + "logps/chosen": -55.585777282714844, + "logps/rejected": -53.092987060546875, + "loss": 0.6895, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.004711954854428768, + "rewards/margins": 0.007386787328869104, + "rewards/rejected": -0.01209874078631401, + "step": 3480 + }, + { + "epoch": 0.6013094417643005, + "grad_norm": 2.4309921264648438, + "learning_rate": 1.7646035337787454e-08, + "logits/chosen": -3.0650832653045654, + "logits/rejected": -3.0358080863952637, + "logps/chosen": -54.878990173339844, + "logps/rejected": -55.72333908081055, + "loss": 0.6903, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.006096069701015949, + "rewards/margins": 0.005908667109906673, + "rewards/rejected": -0.012004735879600048, + "step": 3490 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 2.2524032592773438, + "learning_rate": 1.7626619543541304e-08, + "logits/chosen": -3.01064395904541, + "logits/rejected": -3.0026509761810303, + "logps/chosen": -52.7326774597168, + "logps/rejected": -56.00465774536133, + "loss": 0.6902, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0072999633848667145, + "rewards/margins": 0.0061295004561543465, + "rewards/rejected": -0.01342946570366621, + "step": 3500 + }, + { + "epoch": 0.6030323914541695, + "eval_logits/chosen": -3.1511001586914062, + "eval_logits/rejected": -3.145463228225708, + "eval_logps/chosen": -58.5220947265625, + "eval_logps/rejected": -63.16736602783203, + "eval_loss": 0.6922861337661743, + "eval_rewards/accuracies": 0.5532063245773315, + "eval_rewards/chosen": 0.001898013986647129, + "eval_rewards/margins": 0.0017704860074445605, + "eval_rewards/rejected": 0.00012752779002767056, + "eval_runtime": 383.5783, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 3500 + }, + { + "epoch": 0.6047553411440386, + "grad_norm": 2.344438076019287, + "learning_rate": 1.760713478118739e-08, + "logits/chosen": -2.9544014930725098, + "logits/rejected": -2.928922414779663, + "logps/chosen": -54.55061721801758, + "logps/rejected": -52.584266662597656, + "loss": 0.6909, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.00983361341059208, + "rewards/margins": 0.004564939998090267, + "rewards/rejected": -0.014398554340004921, + "step": 3510 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 2.2484140396118164, + "learning_rate": 1.758758122692791e-08, + "logits/chosen": -2.988053321838379, + "logits/rejected": -2.9679315090179443, + "logps/chosen": -56.056297302246094, + "logps/rejected": -53.2197265625, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007219976745545864, + "rewards/margins": 0.004322125110775232, + "rewards/rejected": -0.011542101390659809, + "step": 3520 + }, + { + "epoch": 0.6082012405237767, + "grad_norm": 2.396355152130127, + "learning_rate": 1.756795905758717e-08, + "logits/chosen": -3.1577303409576416, + "logits/rejected": -3.1141624450683594, + "logps/chosen": -56.49724578857422, + "logps/rejected": -52.197303771972656, + "loss": 0.6895, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.005397644359618425, + "rewards/margins": 0.007435487117618322, + "rewards/rejected": -0.012833130545914173, + "step": 3530 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 2.5322744846343994, + "learning_rate": 1.754826845060995e-08, + "logits/chosen": -2.97660493850708, + "logits/rejected": -2.951645851135254, + "logps/chosen": -54.54118728637695, + "logps/rejected": -54.915924072265625, + "loss": 0.6896, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006284528411924839, + "rewards/margins": 0.007204174064099789, + "rewards/rejected": -0.013488702476024628, + "step": 3540 + }, + { + "epoch": 0.6116471399035148, + "grad_norm": 2.43735671043396, + "learning_rate": 1.752850958405993e-08, + "logits/chosen": -3.067127227783203, + "logits/rejected": -3.0576090812683105, + "logps/chosen": -55.544654846191406, + "logps/rejected": -53.7586784362793, + "loss": 0.6921, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.009815131314098835, + "rewards/margins": 0.0022593550384044647, + "rewards/rejected": -0.0120744863525033, + "step": 3550 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 2.1326327323913574, + "learning_rate": 1.7508682636618058e-08, + "logits/chosen": -3.0019993782043457, + "logits/rejected": -2.9740567207336426, + "logps/chosen": -53.49755096435547, + "logps/rejected": -53.9032096862793, + "loss": 0.6894, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.005975979380309582, + "rewards/margins": 0.007574207149446011, + "rewards/rejected": -0.013550187461078167, + "step": 3560 + }, + { + "epoch": 0.6150930392832529, + "grad_norm": 2.697509288787842, + "learning_rate": 1.7488787787580952e-08, + "logits/chosen": -3.1488282680511475, + "logits/rejected": -3.113802194595337, + "logps/chosen": -57.558563232421875, + "logps/rejected": -54.47620391845703, + "loss": 0.6904, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.006858462933450937, + "rewards/margins": 0.005681387148797512, + "rewards/rejected": -0.012539848685264587, + "step": 3570 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 2.5877275466918945, + "learning_rate": 1.746882521685926e-08, + "logits/chosen": -3.0800981521606445, + "logits/rejected": -3.0567736625671387, + "logps/chosen": -58.673004150390625, + "logps/rejected": -58.826385498046875, + "loss": 0.6888, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004782065749168396, + "rewards/margins": 0.008766787126660347, + "rewards/rejected": -0.013548852875828743, + "step": 3580 + }, + { + "epoch": 0.618538938662991, + "grad_norm": 2.1750683784484863, + "learning_rate": 1.7448795104976046e-08, + "logits/chosen": -3.068450450897217, + "logits/rejected": -3.0522539615631104, + "logps/chosen": -55.2479133605957, + "logps/rejected": -55.32890701293945, + "loss": 0.6906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008229101076722145, + "rewards/margins": 0.005200072657316923, + "rewards/rejected": -0.013429174199700356, + "step": 3590 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 2.4608330726623535, + "learning_rate": 1.7428697633065155e-08, + "logits/chosen": -3.0483107566833496, + "logits/rejected": -3.0238070487976074, + "logps/chosen": -55.4265251159668, + "logps/rejected": -52.878150939941406, + "loss": 0.6905, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0077946423552930355, + "rewards/margins": 0.005378765985369682, + "rewards/rejected": -0.013173406943678856, + "step": 3600 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -3.15022611618042, + "eval_logits/rejected": -3.144578218460083, + "eval_logps/chosen": -58.52936935424805, + "eval_logps/rejected": -63.18168258666992, + "eval_loss": 0.6922528743743896, + "eval_rewards/accuracies": 0.5697026252746582, + "eval_rewards/chosen": 0.0018253130838274956, + "eval_rewards/margins": 0.0018409350886940956, + "eval_rewards/rejected": -1.5622024875483476e-05, + "eval_runtime": 383.6418, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 3600 + }, + { + "epoch": 0.6219848380427292, + "grad_norm": 1.8933156728744507, + "learning_rate": 1.7408532982869573e-08, + "logits/chosen": -3.015439510345459, + "logits/rejected": -2.9991238117218018, + "logps/chosen": -52.52251052856445, + "logps/rejected": -50.87234115600586, + "loss": 0.6912, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.01029304601252079, + "rewards/margins": 0.004010824952274561, + "rewards/rejected": -0.014303868636488914, + "step": 3610 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 2.2738919258117676, + "learning_rate": 1.7388301336739784e-08, + "logits/chosen": -2.9741413593292236, + "logits/rejected": -2.9491591453552246, + "logps/chosen": -55.929954528808594, + "logps/rejected": -54.73632049560547, + "loss": 0.6897, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.004220867063850164, + "rewards/margins": 0.0071715391241014, + "rewards/rejected": -0.011392408050596714, + "step": 3620 + }, + { + "epoch": 0.6254307374224672, + "grad_norm": 2.115520715713501, + "learning_rate": 1.736800287763212e-08, + "logits/chosen": -3.1313865184783936, + "logits/rejected": -3.1247239112854004, + "logps/chosen": -51.954750061035156, + "logps/rejected": -57.22846221923828, + "loss": 0.6896, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.008140425197780132, + "rewards/margins": 0.007292865309864283, + "rewards/rejected": -0.015433289110660553, + "step": 3630 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 2.4922537803649902, + "learning_rate": 1.7347637789107115e-08, + "logits/chosen": -3.081291437149048, + "logits/rejected": -3.0664103031158447, + "logps/chosen": -55.62919998168945, + "logps/rejected": -57.412513732910156, + "loss": 0.69, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005860840901732445, + "rewards/margins": 0.0064689526334404945, + "rewards/rejected": -0.01232979353517294, + "step": 3640 + }, + { + "epoch": 0.6288766368022054, + "grad_norm": 1.9521703720092773, + "learning_rate": 1.7327206255327825e-08, + "logits/chosen": -3.031919002532959, + "logits/rejected": -3.0375239849090576, + "logps/chosen": -51.86109161376953, + "logps/rejected": -55.51893997192383, + "loss": 0.6922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0077956244349479675, + "rewards/margins": 0.0020641677547246218, + "rewards/rejected": -0.00985979288816452, + "step": 3650 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 2.2140133380889893, + "learning_rate": 1.730670846105819e-08, + "logits/chosen": -3.091434955596924, + "logits/rejected": -3.0613198280334473, + "logps/chosen": -54.4542236328125, + "logps/rejected": -54.953651428222656, + "loss": 0.6891, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005954755935817957, + "rewards/margins": 0.008188659325242043, + "rewards/rejected": -0.014143416658043861, + "step": 3660 + }, + { + "epoch": 0.6323225361819435, + "grad_norm": 2.279637336730957, + "learning_rate": 1.7286144591661338e-08, + "logits/chosen": -3.048067569732666, + "logits/rejected": -3.019761562347412, + "logps/chosen": -53.457313537597656, + "logps/rejected": -53.82349395751953, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.006061294116079807, + "rewards/margins": 0.006697321776300669, + "rewards/rejected": -0.012758615426719189, + "step": 3670 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 2.239985704421997, + "learning_rate": 1.7265514833097923e-08, + "logits/chosen": -3.0665507316589355, + "logits/rejected": -3.0252902507781982, + "logps/chosen": -55.358665466308594, + "logps/rejected": -51.566871643066406, + "loss": 0.6893, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0068284133449196815, + "rewards/margins": 0.007831481285393238, + "rewards/rejected": -0.01465989463031292, + "step": 3680 + }, + { + "epoch": 0.6357684355616816, + "grad_norm": 2.6258862018585205, + "learning_rate": 1.724481937192444e-08, + "logits/chosen": -3.000847339630127, + "logits/rejected": -3.009695529937744, + "logps/chosen": -52.84056854248047, + "logps/rejected": -60.19794464111328, + "loss": 0.6915, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.007468428462743759, + "rewards/margins": 0.0035681980662047863, + "rewards/rejected": -0.011036626063287258, + "step": 3690 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 2.4174911975860596, + "learning_rate": 1.7224058395291544e-08, + "logits/chosen": -3.04533052444458, + "logits/rejected": -3.0184431076049805, + "logps/chosen": -58.0278205871582, + "logps/rejected": -60.93510055541992, + "loss": 0.6877, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0043028187938034534, + "rewards/margins": 0.011179441586136818, + "rewards/rejected": -0.015482261776924133, + "step": 3700 + }, + { + "epoch": 0.6374913852515507, + "eval_logits/chosen": -3.149414539337158, + "eval_logits/rejected": -3.1437571048736572, + "eval_logps/chosen": -58.51812744140625, + "eval_logps/rejected": -63.18494415283203, + "eval_loss": 0.6921834945678711, + "eval_rewards/accuracies": 0.574117124080658, + "eval_rewards/chosen": 0.0019376871641725302, + "eval_rewards/margins": 0.0019858963787555695, + "eval_rewards/rejected": -4.820928006665781e-05, + "eval_runtime": 383.5204, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 3700 + }, + { + "epoch": 0.6392143349414197, + "grad_norm": 2.4837942123413086, + "learning_rate": 1.7203232090942337e-08, + "logits/chosen": -3.080009937286377, + "logits/rejected": -3.046736240386963, + "logps/chosen": -54.5723762512207, + "logps/rejected": -52.51799392700195, + "loss": 0.6893, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.007036884780973196, + "rewards/margins": 0.007810269482433796, + "rewards/rejected": -0.01484715472906828, + "step": 3710 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 2.339407205581665, + "learning_rate": 1.7182340647210696e-08, + "logits/chosen": -3.0932888984680176, + "logits/rejected": -3.0756850242614746, + "logps/chosen": -52.47334671020508, + "logps/rejected": -54.8570556640625, + "loss": 0.6888, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.005765864159911871, + "rewards/margins": 0.008951535448431969, + "rewards/rejected": -0.014717401936650276, + "step": 3720 + }, + { + "epoch": 0.6426602343211578, + "grad_norm": 2.783074378967285, + "learning_rate": 1.7161384253019558e-08, + "logits/chosen": -3.0552356243133545, + "logits/rejected": -3.0320777893066406, + "logps/chosen": -55.36650848388672, + "logps/rejected": -52.16785430908203, + "loss": 0.6902, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.008488858118653297, + "rewards/margins": 0.0061333938501775265, + "rewards/rejected": -0.014622251503169537, + "step": 3730 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 2.47733473777771, + "learning_rate": 1.7140363097879206e-08, + "logits/chosen": -3.0521280765533447, + "logits/rejected": -3.0277328491210938, + "logps/chosen": -54.155914306640625, + "logps/rejected": -57.747825622558594, + "loss": 0.6902, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.007208681665360928, + "rewards/margins": 0.006174913141876459, + "rewards/rejected": -0.0133835943415761, + "step": 3740 + }, + { + "epoch": 0.646106133700896, + "grad_norm": 2.5525078773498535, + "learning_rate": 1.7119277371885565e-08, + "logits/chosen": -3.1250386238098145, + "logits/rejected": -3.096428394317627, + "logps/chosen": -53.9883918762207, + "logps/rejected": -54.46564865112305, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006697263568639755, + "rewards/margins": 0.007167118135839701, + "rewards/rejected": -0.013864380307495594, + "step": 3750 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 2.3697919845581055, + "learning_rate": 1.709812726571848e-08, + "logits/chosen": -3.0818190574645996, + "logits/rejected": -3.078629732131958, + "logps/chosen": -56.815521240234375, + "logps/rejected": -57.3564338684082, + "loss": 0.6915, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.009418672882020473, + "rewards/margins": 0.0035493075847625732, + "rewards/rejected": -0.012967979535460472, + "step": 3760 + }, + { + "epoch": 0.649552033080634, + "grad_norm": 2.4745736122131348, + "learning_rate": 1.707691297063999e-08, + "logits/chosen": -3.04585337638855, + "logits/rejected": -3.025336742401123, + "logps/chosen": -54.28108596801758, + "logps/rejected": -55.88178253173828, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007393890526145697, + "rewards/margins": 0.00592414103448391, + "rewards/rejected": -0.013318032026290894, + "step": 3770 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 2.3536436557769775, + "learning_rate": 1.7055634678492594e-08, + "logits/chosen": -3.0982823371887207, + "logits/rejected": -3.058579206466675, + "logps/chosen": -54.78825759887695, + "logps/rejected": -50.85487365722656, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007096471730619669, + "rewards/margins": 0.007441540714353323, + "rewards/rejected": -0.014538010582327843, + "step": 3780 + }, + { + "epoch": 0.6529979324603722, + "grad_norm": 2.3600430488586426, + "learning_rate": 1.7034292581697533e-08, + "logits/chosen": -3.0493836402893066, + "logits/rejected": -3.019918203353882, + "logps/chosen": -54.718711853027344, + "logps/rejected": -52.50199508666992, + "loss": 0.6893, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.007305916398763657, + "rewards/margins": 0.007807619869709015, + "rewards/rejected": -0.015113537199795246, + "step": 3790 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 2.946199893951416, + "learning_rate": 1.701288687325303e-08, + "logits/chosen": -3.066213369369507, + "logits/rejected": -3.033952236175537, + "logps/chosen": -58.41338348388672, + "logps/rejected": -54.03316116333008, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007201158907264471, + "rewards/margins": 0.0044540828093886375, + "rewards/rejected": -0.01165524311363697, + "step": 3800 + }, + { + "epoch": 0.6547208821502413, + "eval_logits/chosen": -3.1486449241638184, + "eval_logits/rejected": -3.143003225326538, + "eval_logps/chosen": -58.519344329833984, + "eval_logps/rejected": -63.19419479370117, + "eval_loss": 0.6921459436416626, + "eval_rewards/accuracies": 0.5676115155220032, + "eval_rewards/chosen": 0.0019254968501627445, + "eval_rewards/margins": 0.0020662089809775352, + "eval_rewards/rejected": -0.0001407122181262821, + "eval_runtime": 383.5365, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 3800 + }, + { + "epoch": 0.6564438318401102, + "grad_norm": 2.0473248958587646, + "learning_rate": 1.699141774673255e-08, + "logits/chosen": -3.0938503742218018, + "logits/rejected": -3.0491273403167725, + "logps/chosen": -58.0572509765625, + "logps/rejected": -53.581703186035156, + "loss": 0.6886, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.005891457665711641, + "rewards/margins": 0.009286518208682537, + "rewards/rejected": -0.015177974477410316, + "step": 3810 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 2.378174304962158, + "learning_rate": 1.696988539628306e-08, + "logits/chosen": -3.0598483085632324, + "logits/rejected": -3.0423831939697266, + "logps/chosen": -54.69315719604492, + "logps/rejected": -54.343544006347656, + "loss": 0.6899, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.00594948697835207, + "rewards/margins": 0.006609291769564152, + "rewards/rejected": -0.012558777816593647, + "step": 3820 + }, + { + "epoch": 0.6598897312198484, + "grad_norm": 2.2672643661499023, + "learning_rate": 1.6948290016623267e-08, + "logits/chosen": -3.106572151184082, + "logits/rejected": -3.0605735778808594, + "logps/chosen": -59.068077087402344, + "logps/rejected": -52.881927490234375, + "loss": 0.6876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.003909780643880367, + "rewards/margins": 0.011306321248412132, + "rewards/rejected": -0.0152161018922925, + "step": 3830 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 2.378580331802368, + "learning_rate": 1.6926631803041846e-08, + "logits/chosen": -3.029109477996826, + "logits/rejected": -3.015733242034912, + "logps/chosen": -54.55817413330078, + "logps/rejected": -60.2359619140625, + "loss": 0.6896, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.004717822652310133, + "rewards/margins": 0.0073250653222203255, + "rewards/rejected": -0.01204288937151432, + "step": 3840 + }, + { + "epoch": 0.6633356305995864, + "grad_norm": 2.3673667907714844, + "learning_rate": 1.690491095139569e-08, + "logits/chosen": -3.157423496246338, + "logits/rejected": -3.1420907974243164, + "logps/chosen": -58.070655822753906, + "logps/rejected": -54.941322326660156, + "loss": 0.6912, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.007695481181144714, + "rewards/margins": 0.00405865628272295, + "rewards/rejected": -0.011754137463867664, + "step": 3850 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 2.7017436027526855, + "learning_rate": 1.688312765810814e-08, + "logits/chosen": -3.1684272289276123, + "logits/rejected": -3.1415889263153076, + "logps/chosen": -55.018394470214844, + "logps/rejected": -54.544158935546875, + "loss": 0.6894, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.008379653096199036, + "rewards/margins": 0.0076343291439116, + "rewards/rejected": -0.01601398177444935, + "step": 3860 + }, + { + "epoch": 0.6667815299793246, + "grad_norm": 3.0593020915985107, + "learning_rate": 1.6861282120167186e-08, + "logits/chosen": -3.058176040649414, + "logits/rejected": -3.0332953929901123, + "logps/chosen": -59.28391647338867, + "logps/rejected": -56.78931427001953, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004310892894864082, + "rewards/margins": 0.006942380219697952, + "rewards/rejected": -0.011253273114562035, + "step": 3870 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 2.492913246154785, + "learning_rate": 1.6839374535123718e-08, + "logits/chosen": -3.087202548980713, + "logits/rejected": -3.0917575359344482, + "logps/chosen": -52.11659622192383, + "logps/rejected": -54.79553985595703, + "loss": 0.6929, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.012375886552035809, + "rewards/margins": 0.0006632144213654101, + "rewards/rejected": -0.013039101846516132, + "step": 3880 + }, + { + "epoch": 0.6702274293590628, + "grad_norm": 2.2675533294677734, + "learning_rate": 1.6817405101089707e-08, + "logits/chosen": -3.136599063873291, + "logits/rejected": -3.108509063720703, + "logps/chosen": -54.788795471191406, + "logps/rejected": -53.5931510925293, + "loss": 0.689, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.006023922935128212, + "rewards/margins": 0.008436523377895355, + "rewards/rejected": -0.014460447244346142, + "step": 3890 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 2.471661329269409, + "learning_rate": 1.679537401673644e-08, + "logits/chosen": -3.106663227081299, + "logits/rejected": -3.050253391265869, + "logps/chosen": -57.400482177734375, + "logps/rejected": -51.8661003112793, + "loss": 0.6881, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005719014443457127, + "rewards/margins": 0.010319101624190807, + "rewards/rejected": -0.016038116067647934, + "step": 3900 + }, + { + "epoch": 0.6719503790489317, + "eval_logits/chosen": -3.1476454734802246, + "eval_logits/rejected": -3.1420326232910156, + "eval_logps/chosen": -58.53263473510742, + "eval_logps/rejected": -63.2182502746582, + "eval_loss": 0.6920942068099976, + "eval_rewards/accuracies": 0.5638940334320068, + "eval_rewards/chosen": 0.0017926108557730913, + "eval_rewards/margins": 0.002173854038119316, + "eval_rewards/rejected": -0.000381243386073038, + "eval_runtime": 383.5692, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 3900 + }, + { + "epoch": 0.6736733287388008, + "grad_norm": 2.3394501209259033, + "learning_rate": 1.6773281481292708e-08, + "logits/chosen": -3.083597421646118, + "logits/rejected": -3.067528009414673, + "logps/chosen": -56.4756965637207, + "logps/rejected": -56.2137336730957, + "loss": 0.6894, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.00834328681230545, + "rewards/margins": 0.007735086139291525, + "rewards/rejected": -0.016078371554613113, + "step": 3910 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 2.520785331726074, + "learning_rate": 1.6751127694543012e-08, + "logits/chosen": -3.1565117835998535, + "logits/rejected": -3.1098198890686035, + "logps/chosen": -59.21630096435547, + "logps/rejected": -51.98161697387695, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005468897521495819, + "rewards/margins": 0.009867525659501553, + "rewards/rejected": -0.015336424112319946, + "step": 3920 + }, + { + "epoch": 0.677119228118539, + "grad_norm": 2.314502716064453, + "learning_rate": 1.6728912856825752e-08, + "logits/chosen": -3.1580872535705566, + "logits/rejected": -3.1532976627349854, + "logps/chosen": -52.03582000732422, + "logps/rejected": -54.057891845703125, + "loss": 0.6907, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.007780077867209911, + "rewards/margins": 0.004977663513273001, + "rewards/rejected": -0.012757742777466774, + "step": 3930 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 2.5148684978485107, + "learning_rate": 1.6706637169031412e-08, + "logits/chosen": -2.9767425060272217, + "logits/rejected": -2.9571597576141357, + "logps/chosen": -54.011024475097656, + "logps/rejected": -55.11913299560547, + "loss": 0.6887, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.005110305733978748, + "rewards/margins": 0.00910019502043724, + "rewards/rejected": -0.014210501685738564, + "step": 3940 + }, + { + "epoch": 0.680565127498277, + "grad_norm": 2.334064483642578, + "learning_rate": 1.6684300832600752e-08, + "logits/chosen": -2.980774164199829, + "logits/rejected": -2.9327759742736816, + "logps/chosen": -56.086326599121094, + "logps/rejected": -52.7585563659668, + "loss": 0.6872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.006129746790975332, + "rewards/margins": 0.011988668702542782, + "rewards/rejected": -0.018118415027856827, + "step": 3950 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 2.4945201873779297, + "learning_rate": 1.6661904049522985e-08, + "logits/chosen": -3.0870680809020996, + "logits/rejected": -3.089069128036499, + "logps/chosen": -53.5789794921875, + "logps/rejected": -60.57684326171875, + "loss": 0.6917, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.008758168667554855, + "rewards/margins": 0.0030608586966991425, + "rewards/rejected": -0.011819026432931423, + "step": 3960 + }, + { + "epoch": 0.6840110268780152, + "grad_norm": 2.3911170959472656, + "learning_rate": 1.663944702233395e-08, + "logits/chosen": -3.0987579822540283, + "logits/rejected": -3.0742454528808594, + "logps/chosen": -54.832969665527344, + "logps/rejected": -54.51105880737305, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007600725628435612, + "rewards/margins": 0.007472677621990442, + "rewards/rejected": -0.015073401853442192, + "step": 3970 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 2.2145421504974365, + "learning_rate": 1.6616929954114263e-08, + "logits/chosen": -2.976238965988159, + "logits/rejected": -2.9565927982330322, + "logps/chosen": -54.43210983276367, + "logps/rejected": -54.672218322753906, + "loss": 0.6913, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.010925976559519768, + "rewards/margins": 0.003819962264969945, + "rewards/rejected": -0.014745938591659069, + "step": 3980 + }, + { + "epoch": 0.6874569262577532, + "grad_norm": 2.3381612300872803, + "learning_rate": 1.659435304848751e-08, + "logits/chosen": -3.0916008949279785, + "logits/rejected": -3.0771727561950684, + "logps/chosen": -53.97601318359375, + "logps/rejected": -56.700538635253906, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007593357469886541, + "rewards/margins": 0.0067450194619596004, + "rewards/rejected": -0.014338378794491291, + "step": 3990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 2.3654754161834717, + "learning_rate": 1.6571716509618385e-08, + "logits/chosen": -3.156153440475464, + "logits/rejected": -3.1207895278930664, + "logps/chosen": -57.9157600402832, + "logps/rejected": -54.3126335144043, + "loss": 0.6891, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0071152858436107635, + "rewards/margins": 0.008385889232158661, + "rewards/rejected": -0.015501176007091999, + "step": 4000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -3.1464505195617676, + "eval_logits/rejected": -3.140815258026123, + "eval_logps/chosen": -58.53483200073242, + "eval_logps/rejected": -63.2358283996582, + "eval_loss": 0.6920202374458313, + "eval_rewards/accuracies": 0.5727230310440063, + "eval_rewards/chosen": 0.0017705905484035611, + "eval_rewards/margins": 0.0023276114370673895, + "eval_rewards/rejected": -0.000557021121494472, + "eval_runtime": 383.5374, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 4000 + }, + { + "epoch": 0.6909028256374914, + "grad_norm": 2.257662773132324, + "learning_rate": 1.6549020542210858e-08, + "logits/chosen": -3.0420236587524414, + "logits/rejected": -3.0328893661499023, + "logps/chosen": -48.96164321899414, + "logps/rejected": -54.50152587890625, + "loss": 0.6905, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.009422359988093376, + "rewards/margins": 0.005418227985501289, + "rewards/rejected": -0.014840586110949516, + "step": 4010 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 2.3167202472686768, + "learning_rate": 1.6526265351506302e-08, + "logits/chosen": -3.0502841472625732, + "logits/rejected": -3.0362343788146973, + "logps/chosen": -53.02349853515625, + "logps/rejected": -54.821495056152344, + "loss": 0.689, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.008379830047488213, + "rewards/margins": 0.008577833883464336, + "rewards/rejected": -0.016957664862275124, + "step": 4020 + }, + { + "epoch": 0.6943487250172296, + "grad_norm": 2.379713535308838, + "learning_rate": 1.6503451143281665e-08, + "logits/chosen": -3.0468204021453857, + "logits/rejected": -3.0248570442199707, + "logps/chosen": -56.70612335205078, + "logps/rejected": -57.12060546875, + "loss": 0.6899, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.006521374918520451, + "rewards/margins": 0.006764276418834925, + "rewards/rejected": -0.013285649940371513, + "step": 4030 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 2.7433881759643555, + "learning_rate": 1.6480578123847584e-08, + "logits/chosen": -3.1389083862304688, + "logits/rejected": -3.126983642578125, + "logps/chosen": -57.09613800048828, + "logps/rejected": -55.38570022583008, + "loss": 0.6911, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.009039239026606083, + "rewards/margins": 0.00432856660336256, + "rewards/rejected": -0.013367804698646069, + "step": 4040 + }, + { + "epoch": 0.6977946243969676, + "grad_norm": 2.4082491397857666, + "learning_rate": 1.6457646500046536e-08, + "logits/chosen": -3.0439090728759766, + "logits/rejected": -3.026900053024292, + "logps/chosen": -51.30855178833008, + "logps/rejected": -54.536094665527344, + "loss": 0.6884, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007431273814290762, + "rewards/margins": 0.009696152061223984, + "rewards/rejected": -0.017127424478530884, + "step": 4050 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 2.4375593662261963, + "learning_rate": 1.643465647925096e-08, + "logits/chosen": -3.0453343391418457, + "logits/rejected": -3.0121757984161377, + "logps/chosen": -57.543304443359375, + "logps/rejected": -54.09287643432617, + "loss": 0.6895, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.011836276389658451, + "rewards/margins": 0.007690160069614649, + "rewards/rejected": -0.01952643319964409, + "step": 4060 + }, + { + "epoch": 0.7012405237767058, + "grad_norm": 2.122170925140381, + "learning_rate": 1.6411608269361393e-08, + "logits/chosen": -3.05137300491333, + "logits/rejected": -3.028947353363037, + "logps/chosen": -55.06595993041992, + "logps/rejected": -54.700538635253906, + "loss": 0.6903, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012340652756392956, + "rewards/margins": 0.005884417332708836, + "rewards/rejected": -0.01822507008910179, + "step": 4070 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 2.809166669845581, + "learning_rate": 1.638850207880456e-08, + "logits/chosen": -3.0260517597198486, + "logits/rejected": -3.018556594848633, + "logps/chosen": -52.71803665161133, + "logps/rejected": -55.4732666015625, + "loss": 0.6901, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.008182106539607048, + "rewards/margins": 0.006329337600618601, + "rewards/rejected": -0.014511443674564362, + "step": 4080 + }, + { + "epoch": 0.7046864231564438, + "grad_norm": 2.3355367183685303, + "learning_rate": 1.6365338116531524e-08, + "logits/chosen": -3.100362777709961, + "logits/rejected": -3.0696163177490234, + "logps/chosen": -56.38579177856445, + "logps/rejected": -56.2817497253418, + "loss": 0.689, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.009085236117243767, + "rewards/margins": 0.008473975583910942, + "rewards/rejected": -0.01755921170115471, + "step": 4090 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 2.1862552165985107, + "learning_rate": 1.6342116592015784e-08, + "logits/chosen": -3.0656888484954834, + "logits/rejected": -3.036451816558838, + "logps/chosen": -53.445716857910156, + "logps/rejected": -50.91322708129883, + "loss": 0.688, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0060226572677493095, + "rewards/margins": 0.010662797838449478, + "rewards/rejected": -0.016685456037521362, + "step": 4100 + }, + { + "epoch": 0.7064093728463129, + "eval_logits/chosen": -3.1452784538269043, + "eval_logits/rejected": -3.139664888381958, + "eval_logps/chosen": -58.53339767456055, + "eval_logps/rejected": -63.248939514160156, + "eval_loss": 0.6919506788253784, + "eval_rewards/accuracies": 0.5694702863693237, + "eval_rewards/chosen": 0.0017849754076451063, + "eval_rewards/margins": 0.0024732158053666353, + "eval_rewards/rejected": -0.0006882402813062072, + "eval_runtime": 383.3008, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 4100 + }, + { + "epoch": 0.708132322536182, + "grad_norm": 2.4810664653778076, + "learning_rate": 1.631883771525137e-08, + "logits/chosen": -3.0311903953552246, + "logits/rejected": -3.019279956817627, + "logps/chosen": -57.1802864074707, + "logps/rejected": -53.1908073425293, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.011178601533174515, + "rewards/margins": 0.005697404034435749, + "rewards/rejected": -0.01687600649893284, + "step": 4110 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 2.73134183883667, + "learning_rate": 1.6295501696750958e-08, + "logits/chosen": -2.9880242347717285, + "logits/rejected": -2.9914722442626953, + "logps/chosen": -51.12857437133789, + "logps/rejected": -56.45573806762695, + "loss": 0.6908, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.01163684856146574, + "rewards/margins": 0.005046077072620392, + "rewards/rejected": -0.016682926565408707, + "step": 4120 + }, + { + "epoch": 0.71157822191592, + "grad_norm": 2.448148727416992, + "learning_rate": 1.6272108747543964e-08, + "logits/chosen": -3.089820384979248, + "logits/rejected": -3.066537618637085, + "logps/chosen": -54.83890914916992, + "logps/rejected": -54.20136260986328, + "loss": 0.6893, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.008709724061191082, + "rewards/margins": 0.0079735042527318, + "rewards/rejected": -0.016683228313922882, + "step": 4130 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 2.247288942337036, + "learning_rate": 1.6248659079174624e-08, + "logits/chosen": -3.0946431159973145, + "logits/rejected": -3.066021680831909, + "logps/chosen": -56.27561569213867, + "logps/rejected": -52.206207275390625, + "loss": 0.6895, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.007499805651605129, + "rewards/margins": 0.0075314841233193874, + "rewards/rejected": -0.015031290240585804, + "step": 4140 + }, + { + "epoch": 0.7150241212956582, + "grad_norm": 2.0884854793548584, + "learning_rate": 1.6225152903700093e-08, + "logits/chosen": -3.1154227256774902, + "logits/rejected": -3.088381052017212, + "logps/chosen": -58.83393478393555, + "logps/rejected": -53.803932189941406, + "loss": 0.6884, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.009922737255692482, + "rewards/margins": 0.009877922013401985, + "rewards/rejected": -0.019800657406449318, + "step": 4150 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 2.234891653060913, + "learning_rate": 1.6201590433688532e-08, + "logits/chosen": -3.0078649520874023, + "logits/rejected": -2.9760546684265137, + "logps/chosen": -52.66447830200195, + "logps/rejected": -51.263343811035156, + "loss": 0.6875, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.009862758219242096, + "rewards/margins": 0.011652237735688686, + "rewards/rejected": -0.021514996886253357, + "step": 4160 + }, + { + "epoch": 0.7184700206753962, + "grad_norm": 2.4819843769073486, + "learning_rate": 1.617797188221717e-08, + "logits/chosen": -3.06020188331604, + "logits/rejected": -3.0522642135620117, + "logps/chosen": -52.52134323120117, + "logps/rejected": -54.958274841308594, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.011253321543335915, + "rewards/margins": 0.0024759075604379177, + "rewards/rejected": -0.01372922956943512, + "step": 4170 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 2.3897159099578857, + "learning_rate": 1.6154297462870378e-08, + "logits/chosen": -3.0610311031341553, + "logits/rejected": -3.0461132526397705, + "logps/chosen": -56.082275390625, + "logps/rejected": -57.79096221923828, + "loss": 0.6895, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.007366887293756008, + "rewards/margins": 0.007653279695659876, + "rewards/rejected": -0.015020167455077171, + "step": 4180 + }, + { + "epoch": 0.7219159200551344, + "grad_norm": 2.1994800567626953, + "learning_rate": 1.6130567389737767e-08, + "logits/chosen": -3.041149616241455, + "logits/rejected": -3.0273966789245605, + "logps/chosen": -51.95328903198242, + "logps/rejected": -54.303009033203125, + "loss": 0.6888, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.008851468563079834, + "rewards/margins": 0.008974619209766388, + "rewards/rejected": -0.017826087772846222, + "step": 4190 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 2.5199713706970215, + "learning_rate": 1.6106781877412207e-08, + "logits/chosen": -3.092040538787842, + "logits/rejected": -3.0750718116760254, + "logps/chosen": -56.37440872192383, + "logps/rejected": -54.04875564575195, + "loss": 0.6893, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.006679283920675516, + "rewards/margins": 0.007919451221823692, + "rewards/rejected": -0.014598734676837921, + "step": 4200 + }, + { + "epoch": 0.7236388697450035, + "eval_logits/chosen": -3.1445956230163574, + "eval_logits/rejected": -3.1389715671539307, + "eval_logps/chosen": -58.557437896728516, + "eval_logps/rejected": -63.27348709106445, + "eval_loss": 0.6919510364532471, + "eval_rewards/accuracies": 0.5685408711433411, + "eval_rewards/chosen": 0.0015445526223629713, + "eval_rewards/margins": 0.0024782144464552402, + "eval_rewards/rejected": -0.0009336618822999299, + "eval_runtime": 382.6488, + "eval_samples_per_second": 11.248, + "eval_steps_per_second": 1.406, + "step": 4200 + }, + { + "epoch": 0.7253618194348725, + "grad_norm": 2.560533046722412, + "learning_rate": 1.6082941140987916e-08, + "logits/chosen": -3.0729806423187256, + "logits/rejected": -3.049401044845581, + "logps/chosen": -56.40735626220703, + "logps/rejected": -54.78361892700195, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011106094345450401, + "rewards/margins": 0.00898011215031147, + "rewards/rejected": -0.020086204633116722, + "step": 4210 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 2.338407039642334, + "learning_rate": 1.6059045396058517e-08, + "logits/chosen": -2.9267725944519043, + "logits/rejected": -2.907881498336792, + "logps/chosen": -53.28325271606445, + "logps/rejected": -54.56683349609375, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010645734146237373, + "rewards/margins": 0.00797154288738966, + "rewards/rejected": -0.018617277964949608, + "step": 4220 + }, + { + "epoch": 0.7288077188146106, + "grad_norm": 2.2222158908843994, + "learning_rate": 1.603509485871506e-08, + "logits/chosen": -3.1128008365631104, + "logits/rejected": -3.0919594764709473, + "logps/chosen": -56.30914306640625, + "logps/rejected": -55.95566940307617, + "loss": 0.689, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.010030779056251049, + "rewards/margins": 0.008529409766197205, + "rewards/rejected": -0.018560189753770828, + "step": 4230 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 2.292361259460449, + "learning_rate": 1.601108974554411e-08, + "logits/chosen": -3.0141403675079346, + "logits/rejected": -2.9893059730529785, + "logps/chosen": -55.5947151184082, + "logps/rejected": -58.04345703125, + "loss": 0.6871, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.00850595161318779, + "rewards/margins": 0.012388696894049644, + "rewards/rejected": -0.020894650369882584, + "step": 4240 + }, + { + "epoch": 0.7322536181943488, + "grad_norm": 2.362673044204712, + "learning_rate": 1.5987030273625747e-08, + "logits/chosen": -3.03678297996521, + "logits/rejected": -3.020172595977783, + "logps/chosen": -56.116294860839844, + "logps/rejected": -54.58025360107422, + "loss": 0.6891, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01067496370524168, + "rewards/margins": 0.008443078026175499, + "rewards/rejected": -0.019118044525384903, + "step": 4250 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 2.441284656524658, + "learning_rate": 1.596291666053163e-08, + "logits/chosen": -2.9980833530426025, + "logits/rejected": -2.9824585914611816, + "logps/chosen": -55.41083908081055, + "logps/rejected": -56.44206619262695, + "loss": 0.6898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.008826795034110546, + "rewards/margins": 0.0070566474460065365, + "rewards/rejected": -0.015883442014455795, + "step": 4260 + }, + { + "epoch": 0.7356995175740868, + "grad_norm": 2.271895170211792, + "learning_rate": 1.5938749124323017e-08, + "logits/chosen": -3.0179824829101562, + "logits/rejected": -2.9922022819519043, + "logps/chosen": -52.23370361328125, + "logps/rejected": -52.155426025390625, + "loss": 0.6884, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.00625257333740592, + "rewards/margins": 0.009830532595515251, + "rewards/rejected": -0.01608310639858246, + "step": 4270 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 2.187901496887207, + "learning_rate": 1.5914527883548804e-08, + "logits/chosen": -2.9919097423553467, + "logits/rejected": -2.9648940563201904, + "logps/chosen": -52.847511291503906, + "logps/rejected": -54.09342575073242, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011299841105937958, + "rewards/margins": 0.009080270305275917, + "rewards/rejected": -0.020380113273859024, + "step": 4280 + }, + { + "epoch": 0.739145416953825, + "grad_norm": 2.2882347106933594, + "learning_rate": 1.5890253157243527e-08, + "logits/chosen": -3.045542001724243, + "logits/rejected": -3.0208449363708496, + "logps/chosen": -56.22462844848633, + "logps/rejected": -53.41437911987305, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009847553446888924, + "rewards/margins": 0.008151333779096603, + "rewards/rejected": -0.017998887225985527, + "step": 4290 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 2.245983839035034, + "learning_rate": 1.5865925164925415e-08, + "logits/chosen": -3.059497117996216, + "logits/rejected": -3.0487923622131348, + "logps/chosen": -53.6651496887207, + "logps/rejected": -54.894622802734375, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011088307946920395, + "rewards/margins": 0.0072076646611094475, + "rewards/rejected": -0.018295975401997566, + "step": 4300 + }, + { + "epoch": 0.740868366643694, + "eval_logits/chosen": -3.143899440765381, + "eval_logits/rejected": -3.1382787227630615, + "eval_logps/chosen": -58.56075668334961, + "eval_logps/rejected": -63.2966423034668, + "eval_loss": 0.691855251789093, + "eval_rewards/accuracies": 0.5748141407966614, + "eval_rewards/chosen": 0.001511368784122169, + "eval_rewards/margins": 0.0026765998918563128, + "eval_rewards/rejected": -0.0011652313405647874, + "eval_runtime": 383.1116, + "eval_samples_per_second": 11.234, + "eval_steps_per_second": 1.404, + "step": 4300 + }, + { + "epoch": 0.742591316333563, + "grad_norm": 2.5445520877838135, + "learning_rate": 1.5841544126594372e-08, + "logits/chosen": -3.0100371837615967, + "logits/rejected": -2.9904136657714844, + "logps/chosen": -54.983856201171875, + "logps/rejected": -53.272987365722656, + "loss": 0.6895, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.009319139644503593, + "rewards/margins": 0.007537718862295151, + "rewards/rejected": -0.016856860369443893, + "step": 4310 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 2.088826894760132, + "learning_rate": 1.581711026273e-08, + "logits/chosen": -3.0851564407348633, + "logits/rejected": -3.05715012550354, + "logps/chosen": -59.47802734375, + "logps/rejected": -56.493492126464844, + "loss": 0.6884, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.008360291831195354, + "rewards/margins": 0.009802700951695442, + "rewards/rejected": -0.018162991851568222, + "step": 4320 + }, + { + "epoch": 0.7460372157133012, + "grad_norm": 2.4881863594055176, + "learning_rate": 1.579262379428962e-08, + "logits/chosen": -2.991785764694214, + "logits/rejected": -2.9598193168640137, + "logps/chosen": -56.84458541870117, + "logps/rejected": -51.7958869934082, + "loss": 0.69, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.013537961058318615, + "rewards/margins": 0.006494508590549231, + "rewards/rejected": -0.020032469183206558, + "step": 4330 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 2.1231768131256104, + "learning_rate": 1.5768084942706245e-08, + "logits/chosen": -2.9963862895965576, + "logits/rejected": -2.9773504734039307, + "logps/chosen": -50.41028594970703, + "logps/rejected": -52.8140754699707, + "loss": 0.6888, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.014139607548713684, + "rewards/margins": 0.00901852734386921, + "rewards/rejected": -0.023158136755228043, + "step": 4340 + }, + { + "epoch": 0.7494831150930393, + "grad_norm": 2.1576578617095947, + "learning_rate": 1.5743493929886602e-08, + "logits/chosen": -3.0088493824005127, + "logits/rejected": -2.994492530822754, + "logps/chosen": -51.812225341796875, + "logps/rejected": -54.839447021484375, + "loss": 0.6891, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.008023882284760475, + "rewards/margins": 0.008411007933318615, + "rewards/rejected": -0.016434891149401665, + "step": 4350 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 2.4116005897521973, + "learning_rate": 1.5718850978209113e-08, + "logits/chosen": -3.0605316162109375, + "logits/rejected": -3.038637638092041, + "logps/chosen": -55.0179328918457, + "logps/rejected": -53.5815544128418, + "loss": 0.6904, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.01346288900822401, + "rewards/margins": 0.005852080415934324, + "rewards/rejected": -0.019314970821142197, + "step": 4360 + }, + { + "epoch": 0.7529290144727774, + "grad_norm": 2.157750368118286, + "learning_rate": 1.5694156310521886e-08, + "logits/chosen": -3.0732007026672363, + "logits/rejected": -3.0582518577575684, + "logps/chosen": -55.9976921081543, + "logps/rejected": -56.61333084106445, + "loss": 0.6894, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.010390983894467354, + "rewards/margins": 0.007754699792712927, + "rewards/rejected": -0.018145684152841568, + "step": 4370 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 2.26912260055542, + "learning_rate": 1.5669410150140707e-08, + "logits/chosen": -3.0933172702789307, + "logits/rejected": -3.069687604904175, + "logps/chosen": -55.12248611450195, + "logps/rejected": -53.76482391357422, + "loss": 0.6894, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.008189195767045021, + "rewards/margins": 0.007664737291634083, + "rewards/rejected": -0.01585393212735653, + "step": 4380 + }, + { + "epoch": 0.7563749138525155, + "grad_norm": 2.0165369510650635, + "learning_rate": 1.5644612720847002e-08, + "logits/chosen": -3.018369197845459, + "logits/rejected": -3.0045838356018066, + "logps/chosen": -54.61598587036133, + "logps/rejected": -55.37036895751953, + "loss": 0.6908, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.013793488033115864, + "rewards/margins": 0.0049360147677361965, + "rewards/rejected": -0.018729500472545624, + "step": 4390 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 2.3872110843658447, + "learning_rate": 1.5619764246885842e-08, + "logits/chosen": -3.0393893718719482, + "logits/rejected": -3.0217764377593994, + "logps/chosen": -57.86515426635742, + "logps/rejected": -56.952545166015625, + "loss": 0.6904, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.009598755277693272, + "rewards/margins": 0.005718299653381109, + "rewards/rejected": -0.015317055396735668, + "step": 4400 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -3.1430068016052246, + "eval_logits/rejected": -3.137352466583252, + "eval_logps/chosen": -58.58723449707031, + "eval_logps/rejected": -63.3355598449707, + "eval_loss": 0.691795825958252, + "eval_rewards/accuracies": 0.571096658706665, + "eval_rewards/chosen": 0.001246647210791707, + "eval_rewards/margins": 0.0028010986279696226, + "eval_rewards/rejected": -0.00155445106793195, + "eval_runtime": 383.1918, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 4400 + }, + { + "epoch": 0.7598208132322536, + "grad_norm": 2.294480800628662, + "learning_rate": 1.5594864952963885e-08, + "logits/chosen": -3.0046401023864746, + "logits/rejected": -2.98811411857605, + "logps/chosen": -56.121124267578125, + "logps/rejected": -57.08354568481445, + "loss": 0.6892, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.012294664047658443, + "rewards/margins": 0.008234361186623573, + "rewards/rejected": -0.02052902616560459, + "step": 4410 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 2.1563189029693604, + "learning_rate": 1.5569915064247365e-08, + "logits/chosen": -3.1245226860046387, + "logits/rejected": -3.0795650482177734, + "logps/chosen": -55.503578186035156, + "logps/rejected": -52.61016845703125, + "loss": 0.6861, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0068757846020162106, + "rewards/margins": 0.01452686171978712, + "rewards/rejected": -0.021402645856142044, + "step": 4420 + }, + { + "epoch": 0.7632667126119917, + "grad_norm": 1.9755196571350098, + "learning_rate": 1.5544914806360043e-08, + "logits/chosen": -3.0168216228485107, + "logits/rejected": -2.985079288482666, + "logps/chosen": -55.10762405395508, + "logps/rejected": -53.182945251464844, + "loss": 0.6906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.013249601237475872, + "rewards/margins": 0.005370932165533304, + "rewards/rejected": -0.018620532006025314, + "step": 4430 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 2.288050889968872, + "learning_rate": 1.5519864405381183e-08, + "logits/chosen": -3.083771228790283, + "logits/rejected": -3.0520148277282715, + "logps/chosen": -60.474449157714844, + "logps/rejected": -56.98888397216797, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010724170133471489, + "rewards/margins": 0.008934767916798592, + "rewards/rejected": -0.01965893805027008, + "step": 4440 + }, + { + "epoch": 0.7667126119917298, + "grad_norm": 2.294389486312866, + "learning_rate": 1.5494764087843482e-08, + "logits/chosen": -2.9569332599639893, + "logits/rejected": -2.944169521331787, + "logps/chosen": -56.9466552734375, + "logps/rejected": -55.96522903442383, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.012313870713114738, + "rewards/margins": 0.005467808805406094, + "rewards/rejected": -0.017781678587198257, + "step": 4450 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 2.432860851287842, + "learning_rate": 1.5469614080731053e-08, + "logits/chosen": -3.069728136062622, + "logits/rejected": -3.061816692352295, + "logps/chosen": -54.156227111816406, + "logps/rejected": -54.633888244628906, + "loss": 0.6906, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0142423827201128, + "rewards/margins": 0.005499526392668486, + "rewards/rejected": -0.019741909578442574, + "step": 4460 + }, + { + "epoch": 0.770158511371468, + "grad_norm": 2.1848111152648926, + "learning_rate": 1.544441461147734e-08, + "logits/chosen": -3.105034351348877, + "logits/rejected": -3.074484348297119, + "logps/chosen": -57.09598922729492, + "logps/rejected": -52.336395263671875, + "loss": 0.6887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.011739978566765785, + "rewards/margins": 0.009237302467226982, + "rewards/rejected": -0.020977279171347618, + "step": 4470 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 2.3228683471679688, + "learning_rate": 1.5419165907963085e-08, + "logits/chosen": -3.133382797241211, + "logits/rejected": -3.126979351043701, + "logps/chosen": -52.30253219604492, + "logps/rejected": -54.35466766357422, + "loss": 0.6892, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01291743665933609, + "rewards/margins": 0.00816321186721325, + "rewards/rejected": -0.02108065038919449, + "step": 4480 + }, + { + "epoch": 0.7736044107512061, + "grad_norm": 2.3531270027160645, + "learning_rate": 1.5393868198514258e-08, + "logits/chosen": -3.023160457611084, + "logits/rejected": -2.9955711364746094, + "logps/chosen": -52.2995719909668, + "logps/rejected": -51.951698303222656, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011587420478463173, + "rewards/margins": 0.009556153789162636, + "rewards/rejected": -0.02114357426762581, + "step": 4490 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 2.6675174236297607, + "learning_rate": 1.5368521711899994e-08, + "logits/chosen": -3.038281202316284, + "logits/rejected": -3.0235447883605957, + "logps/chosen": -57.523460388183594, + "logps/rejected": -54.69983673095703, + "loss": 0.6905, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.011881532147526741, + "rewards/margins": 0.005483219865709543, + "rewards/rejected": -0.017364751547574997, + "step": 4500 + }, + { + "epoch": 0.7753273604410751, + "eval_logits/chosen": -3.142470598220825, + "eval_logits/rejected": -3.136869430541992, + "eval_logps/chosen": -58.58580780029297, + "eval_logps/rejected": -63.34257507324219, + "eval_loss": 0.6917560696601868, + "eval_rewards/accuracies": 0.5850371718406677, + "eval_rewards/chosen": 0.0012609114637598395, + "eval_rewards/margins": 0.0028854578267782927, + "eval_rewards/rejected": -0.0016245462466031313, + "eval_runtime": 383.2687, + "eval_samples_per_second": 11.23, + "eval_steps_per_second": 1.404, + "step": 4500 + }, + { + "epoch": 0.7770503101309442, + "grad_norm": 2.6927618980407715, + "learning_rate": 1.5343126677330526e-08, + "logits/chosen": -3.154592990875244, + "logits/rejected": -3.126269817352295, + "logps/chosen": -54.230247497558594, + "logps/rejected": -57.4571418762207, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00797832477837801, + "rewards/margins": 0.010417604818940163, + "rewards/rejected": -0.018395930528640747, + "step": 4510 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 2.582130193710327, + "learning_rate": 1.5317683324455104e-08, + "logits/chosen": -3.07936429977417, + "logits/rejected": -3.0577187538146973, + "logps/chosen": -54.154029846191406, + "logps/rejected": -52.17194747924805, + "loss": 0.6906, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.015559755265712738, + "rewards/margins": 0.005351835396140814, + "rewards/rejected": -0.02091159299015999, + "step": 4520 + }, + { + "epoch": 0.7804962095106823, + "grad_norm": 2.4246294498443604, + "learning_rate": 1.5292191883359924e-08, + "logits/chosen": -3.0756242275238037, + "logits/rejected": -3.066664695739746, + "logps/chosen": -55.20574188232422, + "logps/rejected": -55.2439079284668, + "loss": 0.6916, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01431342214345932, + "rewards/margins": 0.0033062086440622807, + "rewards/rejected": -0.017619632184505463, + "step": 4530 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 2.6444544792175293, + "learning_rate": 1.5266652584566056e-08, + "logits/chosen": -2.9866771697998047, + "logits/rejected": -2.9747414588928223, + "logps/chosen": -53.885520935058594, + "logps/rejected": -53.13618850708008, + "loss": 0.6881, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.011537250131368637, + "rewards/margins": 0.010475369170308113, + "rewards/rejected": -0.0220126211643219, + "step": 4540 + }, + { + "epoch": 0.7839421088904204, + "grad_norm": 2.567601442337036, + "learning_rate": 1.5241065659027345e-08, + "logits/chosen": -3.092729091644287, + "logits/rejected": -3.0693888664245605, + "logps/chosen": -57.39257049560547, + "logps/rejected": -55.48276901245117, + "loss": 0.688, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.010340984910726547, + "rewards/margins": 0.010571248829364777, + "rewards/rejected": -0.020912233740091324, + "step": 4550 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 2.5469632148742676, + "learning_rate": 1.5215431338128326e-08, + "logits/chosen": -2.986642360687256, + "logits/rejected": -2.949063777923584, + "logps/chosen": -56.87488555908203, + "logps/rejected": -54.90007781982422, + "loss": 0.6875, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.006578305270522833, + "rewards/margins": 0.011532245203852654, + "rewards/rejected": -0.018110549077391624, + "step": 4560 + }, + { + "epoch": 0.7873880082701585, + "grad_norm": 2.250145435333252, + "learning_rate": 1.5189749853682138e-08, + "logits/chosen": -3.014840602874756, + "logits/rejected": -2.9897007942199707, + "logps/chosen": -52.40979766845703, + "logps/rejected": -53.9889030456543, + "loss": 0.6887, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0131943728774786, + "rewards/margins": 0.009161809459328651, + "rewards/rejected": -0.02235618606209755, + "step": 4570 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 2.483966827392578, + "learning_rate": 1.5164021437928424e-08, + "logits/chosen": -2.990060567855835, + "logits/rejected": -2.9630320072174072, + "logps/chosen": -55.05839920043945, + "logps/rejected": -53.426841735839844, + "loss": 0.6889, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.007987894117832184, + "rewards/margins": 0.008664386346936226, + "rewards/rejected": -0.01665228046476841, + "step": 4580 + }, + { + "epoch": 0.7908339076498966, + "grad_norm": 2.7395739555358887, + "learning_rate": 1.5138246323531224e-08, + "logits/chosen": -3.0871968269348145, + "logits/rejected": -3.0587024688720703, + "logps/chosen": -56.29444122314453, + "logps/rejected": -54.183837890625, + "loss": 0.6905, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.014101634733378887, + "rewards/margins": 0.005672593601047993, + "rewards/rejected": -0.01977423205971718, + "step": 4590 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 2.2159547805786133, + "learning_rate": 1.5112424743576885e-08, + "logits/chosen": -2.964893102645874, + "logits/rejected": -2.945138931274414, + "logps/chosen": -54.72895431518555, + "logps/rejected": -56.71080780029297, + "loss": 0.6883, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.013638514094054699, + "rewards/margins": 0.009959360584616661, + "rewards/rejected": -0.023597871884703636, + "step": 4600 + }, + { + "epoch": 0.7925568573397657, + "eval_logits/chosen": -3.141345262527466, + "eval_logits/rejected": -3.1357266902923584, + "eval_logps/chosen": -58.60505294799805, + "eval_logps/rejected": -63.36589431762695, + "eval_loss": 0.6917376518249512, + "eval_rewards/accuracies": 0.5787639617919922, + "eval_rewards/chosen": 0.0010684671578928828, + "eval_rewards/margins": 0.0029261959716677666, + "eval_rewards/rejected": -0.0018577290466055274, + "eval_runtime": 383.4911, + "eval_samples_per_second": 11.223, + "eval_steps_per_second": 1.403, + "step": 4600 + }, + { + "epoch": 0.7942798070296347, + "grad_norm": 2.280055046081543, + "learning_rate": 1.5086556931571946e-08, + "logits/chosen": -2.999232292175293, + "logits/rejected": -2.988621711730957, + "logps/chosen": -52.64866256713867, + "logps/rejected": -53.27161407470703, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015107172541320324, + "rewards/margins": 0.008137322030961514, + "rewards/rejected": -0.023244492709636688, + "step": 4610 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 2.420598030090332, + "learning_rate": 1.5060643121441017e-08, + "logits/chosen": -3.0278878211975098, + "logits/rejected": -3.004408836364746, + "logps/chosen": -56.786781311035156, + "logps/rejected": -57.46771240234375, + "loss": 0.6889, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01068052463233471, + "rewards/margins": 0.0088728042319417, + "rewards/rejected": -0.019553329795598984, + "step": 4620 + }, + { + "epoch": 0.7977257064093728, + "grad_norm": 2.28881573677063, + "learning_rate": 1.503468354752468e-08, + "logits/chosen": -3.0719847679138184, + "logits/rejected": -3.0378081798553467, + "logps/chosen": -58.28291702270508, + "logps/rejected": -58.635284423828125, + "loss": 0.6881, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009992343373596668, + "rewards/margins": 0.010409261099994183, + "rewards/rejected": -0.0204016026109457, + "step": 4630 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 2.432915449142456, + "learning_rate": 1.5008678444577368e-08, + "logits/chosen": -3.0757288932800293, + "logits/rejected": -3.060554265975952, + "logps/chosen": -56.30283737182617, + "logps/rejected": -57.6944694519043, + "loss": 0.6903, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01548223476856947, + "rewards/margins": 0.005951850675046444, + "rewards/rejected": -0.021434085443615913, + "step": 4640 + }, + { + "epoch": 0.801171605789111, + "grad_norm": 2.3404457569122314, + "learning_rate": 1.4982628047765213e-08, + "logits/chosen": -3.073173999786377, + "logits/rejected": -3.0398802757263184, + "logps/chosen": -55.227325439453125, + "logps/rejected": -52.969627380371094, + "loss": 0.6877, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.010765276849269867, + "rewards/margins": 0.011142291128635406, + "rewards/rejected": -0.021907567977905273, + "step": 4650 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 2.2700555324554443, + "learning_rate": 1.495653259266398e-08, + "logits/chosen": -3.0625596046447754, + "logits/rejected": -3.0436787605285645, + "logps/chosen": -55.32490158081055, + "logps/rejected": -56.171669006347656, + "loss": 0.6889, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.010176388546824455, + "rewards/margins": 0.008753242902457714, + "rewards/rejected": -0.018929632380604744, + "step": 4660 + }, + { + "epoch": 0.8046175051688491, + "grad_norm": 2.1048812866210938, + "learning_rate": 1.493039231525686e-08, + "logits/chosen": -3.0612926483154297, + "logits/rejected": -3.036155939102173, + "logps/chosen": -53.7221565246582, + "logps/rejected": -51.17532730102539, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.012948019430041313, + "rewards/margins": 0.00783008337020874, + "rewards/rejected": -0.020778100937604904, + "step": 4670 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 2.1982691287994385, + "learning_rate": 1.4904207451932403e-08, + "logits/chosen": -3.0380797386169434, + "logits/rejected": -3.0065929889678955, + "logps/chosen": -53.59992218017578, + "logps/rejected": -53.799102783203125, + "loss": 0.6883, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.011093436740338802, + "rewards/margins": 0.009997878223657608, + "rewards/rejected": -0.021091314032673836, + "step": 4680 + }, + { + "epoch": 0.8080634045485872, + "grad_norm": 2.743126630783081, + "learning_rate": 1.4877978239482345e-08, + "logits/chosen": -3.0848495960235596, + "logits/rejected": -3.062772750854492, + "logps/chosen": -56.99140167236328, + "logps/rejected": -56.22808837890625, + "loss": 0.6898, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.012282797135412693, + "rewards/margins": 0.006996271200478077, + "rewards/rejected": -0.01927906833589077, + "step": 4690 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 2.436103105545044, + "learning_rate": 1.4851704915099474e-08, + "logits/chosen": -3.060211420059204, + "logits/rejected": -3.0525429248809814, + "logps/chosen": -53.092872619628906, + "logps/rejected": -57.76226806640625, + "loss": 0.6897, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.013585333712399006, + "rewards/margins": 0.0071394420228898525, + "rewards/rejected": -0.02072477713227272, + "step": 4700 + }, + { + "epoch": 0.8097863542384562, + "eval_logits/chosen": -3.1409170627593994, + "eval_logits/rejected": -3.135312557220459, + "eval_logps/chosen": -58.612998962402344, + "eval_logps/rejected": -63.39475631713867, + "eval_loss": 0.6916364431381226, + "eval_rewards/accuracies": 0.574117124080658, + "eval_rewards/chosen": 0.0009889440843835473, + "eval_rewards/margins": 0.0031352676451206207, + "eval_rewards/rejected": -0.0021463236771523952, + "eval_runtime": 383.168, + "eval_samples_per_second": 11.233, + "eval_steps_per_second": 1.404, + "step": 4700 + }, + { + "epoch": 0.8115093039283253, + "grad_norm": 2.3831241130828857, + "learning_rate": 1.482538771637548e-08, + "logits/chosen": -3.1120047569274902, + "logits/rejected": -3.073157787322998, + "logps/chosen": -59.07976150512695, + "logps/rejected": -54.81146240234375, + "loss": 0.6869, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.011325079016387463, + "rewards/margins": 0.012960456311702728, + "rewards/rejected": -0.024285534396767616, + "step": 4710 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 2.280550241470337, + "learning_rate": 1.4799026881298825e-08, + "logits/chosen": -2.985053539276123, + "logits/rejected": -2.953970432281494, + "logps/chosen": -57.7352294921875, + "logps/rejected": -54.659141540527344, + "loss": 0.6869, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.009329278953373432, + "rewards/margins": 0.012923975475132465, + "rewards/rejected": -0.02225325256586075, + "step": 4720 + }, + { + "epoch": 0.8149552033080634, + "grad_norm": 2.3526196479797363, + "learning_rate": 1.4772622648252565e-08, + "logits/chosen": -2.976370096206665, + "logits/rejected": -2.9506354331970215, + "logps/chosen": -55.18701934814453, + "logps/rejected": -53.34539031982422, + "loss": 0.6899, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.014407898299396038, + "rewards/margins": 0.006814153399318457, + "rewards/rejected": -0.021222051233053207, + "step": 4730 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 2.533383846282959, + "learning_rate": 1.4746175256012212e-08, + "logits/chosen": -3.0675313472747803, + "logits/rejected": -3.056377410888672, + "logps/chosen": -55.36817169189453, + "logps/rejected": -55.598167419433594, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009266135282814503, + "rewards/margins": 0.008008824661374092, + "rewards/rejected": -0.01727495715022087, + "step": 4740 + }, + { + "epoch": 0.8184011026878015, + "grad_norm": 2.477605104446411, + "learning_rate": 1.4719684943743575e-08, + "logits/chosen": -3.036411762237549, + "logits/rejected": -3.016793966293335, + "logps/chosen": -56.173614501953125, + "logps/rejected": -56.49778366088867, + "loss": 0.6877, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.007815537974238396, + "rewards/margins": 0.011227364651858807, + "rewards/rejected": -0.019042903557419777, + "step": 4750 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 2.4582791328430176, + "learning_rate": 1.4693151951000583e-08, + "logits/chosen": -3.0335280895233154, + "logits/rejected": -3.011873245239258, + "logps/chosen": -53.954620361328125, + "logps/rejected": -53.71638870239258, + "loss": 0.6899, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.014825527556240559, + "rewards/margins": 0.006758248899132013, + "rewards/rejected": -0.02158377692103386, + "step": 4760 + }, + { + "epoch": 0.8218470020675396, + "grad_norm": 2.687391519546509, + "learning_rate": 1.4666576517723136e-08, + "logits/chosen": -3.0979321002960205, + "logits/rejected": -3.0764148235321045, + "logps/chosen": -60.7008056640625, + "logps/rejected": -56.8785400390625, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011106926016509533, + "rewards/margins": 0.005686748772859573, + "rewards/rejected": -0.01679367572069168, + "step": 4770 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 2.817535400390625, + "learning_rate": 1.4639958884234921e-08, + "logits/chosen": -2.9717934131622314, + "logits/rejected": -2.953327178955078, + "logps/chosen": -52.80674362182617, + "logps/rejected": -56.35276412963867, + "loss": 0.688, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01336643099784851, + "rewards/margins": 0.01072400901466608, + "rewards/rejected": -0.024090442806482315, + "step": 4780 + }, + { + "epoch": 0.8252929014472777, + "grad_norm": 2.144557476043701, + "learning_rate": 1.4613299291241247e-08, + "logits/chosen": -3.074733257293701, + "logits/rejected": -3.048335552215576, + "logps/chosen": -59.228851318359375, + "logps/rejected": -54.978431701660156, + "loss": 0.6896, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.014375475235283375, + "rewards/margins": 0.007547792978584766, + "rewards/rejected": -0.021923266351222992, + "step": 4790 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 2.7041478157043457, + "learning_rate": 1.458659797982687e-08, + "logits/chosen": -2.9979748725891113, + "logits/rejected": -2.9825730323791504, + "logps/chosen": -53.27587127685547, + "logps/rejected": -52.17603302001953, + "loss": 0.6905, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012802990153431892, + "rewards/margins": 0.00566075649112463, + "rewards/rejected": -0.018463749438524246, + "step": 4800 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -3.14040207862854, + "eval_logits/rejected": -3.1347618103027344, + "eval_logps/chosen": -58.63165283203125, + "eval_logps/rejected": -63.4159049987793, + "eval_loss": 0.6916272640228271, + "eval_rewards/accuracies": 0.5748141407966614, + "eval_rewards/chosen": 0.0008023965056054294, + "eval_rewards/margins": 0.003160183085128665, + "eval_rewards/rejected": -0.0023577865213155746, + "eval_runtime": 383.65, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 4800 + }, + { + "epoch": 0.8287388008270159, + "grad_norm": 2.4247682094573975, + "learning_rate": 1.45598551914538e-08, + "logits/chosen": -3.009613513946533, + "logits/rejected": -2.974290370941162, + "logps/chosen": -58.334434509277344, + "logps/rejected": -54.773475646972656, + "loss": 0.687, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.011181493289768696, + "rewards/margins": 0.012812025845050812, + "rewards/rejected": -0.023993518203496933, + "step": 4810 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 2.3951213359832764, + "learning_rate": 1.453307116795913e-08, + "logits/chosen": -3.044926404953003, + "logits/rejected": -3.035269260406494, + "logps/chosen": -52.56196212768555, + "logps/rejected": -54.002899169921875, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014045966789126396, + "rewards/margins": 0.007807609625160694, + "rewards/rejected": -0.021853577345609665, + "step": 4820 + }, + { + "epoch": 0.832184700206754, + "grad_norm": 2.170640230178833, + "learning_rate": 1.4506246151552857e-08, + "logits/chosen": -3.054115056991577, + "logits/rejected": -3.0311291217803955, + "logps/chosen": -54.9075813293457, + "logps/rejected": -54.56113815307617, + "loss": 0.6903, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.013306808657944202, + "rewards/margins": 0.006027604453265667, + "rewards/rejected": -0.01933441497385502, + "step": 4830 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 2.660403251647949, + "learning_rate": 1.447938038481566e-08, + "logits/chosen": -3.012082576751709, + "logits/rejected": -2.9815993309020996, + "logps/chosen": -57.541893005371094, + "logps/rejected": -54.8646240234375, + "loss": 0.6879, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.013715410605072975, + "rewards/margins": 0.01079997606575489, + "rewards/rejected": -0.024515386670827866, + "step": 4840 + }, + { + "epoch": 0.8356305995864921, + "grad_norm": 2.3371341228485107, + "learning_rate": 1.4452474110696739e-08, + "logits/chosen": -3.090893268585205, + "logits/rejected": -3.0729451179504395, + "logps/chosen": -55.47370147705078, + "logps/rejected": -55.9431266784668, + "loss": 0.6884, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.013396288268268108, + "rewards/margins": 0.009913275018334389, + "rewards/rejected": -0.023309562355279922, + "step": 4850 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 2.4334421157836914, + "learning_rate": 1.4425527572511602e-08, + "logits/chosen": -3.095104932785034, + "logits/rejected": -3.0613903999328613, + "logps/chosen": -60.0928840637207, + "logps/rejected": -52.79871368408203, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01635749265551567, + "rewards/margins": 0.0058675603941082954, + "rewards/rejected": -0.02222505584359169, + "step": 4860 + }, + { + "epoch": 0.8390764989662302, + "grad_norm": 2.731940507888794, + "learning_rate": 1.4398541013939869e-08, + "logits/chosen": -3.11464524269104, + "logits/rejected": -3.0886950492858887, + "logps/chosen": -58.54974365234375, + "logps/rejected": -53.7392463684082, + "loss": 0.6875, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.010801544412970543, + "rewards/margins": 0.011670037172734737, + "rewards/rejected": -0.022471582517027855, + "step": 4870 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 2.099777936935425, + "learning_rate": 1.4371514679023067e-08, + "logits/chosen": -2.9958763122558594, + "logits/rejected": -2.959946870803833, + "logps/chosen": -54.3371696472168, + "logps/rejected": -52.98571014404297, + "loss": 0.6873, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.011563587002456188, + "rewards/margins": 0.01208436954766512, + "rewards/rejected": -0.023647956550121307, + "step": 4880 + }, + { + "epoch": 0.8425223983459683, + "grad_norm": 2.4163782596588135, + "learning_rate": 1.4344448812162429e-08, + "logits/chosen": -3.053267002105713, + "logits/rejected": -3.0314173698425293, + "logps/chosen": -56.69382858276367, + "logps/rejected": -55.77009201049805, + "loss": 0.6886, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.012732396833598614, + "rewards/margins": 0.00963148195296526, + "rewards/rejected": -0.022363876923918724, + "step": 4890 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 2.075690507888794, + "learning_rate": 1.4317343658116666e-08, + "logits/chosen": -3.020969867706299, + "logits/rejected": -2.987406015396118, + "logps/chosen": -54.62163162231445, + "logps/rejected": -52.847877502441406, + "loss": 0.6875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014192362315952778, + "rewards/margins": 0.01184868160635233, + "rewards/rejected": -0.026041042059659958, + "step": 4900 + }, + { + "epoch": 0.8442453480358374, + "eval_logits/chosen": -3.139631509780884, + "eval_logits/rejected": -3.1339633464813232, + "eval_logps/chosen": -58.65803527832031, + "eval_logps/rejected": -63.45630645751953, + "eval_loss": 0.6915606260299683, + "eval_rewards/accuracies": 0.5773698687553406, + "eval_rewards/chosen": 0.0005385760450735688, + "eval_rewards/margins": 0.003300320589914918, + "eval_rewards/rejected": -0.0027617441955953836, + "eval_runtime": 384.2265, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 4900 + }, + { + "epoch": 0.8459682977257064, + "grad_norm": 2.4822423458099365, + "learning_rate": 1.4290199461999776e-08, + "logits/chosen": -3.165937900543213, + "logits/rejected": -3.1338515281677246, + "logps/chosen": -57.39813232421875, + "logps/rejected": -54.81819534301758, + "loss": 0.6867, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.010589056648314, + "rewards/margins": 0.013249260373413563, + "rewards/rejected": -0.023838315159082413, + "step": 4910 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 2.523733615875244, + "learning_rate": 1.4263016469278812e-08, + "logits/chosen": -2.9909253120422363, + "logits/rejected": -2.9729251861572266, + "logps/chosen": -55.359779357910156, + "logps/rejected": -56.16680145263672, + "loss": 0.6874, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.015199231915175915, + "rewards/margins": 0.011813652701675892, + "rewards/rejected": -0.027012884616851807, + "step": 4920 + }, + { + "epoch": 0.8494141971054445, + "grad_norm": 2.3042807579040527, + "learning_rate": 1.4235794925771672e-08, + "logits/chosen": -3.1721012592315674, + "logits/rejected": -3.160557746887207, + "logps/chosen": -54.805763244628906, + "logps/rejected": -60.66535186767578, + "loss": 0.6884, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.016626281663775444, + "rewards/margins": 0.009994568303227425, + "rewards/rejected": -0.02662084996700287, + "step": 4930 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 2.6536920070648193, + "learning_rate": 1.420853507764487e-08, + "logits/chosen": -3.094923496246338, + "logits/rejected": -3.064950704574585, + "logps/chosen": -57.57648849487305, + "logps/rejected": -55.926292419433594, + "loss": 0.6881, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.016546418890357018, + "rewards/margins": 0.010448496788740158, + "rewards/rejected": -0.026994913816452026, + "step": 4940 + }, + { + "epoch": 0.8528600964851827, + "grad_norm": 2.630887508392334, + "learning_rate": 1.4181237171411314e-08, + "logits/chosen": -2.8983561992645264, + "logits/rejected": -2.88315749168396, + "logps/chosen": -55.97765350341797, + "logps/rejected": -56.04487228393555, + "loss": 0.6895, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.01150443498045206, + "rewards/margins": 0.0077610015869140625, + "rewards/rejected": -0.019265437498688698, + "step": 4950 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 2.179489850997925, + "learning_rate": 1.4153901453928069e-08, + "logits/chosen": -3.0133819580078125, + "logits/rejected": -3.0152859687805176, + "logps/chosen": -54.046836853027344, + "logps/rejected": -58.50445556640625, + "loss": 0.6918, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.017930559813976288, + "rewards/margins": 0.002923080464825034, + "rewards/rejected": -0.020853638648986816, + "step": 4960 + }, + { + "epoch": 0.8563059958649207, + "grad_norm": 2.817776679992676, + "learning_rate": 1.4126528172394132e-08, + "logits/chosen": -3.0035629272460938, + "logits/rejected": -2.993034839630127, + "logps/chosen": -53.23530960083008, + "logps/rejected": -54.26251983642578, + "loss": 0.6906, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.018618527799844742, + "rewards/margins": 0.005449257791042328, + "rewards/rejected": -0.02406778372824192, + "step": 4970 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 2.0556752681732178, + "learning_rate": 1.40991175743482e-08, + "logits/chosen": -3.0782289505004883, + "logits/rejected": -3.0726828575134277, + "logps/chosen": -54.139183044433594, + "logps/rejected": -56.74431228637695, + "loss": 0.6904, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.012816699221730232, + "rewards/margins": 0.005773487966507673, + "rewards/rejected": -0.018590185791254044, + "step": 4980 + }, + { + "epoch": 0.8597518952446589, + "grad_norm": 2.202556848526001, + "learning_rate": 1.4071669907666415e-08, + "logits/chosen": -2.9854166507720947, + "logits/rejected": -2.9775357246398926, + "logps/chosen": -53.2756462097168, + "logps/rejected": -57.1760139465332, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.014912595972418785, + "rewards/margins": 0.007976751774549484, + "rewards/rejected": -0.02288934774696827, + "step": 4990 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 2.384925603866577, + "learning_rate": 1.4044185420560144e-08, + "logits/chosen": -3.0189459323883057, + "logits/rejected": -2.9987950325012207, + "logps/chosen": -57.17552947998047, + "logps/rejected": -54.762413024902344, + "loss": 0.6899, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.013715590350329876, + "rewards/margins": 0.006854689680039883, + "rewards/rejected": -0.02057028003036976, + "step": 5000 + }, + { + "epoch": 0.8614748449345279, + "eval_logits/chosen": -3.138385534286499, + "eval_logits/rejected": -3.1327359676361084, + "eval_logps/chosen": -58.66404724121094, + "eval_logps/rejected": -63.465179443359375, + "eval_loss": 0.6915493011474609, + "eval_rewards/accuracies": 0.5769051909446716, + "eval_rewards/chosen": 0.0004785024793818593, + "eval_rewards/margins": 0.0033290009014308453, + "eval_rewards/rejected": -0.002850498305633664, + "eval_runtime": 384.5021, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 5000 + }, + { + "epoch": 0.8631977946243969, + "grad_norm": 2.560671091079712, + "learning_rate": 1.4016664361573723e-08, + "logits/chosen": -3.0151009559631348, + "logits/rejected": -2.9882473945617676, + "logps/chosen": -54.525062561035156, + "logps/rejected": -54.10304641723633, + "loss": 0.6858, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.014742674306035042, + "rewards/margins": 0.015148031525313854, + "rewards/rejected": -0.02989070676267147, + "step": 5010 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 2.4198813438415527, + "learning_rate": 1.3989106979582206e-08, + "logits/chosen": -3.0126874446868896, + "logits/rejected": -2.9866137504577637, + "logps/chosen": -54.9820671081543, + "logps/rejected": -54.1724967956543, + "loss": 0.6899, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.014826476573944092, + "rewards/margins": 0.00693404208868742, + "rewards/rejected": -0.021760519593954086, + "step": 5020 + }, + { + "epoch": 0.8666436940041351, + "grad_norm": 2.369276523590088, + "learning_rate": 1.3961513523789117e-08, + "logits/chosen": -2.9598562717437744, + "logits/rejected": -2.9502501487731934, + "logps/chosen": -54.18781661987305, + "logps/rejected": -56.25947952270508, + "loss": 0.6897, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.015706423670053482, + "rewards/margins": 0.007301941514015198, + "rewards/rejected": -0.02300836518406868, + "step": 5030 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 2.4504623413085938, + "learning_rate": 1.3933884243724207e-08, + "logits/chosen": -3.1910994052886963, + "logits/rejected": -3.148806095123291, + "logps/chosen": -58.43614959716797, + "logps/rejected": -52.756126403808594, + "loss": 0.6882, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.012786917388439178, + "rewards/margins": 0.010306078009307384, + "rewards/rejected": -0.02309299446642399, + "step": 5040 + }, + { + "epoch": 0.8700895933838731, + "grad_norm": 2.484468936920166, + "learning_rate": 1.3906219389241175e-08, + "logits/chosen": -3.059654712677002, + "logits/rejected": -3.035466194152832, + "logps/chosen": -56.96913528442383, + "logps/rejected": -57.09128952026367, + "loss": 0.6883, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.013648102059960365, + "rewards/margins": 0.010198203846812248, + "rewards/rejected": -0.023846307769417763, + "step": 5050 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 2.1926894187927246, + "learning_rate": 1.3878519210515435e-08, + "logits/chosen": -2.9194982051849365, + "logits/rejected": -2.9135661125183105, + "logps/chosen": -52.278892517089844, + "logps/rejected": -54.48607635498047, + "loss": 0.6901, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.018070057034492493, + "rewards/margins": 0.006437377072870731, + "rewards/rejected": -0.024507436901330948, + "step": 5060 + }, + { + "epoch": 0.8735354927636113, + "grad_norm": 2.3249335289001465, + "learning_rate": 1.3850783958041834e-08, + "logits/chosen": -3.032139778137207, + "logits/rejected": -3.0054965019226074, + "logps/chosen": -54.1693229675293, + "logps/rejected": -50.52428436279297, + "loss": 0.6884, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.014210036024451256, + "rewards/margins": 0.009761094115674496, + "rewards/rejected": -0.023971129208803177, + "step": 5070 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 2.3921754360198975, + "learning_rate": 1.38230138826324e-08, + "logits/chosen": -3.223688840866089, + "logits/rejected": -3.1855413913726807, + "logps/chosen": -58.59296798706055, + "logps/rejected": -53.28046417236328, + "loss": 0.6869, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.010885847732424736, + "rewards/margins": 0.012997889891266823, + "rewards/rejected": -0.023883739486336708, + "step": 5080 + }, + { + "epoch": 0.8769813921433495, + "grad_norm": 2.5459678173065186, + "learning_rate": 1.379520923541406e-08, + "logits/chosen": -3.0902998447418213, + "logits/rejected": -3.05825138092041, + "logps/chosen": -55.68738555908203, + "logps/rejected": -54.36516189575195, + "loss": 0.686, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01396668516099453, + "rewards/margins": 0.014680743217468262, + "rewards/rejected": -0.02864742837846279, + "step": 5090 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 2.2755610942840576, + "learning_rate": 1.376737026782638e-08, + "logits/chosen": -2.925417900085449, + "logits/rejected": -2.889483690261841, + "logps/chosen": -56.4536247253418, + "logps/rejected": -55.12589645385742, + "loss": 0.6864, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.013867422938346863, + "rewards/margins": 0.014166000299155712, + "rewards/rejected": -0.02803342044353485, + "step": 5100 + }, + { + "epoch": 0.8787043418332184, + "eval_logits/chosen": -3.1375465393066406, + "eval_logits/rejected": -3.131910562515259, + "eval_logps/chosen": -58.683929443359375, + "eval_logps/rejected": -63.488773345947266, + "eval_loss": 0.6915342807769775, + "eval_rewards/accuracies": 0.5683085322380066, + "eval_rewards/chosen": 0.0002796630433294922, + "eval_rewards/margins": 0.0033661844208836555, + "eval_rewards/rejected": -0.0030865215230733156, + "eval_runtime": 384.6471, + "eval_samples_per_second": 11.189, + "eval_steps_per_second": 1.399, + "step": 5100 + }, + { + "epoch": 0.8804272915230875, + "grad_norm": 2.5277225971221924, + "learning_rate": 1.373949723161929e-08, + "logits/chosen": -3.0719993114471436, + "logits/rejected": -3.0354537963867188, + "logps/chosen": -59.154884338378906, + "logps/rejected": -56.359588623046875, + "loss": 0.6871, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.012820270843803883, + "rewards/margins": 0.012380128726363182, + "rewards/rejected": -0.02520040050148964, + "step": 5110 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 2.3103935718536377, + "learning_rate": 1.3711590378850797e-08, + "logits/chosen": -3.1260557174682617, + "logits/rejected": -3.0945322513580322, + "logps/chosen": -55.463409423828125, + "logps/rejected": -52.56806564331055, + "loss": 0.6899, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.015900244936347008, + "rewards/margins": 0.006916800979524851, + "rewards/rejected": -0.02281704545021057, + "step": 5120 + }, + { + "epoch": 0.8838731909028257, + "grad_norm": 2.3789403438568115, + "learning_rate": 1.3683649961884723e-08, + "logits/chosen": -3.146933078765869, + "logits/rejected": -3.112955093383789, + "logps/chosen": -54.97711944580078, + "logps/rejected": -52.694297790527344, + "loss": 0.6868, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.014110149815678596, + "rewards/margins": 0.013062229380011559, + "rewards/rejected": -0.027172381058335304, + "step": 5130 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 2.3451788425445557, + "learning_rate": 1.365567623338841e-08, + "logits/chosen": -3.054654359817505, + "logits/rejected": -3.0173521041870117, + "logps/chosen": -58.7443962097168, + "logps/rejected": -55.3597297668457, + "loss": 0.686, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.009260999038815498, + "rewards/margins": 0.014625395648181438, + "rewards/rejected": -0.023886393755674362, + "step": 5140 + }, + { + "epoch": 0.8873190902825637, + "grad_norm": 2.1813039779663086, + "learning_rate": 1.362766944633044e-08, + "logits/chosen": -3.0976474285125732, + "logits/rejected": -3.088894844055176, + "logps/chosen": -54.575950622558594, + "logps/rejected": -57.33098220825195, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014461587183177471, + "rewards/margins": 0.010113890282809734, + "rewards/rejected": -0.024575477465987206, + "step": 5150 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 2.3312623500823975, + "learning_rate": 1.3599629853978341e-08, + "logits/chosen": -3.06577205657959, + "logits/rejected": -3.029139757156372, + "logps/chosen": -53.272178649902344, + "logps/rejected": -50.97618865966797, + "loss": 0.6878, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.01855347864329815, + "rewards/margins": 0.011086962185800076, + "rewards/rejected": -0.02964043989777565, + "step": 5160 + }, + { + "epoch": 0.8907649896623019, + "grad_norm": 2.2718193531036377, + "learning_rate": 1.357155770989631e-08, + "logits/chosen": -3.05924129486084, + "logits/rejected": -3.0357818603515625, + "logps/chosen": -56.48089599609375, + "logps/rejected": -53.92249298095703, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011843241751194, + "rewards/margins": 0.008908586576581001, + "rewards/rejected": -0.020751826465129852, + "step": 5170 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 2.8206381797790527, + "learning_rate": 1.3543453267942905e-08, + "logits/chosen": -3.149730682373047, + "logits/rejected": -3.1397929191589355, + "logps/chosen": -56.902320861816406, + "logps/rejected": -56.74182891845703, + "loss": 0.6897, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.014993062242865562, + "rewards/margins": 0.0073389047756791115, + "rewards/rejected": -0.0223319660872221, + "step": 5180 + }, + { + "epoch": 0.8942108890420399, + "grad_norm": 2.555413007736206, + "learning_rate": 1.3515316782268756e-08, + "logits/chosen": -3.028125047683716, + "logits/rejected": -3.0172619819641113, + "logps/chosen": -54.56239700317383, + "logps/rejected": -56.15587615966797, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.017221994698047638, + "rewards/margins": 0.005434014368802309, + "rewards/rejected": -0.02265600860118866, + "step": 5190 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 2.364535093307495, + "learning_rate": 1.3487148507314273e-08, + "logits/chosen": -3.088724136352539, + "logits/rejected": -3.0658886432647705, + "logps/chosen": -53.71836471557617, + "logps/rejected": -57.087257385253906, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01639769785106182, + "rewards/margins": 0.01384388841688633, + "rewards/rejected": -0.030241584405303, + "step": 5200 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -3.1371073722839355, + "eval_logits/rejected": -3.131441116333008, + "eval_logps/chosen": -58.70653533935547, + "eval_logps/rejected": -63.53398895263672, + "eval_loss": 0.6914243698120117, + "eval_rewards/accuracies": 0.5734200477600098, + "eval_rewards/chosen": 5.36102379555814e-05, + "eval_rewards/margins": 0.00359228253364563, + "eval_rewards/rejected": -0.003538672346621752, + "eval_runtime": 384.4039, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 5200 + }, + { + "epoch": 0.8976567884217781, + "grad_norm": 2.1429319381713867, + "learning_rate": 1.3458948697807336e-08, + "logits/chosen": -3.0572848320007324, + "logits/rejected": -3.042999744415283, + "logps/chosen": -53.3560791015625, + "logps/rejected": -52.3207893371582, + "loss": 0.6903, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.017168376594781876, + "rewards/margins": 0.006034437101334333, + "rewards/rejected": -0.023202812299132347, + "step": 5210 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 2.7378227710723877, + "learning_rate": 1.3430717608760991e-08, + "logits/chosen": -3.082184314727783, + "logits/rejected": -3.0691020488739014, + "logps/chosen": -54.03383255004883, + "logps/rejected": -58.534889221191406, + "loss": 0.6877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014227539300918579, + "rewards/margins": 0.011310763657093048, + "rewards/rejected": -0.025538304820656776, + "step": 5220 + }, + { + "epoch": 0.9011026878015161, + "grad_norm": 2.590153217315674, + "learning_rate": 1.3402455495471153e-08, + "logits/chosen": -3.0239996910095215, + "logits/rejected": -3.012888193130493, + "logps/chosen": -56.472503662109375, + "logps/rejected": -58.494102478027344, + "loss": 0.6875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.012096828781068325, + "rewards/margins": 0.011721945367753506, + "rewards/rejected": -0.02381877228617668, + "step": 5230 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 2.373455047607422, + "learning_rate": 1.3374162613514285e-08, + "logits/chosen": -3.0243659019470215, + "logits/rejected": -3.001631498336792, + "logps/chosen": -54.14702606201172, + "logps/rejected": -55.911949157714844, + "loss": 0.6874, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.013740400783717632, + "rewards/margins": 0.011903162114322186, + "rewards/rejected": -0.025643562898039818, + "step": 5240 + }, + { + "epoch": 0.9045485871812543, + "grad_norm": 2.0493698120117188, + "learning_rate": 1.3345839218745101e-08, + "logits/chosen": -2.9678304195404053, + "logits/rejected": -2.953367233276367, + "logps/chosen": -52.9145393371582, + "logps/rejected": -53.31523895263672, + "loss": 0.6909, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.019368145614862442, + "rewards/margins": 0.004988770931959152, + "rewards/rejected": -0.024356918409466743, + "step": 5250 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 2.544506311416626, + "learning_rate": 1.3317485567294238e-08, + "logits/chosen": -3.062175750732422, + "logits/rejected": -3.0207974910736084, + "logps/chosen": -58.17012405395508, + "logps/rejected": -54.205718994140625, + "loss": 0.6868, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.013847528025507927, + "rewards/margins": 0.013102750293910503, + "rewards/rejected": -0.026950281113386154, + "step": 5260 + }, + { + "epoch": 0.9079944865609925, + "grad_norm": 2.3220739364624023, + "learning_rate": 1.3289101915565951e-08, + "logits/chosen": -3.1599390506744385, + "logits/rejected": -3.14455246925354, + "logps/chosen": -53.253761291503906, + "logps/rejected": -53.868431091308594, + "loss": 0.6883, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.016995316371321678, + "rewards/margins": 0.010229960083961487, + "rewards/rejected": -0.027225274592638016, + "step": 5270 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 2.4071993827819824, + "learning_rate": 1.3260688520235785e-08, + "logits/chosen": -3.0632424354553223, + "logits/rejected": -3.032189130783081, + "logps/chosen": -56.436180114746094, + "logps/rejected": -53.795684814453125, + "loss": 0.6882, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.020667383447289467, + "rewards/margins": 0.010586929507553577, + "rewards/rejected": -0.03125431388616562, + "step": 5280 + }, + { + "epoch": 0.9114403859407305, + "grad_norm": 2.3140807151794434, + "learning_rate": 1.3232245638248262e-08, + "logits/chosen": -3.0068345069885254, + "logits/rejected": -2.96124529838562, + "logps/chosen": -56.988670349121094, + "logps/rejected": -54.59224319458008, + "loss": 0.6863, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.011329452507197857, + "rewards/margins": 0.01419984083622694, + "rewards/rejected": -0.025529295206069946, + "step": 5290 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 2.47336745262146, + "learning_rate": 1.3203773526814558e-08, + "logits/chosen": -3.0152244567871094, + "logits/rejected": -3.0013296604156494, + "logps/chosen": -56.43061065673828, + "logps/rejected": -57.6225471496582, + "loss": 0.6877, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.013324270024895668, + "rewards/margins": 0.011425389908254147, + "rewards/rejected": -0.02474966086447239, + "step": 5300 + }, + { + "epoch": 0.9131633356305996, + "eval_logits/chosen": -3.136502265930176, + "eval_logits/rejected": -3.1308670043945312, + "eval_logps/chosen": -58.71966552734375, + "eval_logps/rejected": -63.566688537597656, + "eval_loss": 0.691329836845398, + "eval_rewards/accuracies": 0.5736523866653442, + "eval_rewards/chosen": -7.772997923893854e-05, + "eval_rewards/margins": 0.003787950612604618, + "eval_rewards/rejected": -0.003865680657327175, + "eval_runtime": 384.1169, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.401, + "step": 5300 + }, + { + "epoch": 0.9148862853204687, + "grad_norm": 2.492147922515869, + "learning_rate": 1.3175272443410165e-08, + "logits/chosen": -3.148709774017334, + "logits/rejected": -3.1363625526428223, + "logps/chosen": -59.37726974487305, + "logps/rejected": -54.50934600830078, + "loss": 0.689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.012435202486813068, + "rewards/margins": 0.008691241964697838, + "rewards/rejected": -0.02112644538283348, + "step": 5310 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 2.2255496978759766, + "learning_rate": 1.3146742645772576e-08, + "logits/chosen": -3.0580496788024902, + "logits/rejected": -3.0039610862731934, + "logps/chosen": -58.974510192871094, + "logps/rejected": -53.189048767089844, + "loss": 0.6858, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.014206658117473125, + "rewards/margins": 0.015097076073288918, + "rewards/rejected": -0.02930373325943947, + "step": 5320 + }, + { + "epoch": 0.9183321847002067, + "grad_norm": 2.464592695236206, + "learning_rate": 1.311818439189895e-08, + "logits/chosen": -2.996553421020508, + "logits/rejected": -3.0002400875091553, + "logps/chosen": -52.74189376831055, + "logps/rejected": -55.937255859375, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.018881823867559433, + "rewards/margins": 0.006236328277736902, + "rewards/rejected": -0.025118151679635048, + "step": 5330 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 2.629176139831543, + "learning_rate": 1.3089597940043773e-08, + "logits/chosen": -3.035466432571411, + "logits/rejected": -2.9821648597717285, + "logps/chosen": -57.51744842529297, + "logps/rejected": -54.27968215942383, + "loss": 0.6847, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011555047705769539, + "rewards/margins": 0.017491517588496208, + "rewards/rejected": -0.02904656156897545, + "step": 5340 + }, + { + "epoch": 0.9217780840799449, + "grad_norm": 2.4826347827911377, + "learning_rate": 1.3060983548716533e-08, + "logits/chosen": -2.9875071048736572, + "logits/rejected": -2.9504916667938232, + "logps/chosen": -54.7081184387207, + "logps/rejected": -55.256614685058594, + "loss": 0.6873, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.020656753331422806, + "rewards/margins": 0.012195435352623463, + "rewards/rejected": -0.032852184027433395, + "step": 5350 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 2.558962345123291, + "learning_rate": 1.3032341476679368e-08, + "logits/chosen": -3.1059539318084717, + "logits/rejected": -3.0991828441619873, + "logps/chosen": -54.63978958129883, + "logps/rejected": -58.72779083251953, + "loss": 0.6893, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.014256837777793407, + "rewards/margins": 0.008205600082874298, + "rewards/rejected": -0.02246243879199028, + "step": 5360 + }, + { + "epoch": 0.9252239834596829, + "grad_norm": 2.460012674331665, + "learning_rate": 1.3003671982944747e-08, + "logits/chosen": -3.004504680633545, + "logits/rejected": -2.9867444038391113, + "logps/chosen": -53.31267166137695, + "logps/rejected": -55.93389129638672, + "loss": 0.6877, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.013879385776817799, + "rewards/margins": 0.011418366804718971, + "rewards/rejected": -0.025297755375504494, + "step": 5370 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 2.2511675357818604, + "learning_rate": 1.2974975326773106e-08, + "logits/chosen": -3.1150755882263184, + "logits/rejected": -3.075209617614746, + "logps/chosen": -58.38878631591797, + "logps/rejected": -54.537193298339844, + "loss": 0.6838, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.009906591847538948, + "rewards/margins": 0.019162429496645927, + "rewards/rejected": -0.029069025069475174, + "step": 5380 + }, + { + "epoch": 0.9286698828394211, + "grad_norm": 2.384746551513672, + "learning_rate": 1.2946251767670519e-08, + "logits/chosen": -3.032280445098877, + "logits/rejected": -2.9894490242004395, + "logps/chosen": -60.061767578125, + "logps/rejected": -54.47789764404297, + "loss": 0.6865, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.014944732189178467, + "rewards/margins": 0.013701597228646278, + "rewards/rejected": -0.028646331280469894, + "step": 5390 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 2.1436920166015625, + "learning_rate": 1.2917501565386343e-08, + "logits/chosen": -3.0589873790740967, + "logits/rejected": -3.041414976119995, + "logps/chosen": -52.97267532348633, + "logps/rejected": -56.66045379638672, + "loss": 0.6889, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.018663279712200165, + "rewards/margins": 0.009023270569741726, + "rewards/rejected": -0.027686551213264465, + "step": 5400 + }, + { + "epoch": 0.9303928325292902, + "eval_logits/chosen": -3.1357803344726562, + "eval_logits/rejected": -3.130146026611328, + "eval_logps/chosen": -58.737403869628906, + "eval_logps/rejected": -63.59604263305664, + "eval_loss": 0.6912763714790344, + "eval_rewards/accuracies": 0.5759758353233337, + "eval_rewards/chosen": -0.00025504513178020716, + "eval_rewards/margins": 0.0039042264688760042, + "eval_rewards/rejected": -0.004159271717071533, + "eval_runtime": 384.3865, + "eval_samples_per_second": 11.197, + "eval_steps_per_second": 1.4, + "step": 5400 + }, + { + "epoch": 0.9321157822191593, + "grad_norm": 2.2527265548706055, + "learning_rate": 1.2888724979910867e-08, + "logits/chosen": -3.071611166000366, + "logits/rejected": -3.059826374053955, + "logps/chosen": -58.34067916870117, + "logps/rejected": -56.316001892089844, + "loss": 0.6908, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.017013002187013626, + "rewards/margins": 0.005270760972052813, + "rewards/rejected": -0.022283760830760002, + "step": 5410 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 2.507885456085205, + "learning_rate": 1.2859922271472968e-08, + "logits/chosen": -3.124504566192627, + "logits/rejected": -3.101989984512329, + "logps/chosen": -55.222572326660156, + "logps/rejected": -56.50590133666992, + "loss": 0.6871, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.015043241903185844, + "rewards/margins": 0.01249424833804369, + "rewards/rejected": -0.02753749117255211, + "step": 5420 + }, + { + "epoch": 0.9355616815988973, + "grad_norm": 2.130812644958496, + "learning_rate": 1.2831093700537764e-08, + "logits/chosen": -3.0423922538757324, + "logits/rejected": -3.0314245223999023, + "logps/chosen": -55.79705810546875, + "logps/rejected": -54.9379768371582, + "loss": 0.6884, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.021989721804857254, + "rewards/margins": 0.009882936254143715, + "rewards/rejected": -0.03187265619635582, + "step": 5430 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 2.3937151432037354, + "learning_rate": 1.2802239527804237e-08, + "logits/chosen": -3.056756019592285, + "logits/rejected": -3.0396242141723633, + "logps/chosen": -57.8969841003418, + "logps/rejected": -58.131507873535156, + "loss": 0.6875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016955677419900894, + "rewards/margins": 0.011915793642401695, + "rewards/rejected": -0.028871476650238037, + "step": 5440 + }, + { + "epoch": 0.9390075809786355, + "grad_norm": 2.4363255500793457, + "learning_rate": 1.2773360014202888e-08, + "logits/chosen": -2.983828067779541, + "logits/rejected": -2.9721152782440186, + "logps/chosen": -57.15266799926758, + "logps/rejected": -56.26232147216797, + "loss": 0.6874, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0157354436814785, + "rewards/margins": 0.012112580239772797, + "rewards/rejected": -0.027848023921251297, + "step": 5450 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 2.7043750286102295, + "learning_rate": 1.2744455420893392e-08, + "logits/chosen": -3.0711090564727783, + "logits/rejected": -3.0553476810455322, + "logps/chosen": -55.708465576171875, + "logps/rejected": -54.60699462890625, + "loss": 0.6897, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.018650764599442482, + "rewards/margins": 0.007309816777706146, + "rewards/rejected": -0.025960583239793777, + "step": 5460 + }, + { + "epoch": 0.9424534803583735, + "grad_norm": 2.703200578689575, + "learning_rate": 1.2715526009262208e-08, + "logits/chosen": -3.0238287448883057, + "logits/rejected": -3.003737211227417, + "logps/chosen": -53.43670654296875, + "logps/rejected": -53.303321838378906, + "loss": 0.6897, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.020780716091394424, + "rewards/margins": 0.007408404257148504, + "rewards/rejected": -0.028189118951559067, + "step": 5470 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 2.4181151390075684, + "learning_rate": 1.268657204092023e-08, + "logits/chosen": -3.0498404502868652, + "logits/rejected": -3.0245485305786133, + "logps/chosen": -54.807106018066406, + "logps/rejected": -54.9182014465332, + "loss": 0.6864, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.018639078363776207, + "rewards/margins": 0.014108446426689625, + "rewards/rejected": -0.032747525721788406, + "step": 5480 + }, + { + "epoch": 0.9458993797381117, + "grad_norm": 2.315063238143921, + "learning_rate": 1.2657593777700424e-08, + "logits/chosen": -3.0332398414611816, + "logits/rejected": -3.0053722858428955, + "logps/chosen": -57.3248405456543, + "logps/rejected": -54.80067825317383, + "loss": 0.6892, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.017597036436200142, + "rewards/margins": 0.008288295939564705, + "rewards/rejected": -0.025885334238409996, + "step": 5490 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 2.1240878105163574, + "learning_rate": 1.2628591481655457e-08, + "logits/chosen": -3.1073358058929443, + "logits/rejected": -3.091235637664795, + "logps/chosen": -55.46843719482422, + "logps/rejected": -57.39680862426758, + "loss": 0.688, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01657593622803688, + "rewards/margins": 0.010735424235463142, + "rewards/rejected": -0.027311360463500023, + "step": 5500 + }, + { + "epoch": 0.9476223294279807, + "eval_logits/chosen": -3.1351189613342285, + "eval_logits/rejected": -3.1294331550598145, + "eval_logps/chosen": -58.75161361694336, + "eval_logps/rejected": -63.61307907104492, + "eval_loss": 0.6912639737129211, + "eval_rewards/accuracies": 0.5659851431846619, + "eval_rewards/chosen": -0.0003971691185142845, + "eval_rewards/margins": 0.003932336810976267, + "eval_rewards/rejected": -0.004329506773501635, + "eval_runtime": 383.8331, + "eval_samples_per_second": 11.213, + "eval_steps_per_second": 1.402, + "step": 5500 + }, + { + "epoch": 0.9493452791178497, + "grad_norm": 2.3148903846740723, + "learning_rate": 1.2599565415055328e-08, + "logits/chosen": -3.0147597789764404, + "logits/rejected": -2.9981188774108887, + "logps/chosen": -54.94285202026367, + "logps/rejected": -55.143272399902344, + "loss": 0.6899, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01715438812971115, + "rewards/margins": 0.0069595216773450375, + "rewards/rejected": -0.024113908410072327, + "step": 5510 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 2.79364013671875, + "learning_rate": 1.2570515840384984e-08, + "logits/chosen": -3.0089077949523926, + "logits/rejected": -2.9828317165374756, + "logps/chosen": -57.03174591064453, + "logps/rejected": -54.21330642700195, + "loss": 0.6861, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01207288634032011, + "rewards/margins": 0.014618280343711376, + "rewards/rejected": -0.026691168546676636, + "step": 5520 + }, + { + "epoch": 0.9527911784975879, + "grad_norm": 2.2197608947753906, + "learning_rate": 1.2541443020341975e-08, + "logits/chosen": -3.067873477935791, + "logits/rejected": -3.032580852508545, + "logps/chosen": -60.96553421020508, + "logps/rejected": -55.5814094543457, + "loss": 0.6861, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01058810856193304, + "rewards/margins": 0.014492152258753777, + "rewards/rejected": -0.025080259889364243, + "step": 5530 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 2.2949342727661133, + "learning_rate": 1.2512347217834042e-08, + "logits/chosen": -3.0294876098632812, + "logits/rejected": -3.0352654457092285, + "logps/chosen": -53.901390075683594, + "logps/rejected": -59.26971435546875, + "loss": 0.6883, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.015541985630989075, + "rewards/margins": 0.010141951031982899, + "rewards/rejected": -0.025683939456939697, + "step": 5540 + }, + { + "epoch": 0.956237077877326, + "grad_norm": 2.465346336364746, + "learning_rate": 1.2483228695976776e-08, + "logits/chosen": -2.912057876586914, + "logits/rejected": -2.89572811126709, + "logps/chosen": -54.469642639160156, + "logps/rejected": -56.864906311035156, + "loss": 0.6888, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01851799711585045, + "rewards/margins": 0.009365186095237732, + "rewards/rejected": -0.02788318134844303, + "step": 5550 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 2.667961835861206, + "learning_rate": 1.2454087718091208e-08, + "logits/chosen": -3.0371270179748535, + "logits/rejected": -3.0104763507843018, + "logps/chosen": -53.86272048950195, + "logps/rejected": -52.576141357421875, + "loss": 0.6879, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.019759749993681908, + "rewards/margins": 0.011099927127361298, + "rewards/rejected": -0.030859675258398056, + "step": 5560 + }, + { + "epoch": 0.9596829772570641, + "grad_norm": 2.3370180130004883, + "learning_rate": 1.2424924547701442e-08, + "logits/chosen": -3.081906795501709, + "logits/rejected": -3.0722689628601074, + "logps/chosen": -52.50896453857422, + "logps/rejected": -59.51946258544922, + "loss": 0.6898, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.018424561247229576, + "rewards/margins": 0.0074028694070875645, + "rewards/rejected": -0.025827426463365555, + "step": 5570 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 2.0538763999938965, + "learning_rate": 1.239573944853228e-08, + "logits/chosen": -2.9546151161193848, + "logits/rejected": -2.94108247756958, + "logps/chosen": -55.21215057373047, + "logps/rejected": -56.68113327026367, + "loss": 0.6891, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02034585550427437, + "rewards/margins": 0.00860525481402874, + "rewards/rejected": -0.02895110473036766, + "step": 5580 + }, + { + "epoch": 0.9631288766368022, + "grad_norm": 2.415463447570801, + "learning_rate": 1.2366532684506815e-08, + "logits/chosen": -3.1263980865478516, + "logits/rejected": -3.0816597938537598, + "logps/chosen": -57.99749755859375, + "logps/rejected": -58.05116653442383, + "loss": 0.6848, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.011663533747196198, + "rewards/margins": 0.017326349392533302, + "rewards/rejected": -0.02898988500237465, + "step": 5590 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 2.2387213706970215, + "learning_rate": 1.2337304519744066e-08, + "logits/chosen": -3.0889410972595215, + "logits/rejected": -3.086632013320923, + "logps/chosen": -54.502777099609375, + "logps/rejected": -60.0045051574707, + "loss": 0.6899, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.019988704472780228, + "rewards/margins": 0.0069748698733747005, + "rewards/rejected": -0.026963572949171066, + "step": 5600 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -3.134329080581665, + "eval_logits/rejected": -3.128695011138916, + "eval_logps/chosen": -58.77082443237305, + "eval_logps/rejected": -63.63037109375, + "eval_loss": 0.6912762522697449, + "eval_rewards/accuracies": 0.5745818018913269, + "eval_rewards/chosen": -0.0005893177003599703, + "eval_rewards/margins": 0.0039131660014390945, + "eval_rewards/rejected": -0.004502483177930117, + "eval_runtime": 384.5923, + "eval_samples_per_second": 11.191, + "eval_steps_per_second": 1.399, + "step": 5600 + }, + { + "epoch": 0.9665747760165403, + "grad_norm": 2.2621214389801025, + "learning_rate": 1.2308055218556577e-08, + "logits/chosen": -3.098680019378662, + "logits/rejected": -3.080854892730713, + "logps/chosen": -53.97863006591797, + "logps/rejected": -59.511802673339844, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.015781071037054062, + "rewards/margins": 0.013751788064837456, + "rewards/rejected": -0.02953285537660122, + "step": 5610 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 2.7723278999328613, + "learning_rate": 1.2278785045448034e-08, + "logits/chosen": -3.0287671089172363, + "logits/rejected": -3.006072998046875, + "logps/chosen": -56.87371826171875, + "logps/rejected": -53.5859260559082, + "loss": 0.6874, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.018540427088737488, + "rewards/margins": 0.012070056982338428, + "rewards/rejected": -0.03061048686504364, + "step": 5620 + }, + { + "epoch": 0.9700206753962785, + "grad_norm": 2.5379652976989746, + "learning_rate": 1.2249494265110862e-08, + "logits/chosen": -3.07861065864563, + "logits/rejected": -3.0483670234680176, + "logps/chosen": -57.4000129699707, + "logps/rejected": -53.848663330078125, + "loss": 0.6874, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.014923269860446453, + "rewards/margins": 0.012017721310257912, + "rewards/rejected": -0.02694099023938179, + "step": 5630 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 2.558870792388916, + "learning_rate": 1.222018314242384e-08, + "logits/chosen": -3.0515987873077393, + "logits/rejected": -3.038257122039795, + "logps/chosen": -55.17596435546875, + "logps/rejected": -58.388694763183594, + "loss": 0.6881, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.012300265952944756, + "rewards/margins": 0.010648714378476143, + "rewards/rejected": -0.02294897846877575, + "step": 5640 + }, + { + "epoch": 0.9734665747760165, + "grad_norm": 2.609940528869629, + "learning_rate": 1.2190851942449712e-08, + "logits/chosen": -2.959313154220581, + "logits/rejected": -2.9530630111694336, + "logps/chosen": -54.18489456176758, + "logps/rejected": -55.13713455200195, + "loss": 0.6896, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0217437781393528, + "rewards/margins": 0.00759219890460372, + "rewards/rejected": -0.029335975646972656, + "step": 5650 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 2.476722002029419, + "learning_rate": 1.2161500930432778e-08, + "logits/chosen": -2.9682631492614746, + "logits/rejected": -2.965312957763672, + "logps/chosen": -54.02378463745117, + "logps/rejected": -53.51348876953125, + "loss": 0.6917, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.021599246188998222, + "rewards/margins": 0.0035273456014692783, + "rewards/rejected": -0.025126595050096512, + "step": 5660 + }, + { + "epoch": 0.9769124741557547, + "grad_norm": 2.6098222732543945, + "learning_rate": 1.2132130371796499e-08, + "logits/chosen": -3.014533519744873, + "logits/rejected": -2.9931082725524902, + "logps/chosen": -55.737632751464844, + "logps/rejected": -55.68427276611328, + "loss": 0.6877, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.016952062025666237, + "rewards/margins": 0.011545151472091675, + "rewards/rejected": -0.02849721349775791, + "step": 5670 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 2.264927387237549, + "learning_rate": 1.2102740532141101e-08, + "logits/chosen": -3.0722849369049072, + "logits/rejected": -3.0306591987609863, + "logps/chosen": -56.12786102294922, + "logps/rejected": -54.975341796875, + "loss": 0.6855, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.017493674531579018, + "rewards/margins": 0.015774184837937355, + "rewards/rejected": -0.033267855644226074, + "step": 5680 + }, + { + "epoch": 0.9803583735354927, + "grad_norm": 2.7673423290252686, + "learning_rate": 1.207333167724116e-08, + "logits/chosen": -3.095992088317871, + "logits/rejected": -3.0646653175354004, + "logps/chosen": -59.16156005859375, + "logps/rejected": -54.520599365234375, + "loss": 0.6849, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.011630335822701454, + "rewards/margins": 0.01691516861319542, + "rewards/rejected": -0.028545504435896873, + "step": 5690 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 2.370692729949951, + "learning_rate": 1.2043904073043222e-08, + "logits/chosen": -2.9002363681793213, + "logits/rejected": -2.881931781768799, + "logps/chosen": -59.330894470214844, + "logps/rejected": -56.132362365722656, + "loss": 0.687, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01545041799545288, + "rewards/margins": 0.012800877913832664, + "rewards/rejected": -0.028251299634575844, + "step": 5700 + }, + { + "epoch": 0.9820813232253618, + "eval_logits/chosen": -3.1337404251098633, + "eval_logits/rejected": -3.128058910369873, + "eval_logps/chosen": -58.77233123779297, + "eval_logps/rejected": -63.66284942626953, + "eval_loss": 0.691124677658081, + "eval_rewards/accuracies": 0.5787639617919922, + "eval_rewards/chosen": -0.000604349363129586, + "eval_rewards/margins": 0.004222996532917023, + "eval_rewards/rejected": -0.004827346187084913, + "eval_runtime": 384.0163, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 5700 + }, + { + "epoch": 0.9838042729152309, + "grad_norm": 2.2526307106018066, + "learning_rate": 1.2014457985663371e-08, + "logits/chosen": -3.086958885192871, + "logits/rejected": -3.057468891143799, + "logps/chosen": -56.86411666870117, + "logps/rejected": -55.9247932434082, + "loss": 0.6855, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017509115859866142, + "rewards/margins": 0.015910452231764793, + "rewards/rejected": -0.033419571816921234, + "step": 5710 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 2.3755736351013184, + "learning_rate": 1.1984993681384845e-08, + "logits/chosen": -3.0592103004455566, + "logits/rejected": -3.035421133041382, + "logps/chosen": -53.76625442504883, + "logps/rejected": -53.1986083984375, + "loss": 0.6854, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01754259690642357, + "rewards/margins": 0.016087189316749573, + "rewards/rejected": -0.03362978622317314, + "step": 5720 + }, + { + "epoch": 0.987250172294969, + "grad_norm": 2.365424394607544, + "learning_rate": 1.1955511426655622e-08, + "logits/chosen": -3.1162123680114746, + "logits/rejected": -3.104675769805908, + "logps/chosen": -53.531227111816406, + "logps/rejected": -54.61872482299805, + "loss": 0.6887, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01806630939245224, + "rewards/margins": 0.009289136156439781, + "rewards/rejected": -0.02735544741153717, + "step": 5730 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 2.421537399291992, + "learning_rate": 1.1926011488085994e-08, + "logits/chosen": -3.0126664638519287, + "logits/rejected": -2.9804635047912598, + "logps/chosen": -61.017852783203125, + "logps/rejected": -57.800559997558594, + "loss": 0.6857, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.018092989921569824, + "rewards/margins": 0.015476164408028126, + "rewards/rejected": -0.033569153398275375, + "step": 5740 + }, + { + "epoch": 0.9906960716747071, + "grad_norm": 2.609236001968384, + "learning_rate": 1.189649413244618e-08, + "logits/chosen": -3.150195360183716, + "logits/rejected": -3.1196980476379395, + "logps/chosen": -58.42035675048828, + "logps/rejected": -53.55390167236328, + "loss": 0.6858, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01756112650036812, + "rewards/margins": 0.015267576090991497, + "rewards/rejected": -0.03282870352268219, + "step": 5750 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 2.3347179889678955, + "learning_rate": 1.1866959626663902e-08, + "logits/chosen": -3.0212275981903076, + "logits/rejected": -2.9948441982269287, + "logps/chosen": -58.001365661621094, + "logps/rejected": -58.6787109375, + "loss": 0.6859, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01591596007347107, + "rewards/margins": 0.015181079506874084, + "rewards/rejected": -0.031097035855054855, + "step": 5760 + }, + { + "epoch": 0.9941419710544452, + "grad_norm": 2.569519281387329, + "learning_rate": 1.183740823782197e-08, + "logits/chosen": -2.986656665802002, + "logits/rejected": -2.9676034450531006, + "logps/chosen": -53.551719665527344, + "logps/rejected": -56.345191955566406, + "loss": 0.6903, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.016970595344901085, + "rewards/margins": 0.00601046672090888, + "rewards/rejected": -0.022981060668826103, + "step": 5770 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 2.448632001876831, + "learning_rate": 1.1807840233155862e-08, + "logits/chosen": -3.0169689655303955, + "logits/rejected": -2.9971630573272705, + "logps/chosen": -53.976890563964844, + "logps/rejected": -56.612152099609375, + "loss": 0.6878, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01513325423002243, + "rewards/margins": 0.011307798326015472, + "rewards/rejected": -0.026441048830747604, + "step": 5780 + }, + { + "epoch": 0.9975878704341833, + "grad_norm": 2.341364860534668, + "learning_rate": 1.1778255880051325e-08, + "logits/chosen": -2.9556498527526855, + "logits/rejected": -2.921391725540161, + "logps/chosen": -53.52280807495117, + "logps/rejected": -56.76100540161133, + "loss": 0.6896, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.020487099885940552, + "rewards/margins": 0.007790186908096075, + "rewards/rejected": -0.028277289122343063, + "step": 5790 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 2.5893890857696533, + "learning_rate": 1.1748655446041944e-08, + "logits/chosen": -3.0420236587524414, + "logits/rejected": -3.0131123065948486, + "logps/chosen": -52.47263717651367, + "logps/rejected": -55.4683952331543, + "loss": 0.6857, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.017217550426721573, + "rewards/margins": 0.015593932941555977, + "rewards/rejected": -0.0328114852309227, + "step": 5800 + }, + { + "epoch": 0.9993108201240524, + "eval_logits/chosen": -3.1334035396575928, + "eval_logits/rejected": -3.127761125564575, + "eval_logps/chosen": -58.7999267578125, + "eval_logps/rejected": -63.68791198730469, + "eval_loss": 0.6911415457725525, + "eval_rewards/accuracies": 0.5713289976119995, + "eval_rewards/chosen": -0.0008803335367701948, + "eval_rewards/margins": 0.004197545349597931, + "eval_rewards/rejected": -0.005077878944575787, + "eval_runtime": 383.943, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 5800 + }, + { + "epoch": 1.0010337698139213, + "grad_norm": 2.1232244968414307, + "learning_rate": 1.171903919880672e-08, + "logits/chosen": -3.1008832454681396, + "logits/rejected": -3.0849082469940186, + "logps/chosen": -55.886863708496094, + "logps/rejected": -56.52936553955078, + "loss": 0.6878, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.018158026039600372, + "rewards/margins": 0.011155323125422001, + "rewards/rejected": -0.029313350096344948, + "step": 5810 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 2.475459098815918, + "learning_rate": 1.1689407406167661e-08, + "logits/chosen": -3.106663465499878, + "logits/rejected": -3.072197675704956, + "logps/chosen": -54.98704147338867, + "logps/rejected": -52.68647384643555, + "loss": 0.6864, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.019610222429037094, + "rewards/margins": 0.013955632224678993, + "rewards/rejected": -0.03356585651636124, + "step": 5820 + }, + { + "epoch": 1.0044796691936595, + "grad_norm": 2.4394052028656006, + "learning_rate": 1.1659760336087344e-08, + "logits/chosen": -2.99074649810791, + "logits/rejected": -2.958397150039673, + "logps/chosen": -54.87736129760742, + "logps/rejected": -55.85693359375, + "loss": 0.6854, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.013808014802634716, + "rewards/margins": 0.01611264795064926, + "rewards/rejected": -0.029920663684606552, + "step": 5830 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 2.4401886463165283, + "learning_rate": 1.1630098256666513e-08, + "logits/chosen": -3.0222973823547363, + "logits/rejected": -3.0010902881622314, + "logps/chosen": -54.44184494018555, + "logps/rejected": -57.95893478393555, + "loss": 0.6868, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02053814008831978, + "rewards/margins": 0.013348013162612915, + "rewards/rejected": -0.03388615697622299, + "step": 5840 + }, + { + "epoch": 1.0079255685733977, + "grad_norm": 2.449007272720337, + "learning_rate": 1.160042143614163e-08, + "logits/chosen": -3.091636896133423, + "logits/rejected": -3.070950508117676, + "logps/chosen": -52.696617126464844, + "logps/rejected": -57.5743408203125, + "loss": 0.687, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.017818491905927658, + "rewards/margins": 0.012944323010742664, + "rewards/rejected": -0.030762815847992897, + "step": 5850 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 2.2463791370391846, + "learning_rate": 1.157073014288247e-08, + "logits/chosen": -2.9557888507843018, + "logits/rejected": -2.9458224773406982, + "logps/chosen": -53.57053756713867, + "logps/rejected": -56.65851974487305, + "loss": 0.6853, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017971809953451157, + "rewards/margins": 0.01625947281718254, + "rewards/rejected": -0.0342312827706337, + "step": 5860 + }, + { + "epoch": 1.0113714679531358, + "grad_norm": 2.7064640522003174, + "learning_rate": 1.1541024645389687e-08, + "logits/chosen": -2.9869303703308105, + "logits/rejected": -2.9656178951263428, + "logps/chosen": -58.08588790893555, + "logps/rejected": -58.493927001953125, + "loss": 0.6857, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.01905050501227379, + "rewards/margins": 0.015620703808963299, + "rewards/rejected": -0.03467120975255966, + "step": 5870 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 2.561938524246216, + "learning_rate": 1.1511305212292376e-08, + "logits/chosen": -3.068009853363037, + "logits/rejected": -3.0549399852752686, + "logps/chosen": -56.194244384765625, + "logps/rejected": -55.732421875, + "loss": 0.6876, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.014522912912070751, + "rewards/margins": 0.011498660780489445, + "rewards/rejected": -0.026021573692560196, + "step": 5880 + }, + { + "epoch": 1.014817367332874, + "grad_norm": 2.3319783210754395, + "learning_rate": 1.1481572112345666e-08, + "logits/chosen": -3.092992067337036, + "logits/rejected": -3.0787858963012695, + "logps/chosen": -56.552085876464844, + "logps/rejected": -60.93707275390625, + "loss": 0.6867, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01386711560189724, + "rewards/margins": 0.01341196894645691, + "rewards/rejected": -0.02727908454835415, + "step": 5890 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 2.4611449241638184, + "learning_rate": 1.1451825614428266e-08, + "logits/chosen": -2.9957923889160156, + "logits/rejected": -2.975466251373291, + "logps/chosen": -57.44199752807617, + "logps/rejected": -57.857276916503906, + "loss": 0.6864, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01775974966585636, + "rewards/margins": 0.014119607396423817, + "rewards/rejected": -0.03187935799360275, + "step": 5900 + }, + { + "epoch": 1.016540317022743, + "eval_logits/chosen": -3.1327173709869385, + "eval_logits/rejected": -3.1270527839660645, + "eval_logps/chosen": -58.829933166503906, + "eval_logps/rejected": -63.7348747253418, + "eval_loss": 0.6910606026649475, + "eval_rewards/accuracies": 0.5787639617919922, + "eval_rewards/chosen": -0.001180329010821879, + "eval_rewards/margins": 0.004367194604128599, + "eval_rewards/rejected": -0.005547523498535156, + "eval_runtime": 384.3077, + "eval_samples_per_second": 11.199, + "eval_steps_per_second": 1.4, + "step": 5900 + }, + { + "epoch": 1.018263266712612, + "grad_norm": 2.4447977542877197, + "learning_rate": 1.1422065987540045e-08, + "logits/chosen": -2.992640972137451, + "logits/rejected": -2.9968771934509277, + "logps/chosen": -54.487457275390625, + "logps/rejected": -60.301239013671875, + "loss": 0.6904, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01974978856742382, + "rewards/margins": 0.006094303913414478, + "rewards/rejected": -0.025844091549515724, + "step": 5910 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 2.307249069213867, + "learning_rate": 1.1392293500799604e-08, + "logits/chosen": -3.047121047973633, + "logits/rejected": -3.0284416675567627, + "logps/chosen": -53.3023681640625, + "logps/rejected": -56.572227478027344, + "loss": 0.6872, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.018186841160058975, + "rewards/margins": 0.012462997809052467, + "rewards/rejected": -0.030649837106466293, + "step": 5920 + }, + { + "epoch": 1.02170916609235, + "grad_norm": 2.171379566192627, + "learning_rate": 1.1362508423441831e-08, + "logits/chosen": -3.050178289413452, + "logits/rejected": -3.0439326763153076, + "logps/chosen": -52.685142517089844, + "logps/rejected": -54.45989227294922, + "loss": 0.6892, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.024569392204284668, + "rewards/margins": 0.008357712998986244, + "rewards/rejected": -0.03292710706591606, + "step": 5930 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 2.4548449516296387, + "learning_rate": 1.1332711024815471e-08, + "logits/chosen": -3.0522618293762207, + "logits/rejected": -3.0109293460845947, + "logps/chosen": -55.6926155090332, + "logps/rejected": -56.47785186767578, + "loss": 0.6837, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.014554401859641075, + "rewards/margins": 0.019560344517230988, + "rewards/rejected": -0.03411474451422691, + "step": 5940 + }, + { + "epoch": 1.0251550654720882, + "grad_norm": 2.706624746322632, + "learning_rate": 1.1302901574380701e-08, + "logits/chosen": -2.9206995964050293, + "logits/rejected": -2.917583465576172, + "logps/chosen": -55.23699951171875, + "logps/rejected": -56.68768310546875, + "loss": 0.6874, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.015318076126277447, + "rewards/margins": 0.011847314424812794, + "rewards/rejected": -0.02716539241373539, + "step": 5950 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 2.4179177284240723, + "learning_rate": 1.1273080341706672e-08, + "logits/chosen": -2.978672742843628, + "logits/rejected": -2.9258780479431152, + "logps/chosen": -59.678466796875, + "logps/rejected": -54.84111404418945, + "loss": 0.685, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.013551495969295502, + "rewards/margins": 0.016849249601364136, + "rewards/rejected": -0.030400747433304787, + "step": 5960 + }, + { + "epoch": 1.0286009648518264, + "grad_norm": 3.122251033782959, + "learning_rate": 1.1243247596469087e-08, + "logits/chosen": -2.9761157035827637, + "logits/rejected": -2.958291530609131, + "logps/chosen": -54.1254997253418, + "logps/rejected": -53.59160614013672, + "loss": 0.6861, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.022281497716903687, + "rewards/margins": 0.014488357119262218, + "rewards/rejected": -0.03676985576748848, + "step": 5970 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 2.5445525646209717, + "learning_rate": 1.1213403608447758e-08, + "logits/chosen": -2.977578639984131, + "logits/rejected": -2.977252244949341, + "logps/chosen": -54.674766540527344, + "logps/rejected": -59.23362350463867, + "loss": 0.6904, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0210475604981184, + "rewards/margins": 0.0061415620148181915, + "rewards/rejected": -0.02718912623822689, + "step": 5980 + }, + { + "epoch": 1.0320468642315643, + "grad_norm": 2.410888910293579, + "learning_rate": 1.1183548647524173e-08, + "logits/chosen": -3.0514755249023438, + "logits/rejected": -3.0170586109161377, + "logps/chosen": -57.01215744018555, + "logps/rejected": -55.18632125854492, + "loss": 0.6872, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.01939382217824459, + "rewards/margins": 0.012516381219029427, + "rewards/rejected": -0.03191020339727402, + "step": 5990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 1.9471714496612549, + "learning_rate": 1.1153682983679035e-08, + "logits/chosen": -3.0283169746398926, + "logits/rejected": -3.0108890533447266, + "logps/chosen": -54.46110916137695, + "logps/rejected": -55.54545211791992, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.023570090532302856, + "rewards/margins": 0.00914863869547844, + "rewards/rejected": -0.032718725502491, + "step": 6000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -3.1315500736236572, + "eval_logits/rejected": -3.1258907318115234, + "eval_logps/chosen": -58.85401916503906, + "eval_logps/rejected": -63.76580047607422, + "eval_loss": 0.6910278797149658, + "eval_rewards/accuracies": 0.5789963006973267, + "eval_rewards/chosen": -0.0014212463283911347, + "eval_rewards/margins": 0.004435404669493437, + "eval_rewards/rejected": -0.005856651347130537, + "eval_runtime": 383.555, + "eval_samples_per_second": 11.221, + "eval_steps_per_second": 1.403, + "step": 6000 + }, + { + "epoch": 1.0354927636113025, + "grad_norm": 2.3859405517578125, + "learning_rate": 1.1123806886989844e-08, + "logits/chosen": -3.02860426902771, + "logits/rejected": -3.010640859603882, + "logps/chosen": -55.074371337890625, + "logps/rejected": -54.42387771606445, + "loss": 0.6885, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.020391542464494705, + "rewards/margins": 0.009787985123693943, + "rewards/rejected": -0.030179524794220924, + "step": 6010 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 2.363797664642334, + "learning_rate": 1.1093920627628442e-08, + "logits/chosen": -3.2017006874084473, + "logits/rejected": -3.1685476303100586, + "logps/chosen": -56.03865432739258, + "logps/rejected": -53.24171829223633, + "loss": 0.6851, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.016513699665665627, + "rewards/margins": 0.01690778322517872, + "rewards/rejected": -0.033421482890844345, + "step": 6020 + }, + { + "epoch": 1.0389386629910407, + "grad_norm": 2.2829158306121826, + "learning_rate": 1.1064024475858577e-08, + "logits/chosen": -2.854118824005127, + "logits/rejected": -2.8442769050598145, + "logps/chosen": -52.7369384765625, + "logps/rejected": -53.722251892089844, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02898944541811943, + "rewards/margins": 0.005069796461611986, + "rewards/rejected": -0.03405924141407013, + "step": 6030 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 2.114177703857422, + "learning_rate": 1.1034118702033446e-08, + "logits/chosen": -3.052936553955078, + "logits/rejected": -3.0325207710266113, + "logps/chosen": -55.636146545410156, + "logps/rejected": -55.01488494873047, + "loss": 0.6864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01927899941802025, + "rewards/margins": 0.014106673188507557, + "rewards/rejected": -0.03338567540049553, + "step": 6040 + }, + { + "epoch": 1.0423845623707788, + "grad_norm": 2.4536662101745605, + "learning_rate": 1.1004203576593268e-08, + "logits/chosen": -2.949414014816284, + "logits/rejected": -2.9219253063201904, + "logps/chosen": -61.292701721191406, + "logps/rejected": -57.094200134277344, + "loss": 0.6862, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019505826756358147, + "rewards/margins": 0.014441567473113537, + "rewards/rejected": -0.03394739329814911, + "step": 6050 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 2.0977792739868164, + "learning_rate": 1.0974279370062827e-08, + "logits/chosen": -3.0255239009857178, + "logits/rejected": -3.002349853515625, + "logps/chosen": -55.601539611816406, + "logps/rejected": -55.09684371948242, + "loss": 0.6883, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.021067507565021515, + "rewards/margins": 0.010256089270114899, + "rewards/rejected": -0.031323596835136414, + "step": 6060 + }, + { + "epoch": 1.045830461750517, + "grad_norm": 2.427433729171753, + "learning_rate": 1.0944346353049023e-08, + "logits/chosen": -3.0318515300750732, + "logits/rejected": -3.010066509246826, + "logps/chosen": -55.89380645751953, + "logps/rejected": -55.183677673339844, + "loss": 0.6859, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.019514724612236023, + "rewards/margins": 0.015096470713615417, + "rewards/rejected": -0.03461119160056114, + "step": 6070 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 2.407802104949951, + "learning_rate": 1.0914404796238437e-08, + "logits/chosen": -2.998624801635742, + "logits/rejected": -2.977358102798462, + "logps/chosen": -60.79319381713867, + "logps/rejected": -56.703033447265625, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017175931483507156, + "rewards/margins": 0.01408243179321289, + "rewards/rejected": -0.03125835955142975, + "step": 6080 + }, + { + "epoch": 1.049276361130255, + "grad_norm": 2.5689022541046143, + "learning_rate": 1.088445497039487e-08, + "logits/chosen": -3.0697948932647705, + "logits/rejected": -3.0521798133850098, + "logps/chosen": -53.399330139160156, + "logps/rejected": -54.737457275390625, + "loss": 0.6865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.019912278279662132, + "rewards/margins": 0.013947946950793266, + "rewards/rejected": -0.0338602289557457, + "step": 6090 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 2.353585720062256, + "learning_rate": 1.0854497146356908e-08, + "logits/chosen": -3.035552978515625, + "logits/rejected": -3.032215118408203, + "logps/chosen": -58.44176483154297, + "logps/rejected": -57.085479736328125, + "loss": 0.6857, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.016585614532232285, + "rewards/margins": 0.01556225586682558, + "rewards/rejected": -0.03214786946773529, + "step": 6100 + }, + { + "epoch": 1.050999310820124, + "eval_logits/chosen": -3.131176471710205, + "eval_logits/rejected": -3.1255154609680176, + "eval_logps/chosen": -58.87300109863281, + "eval_logps/rejected": -63.80307388305664, + "eval_loss": 0.690942108631134, + "eval_rewards/accuracies": 0.5794609785079956, + "eval_rewards/chosen": -0.0016110517317429185, + "eval_rewards/margins": 0.004618438426405191, + "eval_rewards/rejected": -0.006229490041732788, + "eval_runtime": 383.3806, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 6100 + }, + { + "epoch": 1.052722260509993, + "grad_norm": 2.3778581619262695, + "learning_rate": 1.0824531595035451e-08, + "logits/chosen": -3.0413858890533447, + "logits/rejected": -3.0262722969055176, + "logps/chosen": -53.48925018310547, + "logps/rejected": -56.27558135986328, + "loss": 0.6884, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01790313422679901, + "rewards/margins": 0.009967513382434845, + "rewards/rejected": -0.027870649471879005, + "step": 6110 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 2.639493465423584, + "learning_rate": 1.0794558587411295e-08, + "logits/chosen": -3.1496880054473877, + "logits/rejected": -3.1011807918548584, + "logps/chosen": -55.5837516784668, + "logps/rejected": -56.8000373840332, + "loss": 0.6833, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.012620387598872185, + "rewards/margins": 0.020604535937309265, + "rewards/rejected": -0.0332249216735363, + "step": 6120 + }, + { + "epoch": 1.0561681598897312, + "grad_norm": 2.5370519161224365, + "learning_rate": 1.0764578394532654e-08, + "logits/chosen": -3.0457675457000732, + "logits/rejected": -3.020132541656494, + "logps/chosen": -56.27012252807617, + "logps/rejected": -57.2618408203125, + "loss": 0.6862, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.018784884363412857, + "rewards/margins": 0.014378098770976067, + "rewards/rejected": -0.033162981271743774, + "step": 6130 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 2.4076688289642334, + "learning_rate": 1.0734591287512721e-08, + "logits/chosen": -3.081873893737793, + "logits/rejected": -3.0696253776550293, + "logps/chosen": -55.290489196777344, + "logps/rejected": -55.50571823120117, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.022115176543593407, + "rewards/margins": 0.00814732350409031, + "rewards/rejected": -0.030262500047683716, + "step": 6140 + }, + { + "epoch": 1.0596140592694694, + "grad_norm": 2.379582166671753, + "learning_rate": 1.0704597537527212e-08, + "logits/chosen": -2.981661319732666, + "logits/rejected": -2.966520071029663, + "logps/chosen": -55.33698654174805, + "logps/rejected": -53.2189826965332, + "loss": 0.6889, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02329077199101448, + "rewards/margins": 0.009010384790599346, + "rewards/rejected": -0.0323011577129364, + "step": 6150 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 2.3428122997283936, + "learning_rate": 1.067459741581192e-08, + "logits/chosen": -2.9435131549835205, + "logits/rejected": -2.9327847957611084, + "logps/chosen": -52.50434494018555, + "logps/rejected": -55.611785888671875, + "loss": 0.6858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.015285499393939972, + "rewards/margins": 0.015342341735959053, + "rewards/rejected": -0.030627842992544174, + "step": 6160 + }, + { + "epoch": 1.0630599586492075, + "grad_norm": 2.60860538482666, + "learning_rate": 1.0644591193660252e-08, + "logits/chosen": -3.0668625831604004, + "logits/rejected": -3.0502090454101562, + "logps/chosen": -61.2022819519043, + "logps/rejected": -59.66276168823242, + "loss": 0.6862, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01741643063724041, + "rewards/margins": 0.01449237484484911, + "rewards/rejected": -0.031908802688121796, + "step": 6170 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 2.6758952140808105, + "learning_rate": 1.0614579142420786e-08, + "logits/chosen": -3.082418918609619, + "logits/rejected": -3.0575006008148193, + "logps/chosen": -59.830848693847656, + "logps/rejected": -56.3403434753418, + "loss": 0.6851, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.016757013276219368, + "rewards/margins": 0.016851117834448814, + "rewards/rejected": -0.03360813111066818, + "step": 6180 + }, + { + "epoch": 1.0665058580289455, + "grad_norm": 2.2758195400238037, + "learning_rate": 1.0584561533494817e-08, + "logits/chosen": -3.04453706741333, + "logits/rejected": -3.0198044776916504, + "logps/chosen": -59.79877471923828, + "logps/rejected": -56.4383544921875, + "loss": 0.6844, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.016228018328547478, + "rewards/margins": 0.018129851669073105, + "rewards/rejected": -0.03435786813497543, + "step": 6190 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 2.3243348598480225, + "learning_rate": 1.0554538638333888e-08, + "logits/chosen": -2.933687925338745, + "logits/rejected": -2.9139504432678223, + "logps/chosen": -56.769676208496094, + "logps/rejected": -58.42494583129883, + "loss": 0.6889, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.022551879286766052, + "rewards/margins": 0.009325524792075157, + "rewards/rejected": -0.03187740594148636, + "step": 6200 + }, + { + "epoch": 1.0682288077188147, + "eval_logits/chosen": -3.1305487155914307, + "eval_logits/rejected": -3.124844789505005, + "eval_logps/chosen": -58.903194427490234, + "eval_logps/rejected": -63.83762741088867, + "eval_loss": 0.6909228563308716, + "eval_rewards/accuracies": 0.5764405131340027, + "eval_rewards/chosen": -0.0019130135187879205, + "eval_rewards/margins": 0.00466204434633255, + "eval_rewards/rejected": -0.006575057283043861, + "eval_runtime": 384.0941, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 6200 + }, + { + "epoch": 1.0699517574086836, + "grad_norm": 2.357759952545166, + "learning_rate": 1.0524510728437354e-08, + "logits/chosen": -3.0520224571228027, + "logits/rejected": -3.0220112800598145, + "logps/chosen": -56.74927520751953, + "logps/rejected": -56.363197326660156, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019240161404013634, + "rewards/margins": 0.015431523323059082, + "rewards/rejected": -0.034671682864427567, + "step": 6210 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 2.284916877746582, + "learning_rate": 1.049447807534992e-08, + "logits/chosen": -3.0356788635253906, + "logits/rejected": -3.0147993564605713, + "logps/chosen": -54.05214309692383, + "logps/rejected": -55.98185348510742, + "loss": 0.6859, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.019131021574139595, + "rewards/margins": 0.015217870473861694, + "rewards/rejected": -0.03434889391064644, + "step": 6220 + }, + { + "epoch": 1.0733976567884218, + "grad_norm": 2.3051514625549316, + "learning_rate": 1.0464440950659173e-08, + "logits/chosen": -3.175098419189453, + "logits/rejected": -3.156182289123535, + "logps/chosen": -60.29032516479492, + "logps/rejected": -59.4665641784668, + "loss": 0.6862, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.014576122164726257, + "rewards/margins": 0.014566306956112385, + "rewards/rejected": -0.029142428189516068, + "step": 6230 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 2.666886568069458, + "learning_rate": 1.043439962599315e-08, + "logits/chosen": -2.989060640335083, + "logits/rejected": -2.956812620162964, + "logps/chosen": -58.37445831298828, + "logps/rejected": -56.85335159301758, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019876528531312943, + "rewards/margins": 0.011631477624177933, + "rewards/rejected": -0.031508006155490875, + "step": 6240 + }, + { + "epoch": 1.07684355616816, + "grad_norm": 2.515204429626465, + "learning_rate": 1.0404354373017859e-08, + "logits/chosen": -3.07537841796875, + "logits/rejected": -3.052424907684326, + "logps/chosen": -57.4842529296875, + "logps/rejected": -56.97715377807617, + "loss": 0.6868, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01756465807557106, + "rewards/margins": 0.01326189748942852, + "rewards/rejected": -0.03082655929028988, + "step": 6250 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 2.4106087684631348, + "learning_rate": 1.037430546343484e-08, + "logits/chosen": -2.953723192214966, + "logits/rejected": -2.932874917984009, + "logps/chosen": -56.1456413269043, + "logps/rejected": -54.07300567626953, + "loss": 0.6862, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.02221234142780304, + "rewards/margins": 0.014708032831549644, + "rewards/rejected": -0.03692037612199783, + "step": 6260 + }, + { + "epoch": 1.080289455547898, + "grad_norm": 2.4935922622680664, + "learning_rate": 1.0344253168978695e-08, + "logits/chosen": -3.2191452980041504, + "logits/rejected": -3.218228816986084, + "logps/chosen": -54.688499450683594, + "logps/rejected": -57.696754455566406, + "loss": 0.6871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01787818782031536, + "rewards/margins": 0.012707440182566643, + "rewards/rejected": -0.030585628002882004, + "step": 6270 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 2.4286909103393555, + "learning_rate": 1.0314197761414636e-08, + "logits/chosen": -2.9016292095184326, + "logits/rejected": -2.876361846923828, + "logps/chosen": -56.48183059692383, + "logps/rejected": -56.19874954223633, + "loss": 0.6861, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.020695989951491356, + "rewards/margins": 0.014859400689601898, + "rewards/rejected": -0.0355553925037384, + "step": 6280 + }, + { + "epoch": 1.083735354927636, + "grad_norm": 2.36930775642395, + "learning_rate": 1.0284139512536028e-08, + "logits/chosen": -2.980339765548706, + "logits/rejected": -2.9492430686950684, + "logps/chosen": -53.1446418762207, + "logps/rejected": -56.06641387939453, + "loss": 0.6838, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.017662178725004196, + "rewards/margins": 0.019239947199821472, + "rewards/rejected": -0.03690212219953537, + "step": 6290 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 2.4871668815612793, + "learning_rate": 1.0254078694161929e-08, + "logits/chosen": -3.057957887649536, + "logits/rejected": -3.0294573307037354, + "logps/chosen": -54.77833938598633, + "logps/rejected": -55.77891159057617, + "loss": 0.6865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.023420926183462143, + "rewards/margins": 0.013916957192122936, + "rewards/rejected": -0.037337884306907654, + "step": 6300 + }, + { + "epoch": 1.0854583046175053, + "eval_logits/chosen": -3.1302549839019775, + "eval_logits/rejected": -3.1245484352111816, + "eval_logps/chosen": -58.927452087402344, + "eval_logps/rejected": -63.87955856323242, + "eval_loss": 0.6908382177352905, + "eval_rewards/accuracies": 0.5787639617919922, + "eval_rewards/chosen": -0.0021555284038186073, + "eval_rewards/margins": 0.0048387921415269375, + "eval_rewards/rejected": -0.006994321011006832, + "eval_runtime": 383.6447, + "eval_samples_per_second": 11.219, + "eval_steps_per_second": 1.402, + "step": 6300 + }, + { + "epoch": 1.0871812543073742, + "grad_norm": 2.4470012187957764, + "learning_rate": 1.0224015578134633e-08, + "logits/chosen": -3.0433483123779297, + "logits/rejected": -3.012185573577881, + "logps/chosen": -52.22063446044922, + "logps/rejected": -54.65047073364258, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01763722486793995, + "rewards/margins": 0.013348887674510479, + "rewards/rejected": -0.030986111611127853, + "step": 6310 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 2.340951442718506, + "learning_rate": 1.019395043631722e-08, + "logits/chosen": -2.988467216491699, + "logits/rejected": -2.9664227962493896, + "logps/chosen": -56.86391067504883, + "logps/rejected": -56.512306213378906, + "loss": 0.6882, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.021419556811451912, + "rewards/margins": 0.0104674668982625, + "rewards/rejected": -0.03188702464103699, + "step": 6320 + }, + { + "epoch": 1.0906271536871124, + "grad_norm": 2.3158609867095947, + "learning_rate": 1.0163883540591075e-08, + "logits/chosen": -3.0004405975341797, + "logits/rejected": -2.9805476665496826, + "logps/chosen": -55.39896774291992, + "logps/rejected": -57.84352493286133, + "loss": 0.6829, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.021089566871523857, + "rewards/margins": 0.021264133974909782, + "rewards/rejected": -0.04235369712114334, + "step": 6330 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 2.5407843589782715, + "learning_rate": 1.0133815162853452e-08, + "logits/chosen": -3.0075252056121826, + "logits/rejected": -2.98484468460083, + "logps/chosen": -57.53838348388672, + "logps/rejected": -55.88555908203125, + "loss": 0.6881, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.019510172307491302, + "rewards/margins": 0.010738825425505638, + "rewards/rejected": -0.03024899959564209, + "step": 6340 + }, + { + "epoch": 1.0940730530668505, + "grad_norm": 2.110309362411499, + "learning_rate": 1.010374557501501e-08, + "logits/chosen": -2.989995241165161, + "logits/rejected": -2.9829235076904297, + "logps/chosen": -57.7088508605957, + "logps/rejected": -59.38560104370117, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02194030024111271, + "rewards/margins": 0.008906031958758831, + "rewards/rejected": -0.030846333131194115, + "step": 6350 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 2.4076998233795166, + "learning_rate": 1.0073675048997344e-08, + "logits/chosen": -3.0212626457214355, + "logits/rejected": -3.0192079544067383, + "logps/chosen": -55.68696212768555, + "logps/rejected": -57.60249710083008, + "loss": 0.6894, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02218361385166645, + "rewards/margins": 0.008003572933375835, + "rewards/rejected": -0.03018718585371971, + "step": 6360 + }, + { + "epoch": 1.0975189524465885, + "grad_norm": 2.4917333126068115, + "learning_rate": 1.004360385673054e-08, + "logits/chosen": -3.0473618507385254, + "logits/rejected": -3.0467488765716553, + "logps/chosen": -55.46171188354492, + "logps/rejected": -57.114402770996094, + "loss": 0.6917, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.023173917084932327, + "rewards/margins": 0.0035490102600306273, + "rewards/rejected": -0.026722926646471024, + "step": 6370 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 2.7534730434417725, + "learning_rate": 1.0013532270150699e-08, + "logits/chosen": -3.0758779048919678, + "logits/rejected": -3.064692974090576, + "logps/chosen": -54.99663543701172, + "logps/rejected": -59.77294158935547, + "loss": 0.6885, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.022621020674705505, + "rewards/margins": 0.009905163198709488, + "rewards/rejected": -0.03252618387341499, + "step": 6380 + }, + { + "epoch": 1.1009648518263266, + "grad_norm": 2.4464833736419678, + "learning_rate": 9.983460561197496e-09, + "logits/chosen": -3.1145012378692627, + "logits/rejected": -3.0785086154937744, + "logps/chosen": -57.59659957885742, + "logps/rejected": -54.70061492919922, + "loss": 0.6826, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.015128199942409992, + "rewards/margins": 0.021856555715203285, + "rewards/rejected": -0.03698475658893585, + "step": 6390 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 2.506380319595337, + "learning_rate": 9.953389001811714e-09, + "logits/chosen": -3.0488944053649902, + "logits/rejected": -3.024442434310913, + "logps/chosen": -59.581787109375, + "logps/rejected": -59.423805236816406, + "loss": 0.6884, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.018883857876062393, + "rewards/margins": 0.009957622736692429, + "rewards/rejected": -0.028841480612754822, + "step": 6400 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -3.1286780834198, + "eval_logits/rejected": -3.1230478286743164, + "eval_logps/chosen": -58.95232391357422, + "eval_logps/rejected": -63.89405059814453, + "eval_loss": 0.6908935308456421, + "eval_rewards/accuracies": 0.5748141407966614, + "eval_rewards/chosen": -0.002404270227998495, + "eval_rewards/margins": 0.00473501393571496, + "eval_rewards/rejected": -0.007139283698052168, + "eval_runtime": 383.8443, + "eval_samples_per_second": 11.213, + "eval_steps_per_second": 1.402, + "step": 6400 + }, + { + "epoch": 1.1044107512060648, + "grad_norm": 2.3390676975250244, + "learning_rate": 9.923317863932776e-09, + "logits/chosen": -3.1495254039764404, + "logits/rejected": -3.1230266094207764, + "logps/chosen": -56.2658576965332, + "logps/rejected": -55.08391571044922, + "loss": 0.6862, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.019014541059732437, + "rewards/margins": 0.014427746646106243, + "rewards/rejected": -0.033442284911870956, + "step": 6410 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 2.3376822471618652, + "learning_rate": 9.8932474194963e-09, + "logits/chosen": -3.0909719467163086, + "logits/rejected": -3.0730559825897217, + "logps/chosen": -56.99043655395508, + "logps/rejected": -57.393707275390625, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021999698132276535, + "rewards/margins": 0.012688428163528442, + "rewards/rejected": -0.03468812629580498, + "step": 6420 + }, + { + "epoch": 1.107856650585803, + "grad_norm": 2.4933223724365234, + "learning_rate": 9.863177940431631e-09, + "logits/chosen": -3.000549793243408, + "logits/rejected": -2.986180543899536, + "logps/chosen": -52.349082946777344, + "logps/rejected": -51.32086181640625, + "loss": 0.6879, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.01957535557448864, + "rewards/margins": 0.01128177996724844, + "rewards/rejected": -0.030857134610414505, + "step": 6430 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 2.119002103805542, + "learning_rate": 9.83310969865938e-09, + "logits/chosen": -2.9967103004455566, + "logits/rejected": -2.978527784347534, + "logps/chosen": -55.7023811340332, + "logps/rejected": -56.34955978393555, + "loss": 0.6899, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.024433055892586708, + "rewards/margins": 0.007135553751140833, + "rewards/rejected": -0.031568609178066254, + "step": 6440 + }, + { + "epoch": 1.111302549965541, + "grad_norm": 2.5312013626098633, + "learning_rate": 9.803042966088975e-09, + "logits/chosen": -3.0206658840179443, + "logits/rejected": -2.986959218978882, + "logps/chosen": -58.1439208984375, + "logps/rejected": -55.67827224731445, + "loss": 0.6853, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.01819276250898838, + "rewards/margins": 0.016308221966028214, + "rewards/rejected": -0.034500982612371445, + "step": 6450 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 2.3651509284973145, + "learning_rate": 9.77297801461619e-09, + "logits/chosen": -2.984637498855591, + "logits/rejected": -2.98801851272583, + "logps/chosen": -53.89622116088867, + "logps/rejected": -61.230064392089844, + "loss": 0.687, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.020203817635774612, + "rewards/margins": 0.0128660649061203, + "rewards/rejected": -0.03306988254189491, + "step": 6460 + }, + { + "epoch": 1.114748449345279, + "grad_norm": 2.3542640209198, + "learning_rate": 9.742915116120702e-09, + "logits/chosen": -2.943131685256958, + "logits/rejected": -2.920292377471924, + "logps/chosen": -54.866676330566406, + "logps/rejected": -55.22333908081055, + "loss": 0.6866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021034687757492065, + "rewards/margins": 0.013601104728877544, + "rewards/rejected": -0.03463580086827278, + "step": 6470 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 2.671103000640869, + "learning_rate": 9.71285454246361e-09, + "logits/chosen": -3.0327906608581543, + "logits/rejected": -2.995478868484497, + "logps/chosen": -58.048858642578125, + "logps/rejected": -53.625083923339844, + "loss": 0.685, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.014800253324210644, + "rewards/margins": 0.016874242573976517, + "rewards/rejected": -0.031674496829509735, + "step": 6480 + }, + { + "epoch": 1.1181943487250172, + "grad_norm": 2.5911669731140137, + "learning_rate": 9.682796565485007e-09, + "logits/chosen": -3.2142341136932373, + "logits/rejected": -3.2014338970184326, + "logps/chosen": -55.8908576965332, + "logps/rejected": -57.41044998168945, + "loss": 0.6885, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.022705888375639915, + "rewards/margins": 0.010103853419423103, + "rewards/rejected": -0.03280974552035332, + "step": 6490 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 2.531972885131836, + "learning_rate": 9.65274145700148e-09, + "logits/chosen": -3.047877550125122, + "logits/rejected": -3.04103946685791, + "logps/chosen": -58.6346549987793, + "logps/rejected": -55.967201232910156, + "loss": 0.6893, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.024279167875647545, + "rewards/margins": 0.008274078369140625, + "rewards/rejected": -0.03255324810743332, + "step": 6500 + }, + { + "epoch": 1.1199172984148862, + "eval_logits/chosen": -3.1286838054656982, + "eval_logits/rejected": -3.123046875, + "eval_logps/chosen": -58.967567443847656, + "eval_logps/rejected": -63.92682647705078, + "eval_loss": 0.6908088326454163, + "eval_rewards/accuracies": 0.5813196897506714, + "eval_rewards/chosen": -0.0025567305274307728, + "eval_rewards/margins": 0.004910381976515055, + "eval_rewards/rejected": -0.00746711203828454, + "eval_runtime": 383.4046, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 6500 + }, + { + "epoch": 1.1216402481047554, + "grad_norm": 2.5683422088623047, + "learning_rate": 9.622689488803698e-09, + "logits/chosen": -3.073943614959717, + "logits/rejected": -3.0271260738372803, + "logps/chosen": -58.1553955078125, + "logps/rejected": -56.480018615722656, + "loss": 0.6829, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.01371106505393982, + "rewards/margins": 0.02100132405757904, + "rewards/rejected": -0.03471238911151886, + "step": 6510 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 2.425645112991333, + "learning_rate": 9.592640932653922e-09, + "logits/chosen": -3.009979248046875, + "logits/rejected": -2.9836318492889404, + "logps/chosen": -55.428565979003906, + "logps/rejected": -54.707984924316406, + "loss": 0.6863, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0181711558252573, + "rewards/margins": 0.014331887476146221, + "rewards/rejected": -0.03250304237008095, + "step": 6520 + }, + { + "epoch": 1.1250861474844935, + "grad_norm": 2.4266066551208496, + "learning_rate": 9.562596060283558e-09, + "logits/chosen": -2.986335515975952, + "logits/rejected": -2.9570603370666504, + "logps/chosen": -55.52927780151367, + "logps/rejected": -55.080787658691406, + "loss": 0.6848, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01905311644077301, + "rewards/margins": 0.017313417047262192, + "rewards/rejected": -0.0363665409386158, + "step": 6530 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 2.459099054336548, + "learning_rate": 9.532555143390696e-09, + "logits/chosen": -3.044667959213257, + "logits/rejected": -3.0172994136810303, + "logps/chosen": -59.32032012939453, + "logps/rejected": -59.080108642578125, + "loss": 0.6893, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01945970579981804, + "rewards/margins": 0.008291837759315968, + "rewards/rejected": -0.02775154635310173, + "step": 6540 + }, + { + "epoch": 1.1285320468642315, + "grad_norm": 2.381169080734253, + "learning_rate": 9.502518453637671e-09, + "logits/chosen": -2.93902850151062, + "logits/rejected": -2.9150500297546387, + "logps/chosen": -54.03925323486328, + "logps/rejected": -55.1716194152832, + "loss": 0.6885, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02493315376341343, + "rewards/margins": 0.00981233362108469, + "rewards/rejected": -0.034745484590530396, + "step": 6550 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 2.125992774963379, + "learning_rate": 9.472486262648568e-09, + "logits/chosen": -3.074824810028076, + "logits/rejected": -3.032547950744629, + "logps/chosen": -57.7005729675293, + "logps/rejected": -55.79645538330078, + "loss": 0.6823, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.016963675618171692, + "rewards/margins": 0.02236434444785118, + "rewards/rejected": -0.03932802379131317, + "step": 6560 + }, + { + "epoch": 1.1319779462439696, + "grad_norm": 2.3872108459472656, + "learning_rate": 9.442458842006816e-09, + "logits/chosen": -3.052938938140869, + "logits/rejected": -3.0232064723968506, + "logps/chosen": -55.656700134277344, + "logps/rejected": -56.49686813354492, + "loss": 0.685, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.020926889032125473, + "rewards/margins": 0.017068570479750633, + "rewards/rejected": -0.037995465099811554, + "step": 6570 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 2.2589311599731445, + "learning_rate": 9.412436463252682e-09, + "logits/chosen": -3.0725693702697754, + "logits/rejected": -3.0329666137695312, + "logps/chosen": -56.81243896484375, + "logps/rejected": -53.0896110534668, + "loss": 0.6869, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0178411602973938, + "rewards/margins": 0.013036638498306274, + "rewards/rejected": -0.030877795070409775, + "step": 6580 + }, + { + "epoch": 1.1354238456237078, + "grad_norm": 2.367614507675171, + "learning_rate": 9.382419397880853e-09, + "logits/chosen": -3.015820026397705, + "logits/rejected": -2.992987871170044, + "logps/chosen": -55.18305206298828, + "logps/rejected": -56.52252197265625, + "loss": 0.6899, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02343105897307396, + "rewards/margins": 0.006984441541135311, + "rewards/rejected": -0.030415501445531845, + "step": 6590 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 2.300924777984619, + "learning_rate": 9.35240791733796e-09, + "logits/chosen": -3.112875461578369, + "logits/rejected": -3.0967743396759033, + "logps/chosen": -58.0617561340332, + "logps/rejected": -57.01247024536133, + "loss": 0.6886, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0242941714823246, + "rewards/margins": 0.009952034801244736, + "rewards/rejected": -0.03424619883298874, + "step": 6600 + }, + { + "epoch": 1.1371467953135768, + "eval_logits/chosen": -3.127255916595459, + "eval_logits/rejected": -3.1215784549713135, + "eval_logps/chosen": -59.00900650024414, + "eval_logps/rejected": -63.97233581542969, + "eval_loss": 0.6907920241355896, + "eval_rewards/accuracies": 0.5748141407966614, + "eval_rewards/chosen": -0.0029711031820625067, + "eval_rewards/margins": 0.004950948059558868, + "eval_rewards/rejected": -0.007922051474452019, + "eval_runtime": 383.9688, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 6600 + }, + { + "epoch": 1.138869745003446, + "grad_norm": 2.4028170108795166, + "learning_rate": 9.322402293020136e-09, + "logits/chosen": -3.066715717315674, + "logits/rejected": -3.046630382537842, + "logps/chosen": -54.22076416015625, + "logps/rejected": -55.5111083984375, + "loss": 0.6859, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.022545434534549713, + "rewards/margins": 0.015024189837276936, + "rewards/rejected": -0.037569623440504074, + "step": 6610 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 2.5848283767700195, + "learning_rate": 9.292402796270548e-09, + "logits/chosen": -3.031874179840088, + "logits/rejected": -3.0092434883117676, + "logps/chosen": -55.2226448059082, + "logps/rejected": -55.86328125, + "loss": 0.6891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.028317946940660477, + "rewards/margins": 0.008727455511689186, + "rewards/rejected": -0.037045400589704514, + "step": 6620 + }, + { + "epoch": 1.1423156443831841, + "grad_norm": 2.1752817630767822, + "learning_rate": 9.262409698376958e-09, + "logits/chosen": -3.014620780944824, + "logits/rejected": -2.9869723320007324, + "logps/chosen": -55.143951416015625, + "logps/rejected": -52.85613250732422, + "loss": 0.6843, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01832455024123192, + "rewards/margins": 0.018313277512788773, + "rewards/rejected": -0.03663782775402069, + "step": 6630 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 2.098524332046509, + "learning_rate": 9.23242327056926e-09, + "logits/chosen": -3.063933849334717, + "logits/rejected": -3.0252301692962646, + "logps/chosen": -55.5810432434082, + "logps/rejected": -53.812339782714844, + "loss": 0.6849, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02475198730826378, + "rewards/margins": 0.017390215769410133, + "rewards/rejected": -0.04214220121502876, + "step": 6640 + }, + { + "epoch": 1.145761543762922, + "grad_norm": 2.544950246810913, + "learning_rate": 9.202443784017025e-09, + "logits/chosen": -3.094503879547119, + "logits/rejected": -3.0721499919891357, + "logps/chosen": -54.26481246948242, + "logps/rejected": -57.11799240112305, + "loss": 0.6854, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.021512877196073532, + "rewards/margins": 0.016131866723299026, + "rewards/rejected": -0.03764474391937256, + "step": 6650 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 2.321054220199585, + "learning_rate": 9.172471509827065e-09, + "logits/chosen": -3.13427472114563, + "logits/rejected": -3.114708423614502, + "logps/chosen": -53.75007247924805, + "logps/rejected": -57.930686950683594, + "loss": 0.6846, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.019893547520041466, + "rewards/margins": 0.017774097621440887, + "rewards/rejected": -0.0376676470041275, + "step": 6660 + }, + { + "epoch": 1.1492074431426602, + "grad_norm": 2.4262032508850098, + "learning_rate": 9.142506719040958e-09, + "logits/chosen": -3.0133087635040283, + "logits/rejected": -3.0175650119781494, + "logps/chosen": -52.8214111328125, + "logps/rejected": -58.95885467529297, + "loss": 0.6898, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029145341366529465, + "rewards/margins": 0.007524232380092144, + "rewards/rejected": -0.03666957467794418, + "step": 6670 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 2.388805389404297, + "learning_rate": 9.112549682632617e-09, + "logits/chosen": -3.0323078632354736, + "logits/rejected": -3.0010530948638916, + "logps/chosen": -55.79850387573242, + "logps/rejected": -55.65300750732422, + "loss": 0.6854, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.019732866436243057, + "rewards/margins": 0.0161424670368433, + "rewards/rejected": -0.03587533161044121, + "step": 6680 + }, + { + "epoch": 1.1526533425223984, + "grad_norm": 2.479785203933716, + "learning_rate": 9.082600671505824e-09, + "logits/chosen": -3.018188953399658, + "logits/rejected": -2.996375799179077, + "logps/chosen": -59.262962341308594, + "logps/rejected": -59.838768005371094, + "loss": 0.684, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.021032914519309998, + "rewards/margins": 0.019005469977855682, + "rewards/rejected": -0.04003838449716568, + "step": 6690 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 2.6877920627593994, + "learning_rate": 9.052659956491801e-09, + "logits/chosen": -3.037140369415283, + "logits/rejected": -3.02573561668396, + "logps/chosen": -57.954002380371094, + "logps/rejected": -59.13984298706055, + "loss": 0.6865, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01825765147805214, + "rewards/margins": 0.014165830798447132, + "rewards/rejected": -0.032423485070466995, + "step": 6700 + }, + { + "epoch": 1.1543762922122673, + "eval_logits/chosen": -3.127453088760376, + "eval_logits/rejected": -3.1217737197875977, + "eval_logps/chosen": -59.034568786621094, + "eval_logps/rejected": -64.0009994506836, + "eval_loss": 0.6907802224159241, + "eval_rewards/accuracies": 0.5803903341293335, + "eval_rewards/chosen": -0.003226715140044689, + "eval_rewards/margins": 0.00498205004259944, + "eval_rewards/rejected": -0.008208764716982841, + "eval_runtime": 383.9771, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 6700 + }, + { + "epoch": 1.1560992419021365, + "grad_norm": 2.5995121002197266, + "learning_rate": 9.02272780834673e-09, + "logits/chosen": -3.029447078704834, + "logits/rejected": -3.0037617683410645, + "logps/chosen": -58.503143310546875, + "logps/rejected": -57.007843017578125, + "loss": 0.6841, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.01577948033809662, + "rewards/margins": 0.018815193325281143, + "rewards/rejected": -0.03459467366337776, + "step": 6710 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 2.3430838584899902, + "learning_rate": 8.992804497749343e-09, + "logits/chosen": -3.024191379547119, + "logits/rejected": -3.0123202800750732, + "logps/chosen": -53.38079833984375, + "logps/rejected": -55.08259201049805, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01965912990272045, + "rewards/margins": 0.01567569002509117, + "rewards/rejected": -0.03533482179045677, + "step": 6720 + }, + { + "epoch": 1.1595451412818747, + "grad_norm": 2.730286121368408, + "learning_rate": 8.96289029529843e-09, + "logits/chosen": -2.9752981662750244, + "logits/rejected": -2.959228754043579, + "logps/chosen": -58.8833122253418, + "logps/rejected": -58.656455993652344, + "loss": 0.6876, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.021824505180120468, + "rewards/margins": 0.011972033418715, + "rewards/rejected": -0.033796537667512894, + "step": 6730 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 2.4725494384765625, + "learning_rate": 8.932985471510436e-09, + "logits/chosen": -3.0475223064422607, + "logits/rejected": -3.0392818450927734, + "logps/chosen": -56.54149627685547, + "logps/rejected": -59.19397735595703, + "loss": 0.6885, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02108325995504856, + "rewards/margins": 0.009946262463927269, + "rewards/rejected": -0.03102952241897583, + "step": 6740 + }, + { + "epoch": 1.1629910406616126, + "grad_norm": 2.4905450344085693, + "learning_rate": 8.903090296816975e-09, + "logits/chosen": -3.088040828704834, + "logits/rejected": -3.0729029178619385, + "logps/chosen": -55.22167205810547, + "logps/rejected": -57.266319274902344, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.027635056525468826, + "rewards/margins": 0.00815325602889061, + "rewards/rejected": -0.03578830882906914, + "step": 6750 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 2.5701723098754883, + "learning_rate": 8.873205041562426e-09, + "logits/chosen": -3.0328335762023926, + "logits/rejected": -3.0046257972717285, + "logps/chosen": -54.555458068847656, + "logps/rejected": -52.52616500854492, + "loss": 0.6871, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025020277127623558, + "rewards/margins": 0.01287126075476408, + "rewards/rejected": -0.037891536951065063, + "step": 6760 + }, + { + "epoch": 1.1664369400413508, + "grad_norm": 2.2472474575042725, + "learning_rate": 8.843329976001443e-09, + "logits/chosen": -3.045372486114502, + "logits/rejected": -3.039490222930908, + "logps/chosen": -55.7740364074707, + "logps/rejected": -58.88713455200195, + "loss": 0.6869, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.019391192123293877, + "rewards/margins": 0.013176657259464264, + "rewards/rejected": -0.03256785124540329, + "step": 6770 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 2.712940216064453, + "learning_rate": 8.813465370296552e-09, + "logits/chosen": -3.0446479320526123, + "logits/rejected": -3.01298189163208, + "logps/chosen": -59.089202880859375, + "logps/rejected": -57.0594482421875, + "loss": 0.6878, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02587306872010231, + "rewards/margins": 0.011457438580691814, + "rewards/rejected": -0.0373305082321167, + "step": 6780 + }, + { + "epoch": 1.169882839421089, + "grad_norm": 2.4926705360412598, + "learning_rate": 8.783611494515675e-09, + "logits/chosen": -3.0511624813079834, + "logits/rejected": -3.035194158554077, + "logps/chosen": -53.268836975097656, + "logps/rejected": -57.05846405029297, + "loss": 0.6875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.027448957785964012, + "rewards/margins": 0.011941693723201752, + "rewards/rejected": -0.03939065709710121, + "step": 6790 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 2.2441673278808594, + "learning_rate": 8.753768618629716e-09, + "logits/chosen": -2.967897891998291, + "logits/rejected": -2.9408040046691895, + "logps/chosen": -55.0565185546875, + "logps/rejected": -54.068275451660156, + "loss": 0.6868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.026381511241197586, + "rewards/margins": 0.013342035934329033, + "rewards/rejected": -0.03972354903817177, + "step": 6800 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -3.126094102859497, + "eval_logits/rejected": -3.1204111576080322, + "eval_logps/chosen": -59.046142578125, + "eval_logps/rejected": -64.0239028930664, + "eval_loss": 0.6907273530960083, + "eval_rewards/accuracies": 0.5836431384086609, + "eval_rewards/chosen": -0.0033424401190131903, + "eval_rewards/margins": 0.005095373373478651, + "eval_rewards/rejected": -0.008437813259661198, + "eval_runtime": 383.2996, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 6800 + }, + { + "epoch": 1.173328738800827, + "grad_norm": 2.347003936767578, + "learning_rate": 8.723937012510093e-09, + "logits/chosen": -3.0215110778808594, + "logits/rejected": -3.0046989917755127, + "logps/chosen": -57.366981506347656, + "logps/rejected": -55.16851806640625, + "loss": 0.6847, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01663302257657051, + "rewards/margins": 0.01758481189608574, + "rewards/rejected": -0.03421783447265625, + "step": 6810 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 2.6514339447021484, + "learning_rate": 8.694116945926324e-09, + "logits/chosen": -3.1206183433532715, + "logits/rejected": -3.089256763458252, + "logps/chosen": -56.93294143676758, + "logps/rejected": -57.380699157714844, + "loss": 0.6813, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.01909453049302101, + "rewards/margins": 0.02453770488500595, + "rewards/rejected": -0.043632231652736664, + "step": 6820 + }, + { + "epoch": 1.176774638180565, + "grad_norm": 2.3778793811798096, + "learning_rate": 8.66430868854356e-09, + "logits/chosen": -3.127379894256592, + "logits/rejected": -3.1207189559936523, + "logps/chosen": -56.305023193359375, + "logps/rejected": -56.054443359375, + "loss": 0.6903, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024015625938773155, + "rewards/margins": 0.006403226405382156, + "rewards/rejected": -0.03041885420680046, + "step": 6830 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 2.765775442123413, + "learning_rate": 8.634512509920175e-09, + "logits/chosen": -3.090207576751709, + "logits/rejected": -3.064380407333374, + "logps/chosen": -58.844017028808594, + "logps/rejected": -58.45856857299805, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01913430169224739, + "rewards/margins": 0.011462162248790264, + "rewards/rejected": -0.03059646487236023, + "step": 6840 + }, + { + "epoch": 1.1802205375603032, + "grad_norm": 2.216803550720215, + "learning_rate": 8.604728679505301e-09, + "logits/chosen": -2.9346718788146973, + "logits/rejected": -2.9027819633483887, + "logps/chosen": -56.310997009277344, + "logps/rejected": -57.774322509765625, + "loss": 0.6836, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.021466486155986786, + "rewards/margins": 0.019867608323693275, + "rewards/rejected": -0.04133410006761551, + "step": 6850 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 2.47184157371521, + "learning_rate": 8.574957466636408e-09, + "logits/chosen": -3.0480828285217285, + "logits/rejected": -3.010148525238037, + "logps/chosen": -60.95463943481445, + "logps/rejected": -56.18805694580078, + "loss": 0.6835, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.018399445340037346, + "rewards/margins": 0.020264366641640663, + "rewards/rejected": -0.03866381198167801, + "step": 6860 + }, + { + "epoch": 1.1836664369400414, + "grad_norm": 2.0314888954162598, + "learning_rate": 8.545199140536875e-09, + "logits/chosen": -2.967435359954834, + "logits/rejected": -2.956160545349121, + "logps/chosen": -54.10022735595703, + "logps/rejected": -55.425926208496094, + "loss": 0.6903, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.030147861689329147, + "rewards/margins": 0.006414397154003382, + "rewards/rejected": -0.03656225651502609, + "step": 6870 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 2.3606247901916504, + "learning_rate": 8.515453970313526e-09, + "logits/chosen": -3.0911428928375244, + "logits/rejected": -3.0608937740325928, + "logps/chosen": -56.398406982421875, + "logps/rejected": -55.0906867980957, + "loss": 0.6846, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02135491371154785, + "rewards/margins": 0.01793890818953514, + "rewards/rejected": -0.03929382562637329, + "step": 6880 + }, + { + "epoch": 1.1871123363197795, + "grad_norm": 2.494527816772461, + "learning_rate": 8.485722224954237e-09, + "logits/chosen": -2.9907267093658447, + "logits/rejected": -2.96384859085083, + "logps/chosen": -54.1063346862793, + "logps/rejected": -56.15788650512695, + "loss": 0.6837, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01796005479991436, + "rewards/margins": 0.019725376740098, + "rewards/rejected": -0.03768543154001236, + "step": 6890 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 2.524660348892212, + "learning_rate": 8.456004173325458e-09, + "logits/chosen": -3.0648722648620605, + "logits/rejected": -3.043199300765991, + "logps/chosen": -55.88935089111328, + "logps/rejected": -56.82563400268555, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023347685113549232, + "rewards/margins": 0.010543139651417732, + "rewards/rejected": -0.033890821039676666, + "step": 6900 + }, + { + "epoch": 1.1888352860096485, + "eval_logits/chosen": -3.1254613399505615, + "eval_logits/rejected": -3.119804859161377, + "eval_logps/chosen": -59.08448028564453, + "eval_logps/rejected": -64.06681060791016, + "eval_loss": 0.6907079815864563, + "eval_rewards/accuracies": 0.5810873508453369, + "eval_rewards/chosen": -0.0037258744705468416, + "eval_rewards/margins": 0.0051410011947155, + "eval_rewards/rejected": -0.008866875432431698, + "eval_runtime": 384.0766, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 6900 + }, + { + "epoch": 1.1905582356995175, + "grad_norm": 2.2871227264404297, + "learning_rate": 8.42630008416983e-09, + "logits/chosen": -3.098883867263794, + "logits/rejected": -3.0734448432922363, + "logps/chosen": -59.03523635864258, + "logps/rejected": -58.82707595825195, + "loss": 0.6861, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.02149212174117565, + "rewards/margins": 0.014713278040289879, + "rewards/rejected": -0.03620540350675583, + "step": 6910 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 2.33981990814209, + "learning_rate": 8.396610226103705e-09, + "logits/chosen": -3.1584877967834473, + "logits/rejected": -3.127455472946167, + "logps/chosen": -58.35736846923828, + "logps/rejected": -57.65404510498047, + "loss": 0.6854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.022533800452947617, + "rewards/margins": 0.01630527526140213, + "rewards/rejected": -0.038839079439640045, + "step": 6920 + }, + { + "epoch": 1.1940041350792556, + "grad_norm": 2.3817827701568604, + "learning_rate": 8.366934867614771e-09, + "logits/chosen": -2.964742422103882, + "logits/rejected": -2.9373018741607666, + "logps/chosen": -59.388710021972656, + "logps/rejected": -57.4041633605957, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.01754842884838581, + "rewards/margins": 0.013974052853882313, + "rewards/rejected": -0.0315224826335907, + "step": 6930 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 2.468682050704956, + "learning_rate": 8.337274277059565e-09, + "logits/chosen": -2.956246852874756, + "logits/rejected": -2.9091484546661377, + "logps/chosen": -59.93339157104492, + "logps/rejected": -54.3348274230957, + "loss": 0.6846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0197045486420393, + "rewards/margins": 0.017733246088027954, + "rewards/rejected": -0.037437792867422104, + "step": 6940 + }, + { + "epoch": 1.1974500344589938, + "grad_norm": 2.5347440242767334, + "learning_rate": 8.307628722661104e-09, + "logits/chosen": -3.024871349334717, + "logits/rejected": -3.0062155723571777, + "logps/chosen": -55.86821746826172, + "logps/rejected": -56.56782913208008, + "loss": 0.6854, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.020114390179514885, + "rewards/margins": 0.016181785613298416, + "rewards/rejected": -0.03629617765545845, + "step": 6950 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 2.37861704826355, + "learning_rate": 8.277998472506412e-09, + "logits/chosen": -3.1921026706695557, + "logits/rejected": -3.151654005050659, + "logps/chosen": -57.934608459472656, + "logps/rejected": -51.96177291870117, + "loss": 0.6857, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.026161780580878258, + "rewards/margins": 0.01564834825694561, + "rewards/rejected": -0.04181012138724327, + "step": 6960 + }, + { + "epoch": 1.200895933838732, + "grad_norm": 2.133126735687256, + "learning_rate": 8.248383794544126e-09, + "logits/chosen": -3.103128433227539, + "logits/rejected": -3.068071126937866, + "logps/chosen": -56.252784729003906, + "logps/rejected": -54.36753463745117, + "loss": 0.6823, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.017109837383031845, + "rewards/margins": 0.022667525336146355, + "rewards/rejected": -0.03977736085653305, + "step": 6970 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 2.298814058303833, + "learning_rate": 8.218784956582052e-09, + "logits/chosen": -3.0041556358337402, + "logits/rejected": -2.973860502243042, + "logps/chosen": -56.36700439453125, + "logps/rejected": -56.94713592529297, + "loss": 0.6845, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.021416518837213516, + "rewards/margins": 0.017982427030801773, + "rewards/rejected": -0.03939894586801529, + "step": 6980 + }, + { + "epoch": 1.20434183321847, + "grad_norm": 2.5716981887817383, + "learning_rate": 8.18920222628477e-09, + "logits/chosen": -2.923178195953369, + "logits/rejected": -2.9134066104888916, + "logps/chosen": -59.46036911010742, + "logps/rejected": -57.71564865112305, + "loss": 0.6886, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.024769535288214684, + "rewards/margins": 0.00976070761680603, + "rewards/rejected": -0.034530241042375565, + "step": 6990 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 2.42403244972229, + "learning_rate": 8.15963587117118e-09, + "logits/chosen": -3.2224109172821045, + "logits/rejected": -3.1947338581085205, + "logps/chosen": -58.848487854003906, + "logps/rejected": -58.787452697753906, + "loss": 0.6859, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.025541281327605247, + "rewards/margins": 0.015429399907588959, + "rewards/rejected": -0.04097067937254906, + "step": 7000 + }, + { + "epoch": 1.206064782908339, + "eval_logits/chosen": -3.1261141300201416, + "eval_logits/rejected": -3.120389461517334, + "eval_logps/chosen": -59.12325668334961, + "eval_logps/rejected": -64.10928344726562, + "eval_loss": 0.6906958222389221, + "eval_rewards/accuracies": 0.5796933174133301, + "eval_rewards/chosen": -0.004113591741770506, + "eval_rewards/margins": 0.00517794955521822, + "eval_rewards/rejected": -0.009291541762650013, + "eval_runtime": 383.792, + "eval_samples_per_second": 11.214, + "eval_steps_per_second": 1.402, + "step": 7000 + }, + { + "epoch": 1.207787732598208, + "grad_norm": 2.603954315185547, + "learning_rate": 8.130086158612116e-09, + "logits/chosen": -3.0315804481506348, + "logits/rejected": -3.0040690898895264, + "logps/chosen": -58.850257873535156, + "logps/rejected": -57.16217803955078, + "loss": 0.6842, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.024250095710158348, + "rewards/margins": 0.01874765008687973, + "rewards/rejected": -0.04299774393439293, + "step": 7010 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 2.428187847137451, + "learning_rate": 8.100553355827896e-09, + "logits/chosen": -3.019529342651367, + "logits/rejected": -2.9929378032684326, + "logps/chosen": -53.589073181152344, + "logps/rejected": -56.43495559692383, + "loss": 0.6844, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.022542601451277733, + "rewards/margins": 0.018220920115709305, + "rewards/rejected": -0.040763527154922485, + "step": 7020 + }, + { + "epoch": 1.2112336319779462, + "grad_norm": 2.319085121154785, + "learning_rate": 8.071037729885937e-09, + "logits/chosen": -3.0574421882629395, + "logits/rejected": -3.0418524742126465, + "logps/chosen": -55.7623405456543, + "logps/rejected": -58.62355422973633, + "loss": 0.6892, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.026621636003255844, + "rewards/margins": 0.00847454834729433, + "rewards/rejected": -0.0350961834192276, + "step": 7030 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 2.6634323596954346, + "learning_rate": 8.041539547698307e-09, + "logits/chosen": -2.998723268508911, + "logits/rejected": -2.977909564971924, + "logps/chosen": -57.95904541015625, + "logps/rejected": -60.370140075683594, + "loss": 0.6836, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.018822792917490005, + "rewards/margins": 0.01990441419184208, + "rewards/rejected": -0.038727205246686935, + "step": 7040 + }, + { + "epoch": 1.2146795313576844, + "grad_norm": 2.6832826137542725, + "learning_rate": 8.01205907601935e-09, + "logits/chosen": -2.9719934463500977, + "logits/rejected": -2.9498307704925537, + "logps/chosen": -53.96051788330078, + "logps/rejected": -57.45134735107422, + "loss": 0.6856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.025047386065125465, + "rewards/margins": 0.015683867037296295, + "rewards/rejected": -0.04073125496506691, + "step": 7050 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 2.773050308227539, + "learning_rate": 7.982596581443237e-09, + "logits/chosen": -3.0882060527801514, + "logits/rejected": -3.0739293098449707, + "logps/chosen": -55.9157829284668, + "logps/rejected": -57.05158615112305, + "loss": 0.6864, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.027055267244577408, + "rewards/margins": 0.014333643019199371, + "rewards/rejected": -0.04138891398906708, + "step": 7060 + }, + { + "epoch": 1.2181254307374225, + "grad_norm": 2.327251672744751, + "learning_rate": 7.953152330401568e-09, + "logits/chosen": -3.051583766937256, + "logits/rejected": -3.023996114730835, + "logps/chosen": -56.63190841674805, + "logps/rejected": -56.517967224121094, + "loss": 0.6873, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.020288683474063873, + "rewards/margins": 0.012412209063768387, + "rewards/rejected": -0.03270088881254196, + "step": 7070 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 2.5216662883758545, + "learning_rate": 7.923726589160985e-09, + "logits/chosen": -3.1125307083129883, + "logits/rejected": -3.0817744731903076, + "logps/chosen": -56.150787353515625, + "logps/rejected": -58.22904586791992, + "loss": 0.6856, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.02682647481560707, + "rewards/margins": 0.015813734382390976, + "rewards/rejected": -0.04264020919799805, + "step": 7080 + }, + { + "epoch": 1.2215713301171607, + "grad_norm": 2.3830454349517822, + "learning_rate": 7.894319623820721e-09, + "logits/chosen": -3.141756296157837, + "logits/rejected": -3.13077712059021, + "logps/chosen": -56.7417106628418, + "logps/rejected": -56.47627639770508, + "loss": 0.6883, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03223857656121254, + "rewards/margins": 0.010424691252410412, + "rewards/rejected": -0.042663268744945526, + "step": 7090 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 2.4901022911071777, + "learning_rate": 7.864931700310235e-09, + "logits/chosen": -3.0033276081085205, + "logits/rejected": -2.9819869995117188, + "logps/chosen": -59.11650466918945, + "logps/rejected": -60.6047248840332, + "loss": 0.685, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.020432157441973686, + "rewards/margins": 0.0171308945864439, + "rewards/rejected": -0.037563055753707886, + "step": 7100 + }, + { + "epoch": 1.2232942798070296, + "eval_logits/chosen": -3.123748540878296, + "eval_logits/rejected": -3.1180479526519775, + "eval_logps/chosen": -59.15984344482422, + "eval_logps/rejected": -64.1565170288086, + "eval_loss": 0.6906439065933228, + "eval_rewards/accuracies": 0.5796933174133301, + "eval_rewards/chosen": -0.004479521419852972, + "eval_rewards/margins": 0.005284461192786694, + "eval_rewards/rejected": -0.009763982146978378, + "eval_runtime": 384.0905, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 7100 + }, + { + "epoch": 1.2250172294968986, + "grad_norm": 2.255007266998291, + "learning_rate": 7.835563084386777e-09, + "logits/chosen": -3.0600950717926025, + "logits/rejected": -3.0214476585388184, + "logps/chosen": -56.0528678894043, + "logps/rejected": -54.752723693847656, + "loss": 0.6875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.025800343602895737, + "rewards/margins": 0.011899485252797604, + "rewards/rejected": -0.037699829787015915, + "step": 7110 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 2.3676490783691406, + "learning_rate": 7.806214041633009e-09, + "logits/chosen": -3.045886278152466, + "logits/rejected": -3.00185489654541, + "logps/chosen": -63.60943603515625, + "logps/rejected": -56.4583854675293, + "loss": 0.6821, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.016231974586844444, + "rewards/margins": 0.022754736244678497, + "rewards/rejected": -0.03898671269416809, + "step": 7120 + }, + { + "epoch": 1.2284631288766368, + "grad_norm": 2.478337049484253, + "learning_rate": 7.776884837454573e-09, + "logits/chosen": -2.924165964126587, + "logits/rejected": -2.8969979286193848, + "logps/chosen": -58.44608688354492, + "logps/rejected": -56.87275314331055, + "loss": 0.6856, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02104850485920906, + "rewards/margins": 0.015894543379545212, + "rewards/rejected": -0.03694305196404457, + "step": 7130 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 2.278496742248535, + "learning_rate": 7.747575737077732e-09, + "logits/chosen": -3.0394952297210693, + "logits/rejected": -3.0347137451171875, + "logps/chosen": -54.013336181640625, + "logps/rejected": -55.49956512451172, + "loss": 0.6893, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.026956558227539062, + "rewards/margins": 0.008593494072556496, + "rewards/rejected": -0.03555005416274071, + "step": 7140 + }, + { + "epoch": 1.231909028256375, + "grad_norm": 2.5096144676208496, + "learning_rate": 7.71828700554693e-09, + "logits/chosen": -3.113610029220581, + "logits/rejected": -3.0898070335388184, + "logps/chosen": -58.9386100769043, + "logps/rejected": -58.98060989379883, + "loss": 0.6869, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.025510985404253006, + "rewards/margins": 0.013395940884947777, + "rewards/rejected": -0.038906924426555634, + "step": 7150 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 2.4192092418670654, + "learning_rate": 7.689018907722429e-09, + "logits/chosen": -3.0234131813049316, + "logits/rejected": -2.98473858833313, + "logps/chosen": -55.985321044921875, + "logps/rejected": -58.2840461730957, + "loss": 0.6841, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02323567308485508, + "rewards/margins": 0.018828080967068672, + "rewards/rejected": -0.04206375032663345, + "step": 7160 + }, + { + "epoch": 1.235354927636113, + "grad_norm": 2.607609748840332, + "learning_rate": 7.659771708277883e-09, + "logits/chosen": -2.985973834991455, + "logits/rejected": -2.9590420722961426, + "logps/chosen": -58.46117401123047, + "logps/rejected": -57.82086181640625, + "loss": 0.6879, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.02894872985780239, + "rewards/margins": 0.011171415448188782, + "rewards/rejected": -0.04012014716863632, + "step": 7170 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 2.5801990032196045, + "learning_rate": 7.630545671697975e-09, + "logits/chosen": -3.1181468963623047, + "logits/rejected": -3.10412335395813, + "logps/chosen": -57.78798294067383, + "logps/rejected": -58.09895706176758, + "loss": 0.6878, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.024857694283127785, + "rewards/margins": 0.011409392580389977, + "rewards/rejected": -0.03626708686351776, + "step": 7180 + }, + { + "epoch": 1.2388008270158513, + "grad_norm": 2.26977276802063, + "learning_rate": 7.601341062275997e-09, + "logits/chosen": -2.966801643371582, + "logits/rejected": -2.942594051361084, + "logps/chosen": -55.58942413330078, + "logps/rejected": -57.827667236328125, + "loss": 0.6872, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02918897196650505, + "rewards/margins": 0.012870155274868011, + "rewards/rejected": -0.04205913096666336, + "step": 7190 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 2.7323532104492188, + "learning_rate": 7.57215814411149e-09, + "logits/chosen": -2.9832592010498047, + "logits/rejected": -2.9655752182006836, + "logps/chosen": -55.5822639465332, + "logps/rejected": -60.1941032409668, + "loss": 0.6858, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02484903857111931, + "rewards/margins": 0.015351449139416218, + "rewards/rejected": -0.04020049050450325, + "step": 7200 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -3.124135971069336, + "eval_logits/rejected": -3.1184306144714355, + "eval_logps/chosen": -59.1701774597168, + "eval_logps/rejected": -64.19103240966797, + "eval_loss": 0.6905274391174316, + "eval_rewards/accuracies": 0.5820167064666748, + "eval_rewards/chosen": -0.004582811146974564, + "eval_rewards/margins": 0.005526235792785883, + "eval_rewards/rejected": -0.010109047405421734, + "eval_runtime": 383.8514, + "eval_samples_per_second": 11.213, + "eval_steps_per_second": 1.402, + "step": 7200 + }, + { + "epoch": 1.2422467263955892, + "grad_norm": 2.333134651184082, + "learning_rate": 7.54299718110782e-09, + "logits/chosen": -3.1046881675720215, + "logits/rejected": -3.0822250843048096, + "logps/chosen": -59.1954460144043, + "logps/rejected": -56.4441032409668, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02166753076016903, + "rewards/margins": 0.012477634474635124, + "rewards/rejected": -0.03414516523480415, + "step": 7210 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 2.4416794776916504, + "learning_rate": 7.51385843696983e-09, + "logits/chosen": -2.9671475887298584, + "logits/rejected": -2.931851387023926, + "logps/chosen": -57.426719665527344, + "logps/rejected": -55.91858673095703, + "loss": 0.6867, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.028967272490262985, + "rewards/margins": 0.013575015589594841, + "rewards/rejected": -0.04254228621721268, + "step": 7220 + }, + { + "epoch": 1.2456926257753274, + "grad_norm": 2.446187973022461, + "learning_rate": 7.484742175201417e-09, + "logits/chosen": -3.0023951530456543, + "logits/rejected": -2.9816293716430664, + "logps/chosen": -56.86414337158203, + "logps/rejected": -56.91185760498047, + "loss": 0.6871, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.026427680626511574, + "rewards/margins": 0.012890076264739037, + "rewards/rejected": -0.03931775689125061, + "step": 7230 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 2.3552913665771484, + "learning_rate": 7.455648659103191e-09, + "logits/chosen": -3.0364127159118652, + "logits/rejected": -3.0254123210906982, + "logps/chosen": -60.1931037902832, + "logps/rejected": -58.56923294067383, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02746332809329033, + "rewards/margins": 0.010513190180063248, + "rewards/rejected": -0.03797651827335358, + "step": 7240 + }, + { + "epoch": 1.2491385251550655, + "grad_norm": 2.817966938018799, + "learning_rate": 7.426578151770047e-09, + "logits/chosen": -3.0863070487976074, + "logits/rejected": -3.054816722869873, + "logps/chosen": -57.13684844970703, + "logps/rejected": -56.9018440246582, + "loss": 0.6861, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.025691917166113853, + "rewards/margins": 0.014780798926949501, + "rewards/rejected": -0.04047270864248276, + "step": 7250 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 2.2241945266723633, + "learning_rate": 7.397530916088828e-09, + "logits/chosen": -3.081535816192627, + "logits/rejected": -3.047715663909912, + "logps/chosen": -55.83784866333008, + "logps/rejected": -56.54887008666992, + "loss": 0.6834, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02152073010802269, + "rewards/margins": 0.020243234932422638, + "rewards/rejected": -0.04176396131515503, + "step": 7260 + }, + { + "epoch": 1.2525844245348035, + "grad_norm": 2.390984058380127, + "learning_rate": 7.36850721473592e-09, + "logits/chosen": -3.097338914871216, + "logits/rejected": -3.0662691593170166, + "logps/chosen": -56.4255256652832, + "logps/rejected": -56.92626953125, + "loss": 0.6826, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.021098647266626358, + "rewards/margins": 0.02181319333612919, + "rewards/rejected": -0.0429118387401104, + "step": 7270 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 2.712801218032837, + "learning_rate": 7.339507310174884e-09, + "logits/chosen": -3.126049041748047, + "logits/rejected": -3.091052770614624, + "logps/chosen": -59.09241485595703, + "logps/rejected": -57.259437561035156, + "loss": 0.6844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.025226742029190063, + "rewards/margins": 0.018264181911945343, + "rewards/rejected": -0.043490923941135406, + "step": 7280 + }, + { + "epoch": 1.2560303239145416, + "grad_norm": 2.2661290168762207, + "learning_rate": 7.3105314646541e-09, + "logits/chosen": -2.9083609580993652, + "logits/rejected": -2.8951635360717773, + "logps/chosen": -57.48980712890625, + "logps/rejected": -59.64112091064453, + "loss": 0.6881, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.026984844356775284, + "rewards/margins": 0.010795495472848415, + "rewards/rejected": -0.037780337035655975, + "step": 7290 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 2.620009183883667, + "learning_rate": 7.281579940204361e-09, + "logits/chosen": -2.8676297664642334, + "logits/rejected": -2.8505687713623047, + "logps/chosen": -53.25835037231445, + "logps/rejected": -55.6039924621582, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.029975971207022667, + "rewards/margins": 0.005989503115415573, + "rewards/rejected": -0.03596547618508339, + "step": 7300 + }, + { + "epoch": 1.2577532736044108, + "eval_logits/chosen": -3.1238722801208496, + "eval_logits/rejected": -3.1181890964508057, + "eval_logps/chosen": -59.20159912109375, + "eval_logps/rejected": -64.22035217285156, + "eval_loss": 0.6905403137207031, + "eval_rewards/accuracies": 0.5803903341293335, + "eval_rewards/chosen": -0.004897048696875572, + "eval_rewards/margins": 0.005505240522325039, + "eval_rewards/rejected": -0.010402288287878036, + "eval_runtime": 383.6551, + "eval_samples_per_second": 11.218, + "eval_steps_per_second": 1.402, + "step": 7300 + }, + { + "epoch": 1.2594762232942798, + "grad_norm": 2.822331190109253, + "learning_rate": 7.25265299863654e-09, + "logits/chosen": -3.069945812225342, + "logits/rejected": -3.046442747116089, + "logps/chosen": -57.70795440673828, + "logps/rejected": -55.729942321777344, + "loss": 0.6876, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02648942545056343, + "rewards/margins": 0.01195268053561449, + "rewards/rejected": -0.03844210132956505, + "step": 7310 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 2.468334674835205, + "learning_rate": 7.22375090153919e-09, + "logits/chosen": -3.0881543159484863, + "logits/rejected": -3.061934471130371, + "logps/chosen": -57.00069046020508, + "logps/rejected": -55.181861877441406, + "loss": 0.6868, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.026471266523003578, + "rewards/margins": 0.013788128271698952, + "rewards/rejected": -0.04025938734412193, + "step": 7320 + }, + { + "epoch": 1.262922122674018, + "grad_norm": 2.297668933868408, + "learning_rate": 7.194873910276204e-09, + "logits/chosen": -3.0014100074768066, + "logits/rejected": -2.9748117923736572, + "logps/chosen": -55.19213104248047, + "logps/rejected": -55.83556365966797, + "loss": 0.6829, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02129554934799671, + "rewards/margins": 0.021344564855098724, + "rewards/rejected": -0.04264011234045029, + "step": 7330 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 2.6001136302948, + "learning_rate": 7.166022285984437e-09, + "logits/chosen": -3.061702251434326, + "logits/rejected": -3.0339019298553467, + "logps/chosen": -57.834259033203125, + "logps/rejected": -58.05975341796875, + "loss": 0.6851, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024531777948141098, + "rewards/margins": 0.016910618171095848, + "rewards/rejected": -0.0414423942565918, + "step": 7340 + }, + { + "epoch": 1.266368022053756, + "grad_norm": 2.4237160682678223, + "learning_rate": 7.13719628957135e-09, + "logits/chosen": -3.1121439933776855, + "logits/rejected": -3.0826239585876465, + "logps/chosen": -58.967552185058594, + "logps/rejected": -55.390892028808594, + "loss": 0.6857, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.020481975749135017, + "rewards/margins": 0.01573684811592102, + "rewards/rejected": -0.03621882572770119, + "step": 7350 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 2.4601356983184814, + "learning_rate": 7.108396181712643e-09, + "logits/chosen": -3.018209934234619, + "logits/rejected": -2.997499704360962, + "logps/chosen": -56.858489990234375, + "logps/rejected": -56.35799026489258, + "loss": 0.6855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02560586854815483, + "rewards/margins": 0.01612265780568123, + "rewards/rejected": -0.04172852635383606, + "step": 7360 + }, + { + "epoch": 1.269813921433494, + "grad_norm": 2.7204806804656982, + "learning_rate": 7.079622222849917e-09, + "logits/chosen": -2.9083404541015625, + "logits/rejected": -2.8869762420654297, + "logps/chosen": -55.68623733520508, + "logps/rejected": -55.17841339111328, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028797179460525513, + "rewards/margins": 0.010576505213975906, + "rewards/rejected": -0.03937368467450142, + "step": 7370 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 2.232482433319092, + "learning_rate": 7.05087467318829e-09, + "logits/chosen": -3.0181708335876465, + "logits/rejected": -2.996931552886963, + "logps/chosen": -55.63871383666992, + "logps/rejected": -57.555015563964844, + "loss": 0.6865, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024709437042474747, + "rewards/margins": 0.013970698229968548, + "rewards/rejected": -0.03868013620376587, + "step": 7380 + }, + { + "epoch": 1.2732598208132322, + "grad_norm": 2.4573562145233154, + "learning_rate": 7.022153792694073e-09, + "logits/chosen": -2.982329845428467, + "logits/rejected": -2.9620840549468994, + "logps/chosen": -54.878746032714844, + "logps/rejected": -57.192298889160156, + "loss": 0.686, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02665008045732975, + "rewards/margins": 0.015081726014614105, + "rewards/rejected": -0.041731808334589005, + "step": 7390 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 2.2985384464263916, + "learning_rate": 6.993459841092396e-09, + "logits/chosen": -2.9905054569244385, + "logits/rejected": -2.9512736797332764, + "logps/chosen": -58.15850067138672, + "logps/rejected": -54.942726135253906, + "loss": 0.6852, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.0270744226872921, + "rewards/margins": 0.016645211726427078, + "rewards/rejected": -0.04371963441371918, + "step": 7400 + }, + { + "epoch": 1.2749827705031014, + "eval_logits/chosen": -3.123706579208374, + "eval_logits/rejected": -3.1180148124694824, + "eval_logps/chosen": -59.22597885131836, + "eval_logps/rejected": -64.24317169189453, + "eval_loss": 0.6905500888824463, + "eval_rewards/accuracies": 0.5789963006973267, + "eval_rewards/chosen": -0.005140796769410372, + "eval_rewards/margins": 0.005489727016538382, + "eval_rewards/rejected": -0.010630524717271328, + "eval_runtime": 384.0969, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 7400 + }, + { + "epoch": 1.2767057201929704, + "grad_norm": 2.598616361618042, + "learning_rate": 6.964793077864876e-09, + "logits/chosen": -2.9563305377960205, + "logits/rejected": -2.9255309104919434, + "logps/chosen": -56.2224006652832, + "logps/rejected": -57.120323181152344, + "loss": 0.6854, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02466033585369587, + "rewards/margins": 0.01632571592926979, + "rewards/rejected": -0.04098604992032051, + "step": 7410 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 2.6292760372161865, + "learning_rate": 6.936153762247254e-09, + "logits/chosen": -2.8979477882385254, + "logits/rejected": -2.8714406490325928, + "logps/chosen": -58.17467498779297, + "logps/rejected": -56.69647216796875, + "loss": 0.6855, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.020067289471626282, + "rewards/margins": 0.01622116006910801, + "rewards/rejected": -0.03628844767808914, + "step": 7420 + }, + { + "epoch": 1.2801516195727085, + "grad_norm": 2.500096321105957, + "learning_rate": 6.907542153227073e-09, + "logits/chosen": -2.9541492462158203, + "logits/rejected": -2.9250648021698, + "logps/chosen": -56.53348922729492, + "logps/rejected": -56.31572341918945, + "loss": 0.6869, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.031292449682950974, + "rewards/margins": 0.013092470355331898, + "rewards/rejected": -0.044384922832250595, + "step": 7430 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 2.392944812774658, + "learning_rate": 6.878958509541311e-09, + "logits/chosen": -3.0708324909210205, + "logits/rejected": -3.0418925285339355, + "logps/chosen": -58.69022750854492, + "logps/rejected": -58.29036331176758, + "loss": 0.6824, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.026854073628783226, + "rewards/margins": 0.02240685746073723, + "rewards/rejected": -0.049260932952165604, + "step": 7440 + }, + { + "epoch": 1.2835975189524467, + "grad_norm": 2.7048799991607666, + "learning_rate": 6.850403089674067e-09, + "logits/chosen": -3.134178876876831, + "logits/rejected": -3.1060986518859863, + "logps/chosen": -58.184654235839844, + "logps/rejected": -56.81584930419922, + "loss": 0.6839, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.02258859947323799, + "rewards/margins": 0.01950298808515072, + "rewards/rejected": -0.04209158942103386, + "step": 7450 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 2.316319704055786, + "learning_rate": 6.8218761518541916e-09, + "logits/chosen": -2.9367308616638184, + "logits/rejected": -2.9404594898223877, + "logps/chosen": -53.09021759033203, + "logps/rejected": -57.01831817626953, + "loss": 0.688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02927972748875618, + "rewards/margins": 0.011012665927410126, + "rewards/rejected": -0.040292393416166306, + "step": 7460 + }, + { + "epoch": 1.2870434183321846, + "grad_norm": 2.487236499786377, + "learning_rate": 6.793377954052989e-09, + "logits/chosen": -3.029714584350586, + "logits/rejected": -3.012195587158203, + "logps/chosen": -57.121620178222656, + "logps/rejected": -55.000083923339844, + "loss": 0.6875, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.02466987445950508, + "rewards/margins": 0.012196965515613556, + "rewards/rejected": -0.03686683997511864, + "step": 7470 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 2.2682740688323975, + "learning_rate": 6.764908753981844e-09, + "logits/chosen": -2.9855077266693115, + "logits/rejected": -2.9578702449798584, + "logps/chosen": -57.35419845581055, + "logps/rejected": -53.1810417175293, + "loss": 0.6873, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.027218962088227272, + "rewards/margins": 0.0126141756772995, + "rewards/rejected": -0.03983313590288162, + "step": 7480 + }, + { + "epoch": 1.2904893177119228, + "grad_norm": 2.6248104572296143, + "learning_rate": 6.7364688090899395e-09, + "logits/chosen": -2.955197811126709, + "logits/rejected": -2.941403865814209, + "logps/chosen": -55.307899475097656, + "logps/rejected": -57.485992431640625, + "loss": 0.6856, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.027474090456962585, + "rewards/margins": 0.015908868983387947, + "rewards/rejected": -0.04338296130299568, + "step": 7490 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 2.2009220123291016, + "learning_rate": 6.708058376561879e-09, + "logits/chosen": -2.9785373210906982, + "logits/rejected": -2.9540696144104004, + "logps/chosen": -54.05745315551758, + "logps/rejected": -55.798377990722656, + "loss": 0.6873, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025832027196884155, + "rewards/margins": 0.012369804084300995, + "rewards/rejected": -0.03820183128118515, + "step": 7500 + }, + { + "epoch": 1.292212267401792, + "eval_logits/chosen": -3.123053789138794, + "eval_logits/rejected": -3.1173923015594482, + "eval_logps/chosen": -59.25996017456055, + "eval_logps/rejected": -64.27454376220703, + "eval_loss": 0.6905677318572998, + "eval_rewards/accuracies": 0.5859665274620056, + "eval_rewards/chosen": -0.0054806615225970745, + "eval_rewards/margins": 0.005463543813675642, + "eval_rewards/rejected": -0.010944206267595291, + "eval_runtime": 384.1014, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.401, + "step": 7500 + }, + { + "epoch": 1.293935217091661, + "grad_norm": 2.403428554534912, + "learning_rate": 6.6796777133153885e-09, + "logits/chosen": -3.0138533115386963, + "logits/rejected": -2.9797167778015137, + "logps/chosen": -59.300132751464844, + "logps/rejected": -55.43413162231445, + "loss": 0.6828, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.020390545949339867, + "rewards/margins": 0.021740619093179703, + "rewards/rejected": -0.04213116317987442, + "step": 7510 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 2.5295517444610596, + "learning_rate": 6.651327075999e-09, + "logits/chosen": -2.9840564727783203, + "logits/rejected": -2.963972330093384, + "logps/chosen": -55.64642333984375, + "logps/rejected": -57.811431884765625, + "loss": 0.6874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02718658745288849, + "rewards/margins": 0.01235284935683012, + "rewards/rejected": -0.039539434015750885, + "step": 7520 + }, + { + "epoch": 1.297381116471399, + "grad_norm": 2.5545949935913086, + "learning_rate": 6.623006720989699e-09, + "logits/chosen": -2.927830934524536, + "logits/rejected": -2.926427125930786, + "logps/chosen": -54.93427276611328, + "logps/rejected": -57.257972717285156, + "loss": 0.6881, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02716946043074131, + "rewards/margins": 0.011054610833525658, + "rewards/rejected": -0.03822406753897667, + "step": 7530 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 2.114025592803955, + "learning_rate": 6.594716904390648e-09, + "logits/chosen": -3.1067121028900146, + "logits/rejected": -3.099963903427124, + "logps/chosen": -53.91367721557617, + "logps/rejected": -56.411949157714844, + "loss": 0.6866, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02373148314654827, + "rewards/margins": 0.013630990870296955, + "rewards/rejected": -0.0373624712228775, + "step": 7540 + }, + { + "epoch": 1.3008270158511372, + "grad_norm": 2.553532838821411, + "learning_rate": 6.566457882028829e-09, + "logits/chosen": -3.0206680297851562, + "logits/rejected": -3.0002551078796387, + "logps/chosen": -54.42189407348633, + "logps/rejected": -56.299537658691406, + "loss": 0.6843, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026311367750167847, + "rewards/margins": 0.018551740795373917, + "rewards/rejected": -0.04486311227083206, + "step": 7550 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 2.7336137294769287, + "learning_rate": 6.5382299094527595e-09, + "logits/chosen": -3.0959246158599854, + "logits/rejected": -3.076599359512329, + "logps/chosen": -54.67638397216797, + "logps/rejected": -59.42815399169922, + "loss": 0.6837, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.02101433463394642, + "rewards/margins": 0.019638245925307274, + "rewards/rejected": -0.04065258055925369, + "step": 7560 + }, + { + "epoch": 1.3042729152308752, + "grad_norm": 2.2857635021209717, + "learning_rate": 6.510033241930166e-09, + "logits/chosen": -3.0978853702545166, + "logits/rejected": -3.079770565032959, + "logps/chosen": -59.495140075683594, + "logps/rejected": -56.71014404296875, + "loss": 0.6849, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0269808117300272, + "rewards/margins": 0.017586104571819305, + "rewards/rejected": -0.04456691816449165, + "step": 7570 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 2.645477533340454, + "learning_rate": 6.48186813444569e-09, + "logits/chosen": -3.0349011421203613, + "logits/rejected": -3.0162124633789062, + "logps/chosen": -57.695037841796875, + "logps/rejected": -58.27811813354492, + "loss": 0.6838, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.016813058406114578, + "rewards/margins": 0.019382277503609657, + "rewards/rejected": -0.036195337772369385, + "step": 7580 + }, + { + "epoch": 1.3077188146106133, + "grad_norm": 2.9784748554229736, + "learning_rate": 6.4537348416985586e-09, + "logits/chosen": -3.082174777984619, + "logits/rejected": -3.0377144813537598, + "logps/chosen": -60.8484992980957, + "logps/rejected": -56.35919189453125, + "loss": 0.6816, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.025256508961319923, + "rewards/margins": 0.02421494387090206, + "rewards/rejected": -0.04947146028280258, + "step": 7590 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 2.295466184616089, + "learning_rate": 6.425633618100315e-09, + "logits/chosen": -3.045626163482666, + "logits/rejected": -3.0147712230682373, + "logps/chosen": -55.79792404174805, + "logps/rejected": -53.15778350830078, + "loss": 0.6871, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03260722756385803, + "rewards/margins": 0.013006548397243023, + "rewards/rejected": -0.04561377689242363, + "step": 7600 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -3.122342586517334, + "eval_logits/rejected": -3.1166090965270996, + "eval_logps/chosen": -59.26426696777344, + "eval_logps/rejected": -64.30005645751953, + "eval_loss": 0.6904650926589966, + "eval_rewards/accuracies": 0.5829461216926575, + "eval_rewards/chosen": -0.005523705389350653, + "eval_rewards/margins": 0.005675605032593012, + "eval_rewards/rejected": -0.011199310421943665, + "eval_runtime": 383.9608, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 7600 + }, + { + "epoch": 1.3111647139903515, + "grad_norm": 2.2035515308380127, + "learning_rate": 6.397564717772479e-09, + "logits/chosen": -2.997570753097534, + "logits/rejected": -2.9652457237243652, + "logps/chosen": -55.637603759765625, + "logps/rejected": -54.816688537597656, + "loss": 0.6842, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.027262642979621887, + "rewards/margins": 0.018677236512303352, + "rewards/rejected": -0.04593988507986069, + "step": 7610 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 2.3824641704559326, + "learning_rate": 6.369528394544282e-09, + "logits/chosen": -3.0388424396514893, + "logits/rejected": -3.010739803314209, + "logps/chosen": -59.684471130371094, + "logps/rejected": -57.155296325683594, + "loss": 0.6876, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02171112224459648, + "rewards/margins": 0.01191165205091238, + "rewards/rejected": -0.033622775226831436, + "step": 7620 + }, + { + "epoch": 1.3146106133700897, + "grad_norm": 2.6214728355407715, + "learning_rate": 6.341524901950352e-09, + "logits/chosen": -2.972496509552002, + "logits/rejected": -2.986039400100708, + "logps/chosen": -52.67688751220703, + "logps/rejected": -57.54160690307617, + "loss": 0.6902, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.028438786044716835, + "rewards/margins": 0.0066140033304691315, + "rewards/rejected": -0.03505278751254082, + "step": 7630 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 2.7644217014312744, + "learning_rate": 6.3135544932284304e-09, + "logits/chosen": -2.9409775733947754, + "logits/rejected": -2.917985677719116, + "logps/chosen": -60.52558135986328, + "logps/rejected": -56.77556610107422, + "loss": 0.6848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02607950009405613, + "rewards/margins": 0.01758510060608387, + "rewards/rejected": -0.0436645969748497, + "step": 7640 + }, + { + "epoch": 1.3180565127498278, + "grad_norm": 2.5142641067504883, + "learning_rate": 6.2856174213170735e-09, + "logits/chosen": -3.073223114013672, + "logits/rejected": -3.048750162124634, + "logps/chosen": -61.158958435058594, + "logps/rejected": -60.50822067260742, + "loss": 0.6838, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.027626430615782738, + "rewards/margins": 0.019644903019070625, + "rewards/rejected": -0.04727133363485336, + "step": 7650 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 2.8654909133911133, + "learning_rate": 6.25771393885338e-09, + "logits/chosen": -3.0456790924072266, + "logits/rejected": -3.005376100540161, + "logps/chosen": -59.436973571777344, + "logps/rejected": -59.17437744140625, + "loss": 0.6836, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024066317826509476, + "rewards/margins": 0.019956592470407486, + "rewards/rejected": -0.04402291029691696, + "step": 7660 + }, + { + "epoch": 1.3215024121295658, + "grad_norm": 2.4010403156280518, + "learning_rate": 6.229844298170681e-09, + "logits/chosen": -2.9991161823272705, + "logits/rejected": -2.9706664085388184, + "logps/chosen": -60.234466552734375, + "logps/rejected": -57.38881301879883, + "loss": 0.6849, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.026901472359895706, + "rewards/margins": 0.017608607187867165, + "rewards/rejected": -0.04451008141040802, + "step": 7670 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 2.2777979373931885, + "learning_rate": 6.202008751296293e-09, + "logits/chosen": -3.074632167816162, + "logits/rejected": -3.056203842163086, + "logps/chosen": -53.30518341064453, + "logps/rejected": -53.90324783325195, + "loss": 0.6859, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.026688244193792343, + "rewards/margins": 0.015266289934515953, + "rewards/rejected": -0.041954535990953445, + "step": 7680 + }, + { + "epoch": 1.324948311509304, + "grad_norm": 2.355191230773926, + "learning_rate": 6.174207549949205e-09, + "logits/chosen": -3.1611850261688232, + "logits/rejected": -3.1127707958221436, + "logps/chosen": -56.6767463684082, + "logps/rejected": -52.805686950683594, + "loss": 0.6833, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.016922805458307266, + "rewards/margins": 0.020582620054483414, + "rewards/rejected": -0.03750542551279068, + "step": 7690 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 2.563619613647461, + "learning_rate": 6.146440945537821e-09, + "logits/chosen": -3.015319585800171, + "logits/rejected": -2.9848389625549316, + "logps/chosen": -56.29510498046875, + "logps/rejected": -57.7125244140625, + "loss": 0.6865, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02584684267640114, + "rewards/margins": 0.01425609178841114, + "rewards/rejected": -0.04010293632745743, + "step": 7700 + }, + { + "epoch": 1.3266712611991731, + "eval_logits/chosen": -3.121819496154785, + "eval_logits/rejected": -3.1161410808563232, + "eval_logps/chosen": -59.28498458862305, + "eval_logps/rejected": -64.3291015625, + "eval_loss": 0.690426766872406, + "eval_rewards/accuracies": 0.5845724940299988, + "eval_rewards/chosen": -0.005730922799557447, + "eval_rewards/margins": 0.005758913233876228, + "eval_rewards/rejected": -0.011489835567772388, + "eval_runtime": 384.1336, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.401, + "step": 7700 + }, + { + "epoch": 1.328394210889042, + "grad_norm": 2.249321699142456, + "learning_rate": 6.1187091891576855e-09, + "logits/chosen": -3.061793565750122, + "logits/rejected": -3.0465798377990723, + "logps/chosen": -54.94254684448242, + "logps/rejected": -55.14897918701172, + "loss": 0.6893, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.033700019121170044, + "rewards/margins": 0.008526310324668884, + "rewards/rejected": -0.04222633317112923, + "step": 7710 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 2.452054023742676, + "learning_rate": 6.091012531589198e-09, + "logits/chosen": -3.036054849624634, + "logits/rejected": -2.9991581439971924, + "logps/chosen": -59.057716369628906, + "logps/rejected": -55.31732940673828, + "loss": 0.6857, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.029151711612939835, + "rewards/margins": 0.01590500771999359, + "rewards/rejected": -0.045056719332933426, + "step": 7720 + }, + { + "epoch": 1.33184011026878, + "grad_norm": 2.3000831604003906, + "learning_rate": 6.063351223295377e-09, + "logits/chosen": -2.992392063140869, + "logits/rejected": -2.9694530963897705, + "logps/chosen": -56.845069885253906, + "logps/rejected": -56.6755256652832, + "loss": 0.6861, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.029227161779999733, + "rewards/margins": 0.014832262881100178, + "rewards/rejected": -0.044059425592422485, + "step": 7730 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 2.3779852390289307, + "learning_rate": 6.035725514419554e-09, + "logits/chosen": -2.9875688552856445, + "logits/rejected": -2.971534013748169, + "logps/chosen": -54.39215087890625, + "logps/rejected": -58.52937698364258, + "loss": 0.6833, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.02687232568860054, + "rewards/margins": 0.020616920664906502, + "rewards/rejected": -0.047489240765571594, + "step": 7740 + }, + { + "epoch": 1.3352860096485184, + "grad_norm": 2.277998447418213, + "learning_rate": 6.008135654783151e-09, + "logits/chosen": -3.0029854774475098, + "logits/rejected": -2.968958854675293, + "logps/chosen": -56.569854736328125, + "logps/rejected": -57.06427001953125, + "loss": 0.6825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026270601898431778, + "rewards/margins": 0.02241623029112816, + "rewards/rejected": -0.04868683964014053, + "step": 7750 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 2.386118173599243, + "learning_rate": 5.980581893883383e-09, + "logits/chosen": -2.9068267345428467, + "logits/rejected": -2.8905227184295654, + "logps/chosen": -53.168365478515625, + "logps/rejected": -55.826568603515625, + "loss": 0.6862, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.026616433635354042, + "rewards/margins": 0.01475044060498476, + "rewards/rejected": -0.04136687144637108, + "step": 7760 + }, + { + "epoch": 1.3387319090282563, + "grad_norm": 2.473367214202881, + "learning_rate": 5.95306448089104e-09, + "logits/chosen": -2.953372001647949, + "logits/rejected": -2.928014039993286, + "logps/chosen": -54.80479049682617, + "logps/rejected": -56.73512649536133, + "loss": 0.6858, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.026885952800512314, + "rewards/margins": 0.015338996425271034, + "rewards/rejected": -0.0422249510884285, + "step": 7770 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 2.2375106811523438, + "learning_rate": 5.925583664648201e-09, + "logits/chosen": -3.116488456726074, + "logits/rejected": -3.0995609760284424, + "logps/chosen": -54.901405334472656, + "logps/rejected": -58.4135856628418, + "loss": 0.6878, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.033624183386564255, + "rewards/margins": 0.011648926883935928, + "rewards/rejected": -0.04527311399579048, + "step": 7780 + }, + { + "epoch": 1.3421778084079945, + "grad_norm": 2.5394904613494873, + "learning_rate": 5.898139693666007e-09, + "logits/chosen": -3.045893907546997, + "logits/rejected": -3.030491352081299, + "logps/chosen": -54.684173583984375, + "logps/rejected": -56.65555953979492, + "loss": 0.6853, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.031137654557824135, + "rewards/margins": 0.016448350623250008, + "rewards/rejected": -0.047586001455783844, + "step": 7790 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 2.456921100616455, + "learning_rate": 5.870732816122394e-09, + "logits/chosen": -3.0814521312713623, + "logits/rejected": -3.0761094093322754, + "logps/chosen": -57.13507843017578, + "logps/rejected": -58.82529830932617, + "loss": 0.6888, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03154601901769638, + "rewards/margins": 0.009393574669957161, + "rewards/rejected": -0.040939588099718094, + "step": 7800 + }, + { + "epoch": 1.3439007580978635, + "eval_logits/chosen": -3.1217851638793945, + "eval_logits/rejected": -3.1161201000213623, + "eval_logps/chosen": -59.319210052490234, + "eval_logps/rejected": -64.35895538330078, + "eval_loss": 0.6904501914978027, + "eval_rewards/accuracies": 0.5820167064666748, + "eval_rewards/chosen": -0.006073120515793562, + "eval_rewards/margins": 0.005715163890272379, + "eval_rewards/rejected": -0.01178828440606594, + "eval_runtime": 383.7253, + "eval_samples_per_second": 11.216, + "eval_steps_per_second": 1.402, + "step": 7800 + }, + { + "epoch": 1.3456237077877327, + "grad_norm": 2.6416800022125244, + "learning_rate": 5.843363279859875e-09, + "logits/chosen": -3.0954296588897705, + "logits/rejected": -3.067091464996338, + "logps/chosen": -61.6202507019043, + "logps/rejected": -58.48479080200195, + "loss": 0.6879, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.026026686653494835, + "rewards/margins": 0.011260555125772953, + "rewards/rejected": -0.03728724271059036, + "step": 7810 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 2.322619676589966, + "learning_rate": 5.816031332383267e-09, + "logits/chosen": -3.060593605041504, + "logits/rejected": -3.0286638736724854, + "logps/chosen": -58.142189025878906, + "logps/rejected": -59.22846603393555, + "loss": 0.6829, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.01776704005897045, + "rewards/margins": 0.02139130048453808, + "rewards/rejected": -0.03915834426879883, + "step": 7820 + }, + { + "epoch": 1.3490696071674706, + "grad_norm": 2.551548957824707, + "learning_rate": 5.788737220857479e-09, + "logits/chosen": -2.9640679359436035, + "logits/rejected": -2.9546852111816406, + "logps/chosen": -52.7246208190918, + "logps/rejected": -58.02629852294922, + "loss": 0.6852, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.029245715588331223, + "rewards/margins": 0.016737982630729675, + "rewards/rejected": -0.045983701944351196, + "step": 7830 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 2.1704752445220947, + "learning_rate": 5.76148119210526e-09, + "logits/chosen": -2.958341360092163, + "logits/rejected": -2.936945676803589, + "logps/chosen": -56.746986389160156, + "logps/rejected": -58.57280731201172, + "loss": 0.6878, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.031794894486665726, + "rewards/margins": 0.011606425978243351, + "rewards/rejected": -0.0434013195335865, + "step": 7840 + }, + { + "epoch": 1.3525155065472088, + "grad_norm": 2.541645050048828, + "learning_rate": 5.734263492604981e-09, + "logits/chosen": -2.9532153606414795, + "logits/rejected": -2.91933012008667, + "logps/chosen": -58.54957962036133, + "logps/rejected": -52.94072341918945, + "loss": 0.6856, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.029725080356001854, + "rewards/margins": 0.01591680757701397, + "rewards/rejected": -0.04564188793301582, + "step": 7850 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 2.3668036460876465, + "learning_rate": 5.70708436848839e-09, + "logits/chosen": -2.987510919570923, + "logits/rejected": -2.9515819549560547, + "logps/chosen": -55.26435089111328, + "logps/rejected": -57.63298416137695, + "loss": 0.6805, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.023679155856370926, + "rewards/margins": 0.02612263336777687, + "rewards/rejected": -0.0498017817735672, + "step": 7860 + }, + { + "epoch": 1.355961405926947, + "grad_norm": 2.4281444549560547, + "learning_rate": 5.679944065538403e-09, + "logits/chosen": -3.0099899768829346, + "logits/rejected": -2.9925038814544678, + "logps/chosen": -58.16576385498047, + "logps/rejected": -58.61614227294922, + "loss": 0.6847, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02491823211312294, + "rewards/margins": 0.01760999858379364, + "rewards/rejected": -0.04252823442220688, + "step": 7870 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 2.4042530059814453, + "learning_rate": 5.652842829186866e-09, + "logits/chosen": -3.0708887577056885, + "logits/rejected": -3.0550119876861572, + "logps/chosen": -55.79817581176758, + "logps/rejected": -56.93279266357422, + "loss": 0.685, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.026845788583159447, + "rewards/margins": 0.01735367253422737, + "rewards/rejected": -0.04419945925474167, + "step": 7880 + }, + { + "epoch": 1.359407305306685, + "grad_norm": 2.446213722229004, + "learning_rate": 5.625780904512352e-09, + "logits/chosen": -3.010209560394287, + "logits/rejected": -2.9910619258880615, + "logps/chosen": -56.8497428894043, + "logps/rejected": -58.90478515625, + "loss": 0.6877, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.024736952036619186, + "rewards/margins": 0.011801651678979397, + "rewards/rejected": -0.036538608372211456, + "step": 7890 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 2.7450051307678223, + "learning_rate": 5.598758536237917e-09, + "logits/chosen": -2.999788284301758, + "logits/rejected": -2.996474504470825, + "logps/chosen": -55.325157165527344, + "logps/rejected": -58.1741943359375, + "loss": 0.6868, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.031489770859479904, + "rewards/margins": 0.013761959969997406, + "rewards/rejected": -0.04525173455476761, + "step": 7900 + }, + { + "epoch": 1.361130254996554, + "eval_logits/chosen": -3.1220314502716064, + "eval_logits/rejected": -3.11635160446167, + "eval_logps/chosen": -59.33341979980469, + "eval_logps/rejected": -64.38567352294922, + "eval_loss": 0.6903904676437378, + "eval_rewards/accuracies": 0.5845724940299988, + "eval_rewards/chosen": -0.006215228233486414, + "eval_rewards/margins": 0.005840308964252472, + "eval_rewards/rejected": -0.012055537663400173, + "eval_runtime": 383.7646, + "eval_samples_per_second": 11.215, + "eval_steps_per_second": 1.402, + "step": 7900 + }, + { + "epoch": 1.3628532046864232, + "grad_norm": 2.488607883453369, + "learning_rate": 5.571775968728934e-09, + "logits/chosen": -3.006234645843506, + "logits/rejected": -2.9830057621002197, + "logps/chosen": -59.74515914916992, + "logps/rejected": -57.955780029296875, + "loss": 0.6867, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029217317700386047, + "rewards/margins": 0.013838951475918293, + "rewards/rejected": -0.043056271970272064, + "step": 7910 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 2.591261863708496, + "learning_rate": 5.544833445990827e-09, + "logits/chosen": -3.0004377365112305, + "logits/rejected": -2.9761016368865967, + "logps/chosen": -57.81911087036133, + "logps/rejected": -55.67632293701172, + "loss": 0.6874, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02998395264148712, + "rewards/margins": 0.012168223969638348, + "rewards/rejected": -0.042152177542448044, + "step": 7920 + }, + { + "epoch": 1.3662991040661612, + "grad_norm": 2.607875347137451, + "learning_rate": 5.517931211666907e-09, + "logits/chosen": -3.067615270614624, + "logits/rejected": -3.0388498306274414, + "logps/chosen": -60.940086364746094, + "logps/rejected": -56.713706970214844, + "loss": 0.6851, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.026055265218019485, + "rewards/margins": 0.017093230038881302, + "rewards/rejected": -0.04314848780632019, + "step": 7930 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 2.8139894008636475, + "learning_rate": 5.491069509036151e-09, + "logits/chosen": -2.9540796279907227, + "logits/rejected": -2.9387428760528564, + "logps/chosen": -61.240760803222656, + "logps/rejected": -59.60124588012695, + "loss": 0.6851, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.026804599910974503, + "rewards/margins": 0.01700502447783947, + "rewards/rejected": -0.04380962252616882, + "step": 7940 + }, + { + "epoch": 1.3697450034458993, + "grad_norm": 2.6647346019744873, + "learning_rate": 5.464248581011002e-09, + "logits/chosen": -2.9159152507781982, + "logits/rejected": -2.9074153900146484, + "logps/chosen": -54.17744827270508, + "logps/rejected": -57.369102478027344, + "loss": 0.6864, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02671412192285061, + "rewards/margins": 0.014292205683887005, + "rewards/rejected": -0.04100632667541504, + "step": 7950 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 2.25141978263855, + "learning_rate": 5.4374686701351815e-09, + "logits/chosen": -2.9339842796325684, + "logits/rejected": -2.905107021331787, + "logps/chosen": -53.44514846801758, + "logps/rejected": -56.796424865722656, + "loss": 0.6869, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.029661059379577637, + "rewards/margins": 0.013453202322125435, + "rewards/rejected": -0.04311426356434822, + "step": 7960 + }, + { + "epoch": 1.3731909028256375, + "grad_norm": 2.0555899143218994, + "learning_rate": 5.410730018581482e-09, + "logits/chosen": -3.035153865814209, + "logits/rejected": -3.003742218017578, + "logps/chosen": -58.52336502075195, + "logps/rejected": -56.82777786254883, + "loss": 0.6855, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.031260039657354355, + "rewards/margins": 0.016300015151500702, + "rewards/rejected": -0.047560058534145355, + "step": 7970 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 2.8370964527130127, + "learning_rate": 5.384032868149595e-09, + "logits/chosen": -3.0796780586242676, + "logits/rejected": -3.0659830570220947, + "logps/chosen": -59.4162483215332, + "logps/rejected": -60.17753219604492, + "loss": 0.6859, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02536647394299507, + "rewards/margins": 0.015350818634033203, + "rewards/rejected": -0.040717292577028275, + "step": 7980 + }, + { + "epoch": 1.3766368022053757, + "grad_norm": 2.404433488845825, + "learning_rate": 5.357377460263893e-09, + "logits/chosen": -3.1086089611053467, + "logits/rejected": -3.0962395668029785, + "logps/chosen": -53.78315353393555, + "logps/rejected": -55.634429931640625, + "loss": 0.6892, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.031159456819295883, + "rewards/margins": 0.008595505729317665, + "rewards/rejected": -0.0397549606859684, + "step": 7990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 2.7585387229919434, + "learning_rate": 5.330764035971298e-09, + "logits/chosen": -2.9658491611480713, + "logits/rejected": -2.9562182426452637, + "logps/chosen": -57.27336502075195, + "logps/rejected": -60.18293380737305, + "loss": 0.6876, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.02755850926041603, + "rewards/margins": 0.011846770532429218, + "rewards/rejected": -0.039405278861522675, + "step": 8000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -3.1204283237457275, + "eval_logits/rejected": -3.1147513389587402, + "eval_logps/chosen": -59.34055709838867, + "eval_logps/rejected": -64.40652465820312, + "eval_loss": 0.6903232932090759, + "eval_rewards/accuracies": 0.5838754773139954, + "eval_rewards/chosen": -0.0062866369262337685, + "eval_rewards/margins": 0.005977442022413015, + "eval_rewards/rejected": -0.012264078482985497, + "eval_runtime": 383.7449, + "eval_samples_per_second": 11.216, + "eval_steps_per_second": 1.402, + "step": 8000 + }, + { + "epoch": 1.3800827015851138, + "grad_norm": 2.6549692153930664, + "learning_rate": 5.3041928359390415e-09, + "logits/chosen": -2.9890923500061035, + "logits/rejected": -2.9675803184509277, + "logps/chosen": -59.91522216796875, + "logps/rejected": -56.33888626098633, + "loss": 0.6879, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.027553830295801163, + "rewards/margins": 0.011601470410823822, + "rewards/rejected": -0.039155296981334686, + "step": 8010 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 2.279928684234619, + "learning_rate": 5.277664100452546e-09, + "logits/chosen": -3.0127570629119873, + "logits/rejected": -2.9724864959716797, + "logps/chosen": -63.12485885620117, + "logps/rejected": -59.47083282470703, + "loss": 0.6845, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.028870245441794395, + "rewards/margins": 0.0184099730104208, + "rewards/rejected": -0.047280218452215195, + "step": 8020 + }, + { + "epoch": 1.3835286009648518, + "grad_norm": 2.4220051765441895, + "learning_rate": 5.251178069413196e-09, + "logits/chosen": -3.0053482055664062, + "logits/rejected": -2.9898781776428223, + "logps/chosen": -53.86786651611328, + "logps/rejected": -57.864479064941406, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02724533900618553, + "rewards/margins": 0.01722780428826809, + "rewards/rejected": -0.04447314515709877, + "step": 8030 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 2.450390100479126, + "learning_rate": 5.224734982336216e-09, + "logits/chosen": -2.96675443649292, + "logits/rejected": -2.934190511703491, + "logps/chosen": -57.38630294799805, + "logps/rejected": -56.2953987121582, + "loss": 0.6871, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.029596591368317604, + "rewards/margins": 0.013075938448309898, + "rewards/rejected": -0.0426725298166275, + "step": 8040 + }, + { + "epoch": 1.38697450034459, + "grad_norm": 2.611423969268799, + "learning_rate": 5.198335078348475e-09, + "logits/chosen": -3.0320591926574707, + "logits/rejected": -3.0146846771240234, + "logps/chosen": -57.07909393310547, + "logps/rejected": -59.59125900268555, + "loss": 0.6847, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019715677946805954, + "rewards/margins": 0.017734985798597336, + "rewards/rejected": -0.03745066374540329, + "step": 8050 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 2.637826919555664, + "learning_rate": 5.171978596186342e-09, + "logits/chosen": -3.0777766704559326, + "logits/rejected": -3.0379884243011475, + "logps/chosen": -57.865684509277344, + "logps/rejected": -55.034271240234375, + "loss": 0.6816, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.025270912796258926, + "rewards/margins": 0.024199113249778748, + "rewards/rejected": -0.049470026046037674, + "step": 8060 + }, + { + "epoch": 1.390420399724328, + "grad_norm": 2.563366651535034, + "learning_rate": 5.145665774193511e-09, + "logits/chosen": -2.9853625297546387, + "logits/rejected": -2.954200267791748, + "logps/chosen": -56.14939498901367, + "logps/rejected": -54.62565231323242, + "loss": 0.6838, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02691097930073738, + "rewards/margins": 0.01969815418124199, + "rewards/rejected": -0.04660913720726967, + "step": 8070 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 2.5079445838928223, + "learning_rate": 5.1193968503188584e-09, + "logits/chosen": -2.965848445892334, + "logits/rejected": -2.9610848426818848, + "logps/chosen": -55.743553161621094, + "logps/rejected": -61.7056884765625, + "loss": 0.6887, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02757110632956028, + "rewards/margins": 0.009862509556114674, + "rewards/rejected": -0.03743361681699753, + "step": 8080 + }, + { + "epoch": 1.3938662991040662, + "grad_norm": 2.365835428237915, + "learning_rate": 5.093172062114284e-09, + "logits/chosen": -2.9592156410217285, + "logits/rejected": -2.9305264949798584, + "logps/chosen": -55.8453369140625, + "logps/rejected": -56.141029357910156, + "loss": 0.6838, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.027269789949059486, + "rewards/margins": 0.019605975598096848, + "rewards/rejected": -0.046875763684511185, + "step": 8090 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 2.5415031909942627, + "learning_rate": 5.066991646732575e-09, + "logits/chosen": -3.0153441429138184, + "logits/rejected": -3.0083537101745605, + "logps/chosen": -56.55889892578125, + "logps/rejected": -61.67368698120117, + "loss": 0.688, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.024770287796854973, + "rewards/margins": 0.011040473356842995, + "rewards/rejected": -0.03581076115369797, + "step": 8100 + }, + { + "epoch": 1.3955892487939352, + "eval_logits/chosen": -3.120063304901123, + "eval_logits/rejected": -3.1143927574157715, + "eval_logps/chosen": -59.36701583862305, + "eval_logps/rejected": -64.42515563964844, + "eval_loss": 0.6903651356697083, + "eval_rewards/accuracies": 0.5831784605979919, + "eval_rewards/chosen": -0.006551176775246859, + "eval_rewards/margins": 0.0058991494588553905, + "eval_rewards/rejected": -0.01245032623410225, + "eval_runtime": 384.1938, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 8100 + }, + { + "epoch": 1.3973121984838044, + "grad_norm": 2.5754048824310303, + "learning_rate": 5.040855840925227e-09, + "logits/chosen": -3.0326080322265625, + "logits/rejected": -2.9937872886657715, + "logps/chosen": -58.87183380126953, + "logps/rejected": -55.71844482421875, + "loss": 0.6843, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.028102803975343704, + "rewards/margins": 0.01863877847790718, + "rewards/rejected": -0.04674157872796059, + "step": 8110 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 2.5665032863616943, + "learning_rate": 5.014764881040364e-09, + "logits/chosen": -2.9887478351593018, + "logits/rejected": -2.9701955318450928, + "logps/chosen": -56.5562744140625, + "logps/rejected": -56.73915481567383, + "loss": 0.6865, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.024538787081837654, + "rewards/margins": 0.014023144729435444, + "rewards/rejected": -0.038561929017305374, + "step": 8120 + }, + { + "epoch": 1.4007580978635423, + "grad_norm": 2.2508840560913086, + "learning_rate": 4.98871900302053e-09, + "logits/chosen": -3.002256155014038, + "logits/rejected": -2.98468279838562, + "logps/chosen": -57.396080017089844, + "logps/rejected": -54.70935821533203, + "loss": 0.6859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02575969137251377, + "rewards/margins": 0.015312038362026215, + "rewards/rejected": -0.041071731597185135, + "step": 8130 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 2.453939199447632, + "learning_rate": 4.962718442400611e-09, + "logits/chosen": -2.9657180309295654, + "logits/rejected": -2.939147710800171, + "logps/chosen": -55.78546142578125, + "logps/rejected": -56.93006134033203, + "loss": 0.6883, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.030585378408432007, + "rewards/margins": 0.010639393702149391, + "rewards/rejected": -0.04122477397322655, + "step": 8140 + }, + { + "epoch": 1.4042039972432805, + "grad_norm": 2.562664747238159, + "learning_rate": 4.9367634343056786e-09, + "logits/chosen": -3.0493216514587402, + "logits/rejected": -3.028538703918457, + "logps/chosen": -55.07560348510742, + "logps/rejected": -57.47492599487305, + "loss": 0.6862, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029278188943862915, + "rewards/margins": 0.014732874929904938, + "rewards/rejected": -0.04401106387376785, + "step": 8150 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 2.32053804397583, + "learning_rate": 4.91085421344887e-09, + "logits/chosen": -2.979808807373047, + "logits/rejected": -2.959838628768921, + "logps/chosen": -59.30604934692383, + "logps/rejected": -57.034095764160156, + "loss": 0.6864, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.032929785549640656, + "rewards/margins": 0.014530050568282604, + "rewards/rejected": -0.047459833323955536, + "step": 8160 + }, + { + "epoch": 1.4076498966230186, + "grad_norm": 2.6533477306365967, + "learning_rate": 4.884991014129263e-09, + "logits/chosen": -3.0457491874694824, + "logits/rejected": -3.0203018188476562, + "logps/chosen": -62.3807487487793, + "logps/rejected": -55.70471954345703, + "loss": 0.6881, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.026699107140302658, + "rewards/margins": 0.010825890116393566, + "rewards/rejected": -0.0375249981880188, + "step": 8170 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 2.831326961517334, + "learning_rate": 4.8591740702297614e-09, + "logits/chosen": -3.03529691696167, + "logits/rejected": -3.0137627124786377, + "logps/chosen": -58.8499641418457, + "logps/rejected": -59.05854415893555, + "loss": 0.6861, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02557712234556675, + "rewards/margins": 0.014896447770297527, + "rewards/rejected": -0.040473572909832, + "step": 8180 + }, + { + "epoch": 1.4110957960027566, + "grad_norm": 2.409442186355591, + "learning_rate": 4.8334036152149805e-09, + "logits/chosen": -2.9958674907684326, + "logits/rejected": -2.9718363285064697, + "logps/chosen": -60.311866760253906, + "logps/rejected": -58.06665802001953, + "loss": 0.6842, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02333252504467964, + "rewards/margins": 0.01871083490550518, + "rewards/rejected": -0.04204336181282997, + "step": 8190 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 2.4591057300567627, + "learning_rate": 4.807679882129118e-09, + "logits/chosen": -3.0280957221984863, + "logits/rejected": -2.993908405303955, + "logps/chosen": -57.98280715942383, + "logps/rejected": -57.11616897583008, + "loss": 0.6858, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.029876669868826866, + "rewards/margins": 0.015772178769111633, + "rewards/rejected": -0.04564885050058365, + "step": 8200 + }, + { + "epoch": 1.4128187456926258, + "eval_logits/chosen": -3.119671583175659, + "eval_logits/rejected": -3.1139838695526123, + "eval_logps/chosen": -59.38847351074219, + "eval_logps/rejected": -64.45050811767578, + "eval_loss": 0.6903483271598816, + "eval_rewards/accuracies": 0.578066885471344, + "eval_rewards/chosen": -0.006765724625438452, + "eval_rewards/margins": 0.005938132759183645, + "eval_rewards/rejected": -0.012703859247267246, + "eval_runtime": 384.1597, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.4, + "step": 8200 + }, + { + "epoch": 1.414541695382495, + "grad_norm": 2.437087059020996, + "learning_rate": 4.782003103593887e-09, + "logits/chosen": -2.9005608558654785, + "logits/rejected": -2.882728099822998, + "logps/chosen": -57.88407516479492, + "logps/rejected": -60.46735382080078, + "loss": 0.6859, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.027791589498519897, + "rewards/margins": 0.015335688367486, + "rewards/rejected": -0.04312727600336075, + "step": 8210 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 2.3415608406066895, + "learning_rate": 4.756373511806359e-09, + "logits/chosen": -3.0372555255889893, + "logits/rejected": -3.0118002891540527, + "logps/chosen": -56.182525634765625, + "logps/rejected": -56.98543167114258, + "loss": 0.6894, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.030894294381141663, + "rewards/margins": 0.008487719111144543, + "rewards/rejected": -0.03938201442360878, + "step": 8220 + }, + { + "epoch": 1.417987594762233, + "grad_norm": 2.3519725799560547, + "learning_rate": 4.73079133853692e-09, + "logits/chosen": -2.9884510040283203, + "logits/rejected": -2.969163417816162, + "logps/chosen": -57.80139923095703, + "logps/rejected": -55.51936721801758, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02552812732756138, + "rewards/margins": 0.016272902488708496, + "rewards/rejected": -0.04180102422833443, + "step": 8230 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 2.396422863006592, + "learning_rate": 4.705256815127122e-09, + "logits/chosen": -3.105541706085205, + "logits/rejected": -3.075623035430908, + "logps/chosen": -58.82453155517578, + "logps/rejected": -55.772682189941406, + "loss": 0.6872, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.026508769020438194, + "rewards/margins": 0.012797790579497814, + "rewards/rejected": -0.039306558668613434, + "step": 8240 + }, + { + "epoch": 1.421433494141971, + "grad_norm": 2.6921234130859375, + "learning_rate": 4.679770172487632e-09, + "logits/chosen": -3.02769136428833, + "logits/rejected": -3.0033533573150635, + "logps/chosen": -60.10667037963867, + "logps/rejected": -58.4813117980957, + "loss": 0.6851, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024572988972067833, + "rewards/margins": 0.017005886882543564, + "rewards/rejected": -0.041578877717256546, + "step": 8250 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 2.8120920658111572, + "learning_rate": 4.6543316410961176e-09, + "logits/chosen": -3.1258349418640137, + "logits/rejected": -3.104653835296631, + "logps/chosen": -58.88138961791992, + "logps/rejected": -59.416297912597656, + "loss": 0.6852, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.024667982012033463, + "rewards/margins": 0.016674166545271873, + "rewards/rejected": -0.041342150419950485, + "step": 8260 + }, + { + "epoch": 1.4248793935217092, + "grad_norm": 2.539137840270996, + "learning_rate": 4.62894145099518e-09, + "logits/chosen": -3.0966219902038574, + "logits/rejected": -3.085365056991577, + "logps/chosen": -56.49332809448242, + "logps/rejected": -59.21570587158203, + "loss": 0.6898, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.030976910144090652, + "rewards/margins": 0.007497703190892935, + "rewards/rejected": -0.038474611937999725, + "step": 8270 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 2.646667242050171, + "learning_rate": 4.603599831790262e-09, + "logits/chosen": -2.9852712154388428, + "logits/rejected": -2.974323272705078, + "logps/chosen": -56.3748893737793, + "logps/rejected": -57.87810516357422, + "loss": 0.6886, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.03997837379574776, + "rewards/margins": 0.01005262229591608, + "rewards/rejected": -0.05003099516034126, + "step": 8280 + }, + { + "epoch": 1.4283252929014472, + "grad_norm": 2.5989837646484375, + "learning_rate": 4.578307012647578e-09, + "logits/chosen": -2.975891590118408, + "logits/rejected": -2.92683744430542, + "logps/chosen": -61.94519805908203, + "logps/rejected": -58.292625427246094, + "loss": 0.6831, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.020089568570256233, + "rewards/margins": 0.020902033895254135, + "rewards/rejected": -0.04099160432815552, + "step": 8290 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 2.4585771560668945, + "learning_rate": 4.553063222292038e-09, + "logits/chosen": -3.154768466949463, + "logits/rejected": -3.13096284866333, + "logps/chosen": -60.180511474609375, + "logps/rejected": -59.13969802856445, + "loss": 0.6836, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.026011323556303978, + "rewards/margins": 0.02008582092821598, + "rewards/rejected": -0.04609714448451996, + "step": 8300 + }, + { + "epoch": 1.4300482425913164, + "eval_logits/chosen": -3.1194801330566406, + "eval_logits/rejected": -3.1138548851013184, + "eval_logps/chosen": -59.40495300292969, + "eval_logps/rejected": -64.46597290039062, + "eval_loss": 0.6903538703918457, + "eval_rewards/accuracies": 0.5822490453720093, + "eval_rewards/chosen": -0.006930571049451828, + "eval_rewards/margins": 0.005927949212491512, + "eval_rewards/rejected": -0.01285852026194334, + "eval_runtime": 384.0017, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 8300 + }, + { + "epoch": 1.4317711922811853, + "grad_norm": 2.729517936706543, + "learning_rate": 4.5278686890051835e-09, + "logits/chosen": -2.925302743911743, + "logits/rejected": -2.894500494003296, + "logps/chosen": -58.59355926513672, + "logps/rejected": -55.01383590698242, + "loss": 0.6859, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.023244168609380722, + "rewards/margins": 0.015363660641014576, + "rewards/rejected": -0.03860782831907272, + "step": 8310 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 2.2358431816101074, + "learning_rate": 4.502723640623117e-09, + "logits/chosen": -3.0099167823791504, + "logits/rejected": -2.9926342964172363, + "logps/chosen": -55.30101776123047, + "logps/rejected": -58.07609939575195, + "loss": 0.6874, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.027875792235136032, + "rewards/margins": 0.012339317239820957, + "rewards/rejected": -0.04021511226892471, + "step": 8320 + }, + { + "epoch": 1.4352170916609235, + "grad_norm": 2.591007947921753, + "learning_rate": 4.477628304534454e-09, + "logits/chosen": -3.0072147846221924, + "logits/rejected": -2.997791051864624, + "logps/chosen": -51.999046325683594, + "logps/rejected": -58.81654739379883, + "loss": 0.6858, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03384193778038025, + "rewards/margins": 0.015689820051193237, + "rewards/rejected": -0.049531761556863785, + "step": 8330 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 2.1845362186431885, + "learning_rate": 4.45258290767824e-09, + "logits/chosen": -3.0681591033935547, + "logits/rejected": -3.0523247718811035, + "logps/chosen": -57.44211959838867, + "logps/rejected": -56.03607940673828, + "loss": 0.6868, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02998577430844307, + "rewards/margins": 0.013758832588791847, + "rewards/rejected": -0.043744608759880066, + "step": 8340 + }, + { + "epoch": 1.4386629910406616, + "grad_norm": 2.7989895343780518, + "learning_rate": 4.427587676541932e-09, + "logits/chosen": -2.91628360748291, + "logits/rejected": -2.8969550132751465, + "logps/chosen": -56.94103240966797, + "logps/rejected": -57.142295837402344, + "loss": 0.6898, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.037700019776821136, + "rewards/margins": 0.0077774375677108765, + "rewards/rejected": -0.04547745734453201, + "step": 8350 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 2.4125726222991943, + "learning_rate": 4.4026428371593305e-09, + "logits/chosen": -2.9135327339172363, + "logits/rejected": -2.877890110015869, + "logps/chosen": -56.788597106933594, + "logps/rejected": -56.7236328125, + "loss": 0.6807, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.022345153614878654, + "rewards/margins": 0.0258797500282526, + "rewards/rejected": -0.048224903643131256, + "step": 8360 + }, + { + "epoch": 1.4421088904203998, + "grad_norm": 2.557354211807251, + "learning_rate": 4.377748615108539e-09, + "logits/chosen": -2.9914212226867676, + "logits/rejected": -2.973506450653076, + "logps/chosen": -54.44336700439453, + "logps/rejected": -57.72014617919922, + "loss": 0.69, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0338708758354187, + "rewards/margins": 0.007198970764875412, + "rewards/rejected": -0.04106984660029411, + "step": 8370 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 2.258762836456299, + "learning_rate": 4.352905235509924e-09, + "logits/chosen": -3.111560344696045, + "logits/rejected": -3.0924363136291504, + "logps/chosen": -53.72526168823242, + "logps/rejected": -58.379425048828125, + "loss": 0.6848, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024840399622917175, + "rewards/margins": 0.017487522214651108, + "rewards/rejected": -0.04232792183756828, + "step": 8380 + }, + { + "epoch": 1.4455547898001377, + "grad_norm": 2.6358630657196045, + "learning_rate": 4.328112923024079e-09, + "logits/chosen": -3.059661865234375, + "logits/rejected": -3.0261569023132324, + "logps/chosen": -59.1878547668457, + "logps/rejected": -61.03777313232422, + "loss": 0.6815, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.024916106835007668, + "rewards/margins": 0.024374935775995255, + "rewards/rejected": -0.049291037023067474, + "step": 8390 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 2.550302028656006, + "learning_rate": 4.303371901849797e-09, + "logits/chosen": -3.0383198261260986, + "logits/rejected": -3.016228437423706, + "logps/chosen": -53.170433044433594, + "logps/rejected": -55.09442138671875, + "loss": 0.6863, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.028643745929002762, + "rewards/margins": 0.01448802649974823, + "rewards/rejected": -0.04313177242875099, + "step": 8400 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -3.1202504634857178, + "eval_logits/rejected": -3.1146037578582764, + "eval_logps/chosen": -59.42176055908203, + "eval_logps/rejected": -64.49677276611328, + "eval_loss": 0.6902862787246704, + "eval_rewards/accuracies": 0.5829461216926575, + "eval_rewards/chosen": -0.007098623551428318, + "eval_rewards/margins": 0.006067925598472357, + "eval_rewards/rejected": -0.013166549615561962, + "eval_runtime": 383.7451, + "eval_samples_per_second": 11.216, + "eval_steps_per_second": 1.402, + "step": 8400 + }, + { + "epoch": 1.449000689179876, + "grad_norm": 2.573391914367676, + "learning_rate": 4.278682395722035e-09, + "logits/chosen": -2.9906022548675537, + "logits/rejected": -2.9558684825897217, + "logps/chosen": -60.6536979675293, + "logps/rejected": -57.44050216674805, + "loss": 0.6805, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029932618141174316, + "rewards/margins": 0.02632906101644039, + "rewards/rejected": -0.05626168102025986, + "step": 8410 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 2.531142473220825, + "learning_rate": 4.2540446279099024e-09, + "logits/chosen": -2.878419876098633, + "logits/rejected": -2.8656375408172607, + "logps/chosen": -54.85551071166992, + "logps/rejected": -55.78571319580078, + "loss": 0.687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03203719109296799, + "rewards/margins": 0.013183876872062683, + "rewards/rejected": -0.04522106796503067, + "step": 8420 + }, + { + "epoch": 1.452446588559614, + "grad_norm": 2.613314390182495, + "learning_rate": 4.229458821214621e-09, + "logits/chosen": -3.008746385574341, + "logits/rejected": -2.993227005004883, + "logps/chosen": -55.98688888549805, + "logps/rejected": -60.89069366455078, + "loss": 0.6849, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.024954982101917267, + "rewards/margins": 0.017395135015249252, + "rewards/rejected": -0.04235011711716652, + "step": 8430 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 2.6283459663391113, + "learning_rate": 4.2049251979675465e-09, + "logits/chosen": -3.0397636890411377, + "logits/rejected": -3.0217602252960205, + "logps/chosen": -57.668846130371094, + "logps/rejected": -56.13346481323242, + "loss": 0.6849, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028762226924300194, + "rewards/margins": 0.017270488664507866, + "rewards/rejected": -0.04603271931409836, + "step": 8440 + }, + { + "epoch": 1.4558924879393522, + "grad_norm": 2.6273765563964844, + "learning_rate": 4.1804439800281105e-09, + "logits/chosen": -2.9259116649627686, + "logits/rejected": -2.9209136962890625, + "logps/chosen": -51.028839111328125, + "logps/rejected": -57.476707458496094, + "loss": 0.6858, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03016982041299343, + "rewards/margins": 0.01554443221539259, + "rewards/rejected": -0.045714251697063446, + "step": 8450 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 2.246420383453369, + "learning_rate": 4.156015388781864e-09, + "logits/chosen": -3.038684606552124, + "logits/rejected": -3.0106842517852783, + "logps/chosen": -55.45277786254883, + "logps/rejected": -58.68220901489258, + "loss": 0.682, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.026461321860551834, + "rewards/margins": 0.02305610477924347, + "rewards/rejected": -0.049517422914505005, + "step": 8460 + }, + { + "epoch": 1.4593383873190904, + "grad_norm": 2.541693687438965, + "learning_rate": 4.131639645138428e-09, + "logits/chosen": -3.0102734565734863, + "logits/rejected": -2.9820914268493652, + "logps/chosen": -59.36157989501953, + "logps/rejected": -57.94231414794922, + "loss": 0.6861, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.032951220870018005, + "rewards/margins": 0.015014281496405602, + "rewards/rejected": -0.047965504229068756, + "step": 8470 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 2.3055930137634277, + "learning_rate": 4.107316969529535e-09, + "logits/chosen": -3.0146217346191406, + "logits/rejected": -2.983560085296631, + "logps/chosen": -55.6895866394043, + "logps/rejected": -56.18989944458008, + "loss": 0.6839, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026523908600211143, + "rewards/margins": 0.019494030624628067, + "rewards/rejected": -0.04601794108748436, + "step": 8480 + }, + { + "epoch": 1.4627842866988283, + "grad_norm": 2.613065242767334, + "learning_rate": 4.083047581907013e-09, + "logits/chosen": -3.0532450675964355, + "logits/rejected": -3.0469489097595215, + "logps/chosen": -57.904701232910156, + "logps/rejected": -61.51108932495117, + "loss": 0.6879, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.031647052615880966, + "rewards/margins": 0.01144656352698803, + "rewards/rejected": -0.043093618005514145, + "step": 8490 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 2.272087335586548, + "learning_rate": 4.0588317017408e-09, + "logits/chosen": -2.9357683658599854, + "logits/rejected": -2.9200870990753174, + "logps/chosen": -57.107269287109375, + "logps/rejected": -58.03228759765625, + "loss": 0.6847, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.028004806488752365, + "rewards/margins": 0.017863240092992783, + "rewards/rejected": -0.04586804285645485, + "step": 8500 + }, + { + "epoch": 1.4645072363886975, + "eval_logits/chosen": -3.1188855171203613, + "eval_logits/rejected": -3.113229990005493, + "eval_logps/chosen": -59.43946838378906, + "eval_logps/rejected": -64.51103973388672, + "eval_loss": 0.6903047561645508, + "eval_rewards/accuracies": 0.5871282815933228, + "eval_rewards/chosen": -0.007275736890733242, + "eval_rewards/margins": 0.006033329293131828, + "eval_rewards/rejected": -0.013309067115187645, + "eval_runtime": 384.2065, + "eval_samples_per_second": 11.202, + "eval_steps_per_second": 1.4, + "step": 8500 + }, + { + "epoch": 1.4662301860785665, + "grad_norm": 2.6604816913604736, + "learning_rate": 4.0346695480169684e-09, + "logits/chosen": -3.0012319087982178, + "logits/rejected": -2.965122699737549, + "logps/chosen": -60.46849822998047, + "logps/rejected": -56.2077751159668, + "loss": 0.6833, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.026192188262939453, + "rewards/margins": 0.020619530230760574, + "rewards/rejected": -0.04681171476840973, + "step": 8510 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 2.4973299503326416, + "learning_rate": 4.010561339235732e-09, + "logits/chosen": -2.993504524230957, + "logits/rejected": -2.957441806793213, + "logps/chosen": -56.485206604003906, + "logps/rejected": -53.9075813293457, + "loss": 0.6821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.029152899980545044, + "rewards/margins": 0.022967610508203506, + "rewards/rejected": -0.05212050676345825, + "step": 8520 + }, + { + "epoch": 1.4696760854583046, + "grad_norm": 2.220632314682007, + "learning_rate": 3.98650729340948e-09, + "logits/chosen": -2.9751977920532227, + "logits/rejected": -2.957366943359375, + "logps/chosen": -57.468528747558594, + "logps/rejected": -55.835716247558594, + "loss": 0.6896, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.03581540286540985, + "rewards/margins": 0.00826946645975113, + "rewards/rejected": -0.04408486932516098, + "step": 8530 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 2.472700834274292, + "learning_rate": 3.962507628060802e-09, + "logits/chosen": -3.1107027530670166, + "logits/rejected": -3.094367742538452, + "logps/chosen": -57.2710075378418, + "logps/rejected": -58.294677734375, + "loss": 0.6871, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.025768334046006203, + "rewards/margins": 0.012841353192925453, + "rewards/rejected": -0.038609687238931656, + "step": 8540 + }, + { + "epoch": 1.4731219848380428, + "grad_norm": 2.5321476459503174, + "learning_rate": 3.938562560220523e-09, + "logits/chosen": -3.028843879699707, + "logits/rejected": -3.0211856365203857, + "logps/chosen": -56.91617965698242, + "logps/rejected": -59.54004669189453, + "loss": 0.688, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030782613903284073, + "rewards/margins": 0.011320657096803188, + "rewards/rejected": -0.04210326820611954, + "step": 8550 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 2.368504762649536, + "learning_rate": 3.914672306425727e-09, + "logits/chosen": -3.1164917945861816, + "logits/rejected": -3.093698024749756, + "logps/chosen": -57.331016540527344, + "logps/rejected": -57.33484649658203, + "loss": 0.6842, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03123745322227478, + "rewards/margins": 0.018766935914754868, + "rewards/rejected": -0.050004392862319946, + "step": 8560 + }, + { + "epoch": 1.476567884217781, + "grad_norm": 2.386122226715088, + "learning_rate": 3.890837082717822e-09, + "logits/chosen": -2.973491668701172, + "logits/rejected": -2.9486851692199707, + "logps/chosen": -57.305992126464844, + "logps/rejected": -56.970916748046875, + "loss": 0.6837, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.02948937751352787, + "rewards/margins": 0.019870325922966003, + "rewards/rejected": -0.04935970902442932, + "step": 8570 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 2.6596150398254395, + "learning_rate": 3.867057104640573e-09, + "logits/chosen": -3.0600357055664062, + "logits/rejected": -3.037853479385376, + "logps/chosen": -56.2053108215332, + "logps/rejected": -59.14296340942383, + "loss": 0.6809, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.020690327510237694, + "rewards/margins": 0.02528388239443302, + "rewards/rejected": -0.045974213629961014, + "step": 8580 + }, + { + "epoch": 1.480013783597519, + "grad_norm": 2.690764904022217, + "learning_rate": 3.843332587238151e-09, + "logits/chosen": -3.0696463584899902, + "logits/rejected": -3.0400075912475586, + "logps/chosen": -58.996612548828125, + "logps/rejected": -57.676673889160156, + "loss": 0.6821, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.020020674914121628, + "rewards/margins": 0.022924596443772316, + "rewards/rejected": -0.04294527322053909, + "step": 8590 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 2.450030565261841, + "learning_rate": 3.819663745053194e-09, + "logits/chosen": -2.987722635269165, + "logits/rejected": -2.9553515911102295, + "logps/chosen": -56.3746223449707, + "logps/rejected": -56.209251403808594, + "loss": 0.6861, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029363512992858887, + "rewards/margins": 0.015252038836479187, + "rewards/rejected": -0.044615548104047775, + "step": 8600 + }, + { + "epoch": 1.481736733287388, + "eval_logits/chosen": -3.119209051132202, + "eval_logits/rejected": -3.1135194301605225, + "eval_logps/chosen": -59.45771408081055, + "eval_logps/rejected": -64.53620147705078, + "eval_loss": 0.690272867679596, + "eval_rewards/accuracies": 0.5864312052726746, + "eval_rewards/chosen": -0.007458226755261421, + "eval_rewards/margins": 0.006102536339312792, + "eval_rewards/rejected": -0.0135607635602355, + "eval_runtime": 384.2938, + "eval_samples_per_second": 11.2, + "eval_steps_per_second": 1.4, + "step": 8600 + }, + { + "epoch": 1.483459682977257, + "grad_norm": 2.50291109085083, + "learning_rate": 3.796050792124867e-09, + "logits/chosen": -3.011719226837158, + "logits/rejected": -2.989260673522949, + "logps/chosen": -56.86872482299805, + "logps/rejected": -54.85808563232422, + "loss": 0.6857, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.027238333597779274, + "rewards/margins": 0.015780366957187653, + "rewards/rejected": -0.04301869869232178, + "step": 8610 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 2.6594996452331543, + "learning_rate": 3.772493941986916e-09, + "logits/chosen": -2.934577703475952, + "logits/rejected": -2.9154083728790283, + "logps/chosen": -58.123512268066406, + "logps/rejected": -56.25068283081055, + "loss": 0.6853, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03349360078573227, + "rewards/margins": 0.016506288200616837, + "rewards/rejected": -0.04999988526105881, + "step": 8620 + }, + { + "epoch": 1.4869055823569952, + "grad_norm": 2.505457878112793, + "learning_rate": 3.7489934076657596e-09, + "logits/chosen": -2.9104361534118652, + "logits/rejected": -2.898397922515869, + "logps/chosen": -54.41759490966797, + "logps/rejected": -56.243125915527344, + "loss": 0.6862, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.027125433087348938, + "rewards/margins": 0.01472453773021698, + "rewards/rejected": -0.041849974542856216, + "step": 8630 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 2.5061116218566895, + "learning_rate": 3.725549401678525e-09, + "logits/chosen": -3.0604488849639893, + "logits/rejected": -3.0291781425476074, + "logps/chosen": -58.31371307373047, + "logps/rejected": -56.86163330078125, + "loss": 0.686, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02572130598127842, + "rewards/margins": 0.01509423553943634, + "rewards/rejected": -0.04081553965806961, + "step": 8640 + }, + { + "epoch": 1.4903514817367332, + "grad_norm": 2.4240450859069824, + "learning_rate": 3.7021621360311795e-09, + "logits/chosen": -2.9729819297790527, + "logits/rejected": -2.951951503753662, + "logps/chosen": -57.3306999206543, + "logps/rejected": -58.6158561706543, + "loss": 0.6829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.023774294182658195, + "rewards/margins": 0.021655159071087837, + "rewards/rejected": -0.045429449528455734, + "step": 8650 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 2.508742332458496, + "learning_rate": 3.6788318222165517e-09, + "logits/chosen": -2.985428810119629, + "logits/rejected": -2.9703781604766846, + "logps/chosen": -56.572967529296875, + "logps/rejected": -56.7482795715332, + "loss": 0.6846, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.032110270112752914, + "rewards/margins": 0.01814529113471508, + "rewards/rejected": -0.050255559384822845, + "step": 8660 + }, + { + "epoch": 1.4937973811164715, + "grad_norm": 2.3349335193634033, + "learning_rate": 3.655558671212481e-09, + "logits/chosen": -3.0807077884674072, + "logits/rejected": -3.0629634857177734, + "logps/chosen": -55.401893615722656, + "logps/rejected": -58.01811599731445, + "loss": 0.6845, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03406014293432236, + "rewards/margins": 0.018253503367304802, + "rewards/rejected": -0.05231364443898201, + "step": 8670 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 2.737377405166626, + "learning_rate": 3.6323428934798497e-09, + "logits/chosen": -3.0251212120056152, + "logits/rejected": -3.0183393955230713, + "logps/chosen": -53.94529342651367, + "logps/rejected": -57.3693962097168, + "loss": 0.687, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03267005831003189, + "rewards/margins": 0.013181130401790142, + "rewards/rejected": -0.045851193368434906, + "step": 8680 + }, + { + "epoch": 1.4972432804962095, + "grad_norm": 2.630596876144409, + "learning_rate": 3.609184698960737e-09, + "logits/chosen": -3.0407185554504395, + "logits/rejected": -3.0107009410858154, + "logps/chosen": -58.4240608215332, + "logps/rejected": -53.02643585205078, + "loss": 0.6845, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.028877729550004005, + "rewards/margins": 0.018301133066415787, + "rewards/rejected": -0.047178857028484344, + "step": 8690 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 2.498950719833374, + "learning_rate": 3.5860842970764685e-09, + "logits/chosen": -3.0441231727600098, + "logits/rejected": -3.0103659629821777, + "logps/chosen": -59.04729461669922, + "logps/rejected": -56.984161376953125, + "loss": 0.6847, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02549806609749794, + "rewards/margins": 0.01791895180940628, + "rewards/rejected": -0.04341701418161392, + "step": 8700 + }, + { + "epoch": 1.4989662301860784, + "eval_logits/chosen": -3.1183865070343018, + "eval_logits/rejected": -3.1126930713653564, + "eval_logps/chosen": -59.478599548339844, + "eval_logps/rejected": -64.55989837646484, + "eval_loss": 0.6902616024017334, + "eval_rewards/accuracies": 0.5843401551246643, + "eval_rewards/chosen": -0.007667039055377245, + "eval_rewards/margins": 0.006130703259259462, + "eval_rewards/rejected": -0.013797740451991558, + "eval_runtime": 384.0717, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 8700 + }, + { + "epoch": 1.5006891798759476, + "grad_norm": 2.500349760055542, + "learning_rate": 3.563041896725762e-09, + "logits/chosen": -3.010810375213623, + "logits/rejected": -2.9961395263671875, + "logps/chosen": -55.51630783081055, + "logps/rejected": -56.80950927734375, + "loss": 0.6886, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.026806961745023727, + "rewards/margins": 0.009826353751122952, + "rewards/rejected": -0.03663332015275955, + "step": 8710 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 2.6423752307891846, + "learning_rate": 3.5400577062828156e-09, + "logits/chosen": -3.001882553100586, + "logits/rejected": -2.976762294769287, + "logps/chosen": -58.995338439941406, + "logps/rejected": -58.893882751464844, + "loss": 0.6846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.026477474719285965, + "rewards/margins": 0.017903322353959084, + "rewards/rejected": -0.0443807952105999, + "step": 8720 + }, + { + "epoch": 1.5041350792556858, + "grad_norm": 2.291661024093628, + "learning_rate": 3.5171319335954356e-09, + "logits/chosen": -3.035444736480713, + "logits/rejected": -3.002584457397461, + "logps/chosen": -55.37629318237305, + "logps/rejected": -56.499671936035156, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.031050005927681923, + "rewards/margins": 0.01546327956020832, + "rewards/rejected": -0.046513281762599945, + "step": 8730 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 2.185478925704956, + "learning_rate": 3.4942647859831476e-09, + "logits/chosen": -2.9955713748931885, + "logits/rejected": -2.985081195831299, + "logps/chosen": -54.93894577026367, + "logps/rejected": -57.57527542114258, + "loss": 0.6885, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03534874692559242, + "rewards/margins": 0.010406061075627804, + "rewards/rejected": -0.0457548089325428, + "step": 8740 + }, + { + "epoch": 1.5075809786354237, + "grad_norm": 2.6699836254119873, + "learning_rate": 3.47145647023533e-09, + "logits/chosen": -3.0291950702667236, + "logits/rejected": -3.0053293704986572, + "logps/chosen": -56.25014114379883, + "logps/rejected": -57.870445251464844, + "loss": 0.683, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.025520440191030502, + "rewards/margins": 0.02117043174803257, + "rewards/rejected": -0.04669087380170822, + "step": 8750 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 2.544248104095459, + "learning_rate": 3.4487071926093407e-09, + "logits/chosen": -2.9766876697540283, + "logits/rejected": -2.9653265476226807, + "logps/chosen": -53.982757568359375, + "logps/rejected": -59.586402893066406, + "loss": 0.684, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02485860511660576, + "rewards/margins": 0.019283946603536606, + "rewards/rejected": -0.044142551720142365, + "step": 8760 + }, + { + "epoch": 1.5110268780151621, + "grad_norm": 2.2586305141448975, + "learning_rate": 3.4260171588286427e-09, + "logits/chosen": -3.024056911468506, + "logits/rejected": -2.9952967166900635, + "logps/chosen": -57.99956512451172, + "logps/rejected": -54.73537063598633, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02924189530313015, + "rewards/margins": 0.020510729402303696, + "rewards/rejected": -0.049752626568078995, + "step": 8770 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 2.3685693740844727, + "learning_rate": 3.403386574080961e-09, + "logits/chosen": -3.0541341304779053, + "logits/rejected": -3.0548415184020996, + "logps/chosen": -52.13898468017578, + "logps/rejected": -58.12361526489258, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03045285865664482, + "rewards/margins": 0.016288291662931442, + "rewards/rejected": -0.04674115404486656, + "step": 8780 + }, + { + "epoch": 1.5144727773949, + "grad_norm": 2.505659818649292, + "learning_rate": 3.380815643016417e-09, + "logits/chosen": -3.0582375526428223, + "logits/rejected": -3.0293161869049072, + "logps/chosen": -57.86345291137695, + "logps/rejected": -56.745567321777344, + "loss": 0.6848, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.02939668856561184, + "rewards/margins": 0.01776151731610298, + "rewards/rejected": -0.04715820401906967, + "step": 8790 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 2.6972694396972656, + "learning_rate": 3.3583045697456773e-09, + "logits/chosen": -2.946511745452881, + "logits/rejected": -2.9125216007232666, + "logps/chosen": -57.9686164855957, + "logps/rejected": -56.8680534362793, + "loss": 0.6866, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029695045202970505, + "rewards/margins": 0.01403980515897274, + "rewards/rejected": -0.0437348447740078, + "step": 8800 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -3.1182050704956055, + "eval_logits/rejected": -3.112541913986206, + "eval_logps/chosen": -59.48351287841797, + "eval_logps/rejected": -64.56842041015625, + "eval_loss": 0.6902456879615784, + "eval_rewards/accuracies": 0.5878252983093262, + "eval_rewards/chosen": -0.007716177962720394, + "eval_rewards/margins": 0.006166784558445215, + "eval_rewards/rejected": -0.013882962986826897, + "eval_runtime": 384.0156, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 8800 + }, + { + "epoch": 1.5179186767746382, + "grad_norm": 2.4620769023895264, + "learning_rate": 3.335853557838112e-09, + "logits/chosen": -3.1001956462860107, + "logits/rejected": -3.0769476890563965, + "logps/chosen": -54.146240234375, + "logps/rejected": -57.28928756713867, + "loss": 0.6861, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03185844421386719, + "rewards/margins": 0.014973374083638191, + "rewards/rejected": -0.046831823885440826, + "step": 8810 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 2.396519899368286, + "learning_rate": 3.3134628103199495e-09, + "logits/chosen": -3.0399370193481445, + "logits/rejected": -3.0198001861572266, + "logps/chosen": -53.8540153503418, + "logps/rejected": -54.6851806640625, + "loss": 0.6863, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03429834544658661, + "rewards/margins": 0.014729444868862629, + "rewards/rejected": -0.04902778938412666, + "step": 8820 + }, + { + "epoch": 1.5213645761543764, + "grad_norm": 2.3248801231384277, + "learning_rate": 3.291132529672444e-09, + "logits/chosen": -2.8968586921691895, + "logits/rejected": -2.875062942504883, + "logps/chosen": -57.6630859375, + "logps/rejected": -57.16463088989258, + "loss": 0.6868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030671019107103348, + "rewards/margins": 0.013551396317780018, + "rewards/rejected": -0.04422241449356079, + "step": 8830 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 2.518972873687744, + "learning_rate": 3.2688629178300435e-09, + "logits/chosen": -2.932861804962158, + "logits/rejected": -2.922234296798706, + "logps/chosen": -55.32551956176758, + "logps/rejected": -57.99912643432617, + "loss": 0.6874, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029771555215120316, + "rewards/margins": 0.012255150824785233, + "rewards/rejected": -0.04202670603990555, + "step": 8840 + }, + { + "epoch": 1.5248104755341143, + "grad_norm": 2.3851332664489746, + "learning_rate": 3.2466541761785606e-09, + "logits/chosen": -3.122568368911743, + "logits/rejected": -3.1034460067749023, + "logps/chosen": -54.5229606628418, + "logps/rejected": -58.2146110534668, + "loss": 0.6852, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.022725548595190048, + "rewards/margins": 0.01649020053446293, + "rewards/rejected": -0.03921574726700783, + "step": 8850 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 2.5882325172424316, + "learning_rate": 3.2245065055533616e-09, + "logits/chosen": -3.0242655277252197, + "logits/rejected": -3.002401828765869, + "logps/chosen": -55.985130310058594, + "logps/rejected": -56.9734992980957, + "loss": 0.687, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.028040340170264244, + "rewards/margins": 0.013081741519272327, + "rewards/rejected": -0.041122086346149445, + "step": 8860 + }, + { + "epoch": 1.5282563749138525, + "grad_norm": 2.635909080505371, + "learning_rate": 3.2024201062375256e-09, + "logits/chosen": -3.167145013809204, + "logits/rejected": -3.133671283721924, + "logps/chosen": -59.30853271484375, + "logps/rejected": -59.278480529785156, + "loss": 0.6851, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.02538606896996498, + "rewards/margins": 0.01714247651398182, + "rewards/rejected": -0.04252853989601135, + "step": 8870 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 2.15626859664917, + "learning_rate": 3.180395177960077e-09, + "logits/chosen": -2.962871789932251, + "logits/rejected": -2.9352777004241943, + "logps/chosen": -57.023284912109375, + "logps/rejected": -56.61933135986328, + "loss": 0.6862, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03572096303105354, + "rewards/margins": 0.014982743188738823, + "rewards/rejected": -0.05070370435714722, + "step": 8880 + }, + { + "epoch": 1.5317022742935906, + "grad_norm": 2.1725003719329834, + "learning_rate": 3.1584319198941235e-09, + "logits/chosen": -2.9969067573547363, + "logits/rejected": -2.9709436893463135, + "logps/chosen": -57.14776611328125, + "logps/rejected": -54.85799026489258, + "loss": 0.6863, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.02653408609330654, + "rewards/margins": 0.014357566833496094, + "rewards/rejected": -0.040891654789447784, + "step": 8890 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 2.323460578918457, + "learning_rate": 3.1365305306551128e-09, + "logits/chosen": -3.0483028888702393, + "logits/rejected": -3.01611065864563, + "logps/chosen": -57.7647705078125, + "logps/rejected": -57.78180694580078, + "loss": 0.6841, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02878654934465885, + "rewards/margins": 0.019070129841566086, + "rewards/rejected": -0.04785668104887009, + "step": 8900 + }, + { + "epoch": 1.5334252239834596, + "eval_logits/chosen": -3.118605613708496, + "eval_logits/rejected": -3.1129119396209717, + "eval_logps/chosen": -59.497798919677734, + "eval_logps/rejected": -64.58731079101562, + "eval_loss": 0.6902238726615906, + "eval_rewards/accuracies": 0.5873606204986572, + "eval_rewards/chosen": -0.007859011180698872, + "eval_rewards/margins": 0.006212850101292133, + "eval_rewards/rejected": -0.01407186221331358, + "eval_runtime": 384.1009, + "eval_samples_per_second": 11.205, + "eval_steps_per_second": 1.401, + "step": 8900 + }, + { + "epoch": 1.5351481736733288, + "grad_norm": 2.3581268787384033, + "learning_rate": 3.1146912082989853e-09, + "logits/chosen": -3.007734775543213, + "logits/rejected": -2.9947516918182373, + "logps/chosen": -58.75102615356445, + "logps/rejected": -57.87038040161133, + "loss": 0.6849, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030753616243600845, + "rewards/margins": 0.017357924953103065, + "rewards/rejected": -0.04811154305934906, + "step": 8910 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 2.8827884197235107, + "learning_rate": 3.092914150320416e-09, + "logits/chosen": -3.095099687576294, + "logits/rejected": -3.0627694129943848, + "logps/chosen": -59.900108337402344, + "logps/rejected": -57.189720153808594, + "loss": 0.6825, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.028544938191771507, + "rewards/margins": 0.022241828963160515, + "rewards/rejected": -0.050786763429641724, + "step": 8920 + }, + { + "epoch": 1.538594073053067, + "grad_norm": 2.249345064163208, + "learning_rate": 3.0711995536510174e-09, + "logits/chosen": -3.0434162616729736, + "logits/rejected": -3.0114622116088867, + "logps/chosen": -57.410850524902344, + "logps/rejected": -55.26404571533203, + "loss": 0.6839, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.027216464281082153, + "rewards/margins": 0.019718730822205544, + "rewards/rejected": -0.04693519324064255, + "step": 8930 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 2.743708848953247, + "learning_rate": 3.0495476146575608e-09, + "logits/chosen": -3.023430347442627, + "logits/rejected": -3.005967378616333, + "logps/chosen": -58.66585159301758, + "logps/rejected": -59.8930549621582, + "loss": 0.6882, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.030310818925499916, + "rewards/margins": 0.010716510005295277, + "rewards/rejected": -0.04102732241153717, + "step": 8940 + }, + { + "epoch": 1.5420399724328049, + "grad_norm": 2.820741891860962, + "learning_rate": 3.0279585291401956e-09, + "logits/chosen": -2.9350593090057373, + "logits/rejected": -2.9369022846221924, + "logps/chosen": -56.8059196472168, + "logps/rejected": -61.012367248535156, + "loss": 0.6879, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02704184129834175, + "rewards/margins": 0.011341988109052181, + "rewards/rejected": -0.03838383033871651, + "step": 8950 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 2.4366254806518555, + "learning_rate": 3.006432492330686e-09, + "logits/chosen": -3.056548595428467, + "logits/rejected": -3.0293049812316895, + "logps/chosen": -57.80559158325195, + "logps/rejected": -57.08323287963867, + "loss": 0.684, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03402943164110184, + "rewards/margins": 0.019523415714502335, + "rewards/rejected": -0.05355284735560417, + "step": 8960 + }, + { + "epoch": 1.545485871812543, + "grad_norm": 2.405066728591919, + "learning_rate": 2.9849696988906426e-09, + "logits/chosen": -3.005000591278076, + "logits/rejected": -2.9771170616149902, + "logps/chosen": -56.36186599731445, + "logps/rejected": -55.556846618652344, + "loss": 0.6867, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.030779168009757996, + "rewards/margins": 0.013805484399199486, + "rewards/rejected": -0.04458465427160263, + "step": 8970 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 2.729218006134033, + "learning_rate": 2.9635703429097495e-09, + "logits/chosen": -3.0958468914031982, + "logits/rejected": -3.0640220642089844, + "logps/chosen": -57.76591873168945, + "logps/rejected": -57.62318801879883, + "loss": 0.6855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03285207599401474, + "rewards/margins": 0.01638091169297695, + "rewards/rejected": -0.04923298954963684, + "step": 8980 + }, + { + "epoch": 1.5489317711922812, + "grad_norm": 2.8934600353240967, + "learning_rate": 2.942234617904044e-09, + "logits/chosen": -3.0019514560699463, + "logits/rejected": -2.98407244682312, + "logps/chosen": -57.321800231933594, + "logps/rejected": -57.94104766845703, + "loss": 0.6857, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.023721177130937576, + "rewards/margins": 0.015485422685742378, + "rewards/rejected": -0.039206597954034805, + "step": 8990 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 2.526658058166504, + "learning_rate": 2.9209627168141196e-09, + "logits/chosen": -3.034315586090088, + "logits/rejected": -3.004920482635498, + "logps/chosen": -59.76508712768555, + "logps/rejected": -57.68981170654297, + "loss": 0.6799, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.023506319150328636, + "rewards/margins": 0.027574270963668823, + "rewards/rejected": -0.05108059197664261, + "step": 9000 + }, + { + "epoch": 1.5506547208821502, + "eval_logits/chosen": -3.1180856227874756, + "eval_logits/rejected": -3.1123623847961426, + "eval_logps/chosen": -59.51602554321289, + "eval_logps/rejected": -64.6044921875, + "eval_loss": 0.6902297735214233, + "eval_rewards/accuracies": 0.5857341885566711, + "eval_rewards/chosen": -0.00804129522293806, + "eval_rewards/margins": 0.0062024411745369434, + "eval_rewards/rejected": -0.014243737794458866, + "eval_runtime": 384.1986, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 9000 + }, + { + "epoch": 1.5523776705720191, + "grad_norm": 2.2845945358276367, + "learning_rate": 2.8997548320034205e-09, + "logits/chosen": -3.0197858810424805, + "logits/rejected": -2.9934756755828857, + "logps/chosen": -58.68003463745117, + "logps/rejected": -57.23857879638672, + "loss": 0.6868, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.030407551676034927, + "rewards/margins": 0.013433225452899933, + "rewards/rejected": -0.04384077712893486, + "step": 9010 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 2.222851276397705, + "learning_rate": 2.87861115525648e-09, + "logits/chosen": -2.9426398277282715, + "logits/rejected": -2.926056385040283, + "logps/chosen": -57.41417694091797, + "logps/rejected": -56.05034637451172, + "loss": 0.6854, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03490142524242401, + "rewards/margins": 0.016401633620262146, + "rewards/rejected": -0.05130305141210556, + "step": 9020 + }, + { + "epoch": 1.5558235699517575, + "grad_norm": 2.456385374069214, + "learning_rate": 2.8575318777771964e-09, + "logits/chosen": -2.9651219844818115, + "logits/rejected": -2.9711365699768066, + "logps/chosen": -54.670021057128906, + "logps/rejected": -58.19325637817383, + "loss": 0.6894, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03518029302358627, + "rewards/margins": 0.008600231260061264, + "rewards/rejected": -0.043780528008937836, + "step": 9030 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 2.4180855751037598, + "learning_rate": 2.836517190187098e-09, + "logits/chosen": -2.981226682662964, + "logits/rejected": -2.958794593811035, + "logps/chosen": -54.088279724121094, + "logps/rejected": -56.5379638671875, + "loss": 0.6855, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02919362485408783, + "rewards/margins": 0.016179706901311874, + "rewards/rejected": -0.045373331755399704, + "step": 9040 + }, + { + "epoch": 1.5592694693314955, + "grad_norm": 2.4568464756011963, + "learning_rate": 2.8155672825236246e-09, + "logits/chosen": -3.0286495685577393, + "logits/rejected": -3.009220838546753, + "logps/chosen": -57.30841827392578, + "logps/rejected": -57.4796142578125, + "loss": 0.6862, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03550892323255539, + "rewards/margins": 0.014863801188766956, + "rewards/rejected": -0.05037272721529007, + "step": 9050 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 2.575063467025757, + "learning_rate": 2.7946823442384017e-09, + "logits/chosen": -3.0351195335388184, + "logits/rejected": -3.0088753700256348, + "logps/chosen": -58.00165939331055, + "logps/rejected": -58.076271057128906, + "loss": 0.6857, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.032886531203985214, + "rewards/margins": 0.015846019610762596, + "rewards/rejected": -0.04873254522681236, + "step": 9060 + }, + { + "epoch": 1.5627153687112336, + "grad_norm": 2.5507895946502686, + "learning_rate": 2.7738625641955395e-09, + "logits/chosen": -2.999799966812134, + "logits/rejected": -2.9818906784057617, + "logps/chosen": -57.21845245361328, + "logps/rejected": -59.49110794067383, + "loss": 0.6834, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.026528984308242798, + "rewards/margins": 0.020386185497045517, + "rewards/rejected": -0.046915166079998016, + "step": 9070 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 2.39579439163208, + "learning_rate": 2.7531081306699013e-09, + "logits/chosen": -2.9876439571380615, + "logits/rejected": -2.984792947769165, + "logps/chosen": -54.6351432800293, + "logps/rejected": -59.41515350341797, + "loss": 0.687, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.026544544845819473, + "rewards/margins": 0.013375622220337391, + "rewards/rejected": -0.03992016613483429, + "step": 9080 + }, + { + "epoch": 1.5661612680909718, + "grad_norm": 2.5818097591400146, + "learning_rate": 2.732419231345441e-09, + "logits/chosen": -2.964087963104248, + "logits/rejected": -2.9281914234161377, + "logps/chosen": -58.411651611328125, + "logps/rejected": -55.20099639892578, + "loss": 0.6853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.031430650502443314, + "rewards/margins": 0.016425397247076035, + "rewards/rejected": -0.04785604402422905, + "step": 9090 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 2.6275980472564697, + "learning_rate": 2.7117960533134556e-09, + "logits/chosen": -3.0353236198425293, + "logits/rejected": -3.0079779624938965, + "logps/chosen": -59.2744026184082, + "logps/rejected": -58.063377380371094, + "loss": 0.6832, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.02477165125310421, + "rewards/margins": 0.020760422572493553, + "rewards/rejected": -0.04553207382559776, + "step": 9100 + }, + { + "epoch": 1.5678842177808407, + "eval_logits/chosen": -3.117415189743042, + "eval_logits/rejected": -3.1117069721221924, + "eval_logps/chosen": -59.5157470703125, + "eval_logps/rejected": -64.60614776611328, + "eval_loss": 0.6902238726615906, + "eval_rewards/accuracies": 0.5861988663673401, + "eval_rewards/chosen": -0.008038590662181377, + "eval_rewards/margins": 0.006221668794751167, + "eval_rewards/rejected": -0.01426026038825512, + "eval_runtime": 383.6571, + "eval_samples_per_second": 11.218, + "eval_steps_per_second": 1.402, + "step": 9100 + }, + { + "epoch": 1.5696071674707097, + "grad_norm": 2.2990360260009766, + "learning_rate": 2.691238783070944e-09, + "logits/chosen": -3.040240526199341, + "logits/rejected": -3.0122292041778564, + "logps/chosen": -57.740074157714844, + "logps/rejected": -57.43877029418945, + "loss": 0.682, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.024915913119912148, + "rewards/margins": 0.023519525304436684, + "rewards/rejected": -0.04843544214963913, + "step": 9110 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 2.6483402252197266, + "learning_rate": 2.670747606518872e-09, + "logits/chosen": -3.0556299686431885, + "logits/rejected": -3.020103931427002, + "logps/chosen": -62.59893798828125, + "logps/rejected": -59.377906799316406, + "loss": 0.6838, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.023233408108353615, + "rewards/margins": 0.0195440836250782, + "rewards/rejected": -0.04277748614549637, + "step": 9120 + }, + { + "epoch": 1.573053066850448, + "grad_norm": 2.4812519550323486, + "learning_rate": 2.6503227089605387e-09, + "logits/chosen": -2.9677350521087646, + "logits/rejected": -2.944486618041992, + "logps/chosen": -57.993263244628906, + "logps/rejected": -58.1032600402832, + "loss": 0.6885, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029743751510977745, + "rewards/margins": 0.010279293172061443, + "rewards/rejected": -0.04002305120229721, + "step": 9130 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 2.4657132625579834, + "learning_rate": 2.6299642750998564e-09, + "logits/chosen": -3.052119731903076, + "logits/rejected": -2.9988605976104736, + "logps/chosen": -59.33286666870117, + "logps/rejected": -54.122520446777344, + "loss": 0.6793, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02051299437880516, + "rewards/margins": 0.028978755697607994, + "rewards/rejected": -0.0494917556643486, + "step": 9140 + }, + { + "epoch": 1.576498966230186, + "grad_norm": 2.5840601921081543, + "learning_rate": 2.6096724890397127e-09, + "logits/chosen": -3.034421443939209, + "logits/rejected": -3.0066089630126953, + "logps/chosen": -61.0789794921875, + "logps/rejected": -59.18257522583008, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02752991020679474, + "rewards/margins": 0.01952740177512169, + "rewards/rejected": -0.04705731198191643, + "step": 9150 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 2.009469509124756, + "learning_rate": 2.5894475342802928e-09, + "logits/chosen": -3.043092966079712, + "logits/rejected": -3.0389745235443115, + "logps/chosen": -52.7302360534668, + "logps/rejected": -57.86452102661133, + "loss": 0.6876, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.035803310573101044, + "rewards/margins": 0.011993474327027798, + "rewards/rejected": -0.047796785831451416, + "step": 9160 + }, + { + "epoch": 1.5799448656099242, + "grad_norm": 2.648082971572876, + "learning_rate": 2.5692895937174175e-09, + "logits/chosen": -3.0427441596984863, + "logits/rejected": -3.0096170902252197, + "logps/chosen": -58.09638595581055, + "logps/rejected": -55.04772186279297, + "loss": 0.6821, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03254878148436546, + "rewards/margins": 0.023205403238534927, + "rewards/rejected": -0.05575418472290039, + "step": 9170 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 2.5571298599243164, + "learning_rate": 2.549198849640898e-09, + "logits/chosen": -3.0292630195617676, + "logits/rejected": -3.0079505443573, + "logps/chosen": -58.021263122558594, + "logps/rejected": -56.97618865966797, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032903410494327545, + "rewards/margins": 0.015215583145618439, + "rewards/rejected": -0.04811898618936539, + "step": 9180 + }, + { + "epoch": 1.5833907649896624, + "grad_norm": 2.3080177307128906, + "learning_rate": 2.5291754837328786e-09, + "logits/chosen": -3.144592046737671, + "logits/rejected": -3.1382734775543213, + "logps/chosen": -57.0882453918457, + "logps/rejected": -60.96589279174805, + "loss": 0.6881, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029310131445527077, + "rewards/margins": 0.011074641719460487, + "rewards/rejected": -0.040384769439697266, + "step": 9190 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 2.7551798820495605, + "learning_rate": 2.5092196770662013e-09, + "logits/chosen": -3.044981002807617, + "logits/rejected": -3.0230705738067627, + "logps/chosen": -56.41364669799805, + "logps/rejected": -56.97838592529297, + "loss": 0.6846, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.035616904497146606, + "rewards/margins": 0.01805807836353779, + "rewards/rejected": -0.053674984723329544, + "step": 9200 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -3.1172776222229004, + "eval_logits/rejected": -3.1115829944610596, + "eval_logps/chosen": -59.54096984863281, + "eval_logps/rejected": -64.62459564208984, + "eval_loss": 0.6902568340301514, + "eval_rewards/accuracies": 0.5810873508453369, + "eval_rewards/chosen": -0.008290711790323257, + "eval_rewards/margins": 0.006153962574899197, + "eval_rewards/rejected": -0.014444672502577305, + "eval_runtime": 383.9243, + "eval_samples_per_second": 11.211, + "eval_steps_per_second": 1.401, + "step": 9200 + }, + { + "epoch": 1.5868366643694003, + "grad_norm": 2.5398905277252197, + "learning_rate": 2.4893316101027586e-09, + "logits/chosen": -2.9198031425476074, + "logits/rejected": -2.8921570777893066, + "logps/chosen": -59.11091232299805, + "logps/rejected": -56.6024284362793, + "loss": 0.686, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03263935446739197, + "rewards/margins": 0.01547915767878294, + "rewards/rejected": -0.04811851307749748, + "step": 9210 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 2.464064359664917, + "learning_rate": 2.4695114626918715e-09, + "logits/chosen": -3.019033908843994, + "logits/rejected": -2.9961018562316895, + "logps/chosen": -53.05615997314453, + "logps/rejected": -54.33342361450195, + "loss": 0.6849, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03042665496468544, + "rewards/margins": 0.017417794093489647, + "rewards/rejected": -0.047844450920820236, + "step": 9220 + }, + { + "epoch": 1.5902825637491387, + "grad_norm": 2.680680274963379, + "learning_rate": 2.449759414068662e-09, + "logits/chosen": -2.936624050140381, + "logits/rejected": -2.9228339195251465, + "logps/chosen": -52.629417419433594, + "logps/rejected": -57.511749267578125, + "loss": 0.6869, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03322712704539299, + "rewards/margins": 0.013344851322472095, + "rewards/rejected": -0.04657197743654251, + "step": 9230 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 2.6631574630737305, + "learning_rate": 2.430075642852424e-09, + "logits/chosen": -2.96937894821167, + "logits/rejected": -2.9644036293029785, + "logps/chosen": -58.7064208984375, + "logps/rejected": -59.214813232421875, + "loss": 0.6834, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0255623497068882, + "rewards/margins": 0.02064797654747963, + "rewards/rejected": -0.04621032625436783, + "step": 9240 + }, + { + "epoch": 1.5937284631288766, + "grad_norm": 2.4485669136047363, + "learning_rate": 2.4104603270450176e-09, + "logits/chosen": -2.9927477836608887, + "logits/rejected": -2.9674036502838135, + "logps/chosen": -57.21258544921875, + "logps/rejected": -58.436279296875, + "loss": 0.6866, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.024265926331281662, + "rewards/margins": 0.01401626504957676, + "rewards/rejected": -0.03828219324350357, + "step": 9250 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 2.4200003147125244, + "learning_rate": 2.3909136440292543e-09, + "logits/chosen": -2.9640910625457764, + "logits/rejected": -2.9449548721313477, + "logps/chosen": -52.23038864135742, + "logps/rejected": -54.45415115356445, + "loss": 0.6871, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.04167623072862625, + "rewards/margins": 0.012958998791873455, + "rewards/rejected": -0.05463522672653198, + "step": 9260 + }, + { + "epoch": 1.5971743625086148, + "grad_norm": 2.290677785873413, + "learning_rate": 2.371435770567294e-09, + "logits/chosen": -3.109546184539795, + "logits/rejected": -3.07560133934021, + "logps/chosen": -57.62566375732422, + "logps/rejected": -56.7308464050293, + "loss": 0.6828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027324050664901733, + "rewards/margins": 0.021839747205376625, + "rewards/rejected": -0.04916379600763321, + "step": 9270 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 2.8847193717956543, + "learning_rate": 2.3520268827990443e-09, + "logits/chosen": -3.0211873054504395, + "logits/rejected": -3.005167245864868, + "logps/chosen": -56.53217315673828, + "logps/rejected": -58.4986572265625, + "loss": 0.685, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029618997126817703, + "rewards/margins": 0.017433568835258484, + "rewards/rejected": -0.04705256223678589, + "step": 9280 + }, + { + "epoch": 1.600620261888353, + "grad_norm": 2.370560884475708, + "learning_rate": 2.3326871562405736e-09, + "logits/chosen": -2.9160075187683105, + "logits/rejected": -2.893542528152466, + "logps/chosen": -57.7754020690918, + "logps/rejected": -54.99779510498047, + "loss": 0.6885, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.031204476952552795, + "rewards/margins": 0.010111426003277302, + "rewards/rejected": -0.04131590202450752, + "step": 9290 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 2.956653594970703, + "learning_rate": 2.31341676578252e-09, + "logits/chosen": -3.002432346343994, + "logits/rejected": -2.997676372528076, + "logps/chosen": -57.20106887817383, + "logps/rejected": -61.02766036987305, + "loss": 0.6853, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03517729789018631, + "rewards/margins": 0.016421284526586533, + "rewards/rejected": -0.05159858614206314, + "step": 9300 + }, + { + "epoch": 1.602343211578222, + "eval_logits/chosen": -3.1176798343658447, + "eval_logits/rejected": -3.111994743347168, + "eval_logps/chosen": -59.54670333862305, + "eval_logps/rejected": -64.63745880126953, + "eval_loss": 0.6902238726615906, + "eval_rewards/accuracies": 0.582713782787323, + "eval_rewards/chosen": -0.00834809523075819, + "eval_rewards/margins": 0.006225237622857094, + "eval_rewards/rejected": -0.014573332853615284, + "eval_runtime": 384.037, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 9300 + }, + { + "epoch": 1.6040661612680909, + "grad_norm": 2.5189545154571533, + "learning_rate": 2.2942158856884998e-09, + "logits/chosen": -3.0237507820129395, + "logits/rejected": -2.994091510772705, + "logps/chosen": -57.047607421875, + "logps/rejected": -55.44449996948242, + "loss": 0.6848, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03301980346441269, + "rewards/margins": 0.017662758007645607, + "rewards/rejected": -0.050682563334703445, + "step": 9310 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 2.653944492340088, + "learning_rate": 2.2750846895935627e-09, + "logits/chosen": -3.0877106189727783, + "logits/rejected": -3.0671634674072266, + "logps/chosen": -54.3978157043457, + "logps/rejected": -59.697364807128906, + "loss": 0.685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.035656653344631195, + "rewards/margins": 0.017190592363476753, + "rewards/rejected": -0.052847255021333694, + "step": 9320 + }, + { + "epoch": 1.607512060647829, + "grad_norm": 2.990072727203369, + "learning_rate": 2.256023350502575e-09, + "logits/chosen": -3.040903091430664, + "logits/rejected": -3.0141682624816895, + "logps/chosen": -59.84055709838867, + "logps/rejected": -60.19636154174805, + "loss": 0.6844, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.032625067979097366, + "rewards/margins": 0.018737614154815674, + "rewards/rejected": -0.05136268213391304, + "step": 9330 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 2.7256877422332764, + "learning_rate": 2.2370320407887056e-09, + "logits/chosen": -2.959775447845459, + "logits/rejected": -2.947470188140869, + "logps/chosen": -55.50397872924805, + "logps/rejected": -56.908599853515625, + "loss": 0.6855, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036947958171367645, + "rewards/margins": 0.016250457614660263, + "rewards/rejected": -0.05319841578602791, + "step": 9340 + }, + { + "epoch": 1.6109579600275672, + "grad_norm": 3.4841971397399902, + "learning_rate": 2.2181109321918236e-09, + "logits/chosen": -3.015531063079834, + "logits/rejected": -2.9840874671936035, + "logps/chosen": -55.771270751953125, + "logps/rejected": -57.008934020996094, + "loss": 0.6843, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03221204876899719, + "rewards/margins": 0.018690291792154312, + "rewards/rejected": -0.0509023442864418, + "step": 9350 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 2.2437381744384766, + "learning_rate": 2.199260195816971e-09, + "logits/chosen": -3.0981364250183105, + "logits/rejected": -3.058584213256836, + "logps/chosen": -59.319190979003906, + "logps/rejected": -57.41559600830078, + "loss": 0.6826, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02636829949915409, + "rewards/margins": 0.022022128105163574, + "rewards/rejected": -0.048390429466962814, + "step": 9360 + }, + { + "epoch": 1.6144038594073054, + "grad_norm": 2.5757100582122803, + "learning_rate": 2.1804800021328107e-09, + "logits/chosen": -3.1047720909118652, + "logits/rejected": -3.070584774017334, + "logps/chosen": -56.7910041809082, + "logps/rejected": -56.188743591308594, + "loss": 0.6844, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02951735258102417, + "rewards/margins": 0.0183617752045393, + "rewards/rejected": -0.04787912219762802, + "step": 9370 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 2.4145779609680176, + "learning_rate": 2.16177052097008e-09, + "logits/chosen": -3.1098639965057373, + "logits/rejected": -3.0928986072540283, + "logps/chosen": -57.31666946411133, + "logps/rejected": -57.55403518676758, + "loss": 0.6892, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.029079124331474304, + "rewards/margins": 0.008899955078959465, + "rewards/rejected": -0.03797907754778862, + "step": 9380 + }, + { + "epoch": 1.6178497587870435, + "grad_norm": 2.444182872772217, + "learning_rate": 2.143131921520055e-09, + "logits/chosen": -2.8663852214813232, + "logits/rejected": -2.8523082733154297, + "logps/chosen": -56.66484832763672, + "logps/rejected": -59.6016845703125, + "loss": 0.6875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03564850240945816, + "rewards/margins": 0.012243666686117649, + "rewards/rejected": -0.04789217188954353, + "step": 9390 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 2.6653835773468018, + "learning_rate": 2.12456437233303e-09, + "logits/chosen": -2.9787285327911377, + "logits/rejected": -2.960437059402466, + "logps/chosen": -57.28019332885742, + "logps/rejected": -58.5770378112793, + "loss": 0.6882, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.03212272375822067, + "rewards/margins": 0.010934999212622643, + "rewards/rejected": -0.043057721108198166, + "step": 9400 + }, + { + "epoch": 1.6195727084769125, + "eval_logits/chosen": -3.1168527603149414, + "eval_logits/rejected": -3.1111221313476562, + "eval_logps/chosen": -59.551536560058594, + "eval_logps/rejected": -64.65276336669922, + "eval_loss": 0.6901717782020569, + "eval_rewards/accuracies": 0.5885223150253296, + "eval_rewards/chosen": -0.008396387100219727, + "eval_rewards/margins": 0.006330016069114208, + "eval_rewards/rejected": -0.014726405031979084, + "eval_runtime": 383.6566, + "eval_samples_per_second": 11.218, + "eval_steps_per_second": 1.402, + "step": 9400 + }, + { + "epoch": 1.6212956581667815, + "grad_norm": 2.3467307090759277, + "learning_rate": 2.106068041316781e-09, + "logits/chosen": -2.9938812255859375, + "logits/rejected": -2.9771664142608643, + "logps/chosen": -57.26462936401367, + "logps/rejected": -57.30256271362305, + "loss": 0.6848, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03390895202755928, + "rewards/margins": 0.017822980880737305, + "rewards/rejected": -0.051731932908296585, + "step": 9410 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 2.4428353309631348, + "learning_rate": 2.0876430957350466e-09, + "logits/chosen": -3.0373952388763428, + "logits/rejected": -3.017669677734375, + "logps/chosen": -58.389183044433594, + "logps/rejected": -57.229583740234375, + "loss": 0.6852, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.033502496778964996, + "rewards/margins": 0.01688491180539131, + "rewards/rejected": -0.05038740485906601, + "step": 9420 + }, + { + "epoch": 1.6247415575465196, + "grad_norm": 2.479332208633423, + "learning_rate": 2.0692897022060397e-09, + "logits/chosen": -3.0020992755889893, + "logits/rejected": -2.9905407428741455, + "logps/chosen": -56.156005859375, + "logps/rejected": -53.7123908996582, + "loss": 0.6884, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.037239111959934235, + "rewards/margins": 0.010548432357609272, + "rewards/rejected": -0.04778754711151123, + "step": 9430 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 2.5780246257781982, + "learning_rate": 2.0510080267009023e-09, + "logits/chosen": -2.995959758758545, + "logits/rejected": -2.9901649951934814, + "logps/chosen": -53.41289138793945, + "logps/rejected": -57.17323684692383, + "loss": 0.6889, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.03778151422739029, + "rewards/margins": 0.009339074604213238, + "rewards/rejected": -0.0471205934882164, + "step": 9440 + }, + { + "epoch": 1.6281874569262578, + "grad_norm": 2.6421189308166504, + "learning_rate": 2.032798234542237e-09, + "logits/chosen": -3.1385090351104736, + "logits/rejected": -3.12992787361145, + "logps/chosen": -57.56132125854492, + "logps/rejected": -55.61620330810547, + "loss": 0.6897, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.04172375798225403, + "rewards/margins": 0.008058389648795128, + "rewards/rejected": -0.049782149493694305, + "step": 9450 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 2.476637601852417, + "learning_rate": 2.0146604904025955e-09, + "logits/chosen": -2.9790234565734863, + "logits/rejected": -2.975015163421631, + "logps/chosen": -55.6374626159668, + "logps/rejected": -59.5582275390625, + "loss": 0.683, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03386567533016205, + "rewards/margins": 0.02146020717918873, + "rewards/rejected": -0.055325884371995926, + "step": 9460 + }, + { + "epoch": 1.6316333563059957, + "grad_norm": 2.6225929260253906, + "learning_rate": 1.996594958302996e-09, + "logits/chosen": -2.8437275886535645, + "logits/rejected": -2.811704397201538, + "logps/chosen": -56.132415771484375, + "logps/rejected": -55.46335983276367, + "loss": 0.6861, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.034872956573963165, + "rewards/margins": 0.015074771828949451, + "rewards/rejected": -0.04994772747159004, + "step": 9470 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 2.4534597396850586, + "learning_rate": 1.978601801611436e-09, + "logits/chosen": -2.9890143871307373, + "logits/rejected": -2.95599627494812, + "logps/chosen": -57.860130310058594, + "logps/rejected": -58.361412048339844, + "loss": 0.6844, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.03867609426379204, + "rewards/margins": 0.018373996019363403, + "rewards/rejected": -0.05705009773373604, + "step": 9480 + }, + { + "epoch": 1.635079255685734, + "grad_norm": 2.2696869373321533, + "learning_rate": 1.9606811830414163e-09, + "logits/chosen": -2.9858505725860596, + "logits/rejected": -2.96655535697937, + "logps/chosen": -57.28430938720703, + "logps/rejected": -57.04707717895508, + "loss": 0.6856, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03171836584806442, + "rewards/margins": 0.01598692685365677, + "rewards/rejected": -0.04770529270172119, + "step": 9490 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 2.2463831901550293, + "learning_rate": 1.94283326465047e-09, + "logits/chosen": -3.016265869140625, + "logits/rejected": -3.0025973320007324, + "logps/chosen": -56.819480895996094, + "logps/rejected": -59.62360382080078, + "loss": 0.6867, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03173057362437248, + "rewards/margins": 0.0137772336602211, + "rewards/rejected": -0.04550781100988388, + "step": 9500 + }, + { + "epoch": 1.636802205375603, + "eval_logits/chosen": -3.116647720336914, + "eval_logits/rejected": -3.111027240753174, + "eval_logps/chosen": -59.55279541015625, + "eval_logps/rejected": -64.6480712890625, + "eval_loss": 0.6902014017105103, + "eval_rewards/accuracies": 0.5815520286560059, + "eval_rewards/chosen": -0.008409040048718452, + "eval_rewards/margins": 0.0062704309821128845, + "eval_rewards/rejected": -0.014679470099508762, + "eval_runtime": 384.0968, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 9500 + }, + { + "epoch": 1.638525155065472, + "grad_norm": 2.787724256515503, + "learning_rate": 1.925058207838699e-09, + "logits/chosen": -2.931727886199951, + "logits/rejected": -2.9154107570648193, + "logps/chosen": -53.7954216003418, + "logps/rejected": -59.11879348754883, + "loss": 0.6823, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029123950749635696, + "rewards/margins": 0.022644540295004845, + "rewards/rejected": -0.05176848918199539, + "step": 9510 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 2.510998487472534, + "learning_rate": 1.9073561733473088e-09, + "logits/chosen": -3.1452136039733887, + "logits/rejected": -3.1307685375213623, + "logps/chosen": -55.496925354003906, + "logps/rejected": -62.52506637573242, + "loss": 0.6861, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.031591176986694336, + "rewards/margins": 0.015155290253460407, + "rewards/rejected": -0.04674647003412247, + "step": 9520 + }, + { + "epoch": 1.6419710544452102, + "grad_norm": 2.6163759231567383, + "learning_rate": 1.8897273212571643e-09, + "logits/chosen": -3.048109769821167, + "logits/rejected": -3.025350332260132, + "logps/chosen": -56.559532165527344, + "logps/rejected": -55.84375762939453, + "loss": 0.6872, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.03663558140397072, + "rewards/margins": 0.012950601987540722, + "rewards/rejected": -0.04958617687225342, + "step": 9530 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 2.477329730987549, + "learning_rate": 1.872171810987324e-09, + "logits/chosen": -2.9872732162475586, + "logits/rejected": -2.963569164276123, + "logps/chosen": -57.75830078125, + "logps/rejected": -58.98407745361328, + "loss": 0.6841, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.036397237330675125, + "rewards/margins": 0.019381213933229446, + "rewards/rejected": -0.05577845126390457, + "step": 9540 + }, + { + "epoch": 1.6454169538249483, + "grad_norm": 2.7419731616973877, + "learning_rate": 1.8546898012936297e-09, + "logits/chosen": -3.0292882919311523, + "logits/rejected": -3.032219886779785, + "logps/chosen": -56.93639373779297, + "logps/rejected": -61.3848876953125, + "loss": 0.6892, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.037130676209926605, + "rewards/margins": 0.008765211328864098, + "rewards/rejected": -0.045895881950855255, + "step": 9550 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 2.5584213733673096, + "learning_rate": 1.8372814502672308e-09, + "logits/chosen": -3.0927374362945557, + "logits/rejected": -3.0865063667297363, + "logps/chosen": -56.66698455810547, + "logps/rejected": -57.6358528137207, + "loss": 0.686, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03044191002845764, + "rewards/margins": 0.015346214175224304, + "rewards/rejected": -0.04578813165426254, + "step": 9560 + }, + { + "epoch": 1.6488628532046863, + "grad_norm": 2.4465668201446533, + "learning_rate": 1.8199469153331949e-09, + "logits/chosen": -3.003964900970459, + "logits/rejected": -2.9885926246643066, + "logps/chosen": -53.80454635620117, + "logps/rejected": -55.86109161376953, + "loss": 0.6853, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03552521392703056, + "rewards/margins": 0.0164842177182436, + "rewards/rejected": -0.052009426057338715, + "step": 9570 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 2.3839058876037598, + "learning_rate": 1.802686353249051e-09, + "logits/chosen": -3.0763354301452637, + "logits/rejected": -3.0583791732788086, + "logps/chosen": -57.54524612426758, + "logps/rejected": -61.7070426940918, + "loss": 0.685, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.030573736876249313, + "rewards/margins": 0.017048945650458336, + "rewards/rejected": -0.0476226843893528, + "step": 9580 + }, + { + "epoch": 1.6523087525844247, + "grad_norm": 2.5615997314453125, + "learning_rate": 1.7854999201033917e-09, + "logits/chosen": -3.1601243019104004, + "logits/rejected": -3.1410508155822754, + "logps/chosen": -59.41094970703125, + "logps/rejected": -60.86851119995117, + "loss": 0.6875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03314214199781418, + "rewards/margins": 0.012432152405381203, + "rewards/rejected": -0.04557429626584053, + "step": 9590 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 2.9601426124572754, + "learning_rate": 1.7683877713144559e-09, + "logits/chosen": -2.979534149169922, + "logits/rejected": -2.945681095123291, + "logps/chosen": -58.09977340698242, + "logps/rejected": -57.2403450012207, + "loss": 0.6845, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.030393335968255997, + "rewards/margins": 0.01838875375688076, + "rewards/rejected": -0.048782095313072205, + "step": 9600 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -3.1166112422943115, + "eval_logits/rejected": -3.1109259128570557, + "eval_logps/chosen": -59.561134338378906, + "eval_logps/rejected": -64.66480255126953, + "eval_loss": 0.6901615858078003, + "eval_rewards/accuracies": 0.5861988663673401, + "eval_rewards/chosen": -0.00849241204559803, + "eval_rewards/margins": 0.006354290526360273, + "eval_rewards/rejected": -0.014846701174974442, + "eval_runtime": 384.14, + "eval_samples_per_second": 11.204, + "eval_steps_per_second": 1.401, + "step": 9600 + }, + { + "epoch": 1.6557546519641626, + "grad_norm": 2.4565086364746094, + "learning_rate": 1.7513500616287226e-09, + "logits/chosen": -3.0060410499572754, + "logits/rejected": -2.985361099243164, + "logps/chosen": -58.816650390625, + "logps/rejected": -59.23149490356445, + "loss": 0.6847, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02508339285850525, + "rewards/margins": 0.017644105479121208, + "rewards/rejected": -0.042727500200271606, + "step": 9610 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 2.263732671737671, + "learning_rate": 1.734386945119515e-09, + "logits/chosen": -3.125436782836914, + "logits/rejected": -3.1177351474761963, + "logps/chosen": -57.58256912231445, + "logps/rejected": -59.40592575073242, + "loss": 0.6886, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.029961705207824707, + "rewards/margins": 0.010090869851410389, + "rewards/rejected": -0.04005257040262222, + "step": 9620 + }, + { + "epoch": 1.6592005513439008, + "grad_norm": 2.6932501792907715, + "learning_rate": 1.7174985751855931e-09, + "logits/chosen": -3.0406494140625, + "logits/rejected": -3.0130789279937744, + "logps/chosen": -59.420860290527344, + "logps/rejected": -57.45256805419922, + "loss": 0.6913, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.03962600976228714, + "rewards/margins": 0.004532286431640387, + "rewards/rejected": -0.044158294796943665, + "step": 9630 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 2.2227652072906494, + "learning_rate": 1.7006851045497996e-09, + "logits/chosen": -3.0980916023254395, + "logits/rejected": -3.0667550563812256, + "logps/chosen": -59.05376052856445, + "logps/rejected": -55.8654899597168, + "loss": 0.6822, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.030568838119506836, + "rewards/margins": 0.02280599996447563, + "rewards/rejected": -0.05337483808398247, + "step": 9640 + }, + { + "epoch": 1.662646450723639, + "grad_norm": 2.3098556995391846, + "learning_rate": 1.6839466852576314e-09, + "logits/chosen": -2.9595775604248047, + "logits/rejected": -2.9384946823120117, + "logps/chosen": -59.28965377807617, + "logps/rejected": -58.59055709838867, + "loss": 0.6897, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0381181500852108, + "rewards/margins": 0.007864254526793957, + "rewards/rejected": -0.04598240926861763, + "step": 9650 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 3.269378185272217, + "learning_rate": 1.667283468675913e-09, + "logits/chosen": -2.989434003829956, + "logits/rejected": -2.983867645263672, + "logps/chosen": -53.243675231933594, + "logps/rejected": -57.77482986450195, + "loss": 0.6862, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.026591796427965164, + "rewards/margins": 0.014660343527793884, + "rewards/rejected": -0.04125214368104935, + "step": 9660 + }, + { + "epoch": 1.6660923501033769, + "grad_norm": 2.3100833892822266, + "learning_rate": 1.6506956054913892e-09, + "logits/chosen": -3.077094793319702, + "logits/rejected": -3.04704213142395, + "logps/chosen": -58.68497848510742, + "logps/rejected": -58.46355056762695, + "loss": 0.681, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02568379044532776, + "rewards/margins": 0.025521423667669296, + "rewards/rejected": -0.051205217838287354, + "step": 9670 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 2.4696285724639893, + "learning_rate": 1.6341832457093853e-09, + "logits/chosen": -3.039754867553711, + "logits/rejected": -3.023983955383301, + "logps/chosen": -57.664756774902344, + "logps/rejected": -56.45305633544922, + "loss": 0.6883, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.03195812553167343, + "rewards/margins": 0.010615061037242413, + "rewards/rejected": -0.04257318750023842, + "step": 9680 + }, + { + "epoch": 1.6695382494831152, + "grad_norm": 2.5291388034820557, + "learning_rate": 1.6177465386524426e-09, + "logits/chosen": -2.9263358116149902, + "logits/rejected": -2.906914472579956, + "logps/chosen": -58.816925048828125, + "logps/rejected": -59.23249053955078, + "loss": 0.6856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02988823689520359, + "rewards/margins": 0.016320116817951202, + "rewards/rejected": -0.04620835557579994, + "step": 9690 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 2.3730547428131104, + "learning_rate": 1.6013856329589683e-09, + "logits/chosen": -2.9970877170562744, + "logits/rejected": -2.964787721633911, + "logps/chosen": -56.075889587402344, + "logps/rejected": -57.289794921875, + "loss": 0.6855, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03369338810443878, + "rewards/margins": 0.01647287793457508, + "rewards/rejected": -0.05016626790165901, + "step": 9700 + }, + { + "epoch": 1.6712611991729842, + "eval_logits/chosen": -3.1167359352111816, + "eval_logits/rejected": -3.1110715866088867, + "eval_logps/chosen": -59.564579010009766, + "eval_logps/rejected": -64.6676254272461, + "eval_loss": 0.6901634335517883, + "eval_rewards/accuracies": 0.5875929594039917, + "eval_rewards/chosen": -0.008526891469955444, + "eval_rewards/margins": 0.006348147988319397, + "eval_rewards/rejected": -0.014875039458274841, + "eval_runtime": 383.9173, + "eval_samples_per_second": 11.211, + "eval_steps_per_second": 1.401, + "step": 9700 + }, + { + "epoch": 1.6729841488628532, + "grad_norm": 2.4649150371551514, + "learning_rate": 1.585100676581892e-09, + "logits/chosen": -2.86004376411438, + "logits/rejected": -2.8453660011291504, + "logps/chosen": -56.4981803894043, + "logps/rejected": -56.37230682373047, + "loss": 0.6857, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02869611419737339, + "rewards/margins": 0.01581619493663311, + "rewards/rejected": -0.0445123091340065, + "step": 9710 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 2.7789175510406494, + "learning_rate": 1.568891816787329e-09, + "logits/chosen": -3.03645920753479, + "logits/rejected": -3.022475242614746, + "logps/chosen": -56.657142639160156, + "logps/rejected": -56.0443229675293, + "loss": 0.6867, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03298606723546982, + "rewards/margins": 0.013968385756015778, + "rewards/rejected": -0.046954452991485596, + "step": 9720 + }, + { + "epoch": 1.6764300482425913, + "grad_norm": 2.554271936416626, + "learning_rate": 1.5527592001532465e-09, + "logits/chosen": -3.0012001991271973, + "logits/rejected": -2.987691879272461, + "logps/chosen": -56.674346923828125, + "logps/rejected": -59.074127197265625, + "loss": 0.6852, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.029376983642578125, + "rewards/margins": 0.016980426385998726, + "rewards/rejected": -0.0463574156165123, + "step": 9730 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 2.4061145782470703, + "learning_rate": 1.5367029725681403e-09, + "logits/chosen": -3.041320562362671, + "logits/rejected": -3.0223727226257324, + "logps/chosen": -57.524688720703125, + "logps/rejected": -57.442771911621094, + "loss": 0.6879, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.030505716800689697, + "rewards/margins": 0.011272273026406765, + "rewards/rejected": -0.04177799075841904, + "step": 9740 + }, + { + "epoch": 1.6798759476223295, + "grad_norm": 2.8132548332214355, + "learning_rate": 1.5207232792297065e-09, + "logits/chosen": -3.0355067253112793, + "logits/rejected": -2.992421865463257, + "logps/chosen": -58.13688278198242, + "logps/rejected": -54.65366744995117, + "loss": 0.6822, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.030089890584349632, + "rewards/margins": 0.02290867641568184, + "rewards/rejected": -0.05299856513738632, + "step": 9750 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 2.302518129348755, + "learning_rate": 1.5048202646435528e-09, + "logits/chosen": -3.0535900592803955, + "logits/rejected": -3.0285425186157227, + "logps/chosen": -56.118263244628906, + "logps/rejected": -56.26769256591797, + "loss": 0.6833, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.029349148273468018, + "rewards/margins": 0.020514894276857376, + "rewards/rejected": -0.049864038825035095, + "step": 9760 + }, + { + "epoch": 1.6833218470020674, + "grad_norm": 2.493689775466919, + "learning_rate": 1.4889940726218521e-09, + "logits/chosen": -2.963839292526245, + "logits/rejected": -2.9446358680725098, + "logps/chosen": -57.1699333190918, + "logps/rejected": -58.53014373779297, + "loss": 0.6861, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.030619194731116295, + "rewards/margins": 0.015159929171204567, + "rewards/rejected": -0.04577913135290146, + "step": 9770 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 2.4732000827789307, + "learning_rate": 1.4732448462820902e-09, + "logits/chosen": -3.0382137298583984, + "logits/rejected": -2.9898781776428223, + "logps/chosen": -63.5068473815918, + "logps/rejected": -56.90974807739258, + "loss": 0.6804, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.026242714375257492, + "rewards/margins": 0.026783952489495277, + "rewards/rejected": -0.05302666500210762, + "step": 9780 + }, + { + "epoch": 1.6867677463818056, + "grad_norm": 2.587778091430664, + "learning_rate": 1.457572728045724e-09, + "logits/chosen": -3.0090510845184326, + "logits/rejected": -3.0093512535095215, + "logps/chosen": -56.01578903198242, + "logps/rejected": -62.18316650390625, + "loss": 0.6837, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.026041844859719276, + "rewards/margins": 0.019954070448875427, + "rewards/rejected": -0.045995913445949554, + "step": 9790 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 2.735978603363037, + "learning_rate": 1.4419778596369293e-09, + "logits/chosen": -3.040757656097412, + "logits/rejected": -3.0044455528259277, + "logps/chosen": -59.152740478515625, + "logps/rejected": -58.39259719848633, + "loss": 0.682, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.023504730314016342, + "rewards/margins": 0.023418856784701347, + "rewards/rejected": -0.04692358896136284, + "step": 9800 + }, + { + "epoch": 1.6884906960716748, + "eval_logits/chosen": -3.1164276599884033, + "eval_logits/rejected": -3.110750436782837, + "eval_logps/chosen": -59.58139419555664, + "eval_logps/rejected": -64.676513671875, + "eval_loss": 0.6902053952217102, + "eval_rewards/accuracies": 0.586663544178009, + "eval_rewards/chosen": -0.008694971911609173, + "eval_rewards/margins": 0.006268941797316074, + "eval_rewards/rejected": -0.014963913708925247, + "eval_runtime": 383.8504, + "eval_samples_per_second": 11.213, + "eval_steps_per_second": 1.402, + "step": 9800 + }, + { + "epoch": 1.6902136457615438, + "grad_norm": 2.47900390625, + "learning_rate": 1.4264603820813005e-09, + "logits/chosen": -2.9712750911712646, + "logits/rejected": -2.9366753101348877, + "logps/chosen": -58.309791564941406, + "logps/rejected": -59.19429397583008, + "loss": 0.6815, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02890641614794731, + "rewards/margins": 0.02413240447640419, + "rewards/rejected": -0.0530388243496418, + "step": 9810 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 2.372298002243042, + "learning_rate": 1.411020435704584e-09, + "logits/chosen": -3.037036180496216, + "logits/rejected": -3.0111517906188965, + "logps/chosen": -56.780792236328125, + "logps/rejected": -59.128944396972656, + "loss": 0.6812, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026590263471007347, + "rewards/margins": 0.025052299723029137, + "rewards/rejected": -0.05164256691932678, + "step": 9820 + }, + { + "epoch": 1.693659545141282, + "grad_norm": 2.340632677078247, + "learning_rate": 1.3956581601314045e-09, + "logits/chosen": -3.0698554515838623, + "logits/rejected": -3.0650217533111572, + "logps/chosen": -61.13444137573242, + "logps/rejected": -61.3189811706543, + "loss": 0.6895, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.03146522864699364, + "rewards/margins": 0.008265768177807331, + "rewards/rejected": -0.039730995893478394, + "step": 9830 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 2.3163886070251465, + "learning_rate": 1.3803736942839963e-09, + "logits/chosen": -2.9985289573669434, + "logits/rejected": -2.9663424491882324, + "logps/chosen": -59.07982635498047, + "logps/rejected": -59.1026725769043, + "loss": 0.6844, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.032019712030887604, + "rewards/margins": 0.018637120723724365, + "rewards/rejected": -0.05065683275461197, + "step": 9840 + }, + { + "epoch": 1.69710544452102, + "grad_norm": 2.3132269382476807, + "learning_rate": 1.3651671763809692e-09, + "logits/chosen": -3.017277479171753, + "logits/rejected": -3.005635976791382, + "logps/chosen": -57.1483039855957, + "logps/rejected": -55.78985595703125, + "loss": 0.6857, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02546665631234646, + "rewards/margins": 0.01578855887055397, + "rewards/rejected": -0.04125521332025528, + "step": 9850 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 2.7721080780029297, + "learning_rate": 1.3500387439360285e-09, + "logits/chosen": -3.083782196044922, + "logits/rejected": -3.0732412338256836, + "logps/chosen": -60.18684005737305, + "logps/rejected": -58.596580505371094, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03860507160425186, + "rewards/margins": 0.008516994304955006, + "rewards/rejected": -0.04712206497788429, + "step": 9860 + }, + { + "epoch": 1.700551343900758, + "grad_norm": 2.747910499572754, + "learning_rate": 1.3349885337567613e-09, + "logits/chosen": -3.0581815242767334, + "logits/rejected": -3.030397891998291, + "logps/chosen": -55.94462966918945, + "logps/rejected": -56.11644744873047, + "loss": 0.6836, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.032134730368852615, + "rewards/margins": 0.020175369456410408, + "rewards/rejected": -0.052310097962617874, + "step": 9870 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 2.2323834896087646, + "learning_rate": 1.3200166819433701e-09, + "logits/chosen": -2.945613145828247, + "logits/rejected": -2.941906213760376, + "logps/chosen": -55.61360549926758, + "logps/rejected": -60.08890914916992, + "loss": 0.6882, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.03548043593764305, + "rewards/margins": 0.010891259647905827, + "rewards/rejected": -0.0463716983795166, + "step": 9880 + }, + { + "epoch": 1.7039972432804962, + "grad_norm": 2.667999267578125, + "learning_rate": 1.305123323887467e-09, + "logits/chosen": -2.9106783866882324, + "logits/rejected": -2.904560089111328, + "logps/chosen": -59.61014938354492, + "logps/rejected": -58.7868537902832, + "loss": 0.6852, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.031924955546855927, + "rewards/margins": 0.01715216599404812, + "rewards/rejected": -0.049077123403549194, + "step": 9890 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 2.61613392829895, + "learning_rate": 1.2903085942708348e-09, + "logits/chosen": -3.014256715774536, + "logits/rejected": -2.9844601154327393, + "logps/chosen": -54.362159729003906, + "logps/rejected": -54.91518020629883, + "loss": 0.6814, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02655635215342045, + "rewards/margins": 0.024641280993819237, + "rewards/rejected": -0.051197636872529984, + "step": 9900 + }, + { + "epoch": 1.7057201929703654, + "eval_logits/chosen": -3.116485595703125, + "eval_logits/rejected": -3.110800266265869, + "eval_logps/chosen": -59.58063888549805, + "eval_logps/rejected": -64.68134307861328, + "eval_loss": 0.6901763677597046, + "eval_rewards/accuracies": 0.5913103818893433, + "eval_rewards/chosen": -0.008687433786690235, + "eval_rewards/margins": 0.006324751302599907, + "eval_rewards/rejected": -0.015012186020612717, + "eval_runtime": 383.8753, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.401, + "step": 9900 + }, + { + "epoch": 1.7074431426602343, + "grad_norm": 2.469726324081421, + "learning_rate": 1.2755726270642133e-09, + "logits/chosen": -3.070533514022827, + "logits/rejected": -3.0258636474609375, + "logps/chosen": -60.8514404296875, + "logps/rejected": -55.90861129760742, + "loss": 0.6838, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03283166512846947, + "rewards/margins": 0.01975090429186821, + "rewards/rejected": -0.05258256942033768, + "step": 9910 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 2.480678081512451, + "learning_rate": 1.260915555526091e-09, + "logits/chosen": -3.094026803970337, + "logits/rejected": -3.0649092197418213, + "logps/chosen": -55.47735595703125, + "logps/rejected": -54.31943893432617, + "loss": 0.6826, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02788475528359413, + "rewards/margins": 0.022009506821632385, + "rewards/rejected": -0.04989425837993622, + "step": 9920 + }, + { + "epoch": 1.7108890420399723, + "grad_norm": 2.630247116088867, + "learning_rate": 1.246337512201492e-09, + "logits/chosen": -3.027109146118164, + "logits/rejected": -3.024893283843994, + "logps/chosen": -55.09284591674805, + "logps/rejected": -59.29688262939453, + "loss": 0.6874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.031861864030361176, + "rewards/margins": 0.012505242601037025, + "rewards/rejected": -0.04436710476875305, + "step": 9930 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 2.31946063041687, + "learning_rate": 1.2318386289207862e-09, + "logits/chosen": -3.093456745147705, + "logits/rejected": -3.088930368423462, + "logps/chosen": -57.84920120239258, + "logps/rejected": -64.39171600341797, + "loss": 0.6893, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02787485346198082, + "rewards/margins": 0.008635136298835278, + "rewards/rejected": -0.03650998696684837, + "step": 9940 + }, + { + "epoch": 1.7143349414197107, + "grad_norm": 2.4557619094848633, + "learning_rate": 1.2174190367984905e-09, + "logits/chosen": -3.057157039642334, + "logits/rejected": -3.0204882621765137, + "logps/chosen": -56.916282653808594, + "logps/rejected": -56.3023681640625, + "loss": 0.6835, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.029519572854042053, + "rewards/margins": 0.02020849846303463, + "rewards/rejected": -0.04972807317972183, + "step": 9950 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 2.373345136642456, + "learning_rate": 1.203078866232088e-09, + "logits/chosen": -3.090630054473877, + "logits/rejected": -3.073500394821167, + "logps/chosen": -57.05073928833008, + "logps/rejected": -59.31227493286133, + "loss": 0.6883, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.036136444658041, + "rewards/margins": 0.010895757004618645, + "rewards/rejected": -0.047032203525304794, + "step": 9960 + }, + { + "epoch": 1.7177808407994486, + "grad_norm": 2.390754222869873, + "learning_rate": 1.1888182469008457e-09, + "logits/chosen": -3.0301880836486816, + "logits/rejected": -3.0101847648620605, + "logps/chosen": -54.38376998901367, + "logps/rejected": -56.96045684814453, + "loss": 0.6828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.030236411839723587, + "rewards/margins": 0.021668527275323868, + "rewards/rejected": -0.051904939115047455, + "step": 9970 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 2.6026532649993896, + "learning_rate": 1.1746373077646366e-09, + "logits/chosen": -2.977801561355591, + "logits/rejected": -2.964418649673462, + "logps/chosen": -56.35908126831055, + "logps/rejected": -58.4140510559082, + "loss": 0.6853, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03563179820775986, + "rewards/margins": 0.016755376011133194, + "rewards/rejected": -0.05238717794418335, + "step": 9980 + }, + { + "epoch": 1.7212267401791868, + "grad_norm": 2.3901596069335938, + "learning_rate": 1.1605361770627943e-09, + "logits/chosen": -3.069767475128174, + "logits/rejected": -3.0197501182556152, + "logps/chosen": -57.231842041015625, + "logps/rejected": -54.852928161621094, + "loss": 0.6864, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03499136120080948, + "rewards/margins": 0.014375315979123116, + "rewards/rejected": -0.049366679042577744, + "step": 9990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 2.420463800430298, + "learning_rate": 1.1465149823129207e-09, + "logits/chosen": -3.076472282409668, + "logits/rejected": -3.055633783340454, + "logps/chosen": -58.2899055480957, + "logps/rejected": -61.497642517089844, + "loss": 0.6837, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025413617491722107, + "rewards/margins": 0.019886162132024765, + "rewards/rejected": -0.045299775898456573, + "step": 10000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -3.116307258605957, + "eval_logits/rejected": -3.110671043395996, + "eval_logps/chosen": -59.58539962768555, + "eval_logps/rejected": -64.69255065917969, + "eval_loss": 0.6901459693908691, + "eval_rewards/accuracies": 0.5927044749259949, + "eval_rewards/chosen": -0.00873502902686596, + "eval_rewards/margins": 0.006389264483004808, + "eval_rewards/rejected": -0.015124293975532055, + "eval_runtime": 384.2617, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 10000 + }, + { + "epoch": 1.724672639558925, + "grad_norm": 2.7465927600860596, + "learning_rate": 1.132573850309767e-09, + "logits/chosen": -3.0503437519073486, + "logits/rejected": -3.029636859893799, + "logps/chosen": -55.57611083984375, + "logps/rejected": -57.7164192199707, + "loss": 0.6849, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0319674126803875, + "rewards/margins": 0.017423031851649284, + "rewards/rejected": -0.04939044266939163, + "step": 10010 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 2.1733815670013428, + "learning_rate": 1.1187129071240588e-09, + "logits/chosen": -2.9910526275634766, + "logits/rejected": -2.958216428756714, + "logps/chosen": -57.27119827270508, + "logps/rejected": -56.076995849609375, + "loss": 0.6862, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03598593920469284, + "rewards/margins": 0.014988981187343597, + "rewards/rejected": -0.050974924117326736, + "step": 10020 + }, + { + "epoch": 1.7281185389386629, + "grad_norm": 2.466876268386841, + "learning_rate": 1.1049322781013726e-09, + "logits/chosen": -3.0376193523406982, + "logits/rejected": -3.004720449447632, + "logps/chosen": -60.057777404785156, + "logps/rejected": -56.2112922668457, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.026665765792131424, + "rewards/margins": 0.014138095080852509, + "rewards/rejected": -0.04080386459827423, + "step": 10030 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 2.6095364093780518, + "learning_rate": 1.0912320878610017e-09, + "logits/chosen": -3.1675872802734375, + "logits/rejected": -3.1530065536499023, + "logps/chosen": -57.85478973388672, + "logps/rejected": -59.01996612548828, + "loss": 0.6858, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.030137866735458374, + "rewards/margins": 0.01570282131433487, + "rewards/rejected": -0.045840684324502945, + "step": 10040 + }, + { + "epoch": 1.7315644383184012, + "grad_norm": 2.6305747032165527, + "learning_rate": 1.0776124602948146e-09, + "logits/chosen": -3.009827136993408, + "logits/rejected": -3.0053369998931885, + "logps/chosen": -57.649658203125, + "logps/rejected": -60.40259552001953, + "loss": 0.6912, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03677880018949509, + "rewards/margins": 0.004884940572082996, + "rewards/rejected": -0.04166373983025551, + "step": 10050 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 2.219585418701172, + "learning_rate": 1.06407351856616e-09, + "logits/chosen": -2.963155746459961, + "logits/rejected": -2.9584078788757324, + "logps/chosen": -52.41069793701172, + "logps/rejected": -56.978782653808594, + "loss": 0.689, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0367356613278389, + "rewards/margins": 0.009188707917928696, + "rewards/rejected": -0.04592436924576759, + "step": 10060 + }, + { + "epoch": 1.7350103376981392, + "grad_norm": 3.5949621200561523, + "learning_rate": 1.050615385108722e-09, + "logits/chosen": -2.9376320838928223, + "logits/rejected": -2.927889347076416, + "logps/chosen": -55.84636306762695, + "logps/rejected": -57.358253479003906, + "loss": 0.6862, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.033914633095264435, + "rewards/margins": 0.014869148842990398, + "rewards/rejected": -0.04878378286957741, + "step": 10070 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 2.440239191055298, + "learning_rate": 1.037238181625446e-09, + "logits/chosen": -3.0707335472106934, + "logits/rejected": -3.0552597045898438, + "logps/chosen": -59.8310432434082, + "logps/rejected": -59.3591423034668, + "loss": 0.6887, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03293101117014885, + "rewards/margins": 0.009801891632378101, + "rewards/rejected": -0.042732901871204376, + "step": 10080 + }, + { + "epoch": 1.7384562370778773, + "grad_norm": 2.5599682331085205, + "learning_rate": 1.0239420290874058e-09, + "logits/chosen": -3.1050057411193848, + "logits/rejected": -3.0850796699523926, + "logps/chosen": -57.10601806640625, + "logps/rejected": -62.16572952270508, + "loss": 0.685, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027327552437782288, + "rewards/margins": 0.017170695587992668, + "rewards/rejected": -0.044498249888420105, + "step": 10090 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 2.586700439453125, + "learning_rate": 1.010727047732739e-09, + "logits/chosen": -3.0433762073516846, + "logits/rejected": -3.0341668128967285, + "logps/chosen": -57.99790573120117, + "logps/rejected": -59.83086395263672, + "loss": 0.6821, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030119875445961952, + "rewards/margins": 0.023133408278226852, + "rewards/rejected": -0.053253281861543655, + "step": 10100 + }, + { + "epoch": 1.7401791867677465, + "eval_logits/chosen": -3.116196870803833, + "eval_logits/rejected": -3.110548257827759, + "eval_logps/chosen": -59.580108642578125, + "eval_logps/rejected": -64.69314575195312, + "eval_loss": 0.6901180148124695, + "eval_rewards/accuracies": 0.5841078162193298, + "eval_rewards/chosen": -0.008682068437337875, + "eval_rewards/margins": 0.006448162719607353, + "eval_rewards/rejected": -0.015130231156945229, + "eval_runtime": 383.9779, + "eval_samples_per_second": 11.209, + "eval_steps_per_second": 1.401, + "step": 10100 + }, + { + "epoch": 1.7419021364576155, + "grad_norm": 2.562401533126831, + "learning_rate": 9.97593357065536e-10, + "logits/chosen": -3.0932211875915527, + "logits/rejected": -3.102454900741577, + "logps/chosen": -54.34955596923828, + "logps/rejected": -59.22336959838867, + "loss": 0.6897, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03776390105485916, + "rewards/margins": 0.008016002364456654, + "rewards/rejected": -0.04577990248799324, + "step": 10110 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 2.4685213565826416, + "learning_rate": 9.845410758547724e-10, + "logits/chosen": -3.0438075065612793, + "logits/rejected": -3.0213944911956787, + "logps/chosen": -57.01942825317383, + "logps/rejected": -55.703758239746094, + "loss": 0.6846, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0329575315117836, + "rewards/margins": 0.018011055886745453, + "rewards/rejected": -0.05096858739852905, + "step": 10120 + }, + { + "epoch": 1.7453480358373534, + "grad_norm": 2.5328216552734375, + "learning_rate": 9.715703221332328e-10, + "logits/chosen": -3.0835208892822266, + "logits/rejected": -3.063122034072876, + "logps/chosen": -55.781517028808594, + "logps/rejected": -57.80461502075195, + "loss": 0.685, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.024351779371500015, + "rewards/margins": 0.01719535142183304, + "rewards/rejected": -0.041547130793333054, + "step": 10130 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 2.9124436378479004, + "learning_rate": 9.586812131964429e-10, + "logits/chosen": -3.0506045818328857, + "logits/rejected": -3.0077712535858154, + "logps/chosen": -62.746177673339844, + "logps/rejected": -58.44568634033203, + "loss": 0.6829, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.023203754797577858, + "rewards/margins": 0.02158413641154766, + "rewards/rejected": -0.04478789120912552, + "step": 10140 + }, + { + "epoch": 1.7487939352170918, + "grad_norm": 2.400768518447876, + "learning_rate": 9.458738656016063e-10, + "logits/chosen": -2.9910178184509277, + "logits/rejected": -2.972029685974121, + "logps/chosen": -58.909873962402344, + "logps/rejected": -59.960166931152344, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03536384552717209, + "rewards/margins": 0.010202957317233086, + "rewards/rejected": -0.045566804707050323, + "step": 10150 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 2.4359378814697266, + "learning_rate": 9.331483951665532e-10, + "logits/chosen": -3.0172200202941895, + "logits/rejected": -3.0022635459899902, + "logps/chosen": -55.558441162109375, + "logps/rejected": -57.4552001953125, + "loss": 0.6849, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03486592322587967, + "rewards/margins": 0.01763620227575302, + "rewards/rejected": -0.05250212550163269, + "step": 10160 + }, + { + "epoch": 1.7522398345968297, + "grad_norm": 2.5012121200561523, + "learning_rate": 9.20504916968693e-10, + "logits/chosen": -3.0220370292663574, + "logits/rejected": -3.0115466117858887, + "logps/chosen": -57.70402145385742, + "logps/rejected": -58.722206115722656, + "loss": 0.686, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.027941424399614334, + "rewards/margins": 0.015233149752020836, + "rewards/rejected": -0.04317457601428032, + "step": 10170 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 2.5249717235565186, + "learning_rate": 9.079435453439744e-10, + "logits/chosen": -2.958926200866699, + "logits/rejected": -2.930947780609131, + "logps/chosen": -55.34897994995117, + "logps/rejected": -57.743553161621094, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029070192947983742, + "rewards/margins": 0.01526731252670288, + "rewards/rejected": -0.04433750361204147, + "step": 10180 + }, + { + "epoch": 1.755685733976568, + "grad_norm": 2.646099805831909, + "learning_rate": 8.954643938858431e-10, + "logits/chosen": -3.001873731613159, + "logits/rejected": -2.977242946624756, + "logps/chosen": -53.825775146484375, + "logps/rejected": -55.69243240356445, + "loss": 0.6866, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.02867623046040535, + "rewards/margins": 0.014177098870277405, + "rewards/rejected": -0.042853329330682755, + "step": 10190 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 2.4003872871398926, + "learning_rate": 8.83067575444233e-10, + "logits/chosen": -3.066793441772461, + "logits/rejected": -3.0473647117614746, + "logps/chosen": -56.400489807128906, + "logps/rejected": -56.30890655517578, + "loss": 0.6867, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.039797525852918625, + "rewards/margins": 0.014033086597919464, + "rewards/rejected": -0.05383061245083809, + "step": 10200 + }, + { + "epoch": 1.757408683666437, + "eval_logits/chosen": -3.116189956665039, + "eval_logits/rejected": -3.1104912757873535, + "eval_logps/chosen": -59.59709548950195, + "eval_logps/rejected": -64.70321655273438, + "eval_loss": 0.6901513934135437, + "eval_rewards/accuracies": 0.5815520286560059, + "eval_rewards/chosen": -0.008852045051753521, + "eval_rewards/margins": 0.00637889513745904, + "eval_rewards/rejected": -0.015230940654873848, + "eval_runtime": 384.2513, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 10200 + }, + { + "epoch": 1.759131633356306, + "grad_norm": 2.68137264251709, + "learning_rate": 8.707532021245213e-10, + "logits/chosen": -3.070094585418701, + "logits/rejected": -3.065809726715088, + "logps/chosen": -57.35698699951172, + "logps/rejected": -62.095924377441406, + "loss": 0.6855, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03344056382775307, + "rewards/margins": 0.016259726136922836, + "rewards/rejected": -0.049700286239385605, + "step": 10210 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 2.283804178237915, + "learning_rate": 8.585213852865415e-10, + "logits/chosen": -3.018273115158081, + "logits/rejected": -3.013718843460083, + "logps/chosen": -52.13434600830078, + "logps/rejected": -55.81071090698242, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03748854249715805, + "rewards/margins": 0.009387836791574955, + "rewards/rejected": -0.04687637463212013, + "step": 10220 + }, + { + "epoch": 1.762577532736044, + "grad_norm": 2.206413984298706, + "learning_rate": 8.463722355435466e-10, + "logits/chosen": -2.9376513957977295, + "logits/rejected": -2.9067463874816895, + "logps/chosen": -56.89196014404297, + "logps/rejected": -56.93088912963867, + "loss": 0.6855, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03451027721166611, + "rewards/margins": 0.01620904728770256, + "rewards/rejected": -0.050719328224658966, + "step": 10230 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 2.3493030071258545, + "learning_rate": 8.343058627612421e-10, + "logits/chosen": -2.991508722305298, + "logits/rejected": -2.969237804412842, + "logps/chosen": -54.86486053466797, + "logps/rejected": -57.530853271484375, + "loss": 0.6849, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.034078199416399, + "rewards/margins": 0.017571553587913513, + "rewards/rejected": -0.05164974927902222, + "step": 10240 + }, + { + "epoch": 1.7660234321157822, + "grad_norm": 2.20393705368042, + "learning_rate": 8.223223760567588e-10, + "logits/chosen": -3.0698676109313965, + "logits/rejected": -3.028144359588623, + "logps/chosen": -59.7209358215332, + "logps/rejected": -54.87019729614258, + "loss": 0.6824, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.029454076662659645, + "rewards/margins": 0.02245365083217621, + "rewards/rejected": -0.051907729357481, + "step": 10250 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 2.4347379207611084, + "learning_rate": 8.10421883797694e-10, + "logits/chosen": -2.9993271827697754, + "logits/rejected": -3.0037426948547363, + "logps/chosen": -56.778282165527344, + "logps/rejected": -61.5457878112793, + "loss": 0.6878, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03513524681329727, + "rewards/margins": 0.011598859913647175, + "rewards/rejected": -0.04673410579562187, + "step": 10260 + }, + { + "epoch": 1.7694693314955203, + "grad_norm": 3.3981211185455322, + "learning_rate": 7.986044936011149e-10, + "logits/chosen": -3.0045104026794434, + "logits/rejected": -2.9947686195373535, + "logps/chosen": -58.53948211669922, + "logps/rejected": -60.971046447753906, + "loss": 0.6826, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0310873631387949, + "rewards/margins": 0.022157657891511917, + "rewards/rejected": -0.05324501916766167, + "step": 10270 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 2.483266592025757, + "learning_rate": 7.868703123325871e-10, + "logits/chosen": -2.9915993213653564, + "logits/rejected": -2.9726366996765137, + "logps/chosen": -57.83294677734375, + "logps/rejected": -57.76380157470703, + "loss": 0.6875, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03503817319869995, + "rewards/margins": 0.012229489162564278, + "rewards/rejected": -0.04726766422390938, + "step": 10280 + }, + { + "epoch": 1.7729152308752585, + "grad_norm": 2.4922218322753906, + "learning_rate": 7.75219446105222e-10, + "logits/chosen": -2.9411327838897705, + "logits/rejected": -2.937307834625244, + "logps/chosen": -54.51689529418945, + "logps/rejected": -60.47893142700195, + "loss": 0.6841, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.030806083232164383, + "rewards/margins": 0.019144952297210693, + "rewards/rejected": -0.049951035529375076, + "step": 10290 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 2.6811490058898926, + "learning_rate": 7.636520002786928e-10, + "logits/chosen": -3.024101495742798, + "logits/rejected": -3.0088281631469727, + "logps/chosen": -58.08661651611328, + "logps/rejected": -59.34272384643555, + "loss": 0.6867, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03438801318407059, + "rewards/margins": 0.014171205461025238, + "rewards/rejected": -0.048559218645095825, + "step": 10300 + }, + { + "epoch": 1.7746381805651275, + "eval_logits/chosen": -3.116088628768921, + "eval_logits/rejected": -3.1104257106781006, + "eval_logps/chosen": -59.588050842285156, + "eval_logps/rejected": -64.6971664428711, + "eval_loss": 0.6901378035545349, + "eval_rewards/accuracies": 0.5871282815933228, + "eval_rewards/chosen": -0.00876156147569418, + "eval_rewards/margins": 0.0064088571816682816, + "eval_rewards/rejected": -0.015170418657362461, + "eval_runtime": 384.1877, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 10300 + }, + { + "epoch": 1.7763611302549966, + "grad_norm": 2.2249252796173096, + "learning_rate": 7.521680794583096e-10, + "logits/chosen": -2.943056583404541, + "logits/rejected": -2.917048931121826, + "logps/chosen": -58.94867706298828, + "logps/rejected": -56.73337936401367, + "loss": 0.6807, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.026859750971198082, + "rewards/margins": 0.02593878284096718, + "rewards/rejected": -0.05279853194952011, + "step": 10310 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 2.479624032974243, + "learning_rate": 7.407677874940477e-10, + "logits/chosen": -3.0229122638702393, + "logits/rejected": -2.990391254425049, + "logps/chosen": -59.6657600402832, + "logps/rejected": -58.287437438964844, + "loss": 0.6843, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02904113195836544, + "rewards/margins": 0.01862339861690998, + "rewards/rejected": -0.04766453057527542, + "step": 10320 + }, + { + "epoch": 1.7798070296347346, + "grad_norm": 2.1177711486816406, + "learning_rate": 7.294512274796261e-10, + "logits/chosen": -3.037933349609375, + "logits/rejected": -3.020717144012451, + "logps/chosen": -55.4109001159668, + "logps/rejected": -59.27170944213867, + "loss": 0.6862, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.032665468752384186, + "rewards/margins": 0.014889764599502087, + "rewards/rejected": -0.04755523055791855, + "step": 10330 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 2.83569598197937, + "learning_rate": 7.182185017515707e-10, + "logits/chosen": -3.041626453399658, + "logits/rejected": -3.0205533504486084, + "logps/chosen": -59.1336669921875, + "logps/rejected": -59.606727600097656, + "loss": 0.6837, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030461763963103294, + "rewards/margins": 0.01992267370223999, + "rewards/rejected": -0.050384439527988434, + "step": 10340 + }, + { + "epoch": 1.7832529290144727, + "grad_norm": 2.156789541244507, + "learning_rate": 7.070697118882819e-10, + "logits/chosen": -3.137744903564453, + "logits/rejected": -3.1120965480804443, + "logps/chosen": -56.865562438964844, + "logps/rejected": -55.8765754699707, + "loss": 0.6835, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.027836069464683533, + "rewards/margins": 0.020382162183523178, + "rewards/rejected": -0.04821822792291641, + "step": 10350 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 2.3762242794036865, + "learning_rate": 6.960049587091277e-10, + "logits/chosen": -2.9819459915161133, + "logits/rejected": -2.9459383487701416, + "logps/chosen": -58.7189826965332, + "logps/rejected": -57.51030731201172, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02772151306271553, + "rewards/margins": 0.019328957423567772, + "rewards/rejected": -0.04705046862363815, + "step": 10360 + }, + { + "epoch": 1.786698828394211, + "grad_norm": 2.5589816570281982, + "learning_rate": 6.850243422735214e-10, + "logits/chosen": -2.8506741523742676, + "logits/rejected": -2.819082736968994, + "logps/chosen": -54.847389221191406, + "logps/rejected": -58.32719802856445, + "loss": 0.6807, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.028699740767478943, + "rewards/margins": 0.025770461186766624, + "rewards/rejected": -0.05447020009160042, + "step": 10370 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 2.3986716270446777, + "learning_rate": 6.741279618800234e-10, + "logits/chosen": -3.0086491107940674, + "logits/rejected": -2.996666669845581, + "logps/chosen": -55.70782470703125, + "logps/rejected": -57.116722106933594, + "loss": 0.6888, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03828931599855423, + "rewards/margins": 0.00983067974448204, + "rewards/rejected": -0.04811999946832657, + "step": 10380 + }, + { + "epoch": 1.7901447277739488, + "grad_norm": 2.4709324836730957, + "learning_rate": 6.633159160654411e-10, + "logits/chosen": -3.0464024543762207, + "logits/rejected": -3.016256809234619, + "logps/chosen": -58.832664489746094, + "logps/rejected": -59.042266845703125, + "loss": 0.6841, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.030234787613153458, + "rewards/margins": 0.019277578219771385, + "rewards/rejected": -0.049512363970279694, + "step": 10390 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 2.5254104137420654, + "learning_rate": 6.525883026039358e-10, + "logits/chosen": -3.004934549331665, + "logits/rejected": -2.986795425415039, + "logps/chosen": -55.586334228515625, + "logps/rejected": -60.53899002075195, + "loss": 0.6847, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03446076437830925, + "rewards/margins": 0.017832253128290176, + "rewards/rejected": -0.052293021231889725, + "step": 10400 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -3.1158525943756104, + "eval_logits/rejected": -3.110152244567871, + "eval_logps/chosen": -59.59916305541992, + "eval_logps/rejected": -64.68959045410156, + "eval_loss": 0.690230131149292, + "eval_rewards/accuracies": 0.5868958830833435, + "eval_rewards/chosen": -0.008872650563716888, + "eval_rewards/margins": 0.006222100462764502, + "eval_rewards/rejected": -0.015094749629497528, + "eval_runtime": 383.947, + "eval_samples_per_second": 11.21, + "eval_steps_per_second": 1.401, + "step": 10400 + }, + { + "epoch": 1.7935906271536872, + "grad_norm": 2.282289981842041, + "learning_rate": 6.419452185061447e-10, + "logits/chosen": -3.0068869590759277, + "logits/rejected": -2.9670393466949463, + "logps/chosen": -59.276466369628906, + "logps/rejected": -58.21703338623047, + "loss": 0.6817, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.027589306235313416, + "rewards/margins": 0.023819051682949066, + "rewards/rejected": -0.05140835791826248, + "step": 10410 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 2.372286319732666, + "learning_rate": 6.313867600182932e-10, + "logits/chosen": -3.0341315269470215, + "logits/rejected": -3.006105422973633, + "logps/chosen": -57.386985778808594, + "logps/rejected": -58.883995056152344, + "loss": 0.6861, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.032664235681295395, + "rewards/margins": 0.014921456575393677, + "rewards/rejected": -0.04758569225668907, + "step": 10420 + }, + { + "epoch": 1.7970365265334252, + "grad_norm": 2.8999135494232178, + "learning_rate": 6.209130226213377e-10, + "logits/chosen": -3.0889573097229004, + "logits/rejected": -3.0721592903137207, + "logps/chosen": -57.90520095825195, + "logps/rejected": -57.81034469604492, + "loss": 0.685, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.030346263200044632, + "rewards/margins": 0.0172057393938303, + "rewards/rejected": -0.04755200073122978, + "step": 10430 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 2.485287666320801, + "learning_rate": 6.105241010300888e-10, + "logits/chosen": -3.089571952819824, + "logits/rejected": -3.0788373947143555, + "logps/chosen": -61.08458709716797, + "logps/rejected": -61.87718963623047, + "loss": 0.6867, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0321456603705883, + "rewards/margins": 0.013696420006453991, + "rewards/rejected": -0.04584207385778427, + "step": 10440 + }, + { + "epoch": 1.8004824259131633, + "grad_norm": 2.1716952323913574, + "learning_rate": 6.002200891923693e-10, + "logits/chosen": -3.045029878616333, + "logits/rejected": -3.0410609245300293, + "logps/chosen": -56.7320442199707, + "logps/rejected": -62.7134895324707, + "loss": 0.6838, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03110019862651825, + "rewards/margins": 0.019653640687465668, + "rewards/rejected": -0.05075383931398392, + "step": 10450 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 2.5009636878967285, + "learning_rate": 5.900010802881462e-10, + "logits/chosen": -3.0444865226745605, + "logits/rejected": -3.0173439979553223, + "logps/chosen": -57.57358932495117, + "logps/rejected": -57.37907028198242, + "loss": 0.6869, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029986029490828514, + "rewards/margins": 0.013526695780456066, + "rewards/rejected": -0.043512724339962006, + "step": 10460 + }, + { + "epoch": 1.8039283252929015, + "grad_norm": 2.530627965927124, + "learning_rate": 5.798671667287059e-10, + "logits/chosen": -2.8856892585754395, + "logits/rejected": -2.862014055252075, + "logps/chosen": -55.18822479248047, + "logps/rejected": -55.52335739135742, + "loss": 0.6863, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03281798213720322, + "rewards/margins": 0.014614596962928772, + "rewards/rejected": -0.04743257910013199, + "step": 10470 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 2.694502830505371, + "learning_rate": 5.698184401558093e-10, + "logits/chosen": -3.0464718341827393, + "logits/rejected": -3.008328914642334, + "logps/chosen": -58.82337188720703, + "logps/rejected": -57.099815368652344, + "loss": 0.6845, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.02852488122880459, + "rewards/margins": 0.018450209870934486, + "rewards/rejected": -0.046975087374448776, + "step": 10480 + }, + { + "epoch": 1.8073742246726394, + "grad_norm": 2.5299627780914307, + "learning_rate": 5.598549914408657e-10, + "logits/chosen": -3.0337061882019043, + "logits/rejected": -3.002572774887085, + "logps/chosen": -56.425636291503906, + "logps/rejected": -57.16019821166992, + "loss": 0.6837, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.02615206502377987, + "rewards/margins": 0.01992407813668251, + "rewards/rejected": -0.04607614129781723, + "step": 10490 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 2.505113124847412, + "learning_rate": 5.499769106841079e-10, + "logits/chosen": -3.029731035232544, + "logits/rejected": -3.016737461090088, + "logps/chosen": -55.8951416015625, + "logps/rejected": -59.8373908996582, + "loss": 0.6861, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.031956687569618225, + "rewards/margins": 0.01530187763273716, + "rewards/rejected": -0.047258567065000534, + "step": 10500 + }, + { + "epoch": 1.8090971743625086, + "eval_logits/chosen": -3.116119623184204, + "eval_logits/rejected": -3.1104183197021484, + "eval_logps/chosen": -59.59355926513672, + "eval_logps/rejected": -64.70457458496094, + "eval_loss": 0.6901285648345947, + "eval_rewards/accuracies": 0.5861988663673401, + "eval_rewards/chosen": -0.008816661313176155, + "eval_rewards/margins": 0.006427871994674206, + "eval_rewards/rejected": -0.015244533307850361, + "eval_runtime": 384.1744, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 10500 + }, + { + "epoch": 1.8108201240523778, + "grad_norm": 2.4923551082611084, + "learning_rate": 5.401842872137786e-10, + "logits/chosen": -3.013028621673584, + "logits/rejected": -2.988231897354126, + "logps/chosen": -59.1893196105957, + "logps/rejected": -57.93567657470703, + "loss": 0.6878, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.036012202501297, + "rewards/margins": 0.011595193296670914, + "rewards/rejected": -0.04760739207267761, + "step": 10510 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 2.3233695030212402, + "learning_rate": 5.304772095853305e-10, + "logits/chosen": -3.082209348678589, + "logits/rejected": -3.0868873596191406, + "logps/chosen": -53.884185791015625, + "logps/rejected": -59.48823165893555, + "loss": 0.6858, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.033986646682024, + "rewards/margins": 0.01549739670008421, + "rewards/rejected": -0.049484044313430786, + "step": 10520 + }, + { + "epoch": 1.8142660234321157, + "grad_norm": 2.3757970333099365, + "learning_rate": 5.208557655806078e-10, + "logits/chosen": -2.98966646194458, + "logits/rejected": -2.9673266410827637, + "logps/chosen": -56.67155075073242, + "logps/rejected": -56.58771896362305, + "loss": 0.6879, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03330652788281441, + "rewards/margins": 0.011316435411572456, + "rewards/rejected": -0.04462296515703201, + "step": 10530 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 2.238402843475342, + "learning_rate": 5.113200422070763e-10, + "logits/chosen": -2.97110915184021, + "logits/rejected": -2.9450035095214844, + "logps/chosen": -55.549415588378906, + "logps/rejected": -54.656227111816406, + "loss": 0.6834, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.032187167555093765, + "rewards/margins": 0.0204298235476017, + "rewards/rejected": -0.052616991102695465, + "step": 10540 + }, + { + "epoch": 1.817711922811854, + "grad_norm": 2.7323215007781982, + "learning_rate": 5.018701256970127e-10, + "logits/chosen": -3.127934694290161, + "logits/rejected": -3.1271016597747803, + "logps/chosen": -57.295440673828125, + "logps/rejected": -60.423919677734375, + "loss": 0.6882, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.035445090383291245, + "rewards/margins": 0.010981904342770576, + "rewards/rejected": -0.04642698913812637, + "step": 10550 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 2.640737771987915, + "learning_rate": 4.9250610150674e-10, + "logits/chosen": -2.9855422973632812, + "logits/rejected": -2.966156005859375, + "logps/chosen": -58.918701171875, + "logps/rejected": -60.0306282043457, + "loss": 0.6832, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0328650139272213, + "rewards/margins": 0.021228309720754623, + "rewards/rejected": -0.05409331992268562, + "step": 10560 + }, + { + "epoch": 1.821157822191592, + "grad_norm": 2.298959493637085, + "learning_rate": 4.832280543158507e-10, + "logits/chosen": -3.0651791095733643, + "logits/rejected": -3.0477185249328613, + "logps/chosen": -59.076416015625, + "logps/rejected": -61.52764129638672, + "loss": 0.6855, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.031295984983444214, + "rewards/margins": 0.016338912770152092, + "rewards/rejected": -0.047634903341531754, + "step": 10570 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 2.188692331314087, + "learning_rate": 4.740360680264388e-10, + "logits/chosen": -3.0644102096557617, + "logits/rejected": -3.0470824241638184, + "logps/chosen": -56.48671340942383, + "logps/rejected": -57.2518196105957, + "loss": 0.687, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03124716505408287, + "rewards/margins": 0.0131332753226161, + "rewards/rejected": -0.044380445033311844, + "step": 10580 + }, + { + "epoch": 1.82460372157133, + "grad_norm": 2.293623924255371, + "learning_rate": 4.6493022576234556e-10, + "logits/chosen": -3.0044636726379395, + "logits/rejected": -2.962233066558838, + "logps/chosen": -59.2132568359375, + "logps/rejected": -55.89684295654297, + "loss": 0.6805, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03308730944991112, + "rewards/margins": 0.026532579213380814, + "rewards/rejected": -0.05961988493800163, + "step": 10590 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 2.2917075157165527, + "learning_rate": 4.559106098684029e-10, + "logits/chosen": -3.11020565032959, + "logits/rejected": -3.099874496459961, + "logps/chosen": -56.04803466796875, + "logps/rejected": -59.04853439331055, + "loss": 0.6877, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.03165452927350998, + "rewards/margins": 0.011951565742492676, + "rewards/rejected": -0.043606095016002655, + "step": 10600 + }, + { + "epoch": 1.8263266712611992, + "eval_logits/chosen": -3.1160738468170166, + "eval_logits/rejected": -3.1104249954223633, + "eval_logps/chosen": -59.59670639038086, + "eval_logps/rejected": -64.7072982788086, + "eval_loss": 0.6901311278343201, + "eval_rewards/accuracies": 0.5920074582099915, + "eval_rewards/chosen": -0.00884803757071495, + "eval_rewards/margins": 0.006423789542168379, + "eval_rewards/rejected": -0.015271826647222042, + "eval_runtime": 384.1659, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 10600 + }, + { + "epoch": 1.8280496209510684, + "grad_norm": 2.4631903171539307, + "learning_rate": 4.4697730190969235e-10, + "logits/chosen": -3.001817226409912, + "logits/rejected": -2.9916493892669678, + "logps/chosen": -54.29302978515625, + "logps/rejected": -56.63573455810547, + "loss": 0.6858, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0322897806763649, + "rewards/margins": 0.015405547805130482, + "rewards/rejected": -0.047695327550172806, + "step": 10610 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 2.605523109436035, + "learning_rate": 4.381303826708061e-10, + "logits/chosen": -2.9537954330444336, + "logits/rejected": -2.930615186691284, + "logps/chosen": -56.80295944213867, + "logps/rejected": -56.80694580078125, + "loss": 0.6862, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.029991215094923973, + "rewards/margins": 0.014791592955589294, + "rewards/rejected": -0.04478280618786812, + "step": 10620 + }, + { + "epoch": 1.8314955203308063, + "grad_norm": 2.5096333026885986, + "learning_rate": 4.2936993215511257e-10, + "logits/chosen": -3.041898250579834, + "logits/rejected": -3.023247718811035, + "logps/chosen": -57.2591438293457, + "logps/rejected": -58.80683517456055, + "loss": 0.6855, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03000602126121521, + "rewards/margins": 0.01637157052755356, + "rewards/rejected": -0.04637759551405907, + "step": 10630 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 2.578603506088257, + "learning_rate": 4.206960295840456e-10, + "logits/chosen": -2.9995503425598145, + "logits/rejected": -2.975860595703125, + "logps/chosen": -58.27387237548828, + "logps/rejected": -57.60478591918945, + "loss": 0.6846, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03207911550998688, + "rewards/margins": 0.018181387335062027, + "rewards/rejected": -0.050260502845048904, + "step": 10640 + }, + { + "epoch": 1.8349414197105445, + "grad_norm": 2.3200182914733887, + "learning_rate": 4.1210875339636854e-10, + "logits/chosen": -3.082303524017334, + "logits/rejected": -3.059866428375244, + "logps/chosen": -56.590576171875, + "logps/rejected": -59.08875274658203, + "loss": 0.6821, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.029690880328416824, + "rewards/margins": 0.02302057109773159, + "rewards/rejected": -0.052711449563503265, + "step": 10650 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 2.352198839187622, + "learning_rate": 4.0360818124748677e-10, + "logits/chosen": -2.9877002239227295, + "logits/rejected": -2.9751667976379395, + "logps/chosen": -58.22998809814453, + "logps/rejected": -59.70201873779297, + "loss": 0.6887, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03531861677765846, + "rewards/margins": 0.009900919161736965, + "rewards/rejected": -0.045219533145427704, + "step": 10660 + }, + { + "epoch": 1.8383873190902826, + "grad_norm": 2.7161898612976074, + "learning_rate": 3.9519439000872404e-10, + "logits/chosen": -3.0114142894744873, + "logits/rejected": -2.992724657058716, + "logps/chosen": -55.91082763671875, + "logps/rejected": -56.84636688232422, + "loss": 0.6861, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03772654011845589, + "rewards/margins": 0.015098011121153831, + "rewards/rejected": -0.052824556827545166, + "step": 10670 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 2.565969705581665, + "learning_rate": 3.8686745576664626e-10, + "logits/chosen": -3.039262294769287, + "logits/rejected": -2.9938862323760986, + "logps/chosen": -59.46996307373047, + "logps/rejected": -56.1522102355957, + "loss": 0.6827, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03180866315960884, + "rewards/margins": 0.021904734894633293, + "rewards/rejected": -0.05371339991688728, + "step": 10680 + }, + { + "epoch": 1.8418332184700206, + "grad_norm": 2.3504178524017334, + "learning_rate": 3.7862745382235775e-10, + "logits/chosen": -3.025742292404175, + "logits/rejected": -3.019537925720215, + "logps/chosen": -56.9988899230957, + "logps/rejected": -60.743568420410156, + "loss": 0.688, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03248900920152664, + "rewards/margins": 0.011180490255355835, + "rewards/rejected": -0.043669503182172775, + "step": 10690 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 2.2883052825927734, + "learning_rate": 3.704744586908315e-10, + "logits/chosen": -3.083956003189087, + "logits/rejected": -3.0425620079040527, + "logps/chosen": -58.91267013549805, + "logps/rejected": -55.345741271972656, + "loss": 0.6824, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.027451183646917343, + "rewards/margins": 0.02261456288397312, + "rewards/rejected": -0.050065744668245316, + "step": 10700 + }, + { + "epoch": 1.8435561681598898, + "eval_logits/chosen": -3.115992784500122, + "eval_logits/rejected": -3.110304832458496, + "eval_logps/chosen": -59.599796295166016, + "eval_logps/rejected": -64.70921325683594, + "eval_loss": 0.6901361346244812, + "eval_rewards/accuracies": 0.586663544178009, + "eval_rewards/chosen": -0.008878974243998528, + "eval_rewards/margins": 0.00641192402690649, + "eval_rewards/rejected": -0.01529089454561472, + "eval_runtime": 384.0656, + "eval_samples_per_second": 11.206, + "eval_steps_per_second": 1.401, + "step": 10700 + }, + { + "epoch": 1.8452791178497587, + "grad_norm": 2.8073954582214355, + "learning_rate": 3.6240854410023116e-10, + "logits/chosen": -2.9952645301818848, + "logits/rejected": -2.9651360511779785, + "logps/chosen": -56.67476272583008, + "logps/rejected": -54.63886642456055, + "loss": 0.6838, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.032301269471645355, + "rewards/margins": 0.019710997119545937, + "rewards/rejected": -0.05201226472854614, + "step": 10710 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 2.4006543159484863, + "learning_rate": 3.5442978299124126e-10, + "logits/chosen": -3.021350383758545, + "logits/rejected": -2.9896862506866455, + "logps/chosen": -56.3643913269043, + "logps/rejected": -57.85981369018555, + "loss": 0.6846, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03777404502034187, + "rewards/margins": 0.018012024462223053, + "rewards/rejected": -0.055786073207855225, + "step": 10720 + }, + { + "epoch": 1.848725017229497, + "grad_norm": 2.676990509033203, + "learning_rate": 3.465382475164169e-10, + "logits/chosen": -3.1102206707000732, + "logits/rejected": -3.073194980621338, + "logps/chosen": -56.90290451049805, + "logps/rejected": -55.273040771484375, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0294378399848938, + "rewards/margins": 0.024509413167834282, + "rewards/rejected": -0.05394725129008293, + "step": 10730 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 3.0840725898742676, + "learning_rate": 3.3873400903951634e-10, + "logits/chosen": -2.979149580001831, + "logits/rejected": -2.96920108795166, + "logps/chosen": -58.513206481933594, + "logps/rejected": -59.453834533691406, + "loss": 0.6912, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.03822885453701019, + "rewards/margins": 0.0048066796734929085, + "rewards/rejected": -0.04303553327918053, + "step": 10740 + }, + { + "epoch": 1.852170916609235, + "grad_norm": 2.448347330093384, + "learning_rate": 3.310171381348726e-10, + "logits/chosen": -2.982016086578369, + "logits/rejected": -2.9525201320648193, + "logps/chosen": -55.0046501159668, + "logps/rejected": -57.35972213745117, + "loss": 0.6848, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.029463868588209152, + "rewards/margins": 0.017485864460468292, + "rewards/rejected": -0.046949733048677444, + "step": 10750 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 2.85001802444458, + "learning_rate": 3.233877045867417e-10, + "logits/chosen": -2.994340419769287, + "logits/rejected": -2.97330379486084, + "logps/chosen": -56.65723419189453, + "logps/rejected": -57.377845764160156, + "loss": 0.6855, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.029764290899038315, + "rewards/margins": 0.01615387573838234, + "rewards/rejected": -0.045918166637420654, + "step": 10760 + }, + { + "epoch": 1.8556168159889732, + "grad_norm": 2.89906907081604, + "learning_rate": 3.1584577738867804e-10, + "logits/chosen": -2.9776854515075684, + "logits/rejected": -2.9613919258117676, + "logps/chosen": -59.64055252075195, + "logps/rejected": -58.95122528076172, + "loss": 0.6846, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.025688624009490013, + "rewards/margins": 0.017932411283254623, + "rewards/rejected": -0.043621040880680084, + "step": 10770 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 2.3571207523345947, + "learning_rate": 3.0839142474291206e-10, + "logits/chosen": -3.1085402965545654, + "logits/rejected": -3.0907883644104004, + "logps/chosen": -56.31730270385742, + "logps/rejected": -59.65522003173828, + "loss": 0.6866, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.031476687639951706, + "rewards/margins": 0.014222445897758007, + "rewards/rejected": -0.04569913074374199, + "step": 10780 + }, + { + "epoch": 1.8590627153687111, + "grad_norm": 2.8656840324401855, + "learning_rate": 3.0102471405972666e-10, + "logits/chosen": -3.1140193939208984, + "logits/rejected": -3.082620143890381, + "logps/chosen": -55.114166259765625, + "logps/rejected": -58.22220993041992, + "loss": 0.6835, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.027730736881494522, + "rewards/margins": 0.020436272025108337, + "rewards/rejected": -0.04816700890660286, + "step": 10790 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 2.6412203311920166, + "learning_rate": 2.937457119568554e-10, + "logits/chosen": -3.110588312149048, + "logits/rejected": -3.088531255722046, + "logps/chosen": -55.08143997192383, + "logps/rejected": -58.99031448364258, + "loss": 0.6839, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.031220849603414536, + "rewards/margins": 0.019479000940918922, + "rewards/rejected": -0.05069985240697861, + "step": 10800 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -3.11584734916687, + "eval_logits/rejected": -3.1101913452148438, + "eval_logps/chosen": -59.59832763671875, + "eval_logps/rejected": -64.71131896972656, + "eval_loss": 0.6901180148124695, + "eval_rewards/accuracies": 0.5878252983093262, + "eval_rewards/chosen": -0.008864316157996655, + "eval_rewards/margins": 0.006447718013077974, + "eval_rewards/rejected": -0.015312033705413342, + "eval_runtime": 384.2453, + "eval_samples_per_second": 11.201, + "eval_steps_per_second": 1.4, + "step": 10800 + }, + { + "epoch": 1.8625086147484493, + "grad_norm": 2.6558892726898193, + "learning_rate": 2.8655448425887407e-10, + "logits/chosen": -3.0395102500915527, + "logits/rejected": -2.996107339859009, + "logps/chosen": -60.2518424987793, + "logps/rejected": -55.66035079956055, + "loss": 0.6804, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02872023545205593, + "rewards/margins": 0.026779908686876297, + "rewards/rejected": -0.05550014227628708, + "step": 10810 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 2.5476865768432617, + "learning_rate": 2.794510959966079e-10, + "logits/chosen": -2.981799364089966, + "logits/rejected": -2.957087755203247, + "logps/chosen": -55.7774772644043, + "logps/rejected": -55.99811935424805, + "loss": 0.684, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.026825163513422012, + "rewards/margins": 0.01911497488617897, + "rewards/rejected": -0.045940134674310684, + "step": 10820 + }, + { + "epoch": 1.8659545141281875, + "grad_norm": 2.5901870727539062, + "learning_rate": 2.724356114065452e-10, + "logits/chosen": -3.0013303756713867, + "logits/rejected": -2.974020481109619, + "logps/chosen": -55.19731521606445, + "logps/rejected": -58.499778747558594, + "loss": 0.6846, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0338115394115448, + "rewards/margins": 0.018128041177988052, + "rewards/rejected": -0.05193958431482315, + "step": 10830 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 2.523749351501465, + "learning_rate": 2.6550809393025233e-10, + "logits/chosen": -3.051769495010376, + "logits/rejected": -3.020822525024414, + "logps/chosen": -55.566810607910156, + "logps/rejected": -55.53137969970703, + "loss": 0.6834, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.025759857147932053, + "rewards/margins": 0.02048664726316929, + "rewards/rejected": -0.04624650999903679, + "step": 10840 + }, + { + "epoch": 1.8694004135079254, + "grad_norm": 2.379290819168091, + "learning_rate": 2.586686062138044e-10, + "logits/chosen": -2.9762279987335205, + "logits/rejected": -2.9606053829193115, + "logps/chosen": -52.855323791503906, + "logps/rejected": -59.426116943359375, + "loss": 0.6823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02694542333483696, + "rewards/margins": 0.02276376634836197, + "rewards/rejected": -0.04970919340848923, + "step": 10850 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 2.553745746612549, + "learning_rate": 2.5191721010721204e-10, + "logits/chosen": -3.0979971885681152, + "logits/rejected": -3.0644125938415527, + "logps/chosen": -58.5058479309082, + "logps/rejected": -56.959007263183594, + "loss": 0.6871, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03247032314538956, + "rewards/margins": 0.0129841985180974, + "rewards/rejected": -0.04545452445745468, + "step": 10860 + }, + { + "epoch": 1.8728463128876638, + "grad_norm": 2.415626287460327, + "learning_rate": 2.4525396666387534e-10, + "logits/chosen": -2.92629337310791, + "logits/rejected": -2.9215917587280273, + "logps/chosen": -54.1297492980957, + "logps/rejected": -61.116294860839844, + "loss": 0.6866, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.036440201103687286, + "rewards/margins": 0.014107112772762775, + "rewards/rejected": -0.05054731294512749, + "step": 10870 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 2.4143502712249756, + "learning_rate": 2.386789361400121e-10, + "logits/chosen": -3.021678924560547, + "logits/rejected": -3.000797748565674, + "logps/chosen": -57.00726318359375, + "logps/rejected": -57.3067626953125, + "loss": 0.6845, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.027981415390968323, + "rewards/margins": 0.018140435218811035, + "rewards/rejected": -0.04612184688448906, + "step": 10880 + }, + { + "epoch": 1.8762922122674017, + "grad_norm": 2.297905683517456, + "learning_rate": 2.3219217799413604e-10, + "logits/chosen": -3.074444532394409, + "logits/rejected": -3.0573339462280273, + "logps/chosen": -56.519920349121094, + "logps/rejected": -59.452972412109375, + "loss": 0.6873, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.033595021814107895, + "rewards/margins": 0.01268724538385868, + "rewards/rejected": -0.04628227278590202, + "step": 10890 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 2.3045783042907715, + "learning_rate": 2.257937508864949e-10, + "logits/chosen": -3.1231143474578857, + "logits/rejected": -3.0894017219543457, + "logps/chosen": -57.63786697387695, + "logps/rejected": -55.3487663269043, + "loss": 0.6831, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.026455294340848923, + "rewards/margins": 0.020993905141949654, + "rewards/rejected": -0.047449201345443726, + "step": 10900 + }, + { + "epoch": 1.8780151619572707, + "eval_logits/chosen": -3.116037607192993, + "eval_logits/rejected": -3.110398054122925, + "eval_logps/chosen": -59.60280227661133, + "eval_logps/rejected": -64.71473693847656, + "eval_loss": 0.6901231408119202, + "eval_rewards/accuracies": 0.5845724940299988, + "eval_rewards/chosen": -0.008909125812351704, + "eval_rewards/margins": 0.006437050178647041, + "eval_rewards/rejected": -0.015346175990998745, + "eval_runtime": 384.1875, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 10900 + }, + { + "epoch": 1.8797381116471399, + "grad_norm": 2.752044200897217, + "learning_rate": 2.1948371267855983e-10, + "logits/chosen": -3.0186257362365723, + "logits/rejected": -2.984849214553833, + "logps/chosen": -58.02870559692383, + "logps/rejected": -59.342620849609375, + "loss": 0.6797, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023622671142220497, + "rewards/margins": 0.028002118691802025, + "rewards/rejected": -0.051624786108732224, + "step": 10910 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 2.5073671340942383, + "learning_rate": 2.132621204324925e-10, + "logits/chosen": -2.987391948699951, + "logits/rejected": -2.962049961090088, + "logps/chosen": -59.12227249145508, + "logps/rejected": -57.84357833862305, + "loss": 0.6854, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.03284627944231033, + "rewards/margins": 0.016383716836571693, + "rewards/rejected": -0.049229998141527176, + "step": 10920 + }, + { + "epoch": 1.883184011026878, + "grad_norm": 2.84432315826416, + "learning_rate": 2.0712903041063102e-10, + "logits/chosen": -2.965153455734253, + "logits/rejected": -2.934025287628174, + "logps/chosen": -57.401832580566406, + "logps/rejected": -56.0716667175293, + "loss": 0.6867, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.031898193061351776, + "rewards/margins": 0.013943254947662354, + "rewards/rejected": -0.04584144800901413, + "step": 10930 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 2.7386891841888428, + "learning_rate": 2.010844980749793e-10, + "logits/chosen": -2.919647693634033, + "logits/rejected": -2.8994479179382324, + "logps/chosen": -59.5832405090332, + "logps/rejected": -60.03248977661133, + "loss": 0.6849, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.031240392476320267, + "rewards/margins": 0.017366571351885796, + "rewards/rejected": -0.04860696569085121, + "step": 10940 + }, + { + "epoch": 1.886629910406616, + "grad_norm": 2.3839938640594482, + "learning_rate": 1.951285780867096e-10, + "logits/chosen": -2.9704971313476562, + "logits/rejected": -2.952685832977295, + "logps/chosen": -57.843719482421875, + "logps/rejected": -59.376014709472656, + "loss": 0.6864, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03237280622124672, + "rewards/margins": 0.014441991224884987, + "rewards/rejected": -0.046814799308776855, + "step": 10950 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 2.438133955001831, + "learning_rate": 1.8926132430566512e-10, + "logits/chosen": -3.043850898742676, + "logits/rejected": -3.0247528553009033, + "logps/chosen": -57.020118713378906, + "logps/rejected": -57.449073791503906, + "loss": 0.6853, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.03209669888019562, + "rewards/margins": 0.016656579449772835, + "rewards/rejected": -0.048753272742033005, + "step": 10960 + }, + { + "epoch": 1.8900758097863544, + "grad_norm": 2.7471344470977783, + "learning_rate": 1.8348278978987166e-10, + "logits/chosen": -3.0624217987060547, + "logits/rejected": -3.0322635173797607, + "logps/chosen": -59.22570037841797, + "logps/rejected": -58.56772994995117, + "loss": 0.6833, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.023627672344446182, + "rewards/margins": 0.020939035341143608, + "rewards/rejected": -0.04456670582294464, + "step": 10970 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 2.3950552940368652, + "learning_rate": 1.777930267950656e-10, + "logits/chosen": -3.0392160415649414, + "logits/rejected": -3.0243992805480957, + "logps/chosen": -57.84577178955078, + "logps/rejected": -61.35871124267578, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.032726358622312546, + "rewards/margins": 0.01907447539269924, + "rewards/rejected": -0.05180083587765694, + "step": 10980 + }, + { + "epoch": 1.8935217091660923, + "grad_norm": 2.4089016914367676, + "learning_rate": 1.7219208677420882e-10, + "logits/chosen": -2.959484815597534, + "logits/rejected": -2.9480016231536865, + "logps/chosen": -55.77641677856445, + "logps/rejected": -57.395286560058594, + "loss": 0.6855, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.032888785004615784, + "rewards/margins": 0.016107559204101562, + "rewards/rejected": -0.048996344208717346, + "step": 10990 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 2.7076034545898438, + "learning_rate": 1.6668002037703244e-10, + "logits/chosen": -3.1547889709472656, + "logits/rejected": -3.1513662338256836, + "logps/chosen": -56.488006591796875, + "logps/rejected": -61.58770751953125, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.03598905727267265, + "rewards/margins": 0.010044041089713573, + "rewards/rejected": -0.0460330992937088, + "step": 11000 + }, + { + "epoch": 1.8952446588559613, + "eval_logits/chosen": -3.1160166263580322, + "eval_logits/rejected": -3.1103060245513916, + "eval_logps/chosen": -59.60320281982422, + "eval_logps/rejected": -64.71551513671875, + "eval_loss": 0.6901227235794067, + "eval_rewards/accuracies": 0.5908457040786743, + "eval_rewards/chosen": -0.008913068100810051, + "eval_rewards/margins": 0.006440852303057909, + "eval_rewards/rejected": -0.015353920869529247, + "eval_runtime": 383.863, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.402, + "step": 11000 + }, + { + "epoch": 1.8969676085458305, + "grad_norm": 2.4201393127441406, + "learning_rate": 1.6125687744958039e-10, + "logits/chosen": -3.0817818641662598, + "logits/rejected": -3.049903154373169, + "logps/chosen": -56.0413932800293, + "logps/rejected": -56.055267333984375, + "loss": 0.6848, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.028350725769996643, + "rewards/margins": 0.017499305307865143, + "rewards/rejected": -0.045850031077861786, + "step": 11010 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 2.7356576919555664, + "learning_rate": 1.5592270703374988e-10, + "logits/chosen": -3.0850062370300293, + "logits/rejected": -3.0661559104919434, + "logps/chosen": -57.593597412109375, + "logps/rejected": -57.5640869140625, + "loss": 0.684, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025895848870277405, + "rewards/margins": 0.019343126565217972, + "rewards/rejected": -0.045238979160785675, + "step": 11020 + }, + { + "epoch": 1.9004135079255686, + "grad_norm": 2.9743478298187256, + "learning_rate": 1.5067755736685395e-10, + "logits/chosen": -3.058427333831787, + "logits/rejected": -3.0497522354125977, + "logps/chosen": -58.04417037963867, + "logps/rejected": -59.50719451904297, + "loss": 0.6848, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03255953639745712, + "rewards/margins": 0.0176172386854887, + "rewards/rejected": -0.05017677694559097, + "step": 11030 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 2.6739232540130615, + "learning_rate": 1.4552147588118735e-10, + "logits/chosen": -3.087679147720337, + "logits/rejected": -3.072469711303711, + "logps/chosen": -59.788185119628906, + "logps/rejected": -60.3827018737793, + "loss": 0.686, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03321398049592972, + "rewards/margins": 0.015200495719909668, + "rewards/rejected": -0.048414479941129684, + "step": 11040 + }, + { + "epoch": 1.9038594073053066, + "grad_norm": 2.4157602787017822, + "learning_rate": 1.4045450920358916e-10, + "logits/chosen": -3.0092153549194336, + "logits/rejected": -2.989617109298706, + "logps/chosen": -54.784645080566406, + "logps/rejected": -59.533729553222656, + "loss": 0.6864, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.028205087408423424, + "rewards/margins": 0.014475090429186821, + "rewards/rejected": -0.04268018156290054, + "step": 11050 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 2.6217141151428223, + "learning_rate": 1.354767031550308e-10, + "logits/chosen": -3.0563225746154785, + "logits/rejected": -3.0126149654388428, + "logps/chosen": -57.26979446411133, + "logps/rejected": -57.185638427734375, + "loss": 0.6814, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.029949193820357323, + "rewards/margins": 0.024533966556191444, + "rewards/rejected": -0.054483164101839066, + "step": 11060 + }, + { + "epoch": 1.907305306685045, + "grad_norm": 2.354989528656006, + "learning_rate": 1.305881027501965e-10, + "logits/chosen": -3.0827879905700684, + "logits/rejected": -3.034569025039673, + "logps/chosen": -55.78126907348633, + "logps/rejected": -54.82404708862305, + "loss": 0.6804, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.026058072224259377, + "rewards/margins": 0.026529595255851746, + "rewards/rejected": -0.052587658166885376, + "step": 11070 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 2.645761728286743, + "learning_rate": 1.2578875219707463e-10, + "logits/chosen": -3.06398344039917, + "logits/rejected": -3.037116765975952, + "logps/chosen": -58.64666748046875, + "logps/rejected": -57.29736328125, + "loss": 0.6853, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.031432170420885086, + "rewards/margins": 0.016599904745817184, + "rewards/rejected": -0.04803207889199257, + "step": 11080 + }, + { + "epoch": 1.9107512060647829, + "grad_norm": 2.631910562515259, + "learning_rate": 1.2107869489656141e-10, + "logits/chosen": -3.0288023948669434, + "logits/rejected": -3.009167432785034, + "logps/chosen": -60.91929244995117, + "logps/rejected": -57.264122009277344, + "loss": 0.6865, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.030957896262407303, + "rewards/margins": 0.014416252262890339, + "rewards/rejected": -0.04537414759397507, + "step": 11090 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 2.188575506210327, + "learning_rate": 1.16457973442069e-10, + "logits/chosen": -2.915553569793701, + "logits/rejected": -2.900320529937744, + "logps/chosen": -54.17964553833008, + "logps/rejected": -58.8646240234375, + "loss": 0.6859, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03301596641540527, + "rewards/margins": 0.015402073971927166, + "rewards/rejected": -0.048418037593364716, + "step": 11100 + }, + { + "epoch": 1.9124741557546519, + "eval_logits/chosen": -3.1159143447875977, + "eval_logits/rejected": -3.1102497577667236, + "eval_logps/chosen": -59.591861724853516, + "eval_logps/rejected": -64.70150756835938, + "eval_loss": 0.6901371479034424, + "eval_rewards/accuracies": 0.5845724940299988, + "eval_rewards/chosen": -0.008799640461802483, + "eval_rewards/margins": 0.006414216477423906, + "eval_rewards/rejected": -0.01521385833621025, + "eval_runtime": 383.9987, + "eval_samples_per_second": 11.208, + "eval_steps_per_second": 1.401, + "step": 11100 + }, + { + "epoch": 1.914197105444521, + "grad_norm": 2.3288257122039795, + "learning_rate": 1.119266296191368e-10, + "logits/chosen": -3.0594656467437744, + "logits/rejected": -3.034283399581909, + "logps/chosen": -57.703758239746094, + "logps/rejected": -59.37468719482422, + "loss": 0.6835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03802116960287094, + "rewards/margins": 0.02051617205142975, + "rewards/rejected": -0.05853734165430069, + "step": 11110 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 2.249931812286377, + "learning_rate": 1.0748470440505532e-10, + "logits/chosen": -3.150852918624878, + "logits/rejected": -3.1203830242156982, + "logps/chosen": -62.930511474609375, + "logps/rejected": -58.691131591796875, + "loss": 0.6857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.034155942499637604, + "rewards/margins": 0.016003701835870743, + "rewards/rejected": -0.05015964433550835, + "step": 11120 + }, + { + "epoch": 1.9176430048242592, + "grad_norm": 2.8102667331695557, + "learning_rate": 1.0313223796849735e-10, + "logits/chosen": -2.976644992828369, + "logits/rejected": -2.9540998935699463, + "logps/chosen": -59.52458572387695, + "logps/rejected": -57.91312789916992, + "loss": 0.6882, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03648515045642853, + "rewards/margins": 0.010790010914206505, + "rewards/rejected": -0.04727516323328018, + "step": 11130 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 2.495495319366455, + "learning_rate": 9.886926966915178e-11, + "logits/chosen": -2.9461255073547363, + "logits/rejected": -2.9252490997314453, + "logps/chosen": -56.85634231567383, + "logps/rejected": -56.508995056152344, + "loss": 0.6842, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.030429724603891373, + "rewards/margins": 0.018887072801589966, + "rewards/rejected": -0.04931679740548134, + "step": 11140 + }, + { + "epoch": 1.9210889042039971, + "grad_norm": 2.9053337574005127, + "learning_rate": 9.469583805736925e-11, + "logits/chosen": -3.0301146507263184, + "logits/rejected": -3.017643451690674, + "logps/chosen": -61.63774490356445, + "logps/rejected": -60.53892135620117, + "loss": 0.6913, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.03544190898537636, + "rewards/margins": 0.004841863643378019, + "rewards/rejected": -0.04028376564383507, + "step": 11150 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 2.2418413162231445, + "learning_rate": 9.06119808738115e-11, + "logits/chosen": -3.1213736534118652, + "logits/rejected": -3.1018104553222656, + "logps/chosen": -54.93082809448242, + "logps/rejected": -55.081199645996094, + "loss": 0.6874, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.028121694922447205, + "rewards/margins": 0.012537782080471516, + "rewards/rejected": -0.04065947234630585, + "step": 11160 + }, + { + "epoch": 1.9245348035837355, + "grad_norm": 2.396535634994507, + "learning_rate": 8.661773504911486e-11, + "logits/chosen": -2.9440383911132812, + "logits/rejected": -2.9148967266082764, + "logps/chosen": -59.909454345703125, + "logps/rejected": -53.46514892578125, + "loss": 0.6851, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.027473077178001404, + "rewards/margins": 0.016955533996224403, + "rewards/rejected": -0.04442860931158066, + "step": 11170 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 2.487661600112915, + "learning_rate": 8.271313670355163e-11, + "logits/chosen": -3.0768094062805176, + "logits/rejected": -3.053123950958252, + "logps/chosen": -57.1087760925293, + "logps/rejected": -55.69123077392578, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03142940253019333, + "rewards/margins": 0.01724282279610634, + "rewards/rejected": -0.04867222160100937, + "step": 11180 + }, + { + "epoch": 1.9279807029634735, + "grad_norm": 2.548030376434326, + "learning_rate": 7.889822114670708e-11, + "logits/chosen": -3.079629421234131, + "logits/rejected": -3.0492467880249023, + "logps/chosen": -58.1506233215332, + "logps/rejected": -57.200965881347656, + "loss": 0.6823, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.03268757462501526, + "rewards/margins": 0.022937973961234093, + "rewards/rejected": -0.0556255504488945, + "step": 11190 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 2.3734309673309326, + "learning_rate": 7.5173022877153e-11, + "logits/chosen": -3.077995777130127, + "logits/rejected": -3.056281566619873, + "logps/chosen": -55.8563346862793, + "logps/rejected": -58.005287170410156, + "loss": 0.685, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.025432473048567772, + "rewards/margins": 0.0171569362282753, + "rewards/rejected": -0.04258941113948822, + "step": 11200 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -3.115981101989746, + "eval_logits/rejected": -3.1102943420410156, + "eval_logps/chosen": -59.59299087524414, + "eval_logps/rejected": -64.6996841430664, + "eval_loss": 0.6901500225067139, + "eval_rewards/accuracies": 0.5845724940299988, + "eval_rewards/chosen": -0.00881095789372921, + "eval_rewards/margins": 0.006384588778018951, + "eval_rewards/rejected": -0.015195546671748161, + "eval_runtime": 384.1727, + "eval_samples_per_second": 11.203, + "eval_steps_per_second": 1.4, + "step": 11200 + }, + { + "epoch": 1.9314266023432116, + "grad_norm": 2.562915325164795, + "learning_rate": 7.15375755821468e-11, + "logits/chosen": -2.9258503913879395, + "logits/rejected": -2.909374237060547, + "logps/chosen": -53.963531494140625, + "logps/rejected": -59.4776496887207, + "loss": 0.6839, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.03552749380469322, + "rewards/margins": 0.019793927669525146, + "rewards/rejected": -0.05532141774892807, + "step": 11210 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 2.5470385551452637, + "learning_rate": 6.799191213731737e-11, + "logits/chosen": -3.0415053367614746, + "logits/rejected": -3.0274736881256104, + "logps/chosen": -55.11113739013672, + "logps/rejected": -60.2578239440918, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030614599585533142, + "rewards/margins": 0.015119021758437157, + "rewards/rejected": -0.04573361948132515, + "step": 11220 + }, + { + "epoch": 1.9348725017229498, + "grad_norm": 2.789475202560425, + "learning_rate": 6.453606460637195e-11, + "logits/chosen": -3.0025832653045654, + "logits/rejected": -2.9894137382507324, + "logps/chosen": -59.69022750854492, + "logps/rejected": -56.4006462097168, + "loss": 0.6889, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.039212532341480255, + "rewards/margins": 0.009541334584355354, + "rewards/rejected": -0.04875386878848076, + "step": 11230 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 2.2757625579833984, + "learning_rate": 6.11700642408064e-11, + "logits/chosen": -2.9875824451446533, + "logits/rejected": -2.971897602081299, + "logps/chosen": -57.854469299316406, + "logps/rejected": -58.3641471862793, + "loss": 0.6851, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.032043833285570145, + "rewards/margins": 0.017164278775453568, + "rewards/rejected": -0.04920811206102371, + "step": 11240 + }, + { + "epoch": 1.9383184011026877, + "grad_norm": 2.598924398422241, + "learning_rate": 5.7893941479620904e-11, + "logits/chosen": -3.018852472305298, + "logits/rejected": -2.991450786590576, + "logps/chosen": -57.508934020996094, + "logps/rejected": -58.6025390625, + "loss": 0.6848, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.032207388430833817, + "rewards/margins": 0.01755516789853573, + "rewards/rejected": -0.049762558192014694, + "step": 11250 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 2.611050844192505, + "learning_rate": 5.4707725949045826e-11, + "logits/chosen": -2.967341661453247, + "logits/rejected": -2.9327094554901123, + "logps/chosen": -60.662452697753906, + "logps/rejected": -56.185386657714844, + "loss": 0.6846, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.028289467096328735, + "rewards/margins": 0.017964553087949753, + "rewards/rejected": -0.04625401645898819, + "step": 11260 + }, + { + "epoch": 1.9417643004824259, + "grad_norm": 2.6599137783050537, + "learning_rate": 5.1611446462274116e-11, + "logits/chosen": -3.0220043659210205, + "logits/rejected": -2.9908974170684814, + "logps/chosen": -58.7435417175293, + "logps/rejected": -55.7933235168457, + "loss": 0.6857, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0317700020968914, + "rewards/margins": 0.015762265771627426, + "rewards/rejected": -0.04753226786851883, + "step": 11270 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 2.3055601119995117, + "learning_rate": 4.8605131019198165e-11, + "logits/chosen": -3.0796470642089844, + "logits/rejected": -3.049293041229248, + "logps/chosen": -59.201881408691406, + "logps/rejected": -57.14641189575195, + "loss": 0.6843, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.028897400945425034, + "rewards/margins": 0.01845616102218628, + "rewards/rejected": -0.04735356569290161, + "step": 11280 + }, + { + "epoch": 1.945210199862164, + "grad_norm": 2.618269681930542, + "learning_rate": 4.568880680616228e-11, + "logits/chosen": -3.031619071960449, + "logits/rejected": -3.0159995555877686, + "logps/chosen": -62.53656005859375, + "logps/rejected": -60.65777587890625, + "loss": 0.6841, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.025777708739042282, + "rewards/margins": 0.019383911043405533, + "rewards/rejected": -0.045161619782447815, + "step": 11290 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 2.4982001781463623, + "learning_rate": 4.2862500195708364e-11, + "logits/chosen": -3.007124423980713, + "logits/rejected": -2.9809606075286865, + "logps/chosen": -61.85832977294922, + "logps/rejected": -58.00861740112305, + "loss": 0.6869, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.029342034831643105, + "rewards/margins": 0.01363422255963087, + "rewards/rejected": -0.0429762564599514, + "step": 11300 + }, + { + "epoch": 1.946933149552033, + "eval_logits/chosen": -3.116105318069458, + "eval_logits/rejected": -3.110396146774292, + "eval_logps/chosen": -59.598411560058594, + "eval_logps/rejected": -64.70812225341797, + "eval_loss": 0.6901355385780334, + "eval_rewards/accuracies": 0.5875929594039917, + "eval_rewards/chosen": -0.00886518508195877, + "eval_rewards/margins": 0.006414768751710653, + "eval_rewards/rejected": -0.015279954299330711, + "eval_runtime": 384.0506, + "eval_samples_per_second": 11.207, + "eval_steps_per_second": 1.401, + "step": 11300 + }, + { + "epoch": 1.948656099241902, + "grad_norm": 2.3274354934692383, + "learning_rate": 4.01262367463473e-11, + "logits/chosen": -3.119818687438965, + "logits/rejected": -3.113994598388672, + "logps/chosen": -53.55225372314453, + "logps/rejected": -56.986839294433594, + "loss": 0.6817, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.030891120433807373, + "rewards/margins": 0.02375052683055401, + "rewards/rejected": -0.05464165285229683, + "step": 11310 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 2.3976454734802246, + "learning_rate": 3.748004120231685e-11, + "logits/chosen": -3.033069133758545, + "logits/rejected": -3.0026774406433105, + "logps/chosen": -57.32920455932617, + "logps/rejected": -58.893646240234375, + "loss": 0.6808, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02407662384212017, + "rewards/margins": 0.025767523795366287, + "rewards/rejected": -0.04984414950013161, + "step": 11320 + }, + { + "epoch": 1.9521019986216404, + "grad_norm": 2.341428279876709, + "learning_rate": 3.492393749336964e-11, + "logits/chosen": -3.0544650554656982, + "logits/rejected": -3.017298698425293, + "logps/chosen": -55.70989990234375, + "logps/rejected": -53.46592330932617, + "loss": 0.6827, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.03151210397481918, + "rewards/margins": 0.021801788359880447, + "rewards/rejected": -0.05331388860940933, + "step": 11330 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 2.4387247562408447, + "learning_rate": 3.245794873454777e-11, + "logits/chosen": -2.930663585662842, + "logits/rejected": -2.9110989570617676, + "logps/chosen": -58.4086799621582, + "logps/rejected": -59.36699295043945, + "loss": 0.6857, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.030292293056845665, + "rewards/margins": 0.015786411240696907, + "rewards/rejected": -0.04607870429754257, + "step": 11340 + }, + { + "epoch": 1.9555478980013783, + "grad_norm": 2.442936420440674, + "learning_rate": 3.0082097225977436e-11, + "logits/chosen": -3.0895800590515137, + "logits/rejected": -3.0782761573791504, + "logps/chosen": -60.061737060546875, + "logps/rejected": -60.759864807128906, + "loss": 0.6859, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0315508171916008, + "rewards/margins": 0.015562380664050579, + "rewards/rejected": -0.04711320251226425, + "step": 11350 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 2.343500852584839, + "learning_rate": 2.7796404452666847e-11, + "logits/chosen": -3.0479581356048584, + "logits/rejected": -3.0179901123046875, + "logps/chosen": -55.6785774230957, + "logps/rejected": -54.041419982910156, + "loss": 0.6834, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03423875570297241, + "rewards/margins": 0.020591724663972855, + "rewards/rejected": -0.05483048036694527, + "step": 11360 + }, + { + "epoch": 1.9589937973811165, + "grad_norm": 2.565880298614502, + "learning_rate": 2.5600891084311962e-11, + "logits/chosen": -3.0038440227508545, + "logits/rejected": -2.984208345413208, + "logps/chosen": -59.0855598449707, + "logps/rejected": -58.140045166015625, + "loss": 0.6851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.028102219104766846, + "rewards/margins": 0.017020905390381813, + "rewards/rejected": -0.04512312263250351, + "step": 11370 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 2.500283718109131, + "learning_rate": 2.3495576975107737e-11, + "logits/chosen": -2.9798200130462646, + "logits/rejected": -2.9610161781311035, + "logps/chosen": -55.38152313232422, + "logps/rejected": -55.190673828125, + "loss": 0.6857, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.03548261523246765, + "rewards/margins": 0.015744756907224655, + "rewards/rejected": -0.051227372139692307, + "step": 11380 + }, + { + "epoch": 1.9624396967608546, + "grad_norm": 2.4395084381103516, + "learning_rate": 2.1480481163572707e-11, + "logits/chosen": -3.0970280170440674, + "logits/rejected": -3.0766406059265137, + "logps/chosen": -55.200523376464844, + "logps/rejected": -60.348594665527344, + "loss": 0.6829, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.028758252039551735, + "rewards/margins": 0.021567989140748978, + "rewards/rejected": -0.05032623931765556, + "step": 11390 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 2.49170184135437, + "learning_rate": 1.9555621872374695e-11, + "logits/chosen": -3.0487561225891113, + "logits/rejected": -3.0144057273864746, + "logps/chosen": -58.76116180419922, + "logps/rejected": -57.61921310424805, + "loss": 0.6864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.028487438336014748, + "rewards/margins": 0.014596112072467804, + "rewards/rejected": -0.043083555996418, + "step": 11400 + }, + { + "epoch": 1.9641626464507236, + "eval_logits/chosen": -3.116114377975464, + "eval_logits/rejected": -3.1104519367218018, + "eval_logps/chosen": -59.5885124206543, + "eval_logps/rejected": -64.69520568847656, + "eval_loss": 0.6901496052742004, + "eval_rewards/accuracies": 0.5908457040786743, + "eval_rewards/chosen": -0.008766171522438526, + "eval_rewards/margins": 0.006384550128132105, + "eval_rewards/rejected": -0.015150722116231918, + "eval_runtime": 383.7144, + "eval_samples_per_second": 11.217, + "eval_steps_per_second": 1.402, + "step": 11400 + }, + { + "epoch": 1.9658855961405926, + "grad_norm": 2.6475255489349365, + "learning_rate": 1.7721016508163158e-11, + "logits/chosen": -3.0536608695983887, + "logits/rejected": -3.0251669883728027, + "logps/chosen": -58.230804443359375, + "logps/rejected": -59.3325309753418, + "loss": 0.685, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.025450700893998146, + "rewards/margins": 0.017182698473334312, + "rewards/rejected": -0.04263339936733246, + "step": 11410 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 2.4510061740875244, + "learning_rate": 1.597668166141486e-11, + "logits/chosen": -3.0368571281433105, + "logits/rejected": -3.004257917404175, + "logps/chosen": -56.6202278137207, + "logps/rejected": -55.45795822143555, + "loss": 0.6824, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.029352698475122452, + "rewards/margins": 0.022640466690063477, + "rewards/rejected": -0.05199316143989563, + "step": 11420 + }, + { + "epoch": 1.969331495520331, + "grad_norm": 2.4990296363830566, + "learning_rate": 1.4322633106286232e-11, + "logits/chosen": -2.9539122581481934, + "logits/rejected": -2.92718505859375, + "logps/chosen": -58.123313903808594, + "logps/rejected": -57.20905303955078, + "loss": 0.6866, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.03152982145547867, + "rewards/margins": 0.013926585204899311, + "rewards/rejected": -0.045456402003765106, + "step": 11430 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 2.4815919399261475, + "learning_rate": 1.2758885800464581e-11, + "logits/chosen": -3.035094738006592, + "logits/rejected": -3.0234522819519043, + "logps/chosen": -56.13129806518555, + "logps/rejected": -56.96406936645508, + "loss": 0.6898, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.03581147640943527, + "rewards/margins": 0.007677781395614147, + "rewards/rejected": -0.043489255011081696, + "step": 11440 + }, + { + "epoch": 1.9727773949000689, + "grad_norm": 2.4564032554626465, + "learning_rate": 1.12854538850371e-11, + "logits/chosen": -2.955756664276123, + "logits/rejected": -2.9337503910064697, + "logps/chosen": -55.9810676574707, + "logps/rejected": -56.99578857421875, + "loss": 0.6835, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.032729096710681915, + "rewards/margins": 0.02022354118525982, + "rewards/rejected": -0.05295264720916748, + "step": 11450 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 2.609710931777954, + "learning_rate": 9.90235068436207e-12, + "logits/chosen": -2.999922275543213, + "logits/rejected": -2.9787392616271973, + "logps/chosen": -58.519287109375, + "logps/rejected": -59.29533767700195, + "loss": 0.6838, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.029814541339874268, + "rewards/margins": 0.019777704030275345, + "rewards/rejected": -0.049592241644859314, + "step": 11460 + }, + { + "epoch": 1.976223294279807, + "grad_norm": 2.5704898834228516, + "learning_rate": 8.609588705947857e-12, + "logits/chosen": -3.0523245334625244, + "logits/rejected": -3.0218346118927, + "logps/chosen": -60.27888870239258, + "logps/rejected": -56.3681526184082, + "loss": 0.684, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.03436389937996864, + "rewards/margins": 0.019532622769474983, + "rewards/rejected": -0.053896524012088776, + "step": 11470 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 2.4101979732513428, + "learning_rate": 7.407179640341877e-12, + "logits/chosen": -3.090115547180176, + "logits/rejected": -3.056811809539795, + "logps/chosen": -58.93994140625, + "logps/rejected": -55.08115768432617, + "loss": 0.6846, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0284547321498394, + "rewards/margins": 0.01805991493165493, + "rewards/rejected": -0.04651464894413948, + "step": 11480 + }, + { + "epoch": 1.9796691936595452, + "grad_norm": 2.46321177482605, + "learning_rate": 6.295134361020694e-12, + "logits/chosen": -3.010709047317505, + "logits/rejected": -2.9817841053009033, + "logps/chosen": -57.873619079589844, + "logps/rejected": -55.653099060058594, + "loss": 0.6868, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.03182325139641762, + "rewards/margins": 0.013528557494282722, + "rewards/rejected": -0.04535181075334549, + "step": 11490 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 2.106982469558716, + "learning_rate": 5.273462924296757e-12, + "logits/chosen": -3.058711528778076, + "logits/rejected": -3.036954402923584, + "logps/chosen": -53.891815185546875, + "logps/rejected": -55.3429069519043, + "loss": 0.689, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.03603522107005119, + "rewards/margins": 0.009504115208983421, + "rewards/rejected": -0.04553934186697006, + "step": 11500 + }, + { + "epoch": 1.9813921433494142, + "eval_logits/chosen": -3.1161394119262695, + "eval_logits/rejected": -3.1104989051818848, + "eval_logps/chosen": -59.606449127197266, + "eval_logps/rejected": -64.71166229248047, + "eval_loss": 0.6901578307151794, + "eval_rewards/accuracies": 0.5820167064666748, + "eval_rewards/chosen": -0.008945533074438572, + "eval_rewards/margins": 0.006369884591549635, + "eval_rewards/rejected": -0.01531541720032692, + "eval_runtime": 383.8703, + "eval_samples_per_second": 11.212, + "eval_steps_per_second": 1.402, + "step": 11500 + }, + { + "epoch": 1.9831150930392831, + "grad_norm": 2.3931286334991455, + "learning_rate": 4.342174569221813e-12, + "logits/chosen": -2.955284833908081, + "logits/rejected": -2.945343017578125, + "logps/chosen": -58.84697723388672, + "logps/rejected": -58.00005340576172, + "loss": 0.687, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.033665161579847336, + "rewards/margins": 0.013647237792611122, + "rewards/rejected": -0.04731239750981331, + "step": 11510 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 2.32000994682312, + "learning_rate": 3.501277717508078e-12, + "logits/chosen": -3.0707528591156006, + "logits/rejected": -3.0135512351989746, + "logps/chosen": -61.96906280517578, + "logps/rejected": -55.12891387939453, + "loss": 0.6788, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.02170426771044731, + "rewards/margins": 0.029822617769241333, + "rewards/rejected": -0.051526885479688644, + "step": 11520 + }, + { + "epoch": 1.9865609924190215, + "grad_norm": 2.522261142730713, + "learning_rate": 2.750779973452744e-12, + "logits/chosen": -3.03159761428833, + "logits/rejected": -2.9960219860076904, + "logps/chosen": -56.40028762817383, + "logps/rejected": -53.7711296081543, + "loss": 0.6845, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.029385218396782875, + "rewards/margins": 0.018221555277705193, + "rewards/rejected": -0.04760677367448807, + "step": 11530 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 2.307715892791748, + "learning_rate": 2.0906881238624833e-12, + "logits/chosen": -3.074688196182251, + "logits/rejected": -3.0627036094665527, + "logps/chosen": -53.6716194152832, + "logps/rejected": -59.98317337036133, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029241954907774925, + "rewards/margins": 0.01832282915711403, + "rewards/rejected": -0.047564782202243805, + "step": 11540 + }, + { + "epoch": 1.9900068917987594, + "grad_norm": 2.5106430053710938, + "learning_rate": 1.5210081380001572e-12, + "logits/chosen": -3.000810146331787, + "logits/rejected": -2.9803338050842285, + "logps/chosen": -54.16124725341797, + "logps/rejected": -57.501487731933594, + "loss": 0.6844, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.028317982330918312, + "rewards/margins": 0.018404236063361168, + "rewards/rejected": -0.04672221839427948, + "step": 11550 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 2.4142873287200928, + "learning_rate": 1.0417451675248657e-12, + "logits/chosen": -3.031704902648926, + "logits/rejected": -3.0022201538085938, + "logps/chosen": -58.48185348510742, + "logps/rejected": -57.05162811279297, + "loss": 0.6848, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0349888876080513, + "rewards/margins": 0.01772366650402546, + "rewards/rejected": -0.05271255224943161, + "step": 11560 + }, + { + "epoch": 1.9934527911784976, + "grad_norm": 2.451390266418457, + "learning_rate": 6.529035464486466e-13, + "logits/chosen": -3.0356264114379883, + "logits/rejected": -3.015634536743164, + "logps/chosen": -57.8956413269043, + "logps/rejected": -60.697242736816406, + "loss": 0.6876, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.037846293300390244, + "rewards/margins": 0.012129221111536026, + "rewards/rejected": -0.04997551441192627, + "step": 11570 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 2.7996134757995605, + "learning_rate": 3.5448679109761907e-13, + "logits/chosen": -3.0607693195343018, + "logits/rejected": -3.0405261516571045, + "logps/chosen": -59.582923889160156, + "logps/rejected": -60.716270446777344, + "loss": 0.685, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.03324711322784424, + "rewards/margins": 0.017170798033475876, + "rewards/rejected": -0.050417911261320114, + "step": 11580 + }, + { + "epoch": 1.9968986905582358, + "grad_norm": 2.7875218391418457, + "learning_rate": 1.4649760007534597e-13, + "logits/chosen": -2.9645540714263916, + "logits/rejected": -2.924387216567993, + "logps/chosen": -61.875083923339844, + "logps/rejected": -55.68303680419922, + "loss": 0.6822, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.023736396804451942, + "rewards/margins": 0.02287469431757927, + "rewards/rejected": -0.04661108925938606, + "step": 11590 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 2.565707206726074, + "learning_rate": 2.8937854245070226e-14, + "logits/chosen": -3.003178119659424, + "logits/rejected": -2.991347551345825, + "logps/chosen": -53.818214416503906, + "logps/rejected": -58.562896728515625, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.033688973635435104, + "rewards/margins": 0.014114851132035255, + "rewards/rejected": -0.04780382663011551, + "step": 11600 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -3.11618709564209, + "eval_logits/rejected": -3.1104962825775146, + "eval_logps/chosen": -59.58960723876953, + "eval_logps/rejected": -64.70088958740234, + "eval_loss": 0.6901275515556335, + "eval_rewards/accuracies": 0.589219331741333, + "eval_rewards/chosen": -0.00877712108194828, + "eval_rewards/margins": 0.006430591456592083, + "eval_rewards/rejected": -0.015207710675895214, + "eval_runtime": 385.2313, + "eval_samples_per_second": 11.173, + "eval_steps_per_second": 1.397, + "step": 11600 + }, + { + "epoch": 2.0, + "step": 11608, + "total_flos": 0.0, + "train_loss": 0.6883086910911629, + "train_runtime": 95005.2512, + "train_samples_per_second": 1.955, + "train_steps_per_second": 0.122 + } + ], + "logging_steps": 10, + "max_steps": 11608, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}