{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 104.5, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 78.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08700561523438, "logits/rejected": 80.79035186767578, "logps/chosen": -34.29237365722656, "logps/rejected": -33.04549026489258, "loss": 0.9866, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.040518708527088165, "rewards/margins": 0.029149238020181656, "rewards/rejected": -0.06966794282197952, "step": 10 }, { "epoch": 0.05, "grad_norm": 85.5, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.68280029296875, "logits/rejected": 80.57057189941406, "logps/chosen": -33.53000259399414, "logps/rejected": -30.8519287109375, "loss": 0.8984, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.12792269885540009, "rewards/margins": 0.18747423589229584, "rewards/rejected": -0.05955154448747635, "step": 20 }, { "epoch": 0.08, "grad_norm": 73.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.52570343017578, "logits/rejected": 82.552734375, "logps/chosen": -33.96383285522461, "logps/rejected": -31.202930450439453, "loss": 1.1955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.03918968886137009, "rewards/margins": -0.1375342458486557, "rewards/rejected": 0.1767239272594452, "step": 30 }, { "epoch": 0.1, "grad_norm": 74.5, "learning_rate": 4.999896948438434e-06, "logits/chosen": 80.97098541259766, "logits/rejected": 80.9694595336914, "logps/chosen": -32.8037109375, "logps/rejected": -33.232460021972656, "loss": 0.9309, "rewards/accuracies": 0.625, "rewards/chosen": 0.2962803840637207, "rewards/margins": 0.1874258816242218, "rewards/rejected": 0.10885453224182129, "step": 40 }, { "epoch": 0.13, "grad_norm": 70.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.56432342529297, "logits/rejected": 78.58203125, "logps/chosen": -30.68828773498535, "logps/rejected": -30.789108276367188, "loss": 0.9855, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4135704040527344, "rewards/margins": 0.18622872233390808, "rewards/rejected": 0.2273416966199875, "step": 50 }, { "epoch": 0.16, "grad_norm": 84.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.11712646484375, "logits/rejected": 83.17086791992188, "logps/chosen": -30.859710693359375, "logps/rejected": -29.577678680419922, "loss": 0.9236, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.25236082077026367, "rewards/margins": 0.24208447337150574, "rewards/rejected": 0.010276327840983868, "step": 60 }, { "epoch": 0.18, "grad_norm": 107.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.80155944824219, "logits/rejected": 83.8315658569336, "logps/chosen": -30.606204986572266, "logps/rejected": -32.91896057128906, "loss": 1.141, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.01825830340385437, "rewards/margins": -0.0767388790845871, "rewards/rejected": 0.09499720484018326, "step": 70 }, { "epoch": 0.21, "grad_norm": 84.5, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.45521545410156, "logits/rejected": 81.44440460205078, "logps/chosen": -31.48111343383789, "logps/rejected": -31.072092056274414, "loss": 0.8995, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01906520128250122, "rewards/margins": 0.2706204056739807, "rewards/rejected": -0.2515551745891571, "step": 80 }, { "epoch": 0.23, "grad_norm": 102.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.27713775634766, "logits/rejected": 78.24540710449219, "logps/chosen": -32.49984359741211, "logps/rejected": -31.142908096313477, "loss": 0.9467, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11066800355911255, "rewards/margins": 0.2638159692287445, "rewards/rejected": -0.15314793586730957, "step": 90 }, { "epoch": 0.26, "grad_norm": 73.5, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.56034851074219, "logits/rejected": 83.58055877685547, "logps/chosen": -34.057029724121094, "logps/rejected": -31.78998374938965, "loss": 0.9567, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.19836829602718353, "rewards/margins": 0.22992074489593506, "rewards/rejected": -0.031552452594041824, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.73486328125, "eval_logits/rejected": 98.72332763671875, "eval_logps/chosen": -32.45964431762695, "eval_logps/rejected": -35.88682556152344, "eval_loss": 1.1574316024780273, "eval_rewards/accuracies": 0.4630398452281952, "eval_rewards/chosen": -0.013175476342439651, "eval_rewards/margins": -0.07693858444690704, "eval_rewards/rejected": 0.0637631043791771, "eval_runtime": 104.0187, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 104.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.91990661621094, "logits/rejected": 83.79493713378906, "logps/chosen": -32.23216247558594, "logps/rejected": -32.69822692871094, "loss": 0.7793, "rewards/accuracies": 0.625, "rewards/chosen": 0.6325147151947021, "rewards/margins": 0.6663830280303955, "rewards/rejected": -0.03386829048395157, "step": 110 }, { "epoch": 0.31, "grad_norm": 108.5, "learning_rate": 4.353806263777678e-06, "logits/chosen": 84.13631439208984, "logits/rejected": 84.24298858642578, "logps/chosen": -28.422576904296875, "logps/rejected": -35.379127502441406, "loss": 0.8738, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.40560561418533325, "rewards/margins": 0.34159213304519653, "rewards/rejected": 0.06401350349187851, "step": 120 }, { "epoch": 0.34, "grad_norm": 55.5, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.52787017822266, "logits/rejected": 81.55335998535156, "logps/chosen": -30.235469818115234, "logps/rejected": -31.976581573486328, "loss": 0.8013, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5375091433525085, "rewards/margins": 0.5734738707542419, "rewards/rejected": -0.03596482425928116, "step": 130 }, { "epoch": 0.36, "grad_norm": 57.75, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.77738189697266, "logits/rejected": 82.78349304199219, "logps/chosen": -27.04022216796875, "logps/rejected": -32.86003875732422, "loss": 0.7387, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32568174600601196, "rewards/margins": 0.6750079393386841, "rewards/rejected": -0.34932616353034973, "step": 140 }, { "epoch": 0.39, "grad_norm": 63.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 81.38068389892578, "logits/rejected": 81.35902404785156, "logps/chosen": -28.90557289123535, "logps/rejected": -32.89466094970703, "loss": 0.6488, "rewards/accuracies": 0.75, "rewards/chosen": 0.38682785630226135, "rewards/margins": 0.6493567228317261, "rewards/rejected": -0.26252883672714233, "step": 150 }, { "epoch": 0.42, "grad_norm": 66.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 83.2899169921875, "logits/rejected": 83.30531311035156, "logps/chosen": -33.436553955078125, "logps/rejected": -30.43096351623535, "loss": 0.6075, "rewards/accuracies": 0.75, "rewards/chosen": 0.5601544976234436, "rewards/margins": 0.9428972005844116, "rewards/rejected": -0.382742702960968, "step": 160 }, { "epoch": 0.44, "grad_norm": 70.5, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 84.12855529785156, "logits/rejected": 84.0693359375, "logps/chosen": -30.63346290588379, "logps/rejected": -32.29146194458008, "loss": 0.7575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.5006591081619263, "rewards/margins": 0.717779278755188, "rewards/rejected": -0.21712002158164978, "step": 170 }, { "epoch": 0.47, "grad_norm": 65.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 81.81861114501953, "logits/rejected": 81.79876708984375, "logps/chosen": -30.488971710205078, "logps/rejected": -31.519611358642578, "loss": 0.6265, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5665422677993774, "rewards/margins": 0.8564668893814087, "rewards/rejected": -0.289924681186676, "step": 180 }, { "epoch": 0.49, "grad_norm": 34.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": 83.52326965332031, "logits/rejected": 83.50785064697266, "logps/chosen": -30.285634994506836, "logps/rejected": -30.630422592163086, "loss": 0.8287, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.38462626934051514, "rewards/margins": 0.5145466327667236, "rewards/rejected": -0.12992039322853088, "step": 190 }, { "epoch": 0.52, "grad_norm": 50.25, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 79.0837173461914, "logits/rejected": 79.03185272216797, "logps/chosen": -33.81468963623047, "logps/rejected": -32.4266357421875, "loss": 0.8098, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7504364252090454, "rewards/margins": 0.6970199346542358, "rewards/rejected": 0.053416453301906586, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.86815643310547, "eval_logits/rejected": 98.84379577636719, "eval_logps/chosen": -32.561100006103516, "eval_logps/rejected": -36.16287612915039, "eval_loss": 1.0544954538345337, "eval_rewards/accuracies": 0.5278239250183105, "eval_rewards/chosen": -0.09434036910533905, "eval_rewards/margins": 0.06273789703845978, "eval_rewards/rejected": -0.15707828104496002, "eval_runtime": 103.8402, "eval_samples_per_second": 3.303, "eval_steps_per_second": 0.414, "step": 200 }, { "epoch": 0.55, "grad_norm": 104.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 81.58848571777344, "logits/rejected": 81.49668884277344, "logps/chosen": -33.169471740722656, "logps/rejected": -34.991939544677734, "loss": 0.7585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.620996356010437, "rewards/margins": 0.6256735324859619, "rewards/rejected": -0.004677181597799063, "step": 210 }, { "epoch": 0.57, "grad_norm": 35.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 83.66175079345703, "logits/rejected": 83.74690246582031, "logps/chosen": -31.094005584716797, "logps/rejected": -31.061717987060547, "loss": 0.626, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.683959424495697, "rewards/margins": 0.9501722455024719, "rewards/rejected": -0.2662127912044525, "step": 220 }, { "epoch": 0.6, "grad_norm": 69.5, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 80.7747802734375, "logits/rejected": 80.82144165039062, "logps/chosen": -32.322105407714844, "logps/rejected": -34.30824661254883, "loss": 0.7928, "rewards/accuracies": 0.625, "rewards/chosen": 0.38691413402557373, "rewards/margins": 0.6237919926643372, "rewards/rejected": -0.23687779903411865, "step": 230 }, { "epoch": 0.62, "grad_norm": 89.5, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 83.18328094482422, "logits/rejected": 83.4581069946289, "logps/chosen": -30.6195125579834, "logps/rejected": -31.637020111083984, "loss": 0.5475, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.854840099811554, "rewards/margins": 0.9388192892074585, "rewards/rejected": -0.08397923409938812, "step": 240 }, { "epoch": 0.65, "grad_norm": 84.5, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 81.9409408569336, "logits/rejected": 82.00200653076172, "logps/chosen": -26.908878326416016, "logps/rejected": -30.06099510192871, "loss": 0.8063, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5496150255203247, "rewards/margins": 0.6778196692466736, "rewards/rejected": -0.12820473313331604, "step": 250 }, { "epoch": 0.68, "grad_norm": 60.25, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 79.24473571777344, "logits/rejected": 79.37559509277344, "logps/chosen": -30.283336639404297, "logps/rejected": -36.4005241394043, "loss": 0.4689, "rewards/accuracies": 0.8125, "rewards/chosen": 1.0221258401870728, "rewards/margins": 1.2612489461898804, "rewards/rejected": -0.23912319540977478, "step": 260 }, { "epoch": 0.7, "grad_norm": 31.875, "learning_rate": 1.243452991757889e-06, "logits/chosen": 78.6611328125, "logits/rejected": 78.69209289550781, "logps/chosen": -30.992252349853516, "logps/rejected": -31.770212173461914, "loss": 0.6471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6293355822563171, "rewards/margins": 0.8411690592765808, "rewards/rejected": -0.21183356642723083, "step": 270 }, { "epoch": 0.73, "grad_norm": 100.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 81.2646255493164, "logits/rejected": 81.05351257324219, "logps/chosen": -31.10286521911621, "logps/rejected": -29.866031646728516, "loss": 0.7333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5656233429908752, "rewards/margins": 0.7248517274856567, "rewards/rejected": -0.15922844409942627, "step": 280 }, { "epoch": 0.75, "grad_norm": 61.25, "learning_rate": 8.737922755071455e-07, "logits/chosen": 81.45980072021484, "logits/rejected": 81.37715148925781, "logps/chosen": -33.03575134277344, "logps/rejected": -32.42512512207031, "loss": 0.6231, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8547972440719604, "rewards/margins": 1.1814041137695312, "rewards/rejected": -0.32660672068595886, "step": 290 }, { "epoch": 0.78, "grad_norm": 78.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 77.12594604492188, "logits/rejected": 77.21607971191406, "logps/chosen": -32.209590911865234, "logps/rejected": -29.23952865600586, "loss": 0.6965, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9568928480148315, "rewards/margins": 1.066897988319397, "rewards/rejected": -0.11000506579875946, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.90528106689453, "eval_logits/rejected": 98.8800277709961, "eval_logps/chosen": -32.50230407714844, "eval_logps/rejected": -36.192344665527344, "eval_loss": 0.9868877530097961, "eval_rewards/accuracies": 0.5830564498901367, "eval_rewards/chosen": -0.047303199768066406, "eval_rewards/margins": 0.1333501785993576, "eval_rewards/rejected": -0.180653378367424, "eval_runtime": 103.7665, "eval_samples_per_second": 3.305, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 59.25, "learning_rate": 5.576113578589035e-07, "logits/chosen": 84.09150695800781, "logits/rejected": 84.11959075927734, "logps/chosen": -30.127155303955078, "logps/rejected": -32.29438781738281, "loss": 0.7446, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5891044735908508, "rewards/margins": 0.6648039817810059, "rewards/rejected": -0.07569954544305801, "step": 310 }, { "epoch": 0.83, "grad_norm": 60.25, "learning_rate": 4.229036944380913e-07, "logits/chosen": 81.67195892333984, "logits/rejected": 81.67171478271484, "logps/chosen": -30.566574096679688, "logps/rejected": -29.08144187927246, "loss": 0.5518, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.8255898356437683, "rewards/margins": 0.9551518559455872, "rewards/rejected": -0.12956194579601288, "step": 320 }, { "epoch": 0.86, "grad_norm": 58.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": 78.98292541503906, "logits/rejected": 79.03089904785156, "logps/chosen": -29.0233211517334, "logps/rejected": -32.939231872558594, "loss": 0.5283, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.063429832458496, "rewards/margins": 1.306926965713501, "rewards/rejected": -0.24349698424339294, "step": 330 }, { "epoch": 0.88, "grad_norm": 88.5, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 83.09165954589844, "logits/rejected": 83.11895751953125, "logps/chosen": -32.228084564208984, "logps/rejected": -33.656490325927734, "loss": 0.6498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7872547507286072, "rewards/margins": 1.0174915790557861, "rewards/rejected": -0.23023685812950134, "step": 340 }, { "epoch": 0.91, "grad_norm": 42.5, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 82.23709869384766, "logits/rejected": 82.24420928955078, "logps/chosen": -32.544349670410156, "logps/rejected": -33.209228515625, "loss": 0.6454, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8795096278190613, "rewards/margins": 0.9734523892402649, "rewards/rejected": -0.09394274652004242, "step": 350 }, { "epoch": 0.94, "grad_norm": 66.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": 83.5949935913086, "logits/rejected": 83.62043762207031, "logps/chosen": -28.472000122070312, "logps/rejected": -31.736831665039062, "loss": 0.6336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8641688227653503, "rewards/margins": 0.9337539672851562, "rewards/rejected": -0.0695851519703865, "step": 360 }, { "epoch": 0.96, "grad_norm": 78.5, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 83.08488464355469, "logits/rejected": 83.10858917236328, "logps/chosen": -31.81646156311035, "logps/rejected": -35.26220703125, "loss": 0.6364, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.847761332988739, "rewards/margins": 0.9700264930725098, "rewards/rejected": -0.1222650408744812, "step": 370 }, { "epoch": 0.99, "grad_norm": 82.5, "learning_rate": 2.575864278703266e-09, "logits/chosen": 77.07919311523438, "logits/rejected": 76.94573974609375, "logps/chosen": -29.726633071899414, "logps/rejected": -28.218725204467773, "loss": 0.7075, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6795950531959534, "rewards/margins": 0.7261168956756592, "rewards/rejected": -0.046521805226802826, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.7616742146479619, "train_runtime": 2551.1965, "train_samples_per_second": 1.207, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }