{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.953125, "learning_rate": 1.282051282051282e-07, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 3.75, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.0089564323425293, "logits/rejected": -2.998065710067749, "logps/chosen": -33.19136428833008, "logps/rejected": -31.97561264038086, "loss": 0.4998, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": -0.0020843851380050182, "rewards/margins": 0.0002575106918811798, "rewards/rejected": -0.002341895829886198, "step": 10 }, { "epoch": 0.05, "grad_norm": 3.359375, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.899214267730713, "logits/rejected": -2.8942105770111084, "logps/chosen": -32.48638916015625, "logps/rejected": -28.965356826782227, "loss": 0.5009, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0022583019454032183, "rewards/margins": -0.0055608078837394714, "rewards/rejected": 0.0033025064039975405, "step": 20 }, { "epoch": 0.08, "grad_norm": 4.0625, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.096067190170288, "logits/rejected": -3.1081583499908447, "logps/chosen": -32.83055877685547, "logps/rejected": -30.159122467041016, "loss": 0.4979, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0156254880130291, "rewards/margins": 0.008240064606070518, "rewards/rejected": 0.007385422941297293, "step": 30 }, { "epoch": 0.1, "grad_norm": 4.0625, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.8633229732513428, "logits/rejected": -2.8539462089538574, "logps/chosen": -31.576160430908203, "logps/rejected": -32.43328857421875, "loss": 0.4858, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.05083252862095833, "rewards/margins": 0.06599889695644379, "rewards/rejected": -0.01516636274755001, "step": 40 }, { "epoch": 0.13, "grad_norm": 3.8125, "learning_rate": 4.987541037542187e-06, "logits/chosen": -2.880373954772949, "logits/rejected": -2.87811541557312, "logps/chosen": -29.325586318969727, "logps/rejected": -30.077322006225586, "loss": 0.4808, "rewards/accuracies": 0.5, "rewards/chosen": 0.08291508257389069, "rewards/margins": 0.08059060573577881, "rewards/rejected": 0.002324472414329648, "step": 50 }, { "epoch": 0.16, "grad_norm": 3.59375, "learning_rate": 4.954691471941119e-06, "logits/chosen": -2.9070420265197754, "logits/rejected": -2.908172369003296, "logps/chosen": -29.754262924194336, "logps/rejected": -27.948543548583984, "loss": 0.4839, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07510136812925339, "rewards/margins": 0.0736737847328186, "rewards/rejected": 0.0014275781577453017, "step": 60 }, { "epoch": 0.18, "grad_norm": 4.3125, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.9913430213928223, "logits/rejected": -2.9976165294647217, "logps/chosen": -29.0341796875, "logps/rejected": -30.7557430267334, "loss": 0.4938, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.04935770481824875, "rewards/margins": 0.022146493196487427, "rewards/rejected": 0.027211207896471024, "step": 70 }, { "epoch": 0.21, "grad_norm": 4.125, "learning_rate": 4.828760511501322e-06, "logits/chosen": -2.803948402404785, "logits/rejected": -2.8194332122802734, "logps/chosen": -29.147689819335938, "logps/rejected": -29.726749420166016, "loss": 0.4828, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07860848307609558, "rewards/margins": 0.08910086750984192, "rewards/rejected": -0.010492382571101189, "step": 80 }, { "epoch": 0.23, "grad_norm": 4.4375, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -2.892725706100464, "logits/rejected": -2.874434232711792, "logps/chosen": -32.414127349853516, "logps/rejected": -30.058170318603516, "loss": 0.4789, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.08446021378040314, "rewards/margins": 0.09176470339298248, "rewards/rejected": -0.007304488681256771, "step": 90 }, { "epoch": 0.26, "grad_norm": 3.5, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.9959635734558105, "logits/rejected": -2.9973392486572266, "logps/chosen": -31.55927085876465, "logps/rejected": -30.724218368530273, "loss": 0.4829, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0995207279920578, "rewards/margins": 0.0811297670006752, "rewards/rejected": 0.0183909572660923, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.802985906600952, "eval_logits/rejected": -2.8001766204833984, "eval_logps/chosen": -31.0873966217041, "eval_logps/rejected": -34.63096618652344, "eval_loss": 0.49451369047164917, "eval_rewards/accuracies": 0.5635381937026978, "eval_rewards/chosen": 0.03901098668575287, "eval_rewards/margins": 0.02557329833507538, "eval_rewards/rejected": 0.013437685556709766, "eval_runtime": 113.153, "eval_samples_per_second": 3.031, "eval_steps_per_second": 0.38, "step": 100 }, { "epoch": 0.29, "grad_norm": 4.09375, "learning_rate": 4.498257201263691e-06, "logits/chosen": -2.949113130569458, "logits/rejected": -2.925306797027588, "logps/chosen": -31.45932388305664, "logps/rejected": -31.128564834594727, "loss": 0.4692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1514938771724701, "rewards/margins": 0.14543716609477997, "rewards/rejected": 0.006056725047528744, "step": 110 }, { "epoch": 0.31, "grad_norm": 4.125, "learning_rate": 4.353806263777678e-06, "logits/chosen": -3.031067371368408, "logits/rejected": -3.060089588165283, "logps/chosen": -28.312602996826172, "logps/rejected": -34.01371765136719, "loss": 0.4667, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.17025941610336304, "rewards/margins": 0.14349231123924255, "rewards/rejected": 0.026767095550894737, "step": 120 }, { "epoch": 0.34, "grad_norm": 2.515625, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.733306884765625, "logits/rejected": -2.729750871658325, "logps/chosen": -28.17226219177246, "logps/rejected": -30.136898040771484, "loss": 0.4656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1604831963777542, "rewards/margins": 0.16110166907310486, "rewards/rejected": -0.00061851367354393, "step": 130 }, { "epoch": 0.36, "grad_norm": 3.03125, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -3.0078999996185303, "logits/rejected": -3.005507230758667, "logps/chosen": -26.812694549560547, "logps/rejected": -31.561386108398438, "loss": 0.4732, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.13470572233200073, "rewards/margins": 0.11562897264957428, "rewards/rejected": 0.01907675340771675, "step": 140 }, { "epoch": 0.39, "grad_norm": 3.671875, "learning_rate": 3.834196265035119e-06, "logits/chosen": -2.8041739463806152, "logits/rejected": -2.7990527153015137, "logps/chosen": -26.89498519897461, "logps/rejected": -31.248632431030273, "loss": 0.4572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18610434234142303, "rewards/margins": 0.1911022961139679, "rewards/rejected": -0.004997962154448032, "step": 150 }, { "epoch": 0.42, "grad_norm": 3.828125, "learning_rate": 3.636998309800573e-06, "logits/chosen": -3.122868299484253, "logits/rejected": -3.1052639484405518, "logps/chosen": -31.118968963623047, "logps/rejected": -29.14169692993164, "loss": 0.4349, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2745501399040222, "rewards/margins": 0.30693456530570984, "rewards/rejected": -0.0323844812810421, "step": 160 }, { "epoch": 0.44, "grad_norm": 4.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.9357101917266846, "logits/rejected": -2.9437410831451416, "logps/chosen": -28.84354591369629, "logps/rejected": -31.365503311157227, "loss": 0.4446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21021375060081482, "rewards/margins": 0.26341748237609863, "rewards/rejected": -0.05320371314883232, "step": 170 }, { "epoch": 0.47, "grad_norm": 2.9375, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.7853760719299316, "logits/rejected": -2.782726287841797, "logps/chosen": -28.669147491455078, "logps/rejected": -29.730758666992188, "loss": 0.4565, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2001400738954544, "rewards/margins": 0.19265684485435486, "rewards/rejected": 0.00748323742300272, "step": 180 }, { "epoch": 0.49, "grad_norm": 2.390625, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.905327320098877, "logits/rejected": -2.9015116691589355, "logps/chosen": -29.04599380493164, "logps/rejected": -28.085041046142578, "loss": 0.4628, "rewards/accuracies": 0.625, "rewards/chosen": 0.2281440794467926, "rewards/margins": 0.17364613711833954, "rewards/rejected": 0.05449794605374336, "step": 190 }, { "epoch": 0.52, "grad_norm": 2.84375, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -2.9772303104400635, "logits/rejected": -2.9655442237854004, "logps/chosen": -31.91948890686035, "logps/rejected": -30.082332611083984, "loss": 0.4376, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3559049963951111, "rewards/margins": 0.28555411100387573, "rewards/rejected": 0.07035084813833237, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8163414001464844, "eval_logits/rejected": -2.8137245178222656, "eval_logps/chosen": -31.0458927154541, "eval_logps/rejected": -34.6682014465332, "eval_loss": 0.49114447832107544, "eval_rewards/accuracies": 0.5577242374420166, "eval_rewards/chosen": 0.04731180891394615, "eval_rewards/margins": 0.041320886462926865, "eval_rewards/rejected": 0.005990919191390276, "eval_runtime": 113.006, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.381, "step": 200 }, { "epoch": 0.55, "grad_norm": 4.28125, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.914222240447998, "logits/rejected": -2.916249990463257, "logps/chosen": -31.570764541625977, "logps/rejected": -33.81729507446289, "loss": 0.4391, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2925291359424591, "rewards/margins": 0.2836949825286865, "rewards/rejected": 0.008834179490804672, "step": 210 }, { "epoch": 0.57, "grad_norm": 3.1875, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -2.8961727619171143, "logits/rejected": -2.9133689403533936, "logps/chosen": -28.757190704345703, "logps/rejected": -28.49643325805664, "loss": 0.4382, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.28417009115219116, "rewards/margins": 0.2778027653694153, "rewards/rejected": 0.006367350462824106, "step": 220 }, { "epoch": 0.6, "grad_norm": 3.75, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -2.948033571243286, "logits/rejected": -2.952341079711914, "logps/chosen": -30.005146026611328, "logps/rejected": -31.54683494567871, "loss": 0.4579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22934475541114807, "rewards/margins": 0.18708109855651855, "rewards/rejected": 0.042263638228178024, "step": 230 }, { "epoch": 0.62, "grad_norm": 3.84375, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -3.0030248165130615, "logits/rejected": -3.009697675704956, "logps/chosen": -29.56500816345215, "logps/rejected": -30.203866958618164, "loss": 0.4421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2679831385612488, "rewards/margins": 0.2617082893848419, "rewards/rejected": 0.00627488736063242, "step": 240 }, { "epoch": 0.65, "grad_norm": 3.796875, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.8312201499938965, "logits/rejected": -2.821024179458618, "logps/chosen": -25.73629379272461, "logps/rejected": -29.133464813232422, "loss": 0.4609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24358424544334412, "rewards/margins": 0.1639489382505417, "rewards/rejected": 0.07963528484106064, "step": 250 }, { "epoch": 0.68, "grad_norm": 3.625, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -2.8129467964172363, "logits/rejected": -2.8341376781463623, "logps/chosen": -28.19793701171875, "logps/rejected": -33.99031448364258, "loss": 0.436, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3264053463935852, "rewards/margins": 0.2896498441696167, "rewards/rejected": 0.036755502223968506, "step": 260 }, { "epoch": 0.7, "grad_norm": 3.484375, "learning_rate": 1.243452991757889e-06, "logits/chosen": -2.9577534198760986, "logits/rejected": -2.9646975994110107, "logps/chosen": -29.623525619506836, "logps/rejected": -29.91342544555664, "loss": 0.4449, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23156669735908508, "rewards/margins": 0.2411995679140091, "rewards/rejected": -0.009632894769310951, "step": 270 }, { "epoch": 0.73, "grad_norm": 3.53125, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -2.973743200302124, "logits/rejected": -2.960118532180786, "logps/chosen": -29.57967185974121, "logps/rejected": -28.558263778686523, "loss": 0.4498, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22840175032615662, "rewards/margins": 0.2137848436832428, "rewards/rejected": 0.014616942033171654, "step": 280 }, { "epoch": 0.75, "grad_norm": 2.9375, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.901043653488159, "logits/rejected": -2.883784294128418, "logps/chosen": -30.675678253173828, "logps/rejected": -30.71207618713379, "loss": 0.4063, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3921253979206085, "rewards/margins": 0.4298785626888275, "rewards/rejected": -0.03775321692228317, "step": 290 }, { "epoch": 0.78, "grad_norm": 3.703125, "learning_rate": 7.08321427484816e-07, "logits/chosen": -2.905961513519287, "logits/rejected": -2.9014759063720703, "logps/chosen": -30.50136947631836, "logps/rejected": -27.470678329467773, "loss": 0.4326, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3442566692829132, "rewards/margins": 0.3176761269569397, "rewards/rejected": 0.02658059261739254, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.820904493331909, "eval_logits/rejected": -2.8182966709136963, "eval_logps/chosen": -31.072538375854492, "eval_logps/rejected": -34.71522521972656, "eval_loss": 0.49085888266563416, "eval_rewards/accuracies": 0.6013289093971252, "eval_rewards/chosen": 0.04198317229747772, "eval_rewards/margins": 0.045396942645311356, "eval_rewards/rejected": -0.0034137717448174953, "eval_runtime": 112.9358, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 300 }, { "epoch": 0.81, "grad_norm": 3.234375, "learning_rate": 5.576113578589035e-07, "logits/chosen": -2.781768321990967, "logits/rejected": -2.799870014190674, "logps/chosen": -28.066579818725586, "logps/rejected": -30.81374740600586, "loss": 0.4473, "rewards/accuracies": 0.75, "rewards/chosen": 0.23256821930408478, "rewards/margins": 0.24891658127307892, "rewards/rejected": -0.01634838618338108, "step": 310 }, { "epoch": 0.83, "grad_norm": 3.5625, "learning_rate": 4.229036944380913e-07, "logits/chosen": -3.0310263633728027, "logits/rejected": -3.016596794128418, "logps/chosen": -28.514297485351562, "logps/rejected": -28.385051727294922, "loss": 0.424, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3394576907157898, "rewards/margins": 0.35271310806274414, "rewards/rejected": -0.013255435042083263, "step": 320 }, { "epoch": 0.86, "grad_norm": 3.234375, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.94514799118042, "logits/rejected": -2.927401065826416, "logps/chosen": -26.721960067749023, "logps/rejected": -30.596094131469727, "loss": 0.4178, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3280225396156311, "rewards/margins": 0.3689119517803192, "rewards/rejected": -0.0408894307911396, "step": 330 }, { "epoch": 0.88, "grad_norm": 2.71875, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -3.162492275238037, "logits/rejected": -3.1681036949157715, "logps/chosen": -30.14105796813965, "logps/rejected": -32.993080139160156, "loss": 0.4252, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2996302545070648, "rewards/margins": 0.3590452969074249, "rewards/rejected": -0.05941504240036011, "step": 340 }, { "epoch": 0.91, "grad_norm": 3.3125, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -3.0381181240081787, "logits/rejected": -3.0421128273010254, "logps/chosen": -29.018634796142578, "logps/rejected": -31.29150390625, "loss": 0.432, "rewards/accuracies": 0.75, "rewards/chosen": 0.3576488494873047, "rewards/margins": 0.3132873773574829, "rewards/rejected": 0.044361427426338196, "step": 350 }, { "epoch": 0.94, "grad_norm": 4.09375, "learning_rate": 6.41315865106129e-08, "logits/chosen": -2.8777647018432617, "logits/rejected": -2.879361152648926, "logps/chosen": -26.779287338256836, "logps/rejected": -29.69747543334961, "loss": 0.4224, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.374088317155838, "rewards/margins": 0.34739503264427185, "rewards/rejected": 0.026693273335695267, "step": 360 }, { "epoch": 0.96, "grad_norm": 5.34375, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.953718900680542, "logits/rejected": -2.9520201683044434, "logps/chosen": -29.355998992919922, "logps/rejected": -32.019020080566406, "loss": 0.4352, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.28613001108169556, "rewards/margins": 0.30432528257369995, "rewards/rejected": -0.01819526217877865, "step": 370 }, { "epoch": 0.99, "grad_norm": 3.390625, "learning_rate": 2.575864278703266e-09, "logits/chosen": -2.9095873832702637, "logits/rejected": -2.892209053039551, "logps/chosen": -27.843591690063477, "logps/rejected": -28.02511215209961, "loss": 0.4472, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22028858959674835, "rewards/margins": 0.24738721549510956, "rewards/rejected": -0.02709861472249031, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.4552104888024268, "train_runtime": 2721.944, "train_samples_per_second": 1.131, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }