{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 99.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 0.3906, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 161.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.0094523429870605, "logits/rejected": -2.998090982437134, "logps/chosen": -33.19883728027344, "logps/rejected": -31.973581314086914, "loss": 0.4457, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.014314251020550728, "rewards/margins": -0.006575056351721287, "rewards/rejected": -0.007739194668829441, "step": 10 }, { "epoch": 0.05, "grad_norm": 128.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.8992228507995605, "logits/rejected": -2.8941798210144043, "logps/chosen": -32.46299362182617, "logps/rejected": -28.942882537841797, "loss": 0.4894, "rewards/accuracies": 0.4375, "rewards/chosen": 0.009680476039648056, "rewards/margins": -0.021505311131477356, "rewards/rejected": 0.031185787171125412, "step": 20 }, { "epoch": 0.08, "grad_norm": 129.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.097111225128174, "logits/rejected": -3.109170436859131, "logps/chosen": -32.810890197753906, "logps/rejected": -30.153121948242188, "loss": 0.4421, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.07823446393013, "rewards/margins": 0.04388953000307083, "rewards/rejected": 0.03434494137763977, "step": 30 }, { "epoch": 0.1, "grad_norm": 200.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.86364483833313, "logits/rejected": -2.853883981704712, "logps/chosen": -31.620784759521484, "logps/rejected": -32.36823654174805, "loss": 0.7104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1676289290189743, "rewards/margins": 0.17625470459461212, "rewards/rejected": -0.008625739254057407, "step": 40 }, { "epoch": 0.13, "grad_norm": 229.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": -2.886505603790283, "logits/rejected": -2.884448528289795, "logps/chosen": -29.601898193359375, "logps/rejected": -30.147045135498047, "loss": 0.8433, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.11060991138219833, "rewards/margins": 0.15709055960178375, "rewards/rejected": -0.04648064821958542, "step": 50 }, { "epoch": 0.16, "grad_norm": 137.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": -2.9212472438812256, "logits/rejected": -2.921980381011963, "logps/chosen": -30.149127960205078, "logps/rejected": -28.14251136779785, "loss": 0.5196, "rewards/accuracies": 0.625, "rewards/chosen": -0.01548430323600769, "rewards/margins": 0.13397768139839172, "rewards/rejected": -0.1494619995355606, "step": 60 }, { "epoch": 0.18, "grad_norm": 1104.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": -3.0032145977020264, "logits/rejected": -3.010529041290283, "logps/chosen": -29.345632553100586, "logps/rejected": -31.078609466552734, "loss": 1.3076, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0517343208193779, "rewards/margins": 0.09771490097045898, "rewards/rejected": -0.1494491994380951, "step": 70 }, { "epoch": 0.21, "grad_norm": 186.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": -2.816901922225952, "logits/rejected": -2.832296848297119, "logps/chosen": -29.427043914794922, "logps/rejected": -29.738506317138672, "loss": 0.6608, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0909457728266716, "rewards/margins": 0.14232560992240906, "rewards/rejected": -0.051379840821027756, "step": 80 }, { "epoch": 0.23, "grad_norm": 216.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -2.902531623840332, "logits/rejected": -2.884403705596924, "logps/chosen": -32.63992691040039, "logps/rejected": -30.051746368408203, "loss": 3.8065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15720424056053162, "rewards/margins": 0.18128542602062225, "rewards/rejected": -0.024081196635961533, "step": 90 }, { "epoch": 0.26, "grad_norm": 162.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": -3.0053141117095947, "logits/rejected": -3.0066592693328857, "logps/chosen": -31.79007911682129, "logps/rejected": -30.816768646240234, "loss": 1.0237, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2134353667497635, "rewards/margins": 0.21390971541404724, "rewards/rejected": -0.000474362081149593, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.8161001205444336, "eval_logits/rejected": -2.8135948181152344, "eval_logps/chosen": -31.253108978271484, "eval_logps/rejected": -34.717830657958984, "eval_loss": 0.4814997613430023, "eval_rewards/accuracies": 0.5485880374908447, "eval_rewards/chosen": 0.023475930094718933, "eval_rewards/margins": 0.039213381707668304, "eval_rewards/rejected": -0.015737449750304222, "eval_runtime": 113.131, "eval_samples_per_second": 3.032, "eval_steps_per_second": 0.38, "step": 100 }, { "epoch": 0.29, "grad_norm": 156.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": -2.962667942047119, "logits/rejected": -2.9380180835723877, "logps/chosen": -31.729461669921875, "logps/rejected": -31.084197998046875, "loss": 1.6614, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.38986626267433167, "rewards/margins": 0.33014416694641113, "rewards/rejected": 0.05972205847501755, "step": 110 }, { "epoch": 0.31, "grad_norm": 100.5, "learning_rate": 4.353806263777678e-06, "logits/chosen": -3.0431807041168213, "logits/rejected": -3.072758197784424, "logps/chosen": -28.905542373657227, "logps/rejected": -34.141502380371094, "loss": 0.822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20668914914131165, "rewards/margins": 0.20185013115406036, "rewards/rejected": 0.004838997032493353, "step": 120 }, { "epoch": 0.34, "grad_norm": 73.5, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.743009090423584, "logits/rejected": -2.7388014793395996, "logps/chosen": -28.7690486907959, "logps/rejected": -30.156269073486328, "loss": 0.7399, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16450510919094086, "rewards/margins": 0.1824747771024704, "rewards/rejected": -0.01796967163681984, "step": 130 }, { "epoch": 0.36, "grad_norm": 197.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -3.015079975128174, "logits/rejected": -3.0121235847473145, "logps/chosen": -27.263751983642578, "logps/rejected": -31.766677856445312, "loss": 0.8956, "rewards/accuracies": 0.625, "rewards/chosen": 0.17797937989234924, "rewards/margins": 0.2659076154232025, "rewards/rejected": -0.08792825043201447, "step": 140 }, { "epoch": 0.39, "grad_norm": 126.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": -2.807605028152466, "logits/rejected": -2.8025310039520264, "logps/chosen": -27.548370361328125, "logps/rejected": -31.246013641357422, "loss": 0.6864, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22170917689800262, "rewards/margins": 0.23960641026496887, "rewards/rejected": -0.01789720356464386, "step": 150 }, { "epoch": 0.42, "grad_norm": 97.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": -3.123861312866211, "logits/rejected": -3.10616397857666, "logps/chosen": -32.041046142578125, "logps/rejected": -28.996301651000977, "loss": 1.2758, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.36053839325904846, "rewards/margins": 0.3737615942955017, "rewards/rejected": -0.013223287649452686, "step": 160 }, { "epoch": 0.44, "grad_norm": 150.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.9405550956726074, "logits/rejected": -2.947495222091675, "logps/chosen": -29.35744857788086, "logps/rejected": -31.04239845275879, "loss": 1.0844, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.42973223328590393, "rewards/margins": 0.3840641677379608, "rewards/rejected": 0.045668020844459534, "step": 170 }, { "epoch": 0.47, "grad_norm": 87.5, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.7875921726226807, "logits/rejected": -2.784825086593628, "logps/chosen": -29.229150772094727, "logps/rejected": -29.77825927734375, "loss": 0.7375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.35255879163742065, "rewards/margins": 0.36062949895858765, "rewards/rejected": -0.008070724084973335, "step": 180 }, { "epoch": 0.49, "grad_norm": 314.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.905453681945801, "logits/rejected": -2.901573896408081, "logps/chosen": -29.90896224975586, "logps/rejected": -28.3870906829834, "loss": 1.6507, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.22220328450202942, "rewards/margins": 0.24585184454917908, "rewards/rejected": -0.023648545145988464, "step": 190 }, { "epoch": 0.52, "grad_norm": 320.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -2.97483491897583, "logits/rejected": -2.9631428718566895, "logps/chosen": -33.094482421875, "logps/rejected": -30.27695655822754, "loss": 2.9692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.48363131284713745, "rewards/margins": 0.35792380571365356, "rewards/rejected": 0.12570755183696747, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.812093496322632, "eval_logits/rejected": -2.809893846511841, "eval_logps/chosen": -31.243083953857422, "eval_logps/rejected": -34.712913513183594, "eval_loss": 0.4973182678222656, "eval_rewards/accuracies": 0.5631229281425476, "eval_rewards/chosen": 0.03149374574422836, "eval_rewards/margins": 0.04329964146018028, "eval_rewards/rejected": -0.011805894784629345, "eval_runtime": 113.1074, "eval_samples_per_second": 3.033, "eval_steps_per_second": 0.38, "step": 200 }, { "epoch": 0.55, "grad_norm": 112.5, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.9090988636016846, "logits/rejected": -2.9100005626678467, "logps/chosen": -32.562644958496094, "logps/rejected": -33.86018371582031, "loss": 1.264, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.37660837173461914, "rewards/margins": 0.375582754611969, "rewards/rejected": 0.0010256364475935698, "step": 210 }, { "epoch": 0.57, "grad_norm": 204.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -2.8871023654937744, "logits/rejected": -2.90246844291687, "logps/chosen": -29.662246704101562, "logps/rejected": -28.47257423400879, "loss": 1.4961, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4126364588737488, "rewards/margins": 0.3680770695209503, "rewards/rejected": 0.04455941915512085, "step": 220 }, { "epoch": 0.6, "grad_norm": 87.5, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -2.9373838901519775, "logits/rejected": -2.941819667816162, "logps/chosen": -30.841754913330078, "logps/rejected": -31.728525161743164, "loss": 0.7905, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.2480934113264084, "rewards/margins": 0.2243928462266922, "rewards/rejected": 0.02370055951178074, "step": 230 }, { "epoch": 0.62, "grad_norm": 73.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -2.987827777862549, "logits/rejected": -2.9960155487060547, "logps/chosen": -30.307302474975586, "logps/rejected": -30.214122772216797, "loss": 0.9428, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4780976176261902, "rewards/margins": 0.46120333671569824, "rewards/rejected": 0.016894282773137093, "step": 240 }, { "epoch": 0.65, "grad_norm": 296.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.818535566329956, "logits/rejected": -2.8091442584991455, "logps/chosen": -26.581512451171875, "logps/rejected": -29.30862808227539, "loss": 1.3618, "rewards/accuracies": 0.625, "rewards/chosen": 0.2981645464897156, "rewards/margins": 0.11975376307964325, "rewards/rejected": 0.17841079831123352, "step": 250 }, { "epoch": 0.68, "grad_norm": 127.5, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -2.802452802658081, "logits/rejected": -2.8227944374084473, "logps/chosen": -29.27554702758789, "logps/rejected": -34.11919403076172, "loss": 1.2344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4435352385044098, "rewards/margins": 0.3996209502220154, "rewards/rejected": 0.043914251029491425, "step": 260 }, { "epoch": 0.7, "grad_norm": 64.5, "learning_rate": 1.243452991757889e-06, "logits/chosen": -2.944246768951416, "logits/rejected": -2.9497275352478027, "logps/chosen": -30.331872940063477, "logps/rejected": -29.763620376586914, "loss": 0.9017, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3595869243144989, "rewards/margins": 0.27827444672584534, "rewards/rejected": 0.08131249994039536, "step": 270 }, { "epoch": 0.73, "grad_norm": 87.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -2.9589145183563232, "logits/rejected": -2.9460854530334473, "logps/chosen": -30.4505615234375, "logps/rejected": -28.466442108154297, "loss": 1.0356, "rewards/accuracies": 0.625, "rewards/chosen": 0.2169012576341629, "rewards/margins": 0.08497779071331024, "rewards/rejected": 0.13192343711853027, "step": 280 }, { "epoch": 0.75, "grad_norm": 137.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.8869829177856445, "logits/rejected": -2.868814706802368, "logps/chosen": -32.20147705078125, "logps/rejected": -30.42227554321289, "loss": 1.5568, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3478601574897766, "rewards/margins": 0.2670307755470276, "rewards/rejected": 0.08082933723926544, "step": 290 }, { "epoch": 0.78, "grad_norm": 362.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": -2.8908376693725586, "logits/rejected": -2.887605905532837, "logps/chosen": -31.88857078552246, "logps/rejected": -27.47490882873535, "loss": 0.9535, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2672681212425232, "rewards/margins": 0.16432540118694305, "rewards/rejected": 0.10294272005558014, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.8098394870758057, "eval_logits/rejected": -2.807095527648926, "eval_logps/chosen": -31.205495834350586, "eval_logps/rejected": -34.68474197387695, "eval_loss": 0.47054800391197205, "eval_rewards/accuracies": 0.5369601845741272, "eval_rewards/chosen": 0.0615648552775383, "eval_rewards/margins": 0.050835300236940384, "eval_rewards/rejected": 0.010729559697210789, "eval_runtime": 112.9246, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 300 }, { "epoch": 0.81, "grad_norm": 139.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": -2.769951343536377, "logits/rejected": -2.7882049083709717, "logps/chosen": -28.826038360595703, "logps/rejected": -30.570903778076172, "loss": 1.2033, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.322704941034317, "rewards/margins": 0.19382169842720032, "rewards/rejected": 0.1288832128047943, "step": 310 }, { "epoch": 0.83, "grad_norm": 142.0, "learning_rate": 4.229036944380913e-07, "logits/chosen": -3.0174190998077393, "logits/rejected": -3.002908706665039, "logps/chosen": -29.645282745361328, "logps/rejected": -28.338443756103516, "loss": 1.9393, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4530436396598816, "rewards/margins": 0.4687787890434265, "rewards/rejected": -0.015735208988189697, "step": 320 }, { "epoch": 0.86, "grad_norm": 75.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.931588649749756, "logits/rejected": -2.9138498306274414, "logps/chosen": -27.903301239013672, "logps/rejected": -30.379735946655273, "loss": 0.7854, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.36701759696006775, "rewards/margins": 0.35749000310897827, "rewards/rejected": 0.009527605026960373, "step": 330 }, { "epoch": 0.88, "grad_norm": 101.5, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -3.1477503776550293, "logits/rejected": -3.153796672821045, "logps/chosen": -31.147018432617188, "logps/rejected": -32.76397705078125, "loss": 0.8634, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39374879002571106, "rewards/margins": 0.44812655448913574, "rewards/rejected": -0.05437783524394035, "step": 340 }, { "epoch": 0.91, "grad_norm": 176.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -3.023674249649048, "logits/rejected": -3.0265414714813232, "logps/chosen": -30.397380828857422, "logps/rejected": -31.44447898864746, "loss": 0.5848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3276014029979706, "rewards/margins": 0.27253925800323486, "rewards/rejected": 0.05506212264299393, "step": 350 }, { "epoch": 0.94, "grad_norm": 262.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": -2.8648974895477295, "logits/rejected": -2.866847515106201, "logps/chosen": -28.0252628326416, "logps/rejected": -29.79150390625, "loss": 1.1652, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.49957290291786194, "rewards/margins": 0.4680249094963074, "rewards/rejected": 0.03154800459742546, "step": 360 }, { "epoch": 0.96, "grad_norm": 161.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.937594175338745, "logits/rejected": -2.936284065246582, "logps/chosen": -30.407318115234375, "logps/rejected": -32.029598236083984, "loss": 0.8323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.30346399545669556, "rewards/margins": 0.38470739126205444, "rewards/rejected": -0.0812433734536171, "step": 370 }, { "epoch": 0.99, "grad_norm": 520.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": -2.8965048789978027, "logits/rejected": -2.879058837890625, "logps/chosen": -28.52047348022461, "logps/rejected": -27.761425018310547, "loss": 0.7634, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.33964917063713074, "rewards/margins": 0.23709365725517273, "rewards/rejected": 0.10255551338195801, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 1.1209261448352368, "train_runtime": 2723.1709, "train_samples_per_second": 1.131, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }