{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 5.241135829355759, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06866662204265594, "logits/rejected": 0.1413690149784088, "logps/chosen": -1.7159583568572998, "logps/rejected": -1.889492392539978, "loss": 0.8423, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7159583568572998, "rewards/margins": 0.1735340654850006, "rewards/rejected": -1.889492392539978, "sft_loss": 1.4683737754821777, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 10.047119389873286, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.00279655447229743, "logits/rejected": 0.1199721097946167, "logps/chosen": -1.8025665283203125, "logps/rejected": -1.845226526260376, "loss": 0.9285, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8025665283203125, "rewards/margins": 0.04265982285141945, "rewards/rejected": -1.845226526260376, "sft_loss": 1.508405327796936, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 11.407144769368456, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.035205043852329254, "logits/rejected": 0.06473912298679352, "logps/chosen": -1.6344362497329712, "logps/rejected": -1.7645175457000732, "loss": 0.9018, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6344362497329712, "rewards/margins": 0.1300811469554901, "rewards/rejected": -1.7645175457000732, "sft_loss": 1.500314474105835, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 5.238633350936662, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.05097222328186035, "logits/rejected": 0.03746742755174637, "logps/chosen": -1.7247400283813477, "logps/rejected": -1.8054897785186768, "loss": 0.9259, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7247400283813477, "rewards/margins": 0.08074973523616791, "rewards/rejected": -1.8054897785186768, "sft_loss": 1.5001486539840698, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 16.415507483010636, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.05095674842596054, "logits/rejected": 0.035283464938402176, "logps/chosen": -1.8678537607192993, "logps/rejected": -1.7785379886627197, "loss": 1.0364, "rewards/accuracies": 0.375, "rewards/chosen": -1.8678537607192993, "rewards/margins": -0.08931580185890198, "rewards/rejected": -1.7785379886627197, "sft_loss": 1.5451219081878662, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 9.818125741103396, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09462754428386688, "logits/rejected": -0.0008164912578649819, "logps/chosen": -1.9101909399032593, "logps/rejected": -1.833242416381836, "loss": 1.0004, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.9101909399032593, "rewards/margins": -0.0769486278295517, "rewards/rejected": -1.833242416381836, "sft_loss": 1.6473201513290405, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 10.446823122941232, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.05367986112833023, "logits/rejected": 0.10855366289615631, "logps/chosen": -1.8467719554901123, "logps/rejected": -1.9976612329483032, "loss": 0.9563, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.8467719554901123, "rewards/margins": 0.15088915824890137, "rewards/rejected": -1.9976612329483032, "sft_loss": 1.5615323781967163, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 9.52226659459355, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04535774141550064, "logits/rejected": 0.22551977634429932, "logps/chosen": -1.880853295326233, "logps/rejected": -1.7423698902130127, "loss": 1.0065, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.880853295326233, "rewards/margins": -0.13848325610160828, "rewards/rejected": -1.7423698902130127, "sft_loss": 1.5186402797698975, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 14.993392393029739, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.016608258709311485, "logits/rejected": 0.2148985117673874, "logps/chosen": -1.8355581760406494, "logps/rejected": -1.8709468841552734, "loss": 0.9724, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8355581760406494, "rewards/margins": 0.03538880869746208, "rewards/rejected": -1.8709468841552734, "sft_loss": 1.5353034734725952, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 11.938709031585379, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.0528443343937397, "logits/rejected": 0.09901972860097885, "logps/chosen": -1.8960918188095093, "logps/rejected": -1.7771284580230713, "loss": 1.0264, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8960918188095093, "rewards/margins": -0.11896347999572754, "rewards/rejected": -1.7771284580230713, "sft_loss": 1.5825130939483643, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.751225723464381, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.12037549912929535, "logits/rejected": 0.10033036768436432, "logps/chosen": -1.8300701379776, "logps/rejected": -1.8636726140975952, "loss": 0.9966, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8300701379776, "rewards/margins": 0.03360243886709213, "rewards/rejected": -1.8636726140975952, "sft_loss": 1.5821956396102905, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.434298921935121, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08439075946807861, "logits/rejected": 0.10855086147785187, "logps/chosen": -1.7829084396362305, "logps/rejected": -1.8879938125610352, "loss": 0.9111, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7829084396362305, "rewards/margins": 0.10508525371551514, "rewards/rejected": -1.8879938125610352, "sft_loss": 1.5424165725708008, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 6.15690178972907, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.017289573326706886, "logits/rejected": 0.13417969644069672, "logps/chosen": -1.632793664932251, "logps/rejected": -1.7621288299560547, "loss": 0.8661, "rewards/accuracies": 0.53125, "rewards/chosen": -1.632793664932251, "rewards/margins": 0.12933534383773804, "rewards/rejected": -1.7621288299560547, "sft_loss": 1.4724994897842407, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 12.609544982423918, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.08426558971405029, "logits/rejected": 0.06824670732021332, "logps/chosen": -1.7627747058868408, "logps/rejected": -1.8078590631484985, "loss": 0.9862, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7627747058868408, "rewards/margins": 0.045084238052368164, "rewards/rejected": -1.8078590631484985, "sft_loss": 1.627946138381958, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 13.029966401791775, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.050957489758729935, "logits/rejected": 0.13617627322673798, "logps/chosen": -1.770573377609253, "logps/rejected": -2.030594825744629, "loss": 0.8675, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.770573377609253, "rewards/margins": 0.2600213289260864, "rewards/rejected": -2.030594825744629, "sft_loss": 1.5631544589996338, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 8.664668189055082, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.007962247356772423, "logits/rejected": 0.09837473928928375, "logps/chosen": -1.7023359537124634, "logps/rejected": -1.7354528903961182, "loss": 0.93, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7023359537124634, "rewards/margins": 0.03311706706881523, "rewards/rejected": -1.7354528903961182, "sft_loss": 1.517807960510254, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 5.303208481084279, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.1491144895553589, "logits/rejected": 0.10071063041687012, "logps/chosen": -1.7676842212677002, "logps/rejected": -1.9403903484344482, "loss": 0.8904, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7676842212677002, "rewards/margins": 0.17270630598068237, "rewards/rejected": -1.9403903484344482, "sft_loss": 1.4872312545776367, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 15.42950023182983, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.07358263432979584, "logits/rejected": 0.0378994457423687, "logps/chosen": -1.719226598739624, "logps/rejected": -1.753637671470642, "loss": 0.9499, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.719226598739624, "rewards/margins": 0.03441108763217926, "rewards/rejected": -1.753637671470642, "sft_loss": 1.4496166706085205, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 6.451613645941182, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.05917614698410034, "logits/rejected": 0.0948031097650528, "logps/chosen": -1.7646898031234741, "logps/rejected": -1.8822062015533447, "loss": 0.917, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7646898031234741, "rewards/margins": 0.11751645803451538, "rewards/rejected": -1.8822062015533447, "sft_loss": 1.5111513137817383, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 4.75200385177772, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.045394934713840485, "logits/rejected": 0.02147727832198143, "logps/chosen": -1.666123628616333, "logps/rejected": -1.7723052501678467, "loss": 0.8867, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.666123628616333, "rewards/margins": 0.10618197917938232, "rewards/rejected": -1.7723052501678467, "sft_loss": 1.4811543226242065, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 10.130900796519882, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.029360543936491013, "logits/rejected": 0.056736599653959274, "logps/chosen": -1.6169408559799194, "logps/rejected": -1.7849327325820923, "loss": 0.8598, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6169408559799194, "rewards/margins": 0.16799196600914001, "rewards/rejected": -1.7849327325820923, "sft_loss": 1.4254792928695679, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.4997537279144675, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.0004239112022332847, "logits/rejected": 0.09700234234333038, "logps/chosen": -1.6261718273162842, "logps/rejected": -1.6823606491088867, "loss": 0.9083, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6261718273162842, "rewards/margins": 0.05618885159492493, "rewards/rejected": -1.6823606491088867, "sft_loss": 1.4438135623931885, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 10.559932077745579, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.03688237443566322, "logits/rejected": 0.24905824661254883, "logps/chosen": -1.5997196435928345, "logps/rejected": -1.865505576133728, "loss": 0.8245, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5997196435928345, "rewards/margins": 0.2657860219478607, "rewards/rejected": -1.865505576133728, "sft_loss": 1.5311452150344849, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 6.438650066794796, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.07155241072177887, "logits/rejected": 0.10692272335290909, "logps/chosen": -1.653988242149353, "logps/rejected": -1.7673509120941162, "loss": 0.8821, "rewards/accuracies": 0.5, "rewards/chosen": -1.653988242149353, "rewards/margins": 0.11336270719766617, "rewards/rejected": -1.7673509120941162, "sft_loss": 1.51596999168396, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 4.6992093179083385, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.07477526366710663, "logits/rejected": 0.05816906690597534, "logps/chosen": -1.5865226984024048, "logps/rejected": -1.5511195659637451, "loss": 0.9302, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.5865226984024048, "rewards/margins": -0.035402946174144745, "rewards/rejected": -1.5511195659637451, "sft_loss": 1.492251992225647, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 8.397259668497695, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.017252258956432343, "logits/rejected": 0.15352050960063934, "logps/chosen": -1.6266523599624634, "logps/rejected": -1.7452976703643799, "loss": 0.8503, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6266523599624634, "rewards/margins": 0.11864523589611053, "rewards/rejected": -1.7452976703643799, "sft_loss": 1.5504190921783447, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 16.01478351745101, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.06855495274066925, "logits/rejected": 0.049786873161792755, "logps/chosen": -1.6754124164581299, "logps/rejected": -1.7043033838272095, "loss": 0.9217, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6754124164581299, "rewards/margins": 0.028891151770949364, "rewards/rejected": -1.7043033838272095, "sft_loss": 1.4868216514587402, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 8.453110694029059, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.05912736803293228, "logits/rejected": 0.10661590099334717, "logps/chosen": -1.6367515325546265, "logps/rejected": -1.7601619958877563, "loss": 0.8706, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6367515325546265, "rewards/margins": 0.12341021001338959, "rewards/rejected": -1.7601619958877563, "sft_loss": 1.5336635112762451, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 9.505352370065037, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.01801946759223938, "logits/rejected": 0.140397310256958, "logps/chosen": -1.5470463037490845, "logps/rejected": -1.6572315692901611, "loss": 0.8627, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5470463037490845, "rewards/margins": 0.11018538475036621, "rewards/rejected": -1.6572315692901611, "sft_loss": 1.4840823411941528, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 11.97067128612125, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.08302738517522812, "logits/rejected": 0.07648370414972305, "logps/chosen": -1.4947319030761719, "logps/rejected": -1.4955166578292847, "loss": 0.8976, "rewards/accuracies": 0.5, "rewards/chosen": -1.4947319030761719, "rewards/margins": 0.0007847875240258873, "rewards/rejected": -1.4955166578292847, "sft_loss": 1.3429903984069824, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 8.716169567546759, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.08068571984767914, "logits/rejected": -0.03303000330924988, "logps/chosen": -1.5031237602233887, "logps/rejected": -1.6031303405761719, "loss": 0.8538, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5031237602233887, "rewards/margins": 0.10000662505626678, "rewards/rejected": -1.6031303405761719, "sft_loss": 1.4229665994644165, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 8.642596197070496, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.15794476866722107, "logits/rejected": -0.014252248220145702, "logps/chosen": -1.6020597219467163, "logps/rejected": -1.5782639980316162, "loss": 0.9337, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.6020597219467163, "rewards/margins": -0.02379578724503517, "rewards/rejected": -1.5782639980316162, "sft_loss": 1.4675323963165283, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.453803840860197, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.06954143941402435, "logits/rejected": 0.10241218656301498, "logps/chosen": -1.439340591430664, "logps/rejected": -1.5472867488861084, "loss": 0.854, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.439340591430664, "rewards/margins": 0.10794617980718613, "rewards/rejected": -1.5472867488861084, "sft_loss": 1.3554737567901611, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 15.24949261718954, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.09827111661434174, "logits/rejected": -0.04374364763498306, "logps/chosen": -1.5440620183944702, "logps/rejected": -1.6072267293930054, "loss": 0.8898, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.5440620183944702, "rewards/margins": 0.06316471099853516, "rewards/rejected": -1.6072267293930054, "sft_loss": 1.4672365188598633, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 7.804849677307928, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.0392121747136116, "logits/rejected": 0.03854722902178764, "logps/chosen": -1.40225350856781, "logps/rejected": -1.5015074014663696, "loss": 0.8517, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.40225350856781, "rewards/margins": 0.099253810942173, "rewards/rejected": -1.5015074014663696, "sft_loss": 1.3962509632110596, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 6.591071787166793, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.044824711978435516, "logits/rejected": -0.044315505772829056, "logps/chosen": -1.399688720703125, "logps/rejected": -1.5939632654190063, "loss": 0.8475, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.399688720703125, "rewards/margins": 0.19427452981472015, "rewards/rejected": -1.5939632654190063, "sft_loss": 1.3958854675292969, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 7.093373654847805, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.17792734503746033, "logits/rejected": -0.0892893522977829, "logps/chosen": -1.3721511363983154, "logps/rejected": -1.4268399477005005, "loss": 0.8713, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3721511363983154, "rewards/margins": 0.054688893258571625, "rewards/rejected": -1.4268399477005005, "sft_loss": 1.3671307563781738, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 7.048667420811032, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.09333489090204239, "logits/rejected": 0.024537701159715652, "logps/chosen": -1.3106496334075928, "logps/rejected": -1.4411996603012085, "loss": 0.8197, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3106496334075928, "rewards/margins": 0.13054995238780975, "rewards/rejected": -1.4411996603012085, "sft_loss": 1.3126055002212524, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 4.788119564540609, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.03602975606918335, "logits/rejected": 0.11371143162250519, "logps/chosen": -1.2650938034057617, "logps/rejected": -1.4330915212631226, "loss": 0.7971, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2650938034057617, "rewards/margins": 0.16799767315387726, "rewards/rejected": -1.4330915212631226, "sft_loss": 1.3003733158111572, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 15.527585794821164, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.13083848357200623, "logits/rejected": 0.0006558209424838424, "logps/chosen": -1.3923680782318115, "logps/rejected": -1.4303077459335327, "loss": 0.8719, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3923680782318115, "rewards/margins": 0.037939682602882385, "rewards/rejected": -1.4303077459335327, "sft_loss": 1.3999507427215576, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 10.469382701987184, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.11185383796691895, "logits/rejected": 0.0243713166564703, "logps/chosen": -1.3079421520233154, "logps/rejected": -1.375205397605896, "loss": 0.8508, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3079421520233154, "rewards/margins": 0.06726332008838654, "rewards/rejected": -1.375205397605896, "sft_loss": 1.2929632663726807, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 7.515191474547397, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.17593644559383392, "logits/rejected": 0.008980167098343372, "logps/chosen": -1.3793913125991821, "logps/rejected": -1.504292368888855, "loss": 0.843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3793913125991821, "rewards/margins": 0.1249011754989624, "rewards/rejected": -1.504292368888855, "sft_loss": 1.3484762907028198, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 5.64262431029933, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.20726744830608368, "logits/rejected": 0.03332756459712982, "logps/chosen": -1.3965818881988525, "logps/rejected": -1.458545207977295, "loss": 0.8465, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3965818881988525, "rewards/margins": 0.06196323037147522, "rewards/rejected": -1.458545207977295, "sft_loss": 1.372166633605957, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 12.55043610541021, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.012690830044448376, "logits/rejected": 0.10736958682537079, "logps/chosen": -1.3292357921600342, "logps/rejected": -1.4855844974517822, "loss": 0.823, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3292357921600342, "rewards/margins": 0.15634877979755402, "rewards/rejected": -1.4855844974517822, "sft_loss": 1.3458410501480103, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 4.891428813068394, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.13622412085533142, "logits/rejected": 0.029086655005812645, "logps/chosen": -1.3300727605819702, "logps/rejected": -1.4654982089996338, "loss": 0.8111, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3300727605819702, "rewards/margins": 0.1354253739118576, "rewards/rejected": -1.4654982089996338, "sft_loss": 1.3241392374038696, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 5.2992537852975685, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.06062249466776848, "logits/rejected": 0.013975190930068493, "logps/chosen": -1.3466517925262451, "logps/rejected": -1.5110262632369995, "loss": 0.8128, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3466517925262451, "rewards/margins": 0.16437435150146484, "rewards/rejected": -1.5110262632369995, "sft_loss": 1.290457010269165, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 7.803984118958217, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.023377910256385803, "logits/rejected": 0.10894010961055756, "logps/chosen": -1.3126580715179443, "logps/rejected": -1.481186866760254, "loss": 0.8007, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3126580715179443, "rewards/margins": 0.16852891445159912, "rewards/rejected": -1.481186866760254, "sft_loss": 1.3011443614959717, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 4.304924249922597, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.05965115502476692, "logits/rejected": 0.06149532273411751, "logps/chosen": -1.3237906694412231, "logps/rejected": -1.514859914779663, "loss": 0.8095, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3237906694412231, "rewards/margins": 0.1910690814256668, "rewards/rejected": -1.514859914779663, "sft_loss": 1.3616396188735962, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 6.75147439718235, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.009547481313347816, "logits/rejected": 0.12560246884822845, "logps/chosen": -1.4427531957626343, "logps/rejected": -1.4753752946853638, "loss": 0.8827, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4427531957626343, "rewards/margins": 0.032621920108795166, "rewards/rejected": -1.4753752946853638, "sft_loss": 1.446410059928894, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 10.280389134667836, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.08433694392442703, "logits/rejected": 0.07879729568958282, "logps/chosen": -1.3304082155227661, "logps/rejected": -1.3946244716644287, "loss": 0.8676, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3304082155227661, "rewards/margins": 0.06421622633934021, "rewards/rejected": -1.3946244716644287, "sft_loss": 1.3293696641921997, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 7.159546743126018, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.04339202120900154, "logits/rejected": 0.09749060124158859, "logps/chosen": -1.2906709909439087, "logps/rejected": -1.4070241451263428, "loss": 0.8122, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2906709909439087, "rewards/margins": 0.11635303497314453, "rewards/rejected": -1.4070241451263428, "sft_loss": 1.258568525314331, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 5.200353388008942, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.23168528079986572, "logits/rejected": -0.12337970733642578, "logps/chosen": -1.3821641206741333, "logps/rejected": -1.5426661968231201, "loss": 0.8082, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3821641206741333, "rewards/margins": 0.16050215065479279, "rewards/rejected": -1.5426661968231201, "sft_loss": 1.3882310390472412, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 6.650562217709036, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.09911463409662247, "logits/rejected": -0.014409830793738365, "logps/chosen": -1.3743125200271606, "logps/rejected": -1.5459463596343994, "loss": 0.8252, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3743125200271606, "rewards/margins": 0.17163386940956116, "rewards/rejected": -1.5459463596343994, "sft_loss": 1.4178268909454346, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 4.3611213733361325, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.1033635139465332, "logits/rejected": 0.02849961258471012, "logps/chosen": -1.341848611831665, "logps/rejected": -1.4445269107818604, "loss": 0.8271, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.341848611831665, "rewards/margins": 0.10267815738916397, "rewards/rejected": -1.4445269107818604, "sft_loss": 1.3495066165924072, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 5.345885266775294, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.055797822773456573, "logits/rejected": 0.03937429562211037, "logps/chosen": -1.292399287223816, "logps/rejected": -1.4664561748504639, "loss": 0.7982, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.292399287223816, "rewards/margins": 0.17405693233013153, "rewards/rejected": -1.4664561748504639, "sft_loss": 1.2662118673324585, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 6.109800744046622, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.1288217455148697, "logits/rejected": 0.02363762818276882, "logps/chosen": -1.3550705909729004, "logps/rejected": -1.455700159072876, "loss": 0.8334, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3550705909729004, "rewards/margins": 0.10062961280345917, "rewards/rejected": -1.455700159072876, "sft_loss": 1.3332594633102417, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.3313567104978095, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.08563394844532013, "logits/rejected": 0.05302998423576355, "logps/chosen": -1.366868257522583, "logps/rejected": -1.458636999130249, "loss": 0.856, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.366868257522583, "rewards/margins": 0.09176872670650482, "rewards/rejected": -1.458636999130249, "sft_loss": 1.4071624279022217, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.634233629914983, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.15972232818603516, "logits/rejected": 0.12704238295555115, "logps/chosen": -1.3881292343139648, "logps/rejected": -1.5179569721221924, "loss": 0.8229, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3881292343139648, "rewards/margins": 0.12982788681983948, "rewards/rejected": -1.5179569721221924, "sft_loss": 1.3762595653533936, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 6.397481599644503, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.0834231749176979, "logits/rejected": -0.02731296978890896, "logps/chosen": -1.2913421392440796, "logps/rejected": -1.430359125137329, "loss": 0.8065, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2913421392440796, "rewards/margins": 0.1390170156955719, "rewards/rejected": -1.430359125137329, "sft_loss": 1.2901066541671753, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 6.024311027521206, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.11013118922710419, "logits/rejected": 0.05048912763595581, "logps/chosen": -1.332236647605896, "logps/rejected": -1.4139000177383423, "loss": 0.8426, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.332236647605896, "rewards/margins": 0.08166352659463882, "rewards/rejected": -1.4139000177383423, "sft_loss": 1.3830831050872803, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.583555140700801, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.05276135727763176, "logits/rejected": 0.01633385941386223, "logps/chosen": -1.4365103244781494, "logps/rejected": -1.4422558546066284, "loss": 0.8985, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.4365103244781494, "rewards/margins": 0.005745700094848871, "rewards/rejected": -1.4422558546066284, "sft_loss": 1.4286465644836426, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 7.075648478372616, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.25344425439834595, "logits/rejected": -0.1665697544813156, "logps/chosen": -1.3985621929168701, "logps/rejected": -1.4898195266723633, "loss": 0.8658, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3985621929168701, "rewards/margins": 0.09125734865665436, "rewards/rejected": -1.4898195266723633, "sft_loss": 1.3890941143035889, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 7.026664667321404, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.058100342750549316, "logits/rejected": 0.09575776010751724, "logps/chosen": -1.3847591876983643, "logps/rejected": -1.5425527095794678, "loss": 0.8455, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3847591876983643, "rewards/margins": 0.15779361128807068, "rewards/rejected": -1.5425527095794678, "sft_loss": 1.3969918489456177, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 4.500585439228387, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.11416475474834442, "logits/rejected": 0.011766972951591015, "logps/chosen": -1.33979070186615, "logps/rejected": -1.3991636037826538, "loss": 0.8496, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.33979070186615, "rewards/margins": 0.05937303230166435, "rewards/rejected": -1.3991636037826538, "sft_loss": 1.35588538646698, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 6.626633027882969, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.15494349598884583, "logits/rejected": -0.042571187019348145, "logps/chosen": -1.347100019454956, "logps/rejected": -1.61239492893219, "loss": 0.7985, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.347100019454956, "rewards/margins": 0.2652948796749115, "rewards/rejected": -1.61239492893219, "sft_loss": 1.4237372875213623, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 10.03247230319143, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.07504062354564667, "logits/rejected": 0.0682159811258316, "logps/chosen": -1.361914038658142, "logps/rejected": -1.559640645980835, "loss": 0.8053, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.361914038658142, "rewards/margins": 0.19772668182849884, "rewards/rejected": -1.559640645980835, "sft_loss": 1.3729658126831055, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 10.12597270301308, "learning_rate": 5.971479500891266e-07, "logits/chosen": -0.0037459968589246273, "logits/rejected": 0.09566168487071991, "logps/chosen": -1.3829057216644287, "logps/rejected": -1.4261614084243774, "loss": 0.8519, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3829057216644287, "rewards/margins": 0.043255515396595, "rewards/rejected": -1.4261614084243774, "sft_loss": 1.3691765069961548, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 9.49071368773292, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.056303657591342926, "logits/rejected": 0.0899147093296051, "logps/chosen": -1.4569019079208374, "logps/rejected": -1.5441877841949463, "loss": 0.8708, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4569019079208374, "rewards/margins": 0.08728573471307755, "rewards/rejected": -1.5441877841949463, "sft_loss": 1.4234411716461182, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 9.42495544878123, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.03502111881971359, "logits/rejected": 0.06589053571224213, "logps/chosen": -1.3692471981048584, "logps/rejected": -1.5191218852996826, "loss": 0.8223, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3692471981048584, "rewards/margins": 0.1498747318983078, "rewards/rejected": -1.5191218852996826, "sft_loss": 1.3897430896759033, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 8.834518363433794, "learning_rate": 6.238859180035651e-07, "logits/chosen": -0.011538553051650524, "logits/rejected": 0.08675823360681534, "logps/chosen": -1.327157735824585, "logps/rejected": -1.449873685836792, "loss": 0.8415, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.327157735824585, "rewards/margins": 0.12271575629711151, "rewards/rejected": -1.449873685836792, "sft_loss": 1.3710789680480957, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 6.21195804143653, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.12199006974697113, "logits/rejected": 0.09644833952188492, "logps/chosen": -1.4260783195495605, "logps/rejected": -1.4695813655853271, "loss": 0.8778, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.4260783195495605, "rewards/margins": 0.04350309818983078, "rewards/rejected": -1.4695813655853271, "sft_loss": 1.429587483406067, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 5.989269381813312, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.10130627453327179, "logits/rejected": -0.02199324034154415, "logps/chosen": -1.3627521991729736, "logps/rejected": -1.4971396923065186, "loss": 0.8322, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3627521991729736, "rewards/margins": 0.13438746333122253, "rewards/rejected": -1.4971396923065186, "sft_loss": 1.3304550647735596, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 8.762475691829835, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.031241711229085922, "logits/rejected": 0.048352546989917755, "logps/chosen": -1.3312468528747559, "logps/rejected": -1.434434413909912, "loss": 0.8377, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3312468528747559, "rewards/margins": 0.10318763554096222, "rewards/rejected": -1.434434413909912, "sft_loss": 1.2982232570648193, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 6.7662993588598335, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.057675063610076904, "logits/rejected": 0.03325915336608887, "logps/chosen": -1.3215782642364502, "logps/rejected": -1.3781378269195557, "loss": 0.8473, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3215782642364502, "rewards/margins": 0.05655960366129875, "rewards/rejected": -1.3781378269195557, "sft_loss": 1.2895236015319824, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 6.853147172932886, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.09217499196529388, "logits/rejected": 0.06767531484365463, "logps/chosen": -1.3009799718856812, "logps/rejected": -1.4565882682800293, "loss": 0.8175, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3009799718856812, "rewards/margins": 0.15560829639434814, "rewards/rejected": -1.4565882682800293, "sft_loss": 1.3482820987701416, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 5.79678829460423, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.0677647739648819, "logits/rejected": 0.015960121527314186, "logps/chosen": -1.317563772201538, "logps/rejected": -1.4949743747711182, "loss": 0.8006, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.317563772201538, "rewards/margins": 0.17741069197654724, "rewards/rejected": -1.4949743747711182, "sft_loss": 1.3230907917022705, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 4.205897690267764, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.025275785475969315, "logits/rejected": 0.05514359474182129, "logps/chosen": -1.4152151346206665, "logps/rejected": -1.4196131229400635, "loss": 0.9014, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4152151346206665, "rewards/margins": 0.004398071672767401, "rewards/rejected": -1.4196131229400635, "sft_loss": 1.4195269346237183, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.235064148440024, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.05691586807370186, "logits/rejected": 0.22803792357444763, "logps/chosen": -1.4144960641860962, "logps/rejected": -1.4881465435028076, "loss": 0.8708, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4144960641860962, "rewards/margins": 0.07365051656961441, "rewards/rejected": -1.4881465435028076, "sft_loss": 1.412201166152954, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.485802234883967, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.07718853652477264, "logits/rejected": 0.08208423107862473, "logps/chosen": -1.3642868995666504, "logps/rejected": -1.3951537609100342, "loss": 0.8544, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3642868995666504, "rewards/margins": 0.030866652727127075, "rewards/rejected": -1.3951537609100342, "sft_loss": 1.3641996383666992, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 9.6476369842328, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.045478858053684235, "logits/rejected": 0.13731339573860168, "logps/chosen": -1.3522393703460693, "logps/rejected": -1.467068076133728, "loss": 0.8248, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3522393703460693, "rewards/margins": 0.11482896655797958, "rewards/rejected": -1.467068076133728, "sft_loss": 1.3359606266021729, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.2209644317626953, "eval_logits/rejected": 0.3069458603858948, "eval_logps/chosen": -1.3850491046905518, "eval_logps/rejected": -1.5360316038131714, "eval_loss": 0.8254688382148743, "eval_rewards/accuracies": 0.5645400881767273, "eval_rewards/chosen": -1.3850491046905518, "eval_rewards/margins": 0.15098246932029724, "eval_rewards/rejected": -1.5360316038131714, "eval_runtime": 48.631, "eval_samples_per_second": 27.657, "eval_sft_loss": 1.390453815460205, "eval_steps_per_second": 6.93, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 6.947088991355177, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.03424640744924545, "logits/rejected": 0.05961717292666435, "logps/chosen": -1.3753900527954102, "logps/rejected": -1.4629487991333008, "loss": 0.8445, "rewards/accuracies": 0.5, "rewards/chosen": -1.3753900527954102, "rewards/margins": 0.08755877614021301, "rewards/rejected": -1.4629487991333008, "sft_loss": 1.3514071702957153, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 9.031130816963694, "learning_rate": 7.30837789661319e-07, "logits/chosen": -0.018977805972099304, "logits/rejected": 0.11137328296899796, "logps/chosen": -1.3488214015960693, "logps/rejected": -1.4602519273757935, "loss": 0.8337, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3488214015960693, "rewards/margins": 0.11143036186695099, "rewards/rejected": -1.4602519273757935, "sft_loss": 1.3629045486450195, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.460981977261194, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.008375492878258228, "logits/rejected": 0.03483561426401138, "logps/chosen": -1.3426172733306885, "logps/rejected": -1.515251874923706, "loss": 0.8124, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3426172733306885, "rewards/margins": 0.1726348102092743, "rewards/rejected": -1.515251874923706, "sft_loss": 1.3418259620666504, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.830405504498053, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.037222422659397125, "logits/rejected": 0.15673691034317017, "logps/chosen": -1.3136218786239624, "logps/rejected": -1.4364408254623413, "loss": 0.8295, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3136218786239624, "rewards/margins": 0.12281904369592667, "rewards/rejected": -1.4364408254623413, "sft_loss": 1.351162314414978, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 6.749974545526729, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.08175744116306305, "logits/rejected": 0.11871360242366791, "logps/chosen": -1.3690340518951416, "logps/rejected": -1.5547032356262207, "loss": 0.8055, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3690340518951416, "rewards/margins": 0.18566930294036865, "rewards/rejected": -1.5547032356262207, "sft_loss": 1.421238660812378, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 9.055552858972396, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.1090446263551712, "logits/rejected": 0.08761148154735565, "logps/chosen": -1.3930914402008057, "logps/rejected": -1.5755016803741455, "loss": 0.8091, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3930914402008057, "rewards/margins": 0.18241021037101746, "rewards/rejected": -1.5755016803741455, "sft_loss": 1.4188745021820068, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 7.401098274643508, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.0450705885887146, "logits/rejected": 0.0441429577767849, "logps/chosen": -1.2809169292449951, "logps/rejected": -1.4211491346359253, "loss": 0.8091, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2809169292449951, "rewards/margins": 0.14023222029209137, "rewards/rejected": -1.4211491346359253, "sft_loss": 1.3247771263122559, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.730607642317396, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.03399479389190674, "logits/rejected": 0.05846908688545227, "logps/chosen": -1.3478953838348389, "logps/rejected": -1.482497215270996, "loss": 0.8123, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3478953838348389, "rewards/margins": 0.13460186123847961, "rewards/rejected": -1.482497215270996, "sft_loss": 1.3558604717254639, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 6.077000807250297, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.08569110929965973, "logits/rejected": 0.024291569367051125, "logps/chosen": -1.3827598094940186, "logps/rejected": -1.5550761222839355, "loss": 0.8288, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3827598094940186, "rewards/margins": 0.17231638729572296, "rewards/rejected": -1.5550761222839355, "sft_loss": 1.3997600078582764, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 10.457164427371827, "learning_rate": 8.02139037433155e-07, "logits/chosen": -0.016309332102537155, "logits/rejected": 0.11412891000509262, "logps/chosen": -1.4031344652175903, "logps/rejected": -1.5545867681503296, "loss": 0.7956, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4031344652175903, "rewards/margins": 0.1514524519443512, "rewards/rejected": -1.5545867681503296, "sft_loss": 1.350414514541626, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 8.11384192787803, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.01686396822333336, "logits/rejected": 0.07750639319419861, "logps/chosen": -1.332587718963623, "logps/rejected": -1.570004940032959, "loss": 0.787, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.332587718963623, "rewards/margins": 0.2374173402786255, "rewards/rejected": -1.570004940032959, "sft_loss": 1.324858546257019, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 7.4695084721433025, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.138697549700737, "logits/rejected": -0.011783541180193424, "logps/chosen": -1.4591374397277832, "logps/rejected": -1.5521273612976074, "loss": 0.8561, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4591374397277832, "rewards/margins": 0.09298999607563019, "rewards/rejected": -1.5521273612976074, "sft_loss": 1.4650377035140991, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.755943558335849, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.1174217090010643, "logits/rejected": 0.14014877378940582, "logps/chosen": -1.397404432296753, "logps/rejected": -1.5828224420547485, "loss": 0.8211, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.397404432296753, "rewards/margins": 0.18541797995567322, "rewards/rejected": -1.5828224420547485, "sft_loss": 1.370445728302002, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 6.58948444310401, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.13835200667381287, "logits/rejected": 0.08895708620548248, "logps/chosen": -1.3167743682861328, "logps/rejected": -1.533786654472351, "loss": 0.7914, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3167743682861328, "rewards/margins": 0.21701231598854065, "rewards/rejected": -1.533786654472351, "sft_loss": 1.3352279663085938, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 5.614237041453402, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.08576063066720963, "logits/rejected": 0.053143393248319626, "logps/chosen": -1.3589133024215698, "logps/rejected": -1.649198293685913, "loss": 0.7756, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3589133024215698, "rewards/margins": 0.2902849316596985, "rewards/rejected": -1.649198293685913, "sft_loss": 1.3821589946746826, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 6.393412432516514, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.06489423662424088, "logits/rejected": 0.13662515580654144, "logps/chosen": -1.3457838296890259, "logps/rejected": -1.4462194442749023, "loss": 0.8271, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3457838296890259, "rewards/margins": 0.10043561458587646, "rewards/rejected": -1.4462194442749023, "sft_loss": 1.3525927066802979, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 7.335230505121668, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.02245071530342102, "logits/rejected": 0.01754281111061573, "logps/chosen": -1.4477413892745972, "logps/rejected": -1.552997350692749, "loss": 0.849, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4477413892745972, "rewards/margins": 0.1052558571100235, "rewards/rejected": -1.552997350692749, "sft_loss": 1.4250624179840088, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 6.5223760116254885, "learning_rate": 8.734402852049911e-07, "logits/chosen": -0.013107108883559704, "logits/rejected": 0.05477488040924072, "logps/chosen": -1.382375955581665, "logps/rejected": -1.495545506477356, "loss": 0.8455, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.382375955581665, "rewards/margins": 0.11316970735788345, "rewards/rejected": -1.495545506477356, "sft_loss": 1.3612608909606934, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.548457429922757, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.045364078134298325, "logits/rejected": -0.02277747355401516, "logps/chosen": -1.3938887119293213, "logps/rejected": -1.5227181911468506, "loss": 0.8392, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3938887119293213, "rewards/margins": 0.12882940471172333, "rewards/rejected": -1.5227181911468506, "sft_loss": 1.4395800828933716, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 5.617002521076255, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.06262558698654175, "logits/rejected": 0.038189757615327835, "logps/chosen": -1.3054161071777344, "logps/rejected": -1.480383276939392, "loss": 0.8152, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3054161071777344, "rewards/margins": 0.17496728897094727, "rewards/rejected": -1.480383276939392, "sft_loss": 1.3259804248809814, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 7.596897551019339, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.07416924089193344, "logits/rejected": 0.0727233737707138, "logps/chosen": -1.419053554534912, "logps/rejected": -1.4824250936508179, "loss": 0.8566, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.419053554534912, "rewards/margins": 0.06337149441242218, "rewards/rejected": -1.4824250936508179, "sft_loss": 1.4202762842178345, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 6.9362611456040835, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.07286637276411057, "logits/rejected": 0.136042058467865, "logps/chosen": -1.3592536449432373, "logps/rejected": -1.568825125694275, "loss": 0.791, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3592536449432373, "rewards/margins": 0.20957139134407043, "rewards/rejected": -1.568825125694275, "sft_loss": 1.3293838500976562, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 4.938681435492669, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.009478648193180561, "logits/rejected": 0.10604576766490936, "logps/chosen": -1.3049455881118774, "logps/rejected": -1.4866827726364136, "loss": 0.8066, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3049455881118774, "rewards/margins": 0.18173733353614807, "rewards/rejected": -1.4866827726364136, "sft_loss": 1.327091932296753, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 4.749814960530297, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.11864975839853287, "logits/rejected": 0.016519224271178246, "logps/chosen": -1.353294014930725, "logps/rejected": -1.5012271404266357, "loss": 0.834, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.353294014930725, "rewards/margins": 0.1479329615831375, "rewards/rejected": -1.5012271404266357, "sft_loss": 1.445577621459961, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 11.371383046002663, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.0786885991692543, "logits/rejected": 0.15137773752212524, "logps/chosen": -1.3430989980697632, "logps/rejected": -1.571850061416626, "loss": 0.8068, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3430989980697632, "rewards/margins": 0.22875118255615234, "rewards/rejected": -1.571850061416626, "sft_loss": 1.4153742790222168, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 5.072141904022947, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.05348137021064758, "logits/rejected": 0.1374559849500656, "logps/chosen": -1.3070955276489258, "logps/rejected": -1.4544134140014648, "loss": 0.8124, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3070955276489258, "rewards/margins": 0.1473180055618286, "rewards/rejected": -1.4544134140014648, "sft_loss": 1.2672505378723145, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.244100816299926, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.14164254069328308, "logits/rejected": 0.11081121861934662, "logps/chosen": -1.3168877363204956, "logps/rejected": -1.4304202795028687, "loss": 0.8153, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3168877363204956, "rewards/margins": 0.11353246867656708, "rewards/rejected": -1.4304202795028687, "sft_loss": 1.2874393463134766, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 5.089804995973252, "learning_rate": 9.62566844919786e-07, "logits/chosen": -0.02128760516643524, "logits/rejected": 0.04924682155251503, "logps/chosen": -1.4541218280792236, "logps/rejected": -1.5730615854263306, "loss": 0.8458, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4541218280792236, "rewards/margins": 0.1189398542046547, "rewards/rejected": -1.5730615854263306, "sft_loss": 1.4805867671966553, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.988739163417053, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.11403359472751617, "logits/rejected": 0.08642077445983887, "logps/chosen": -1.3758718967437744, "logps/rejected": -1.5480414628982544, "loss": 0.8067, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3758718967437744, "rewards/margins": 0.17216962575912476, "rewards/rejected": -1.5480414628982544, "sft_loss": 1.3711098432540894, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 5.700855217177051, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.0030816257931292057, "logits/rejected": 0.06977695226669312, "logps/chosen": -1.3891031742095947, "logps/rejected": -1.560868263244629, "loss": 0.8001, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3891031742095947, "rewards/margins": 0.17176511883735657, "rewards/rejected": -1.560868263244629, "sft_loss": 1.3536107540130615, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 9.20009268289356, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.08153178542852402, "logits/rejected": 0.04767586663365364, "logps/chosen": -1.4646544456481934, "logps/rejected": -1.5674375295639038, "loss": 0.8624, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4646544456481934, "rewards/margins": 0.10278304666280746, "rewards/rejected": -1.5674375295639038, "sft_loss": 1.45162034034729, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 6.905524608877352, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.02582872472703457, "logits/rejected": 0.041776977479457855, "logps/chosen": -1.3279228210449219, "logps/rejected": -1.507521152496338, "loss": 0.8075, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3279228210449219, "rewards/margins": 0.17959830164909363, "rewards/rejected": -1.507521152496338, "sft_loss": 1.437023639678955, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 6.554422969163889, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.022335387766361237, "logits/rejected": 0.08181963860988617, "logps/chosen": -1.4174307584762573, "logps/rejected": -1.5860637426376343, "loss": 0.8208, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4174307584762573, "rewards/margins": 0.16863301396369934, "rewards/rejected": -1.5860637426376343, "sft_loss": 1.4302308559417725, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 7.6639169993243454, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.05036438629031181, "logits/rejected": 0.1768157184123993, "logps/chosen": -1.3770478963851929, "logps/rejected": -1.5245137214660645, "loss": 0.8363, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3770478963851929, "rewards/margins": 0.14746567606925964, "rewards/rejected": -1.5245137214660645, "sft_loss": 1.446306824684143, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.913892253081485, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.01328777801245451, "logits/rejected": 0.0488358698785305, "logps/chosen": -1.3167216777801514, "logps/rejected": -1.4920165538787842, "loss": 0.8055, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3167216777801514, "rewards/margins": 0.17529483139514923, "rewards/rejected": -1.4920165538787842, "sft_loss": 1.3323217630386353, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.398370826970335, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.019995415583252907, "logits/rejected": 0.1481052190065384, "logps/chosen": -1.3342006206512451, "logps/rejected": -1.5779651403427124, "loss": 0.7727, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3342006206512451, "rewards/margins": 0.24376468360424042, "rewards/rejected": -1.5779651403427124, "sft_loss": 1.3413031101226807, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 6.596705010903569, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.06026517227292061, "logits/rejected": 0.05025007575750351, "logps/chosen": -1.5123201608657837, "logps/rejected": -1.620935082435608, "loss": 0.8712, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5123201608657837, "rewards/margins": 0.10861505568027496, "rewards/rejected": -1.620935082435608, "sft_loss": 1.4992682933807373, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 9.726914897320233, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.03563661500811577, "logits/rejected": 0.09400226175785065, "logps/chosen": -1.4427645206451416, "logps/rejected": -1.5188099145889282, "loss": 0.86, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4427645206451416, "rewards/margins": 0.07604547590017319, "rewards/rejected": -1.5188099145889282, "sft_loss": 1.4608362913131714, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.88657845548431, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.0094614177942276, "logits/rejected": 0.14305809140205383, "logps/chosen": -1.3531228303909302, "logps/rejected": -1.504281759262085, "loss": 0.8167, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3531228303909302, "rewards/margins": 0.15115907788276672, "rewards/rejected": -1.504281759262085, "sft_loss": 1.3632830381393433, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.314876905137107, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.05322295427322388, "logits/rejected": 0.09657259285449982, "logps/chosen": -1.3616068363189697, "logps/rejected": -1.4519062042236328, "loss": 0.8375, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3616068363189697, "rewards/margins": 0.09029947221279144, "rewards/rejected": -1.4519062042236328, "sft_loss": 1.384135365486145, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 6.729297181381879, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.07589106261730194, "logits/rejected": 0.013483390212059021, "logps/chosen": -1.4494796991348267, "logps/rejected": -1.6531600952148438, "loss": 0.8215, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4494796991348267, "rewards/margins": 0.2036806344985962, "rewards/rejected": -1.6531600952148438, "sft_loss": 1.4627764225006104, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 15.305179267577746, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.052164338529109955, "logits/rejected": 0.21540775895118713, "logps/chosen": -1.4092166423797607, "logps/rejected": -1.5794641971588135, "loss": 0.8293, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4092166423797607, "rewards/margins": 0.17024731636047363, "rewards/rejected": -1.5794641971588135, "sft_loss": 1.4149467945098877, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 8.754168981723573, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.04741921275854111, "logits/rejected": 0.062200047075748444, "logps/chosen": -1.4097732305526733, "logps/rejected": -1.6510112285614014, "loss": 0.8057, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4097732305526733, "rewards/margins": 0.24123787879943848, "rewards/rejected": -1.6510112285614014, "sft_loss": 1.4197732210159302, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.279109991539173, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.04084223881363869, "logits/rejected": 0.11059192568063736, "logps/chosen": -1.4887102842330933, "logps/rejected": -1.6538307666778564, "loss": 0.8345, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4887102842330933, "rewards/margins": 0.16512033343315125, "rewards/rejected": -1.6538307666778564, "sft_loss": 1.466871976852417, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 8.456355044674595, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.07184690982103348, "logits/rejected": 0.19549378752708435, "logps/chosen": -1.3768669366836548, "logps/rejected": -1.6150470972061157, "loss": 0.7847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3768669366836548, "rewards/margins": 0.23818016052246094, "rewards/rejected": -1.6150470972061157, "sft_loss": 1.3980249166488647, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 8.517314779995383, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.01540251076221466, "logits/rejected": 0.13189511001110077, "logps/chosen": -1.4255297183990479, "logps/rejected": -1.6558986902236938, "loss": 0.7822, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4255297183990479, "rewards/margins": 0.23036888241767883, "rewards/rejected": -1.6558986902236938, "sft_loss": 1.38680100440979, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 4.94324530072853, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.01976579800248146, "logits/rejected": 0.22383160889148712, "logps/chosen": -1.500919222831726, "logps/rejected": -1.6664314270019531, "loss": 0.8493, "rewards/accuracies": 0.5625, "rewards/chosen": -1.500919222831726, "rewards/margins": 0.1655120551586151, "rewards/rejected": -1.6664314270019531, "sft_loss": 1.4955133199691772, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 19.917573265301105, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.09189249575138092, "logits/rejected": 0.10940637439489365, "logps/chosen": -1.4420835971832275, "logps/rejected": -1.6289657354354858, "loss": 0.8267, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4420835971832275, "rewards/margins": 0.1868821680545807, "rewards/rejected": -1.6289657354354858, "sft_loss": 1.4969195127487183, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 6.2713690228178045, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.09385538101196289, "logits/rejected": -0.000730663537979126, "logps/chosen": -1.3631962537765503, "logps/rejected": -1.63164484500885, "loss": 0.7634, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3631962537765503, "rewards/margins": 0.26844847202301025, "rewards/rejected": -1.63164484500885, "sft_loss": 1.3461940288543701, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 8.577436477801887, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.020334230735898018, "logits/rejected": 0.12423195689916611, "logps/chosen": -1.5157902240753174, "logps/rejected": -1.7500642538070679, "loss": 0.8075, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5157902240753174, "rewards/margins": 0.23427405953407288, "rewards/rejected": -1.7500642538070679, "sft_loss": 1.5177749395370483, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 7.556912330641032, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.032945163547992706, "logits/rejected": 0.04700014740228653, "logps/chosen": -1.4554027318954468, "logps/rejected": -1.7037118673324585, "loss": 0.8225, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4554027318954468, "rewards/margins": 0.2483091652393341, "rewards/rejected": -1.7037118673324585, "sft_loss": 1.458221673965454, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 6.899620491436677, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.027759453281760216, "logits/rejected": 0.1641981303691864, "logps/chosen": -1.4130445718765259, "logps/rejected": -1.5385392904281616, "loss": 0.8405, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4130445718765259, "rewards/margins": 0.12549491226673126, "rewards/rejected": -1.5385392904281616, "sft_loss": 1.436645269393921, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 7.247434774624164, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.010338058695197105, "logits/rejected": 0.15490129590034485, "logps/chosen": -1.3537068367004395, "logps/rejected": -1.5573979616165161, "loss": 0.8041, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3537068367004395, "rewards/margins": 0.20369116961956024, "rewards/rejected": -1.5573979616165161, "sft_loss": 1.394716739654541, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 6.466865978837853, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.027265384793281555, "logits/rejected": 0.19246003031730652, "logps/chosen": -1.3908880949020386, "logps/rejected": -1.533629059791565, "loss": 0.8189, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3908880949020386, "rewards/margins": 0.14274093508720398, "rewards/rejected": -1.533629059791565, "sft_loss": 1.4787267446517944, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 6.964670242435892, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.044592663645744324, "logits/rejected": 0.03786772862076759, "logps/chosen": -1.390737771987915, "logps/rejected": -1.6763662099838257, "loss": 0.7778, "rewards/accuracies": 0.625, "rewards/chosen": -1.390737771987915, "rewards/margins": 0.2856284976005554, "rewards/rejected": -1.6763662099838257, "sft_loss": 1.4360474348068237, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 6.430263476547815, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.08023092895746231, "logits/rejected": 0.25273817777633667, "logps/chosen": -1.4434854984283447, "logps/rejected": -1.5930297374725342, "loss": 0.8658, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4434854984283447, "rewards/margins": 0.14954404532909393, "rewards/rejected": -1.5930297374725342, "sft_loss": 1.41591477394104, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 9.605751859573651, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.05132424831390381, "logits/rejected": 0.22000408172607422, "logps/chosen": -1.4061557054519653, "logps/rejected": -1.5719212293624878, "loss": 0.8262, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4061557054519653, "rewards/margins": 0.16576561331748962, "rewards/rejected": -1.5719212293624878, "sft_loss": 1.4110631942749023, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 7.568831910208547, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.06606674194335938, "logits/rejected": 0.11064629256725311, "logps/chosen": -1.4183576107025146, "logps/rejected": -1.610594391822815, "loss": 0.8312, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4183576107025146, "rewards/margins": 0.19223684072494507, "rewards/rejected": -1.610594391822815, "sft_loss": 1.4632813930511475, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 5.785029878046707, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.058470167219638824, "logits/rejected": 0.1552259624004364, "logps/chosen": -1.3224313259124756, "logps/rejected": -1.556247591972351, "loss": 0.7871, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3224313259124756, "rewards/margins": 0.23381614685058594, "rewards/rejected": -1.556247591972351, "sft_loss": 1.360646367073059, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 8.552315630638782, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.07230981439352036, "logits/rejected": 0.08344583213329315, "logps/chosen": -1.489461898803711, "logps/rejected": -1.6290485858917236, "loss": 0.8322, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.489461898803711, "rewards/margins": 0.13958671689033508, "rewards/rejected": -1.6290485858917236, "sft_loss": 1.457029938697815, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 7.858520268041617, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.03659159690141678, "logits/rejected": 0.15742424130439758, "logps/chosen": -1.4309229850769043, "logps/rejected": -1.7274789810180664, "loss": 0.7778, "rewards/accuracies": 0.625, "rewards/chosen": -1.4309229850769043, "rewards/margins": 0.2965560555458069, "rewards/rejected": -1.7274789810180664, "sft_loss": 1.452720284461975, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 7.630780130914619, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.07541829347610474, "logits/rejected": 0.1804257035255432, "logps/chosen": -1.4164173603057861, "logps/rejected": -1.6312143802642822, "loss": 0.7882, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4164173603057861, "rewards/margins": 0.21479690074920654, "rewards/rejected": -1.6312143802642822, "sft_loss": 1.3946534395217896, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 10.03469205631459, "learning_rate": 9.97700834996658e-07, "logits/chosen": -3.202706648153253e-05, "logits/rejected": 0.1807897835969925, "logps/chosen": -1.506064534187317, "logps/rejected": -1.6994918584823608, "loss": 0.8079, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.506064534187317, "rewards/margins": 0.19342732429504395, "rewards/rejected": -1.6994918584823608, "sft_loss": 1.4442682266235352, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 6.831412234684436, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.0943126231431961, "logits/rejected": 0.3053697645664215, "logps/chosen": -1.5625550746917725, "logps/rejected": -1.7868919372558594, "loss": 0.8404, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5625550746917725, "rewards/margins": 0.22433657944202423, "rewards/rejected": -1.7868919372558594, "sft_loss": 1.5496907234191895, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 7.128366176422961, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.06101804971694946, "logits/rejected": 0.08986925333738327, "logps/chosen": -1.3635046482086182, "logps/rejected": -1.7173395156860352, "loss": 0.7427, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3635046482086182, "rewards/margins": 0.35383477807044983, "rewards/rejected": -1.7173395156860352, "sft_loss": 1.4110605716705322, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 7.505984504486462, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.0960775688290596, "logits/rejected": 0.07468603551387787, "logps/chosen": -1.4832559823989868, "logps/rejected": -1.7148786783218384, "loss": 0.8098, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4832559823989868, "rewards/margins": 0.2316228151321411, "rewards/rejected": -1.7148786783218384, "sft_loss": 1.4715131521224976, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 5.956396203551287, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.0648040622472763, "logits/rejected": 0.04636877775192261, "logps/chosen": -1.496577262878418, "logps/rejected": -1.6820507049560547, "loss": 0.82, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.496577262878418, "rewards/margins": 0.18547356128692627, "rewards/rejected": -1.6820507049560547, "sft_loss": 1.5222786664962769, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 13.346157170364691, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.04654891416430473, "logits/rejected": 0.14179016649723053, "logps/chosen": -1.5719588994979858, "logps/rejected": -1.7497106790542603, "loss": 0.8483, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5719588994979858, "rewards/margins": 0.17775161564350128, "rewards/rejected": -1.7497106790542603, "sft_loss": 1.5560394525527954, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 6.305412433191499, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.08343605697154999, "logits/rejected": 0.17710405588150024, "logps/chosen": -1.5416231155395508, "logps/rejected": -1.8573392629623413, "loss": 0.8273, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5416231155395508, "rewards/margins": 0.31571608781814575, "rewards/rejected": -1.8573392629623413, "sft_loss": 1.503692388534546, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 8.196870077157955, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.0007702164584770799, "logits/rejected": 0.1767648458480835, "logps/chosen": -1.4756300449371338, "logps/rejected": -1.7681652307510376, "loss": 0.7865, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4756300449371338, "rewards/margins": 0.29253506660461426, "rewards/rejected": -1.7681652307510376, "sft_loss": 1.4575726985931396, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 6.696973846857812, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.04241034761071205, "logits/rejected": 0.191898912191391, "logps/chosen": -1.4594138860702515, "logps/rejected": -1.6694276332855225, "loss": 0.8296, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4594138860702515, "rewards/margins": 0.21001389622688293, "rewards/rejected": -1.6694276332855225, "sft_loss": 1.4784830808639526, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 4.9330077662880525, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.05131806805729866, "logits/rejected": 0.027555961161851883, "logps/chosen": -1.4626655578613281, "logps/rejected": -1.7275264263153076, "loss": 0.7949, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4626655578613281, "rewards/margins": 0.26486068964004517, "rewards/rejected": -1.7275264263153076, "sft_loss": 1.487533450126648, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 5.970242355976017, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.00278986687771976, "logits/rejected": 0.09611242264509201, "logps/chosen": -1.4592771530151367, "logps/rejected": -1.644202470779419, "loss": 0.8096, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4592771530151367, "rewards/margins": 0.18492527306079865, "rewards/rejected": -1.644202470779419, "sft_loss": 1.4266273975372314, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 8.221146972645718, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.10509107261896133, "logits/rejected": 0.24740242958068848, "logps/chosen": -1.4015429019927979, "logps/rejected": -1.789790391921997, "loss": 0.7594, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4015429019927979, "rewards/margins": 0.38824737071990967, "rewards/rejected": -1.789790391921997, "sft_loss": 1.4088695049285889, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 5.996746772412752, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.05945770815014839, "logits/rejected": 0.12533241510391235, "logps/chosen": -1.4721711874008179, "logps/rejected": -1.7401840686798096, "loss": 0.8064, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4721711874008179, "rewards/margins": 0.2680128514766693, "rewards/rejected": -1.7401840686798096, "sft_loss": 1.411442518234253, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 5.2693711992511, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.10139896720647812, "logits/rejected": 0.059671349823474884, "logps/chosen": -1.4136241674423218, "logps/rejected": -1.7005494832992554, "loss": 0.7767, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4136241674423218, "rewards/margins": 0.2869250774383545, "rewards/rejected": -1.7005494832992554, "sft_loss": 1.4422767162322998, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 5.940862800195787, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.049787215888500214, "logits/rejected": 0.18889674544334412, "logps/chosen": -1.415466070175171, "logps/rejected": -1.756277322769165, "loss": 0.7662, "rewards/accuracies": 0.59375, "rewards/chosen": -1.415466070175171, "rewards/margins": 0.3408113121986389, "rewards/rejected": -1.756277322769165, "sft_loss": 1.4119466543197632, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 8.763617032563491, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.011156201362609863, "logits/rejected": 0.13287320733070374, "logps/chosen": -1.4517066478729248, "logps/rejected": -1.7545764446258545, "loss": 0.7743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4517066478729248, "rewards/margins": 0.3028695583343506, "rewards/rejected": -1.7545764446258545, "sft_loss": 1.428072452545166, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 9.432237364572199, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.011267140507698059, "logits/rejected": 0.20100240409374237, "logps/chosen": -1.4378983974456787, "logps/rejected": -1.831396460533142, "loss": 0.7767, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4378983974456787, "rewards/margins": 0.3934980034828186, "rewards/rejected": -1.831396460533142, "sft_loss": 1.4487905502319336, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 6.7524390474487666, "learning_rate": 9.944683435341155e-07, "logits/chosen": 0.038916267454624176, "logits/rejected": 0.1338595598936081, "logps/chosen": -1.4667879343032837, "logps/rejected": -1.7416179180145264, "loss": 0.7884, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4667879343032837, "rewards/margins": 0.274830162525177, "rewards/rejected": -1.7416179180145264, "sft_loss": 1.4271196126937866, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.38953980803489685, "eval_logits/rejected": 0.4913637042045593, "eval_logps/chosen": -1.519872784614563, "eval_logps/rejected": -1.8624564409255981, "eval_loss": 0.781060516834259, "eval_rewards/accuracies": 0.6112759709358215, "eval_rewards/chosen": -1.519872784614563, "eval_rewards/margins": 0.3425837755203247, "eval_rewards/rejected": -1.8624564409255981, "eval_runtime": 48.1988, "eval_samples_per_second": 27.905, "eval_sft_loss": 1.4856693744659424, "eval_steps_per_second": 6.992, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 9.478412651669851, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.024741273373365402, "logits/rejected": 0.16353586316108704, "logps/chosen": -1.5089105367660522, "logps/rejected": -1.930978536605835, "loss": 0.7698, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5089105367660522, "rewards/margins": 0.42206794023513794, "rewards/rejected": -1.930978536605835, "sft_loss": 1.4987614154815674, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 8.559969563956894, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.0923948884010315, "logits/rejected": 0.17605474591255188, "logps/chosen": -1.4771305322647095, "logps/rejected": -1.8206688165664673, "loss": 0.7745, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4771305322647095, "rewards/margins": 0.3435381352901459, "rewards/rejected": -1.8206688165664673, "sft_loss": 1.4665313959121704, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 13.44467394433346, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.08743222802877426, "logits/rejected": 0.2398766726255417, "logps/chosen": -1.4706168174743652, "logps/rejected": -1.9225200414657593, "loss": 0.7657, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4706168174743652, "rewards/margins": 0.45190319418907166, "rewards/rejected": -1.9225200414657593, "sft_loss": 1.4846904277801514, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 9.09437305489372, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.11175093799829483, "logits/rejected": 0.170477032661438, "logps/chosen": -1.5116984844207764, "logps/rejected": -1.8542693853378296, "loss": 0.7963, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5116984844207764, "rewards/margins": 0.3425709307193756, "rewards/rejected": -1.8542693853378296, "sft_loss": 1.4979822635650635, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 13.903122243576494, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.03402037173509598, "logits/rejected": 0.17264944314956665, "logps/chosen": -1.4104028940200806, "logps/rejected": -1.7492183446884155, "loss": 0.7925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4104028940200806, "rewards/margins": 0.3388154208660126, "rewards/rejected": -1.7492183446884155, "sft_loss": 1.4937907457351685, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 8.181214260342125, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.13250409066677094, "logits/rejected": 0.3093946874141693, "logps/chosen": -1.4798091650009155, "logps/rejected": -1.7190351486206055, "loss": 0.8047, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4798091650009155, "rewards/margins": 0.23922595381736755, "rewards/rejected": -1.7190351486206055, "sft_loss": 1.4476814270019531, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 7.021128237002189, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.07647381722927094, "logits/rejected": 0.22388648986816406, "logps/chosen": -1.423357605934143, "logps/rejected": -1.6930935382843018, "loss": 0.7975, "rewards/accuracies": 0.59375, "rewards/chosen": -1.423357605934143, "rewards/margins": 0.26973602175712585, "rewards/rejected": -1.6930935382843018, "sft_loss": 1.483536720275879, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 5.444269820874599, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.07038027048110962, "logits/rejected": 0.2524788975715637, "logps/chosen": -1.4763027429580688, "logps/rejected": -1.8203551769256592, "loss": 0.7946, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4763027429580688, "rewards/margins": 0.3440525233745575, "rewards/rejected": -1.8203551769256592, "sft_loss": 1.515074372291565, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 3.9298350587989703, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.13534356653690338, "logits/rejected": 0.21075978875160217, "logps/chosen": -1.4686431884765625, "logps/rejected": -1.662706732749939, "loss": 0.8342, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4686431884765625, "rewards/margins": 0.19406333565711975, "rewards/rejected": -1.662706732749939, "sft_loss": 1.5189584493637085, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 8.127018398181935, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.07503533363342285, "logits/rejected": 0.15978340804576874, "logps/chosen": -1.4593805074691772, "logps/rejected": -1.7287170886993408, "loss": 0.7744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4593805074691772, "rewards/margins": 0.2693364918231964, "rewards/rejected": -1.7287170886993408, "sft_loss": 1.4590668678283691, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 14.204530229897067, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.04205578193068504, "logits/rejected": 0.20363526046276093, "logps/chosen": -1.527470350265503, "logps/rejected": -1.9072644710540771, "loss": 0.8077, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.527470350265503, "rewards/margins": 0.3797941505908966, "rewards/rejected": -1.9072644710540771, "sft_loss": 1.5054857730865479, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 7.676732687613248, "learning_rate": 9.913513527293234e-07, "logits/chosen": 0.02001282386481762, "logits/rejected": 0.20995073020458221, "logps/chosen": -1.5619772672653198, "logps/rejected": -1.9812109470367432, "loss": 0.7717, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5619772672653198, "rewards/margins": 0.4192333221435547, "rewards/rejected": -1.9812109470367432, "sft_loss": 1.5510200262069702, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 14.994661540489169, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.06933962553739548, "logits/rejected": 0.17799408733844757, "logps/chosen": -1.4994163513183594, "logps/rejected": -1.940812349319458, "loss": 0.7896, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4994163513183594, "rewards/margins": 0.44139593839645386, "rewards/rejected": -1.940812349319458, "sft_loss": 1.4676927328109741, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 7.1208645969992554, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.011937955394387245, "logits/rejected": 0.2829660475254059, "logps/chosen": -1.5142967700958252, "logps/rejected": -1.8380763530731201, "loss": 0.8081, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5142967700958252, "rewards/margins": 0.32377952337265015, "rewards/rejected": -1.8380763530731201, "sft_loss": 1.5563030242919922, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 10.57266265331966, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.0970253124833107, "logits/rejected": 0.18807905912399292, "logps/chosen": -1.5507986545562744, "logps/rejected": -1.802983283996582, "loss": 0.8544, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5507986545562744, "rewards/margins": 0.2521846294403076, "rewards/rejected": -1.802983283996582, "sft_loss": 1.5012518167495728, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 6.615247648347357, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.08528032153844833, "logits/rejected": 0.27221986651420593, "logps/chosen": -1.5537059307098389, "logps/rejected": -1.8551679849624634, "loss": 0.7964, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5537059307098389, "rewards/margins": 0.30146196484565735, "rewards/rejected": -1.8551679849624634, "sft_loss": 1.4632527828216553, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 7.637773191791828, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.00021566599025391042, "logits/rejected": 0.0904955193400383, "logps/chosen": -1.461515188217163, "logps/rejected": -1.8142931461334229, "loss": 0.7501, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.461515188217163, "rewards/margins": 0.3527778089046478, "rewards/rejected": -1.8142931461334229, "sft_loss": 1.4819213151931763, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 5.505098445964599, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.08802780508995056, "logits/rejected": 0.05928264930844307, "logps/chosen": -1.5004725456237793, "logps/rejected": -1.7867472171783447, "loss": 0.7969, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5004725456237793, "rewards/margins": 0.28627461194992065, "rewards/rejected": -1.7867472171783447, "sft_loss": 1.5398263931274414, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 6.86392578183846, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.12297271192073822, "logits/rejected": 0.249002143740654, "logps/chosen": -1.383638620376587, "logps/rejected": -1.6328461170196533, "loss": 0.7916, "rewards/accuracies": 0.5625, "rewards/chosen": -1.383638620376587, "rewards/margins": 0.24920740723609924, "rewards/rejected": -1.6328461170196533, "sft_loss": 1.4265363216400146, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.938887538526364, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.008537910878658295, "logits/rejected": 0.07684127986431122, "logps/chosen": -1.390125036239624, "logps/rejected": -1.750501036643982, "loss": 0.7693, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.390125036239624, "rewards/margins": 0.3603759706020355, "rewards/rejected": -1.750501036643982, "sft_loss": 1.421464204788208, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 6.247500651591439, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.04593368247151375, "logits/rejected": 0.2520795166492462, "logps/chosen": -1.4899015426635742, "logps/rejected": -1.8351194858551025, "loss": 0.8013, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4899015426635742, "rewards/margins": 0.34521812200546265, "rewards/rejected": -1.8351194858551025, "sft_loss": 1.5467928647994995, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 4.2472737939219325, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.03426792845129967, "logits/rejected": 0.15044865012168884, "logps/chosen": -1.4836585521697998, "logps/rejected": -1.7281948328018188, "loss": 0.8258, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4836585521697998, "rewards/margins": 0.2445363998413086, "rewards/rejected": -1.7281948328018188, "sft_loss": 1.4716042280197144, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 6.324915561479719, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.03401128202676773, "logits/rejected": 0.23937173187732697, "logps/chosen": -1.4460633993148804, "logps/rejected": -1.727929711341858, "loss": 0.7879, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4460633993148804, "rewards/margins": 0.2818659543991089, "rewards/rejected": -1.727929711341858, "sft_loss": 1.4450603723526, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 7.220558590592656, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.1617671549320221, "logits/rejected": 0.321737676858902, "logps/chosen": -1.3826147317886353, "logps/rejected": -1.7328180074691772, "loss": 0.7897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.3826147317886353, "rewards/margins": 0.3502032160758972, "rewards/rejected": -1.7328180074691772, "sft_loss": 1.430105209350586, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 8.854366214455323, "learning_rate": 9.87200209327504e-07, "logits/chosen": 0.028535524383187294, "logits/rejected": 0.22215552628040314, "logps/chosen": -1.5255110263824463, "logps/rejected": -1.7730461359024048, "loss": 0.8028, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5255110263824463, "rewards/margins": 0.24753530323505402, "rewards/rejected": -1.7730461359024048, "sft_loss": 1.4887754917144775, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 9.512077178518032, "learning_rate": 9.868477119388894e-07, "logits/chosen": 0.006587311625480652, "logits/rejected": 0.13302569091320038, "logps/chosen": -1.434971570968628, "logps/rejected": -1.86062753200531, "loss": 0.7715, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.434971570968628, "rewards/margins": 0.42565593123435974, "rewards/rejected": -1.86062753200531, "sft_loss": 1.450035810470581, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 6.2106996188588495, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.06405138224363327, "logits/rejected": 0.11938437074422836, "logps/chosen": -1.441922903060913, "logps/rejected": -1.753854513168335, "loss": 0.7925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.441922903060913, "rewards/margins": 0.3119317591190338, "rewards/rejected": -1.753854513168335, "sft_loss": 1.4742841720581055, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 7.739110467682054, "learning_rate": 9.861285504315084e-07, "logits/chosen": 0.022775262594223022, "logits/rejected": 0.14432260394096375, "logps/chosen": -1.4950230121612549, "logps/rejected": -1.7906938791275024, "loss": 0.7887, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4950230121612549, "rewards/margins": 0.2956710457801819, "rewards/rejected": -1.7906938791275024, "sft_loss": 1.5051108598709106, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 10.213549728033058, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.0027862130664288998, "logits/rejected": 0.1521952897310257, "logps/chosen": -1.5002058744430542, "logps/rejected": -1.925718069076538, "loss": 0.7401, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5002058744430542, "rewards/margins": 0.42551201581954956, "rewards/rejected": -1.925718069076538, "sft_loss": 1.4755966663360596, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 6.708504004912308, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.018649207428097725, "logits/rejected": 0.21121075749397278, "logps/chosen": -1.6166229248046875, "logps/rejected": -1.8786675930023193, "loss": 0.8553, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6166229248046875, "rewards/margins": 0.2620445787906647, "rewards/rejected": -1.8786675930023193, "sft_loss": 1.5805094242095947, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 6.495162494021766, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.04336419329047203, "logits/rejected": 0.2884041666984558, "logps/chosen": -1.5941362380981445, "logps/rejected": -1.9034837484359741, "loss": 0.8213, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5941362380981445, "rewards/margins": 0.30934733152389526, "rewards/rejected": -1.9034837484359741, "sft_loss": 1.6056299209594727, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 6.973508344540136, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.023673977702856064, "logits/rejected": 0.1818438023328781, "logps/chosen": -1.5351133346557617, "logps/rejected": -1.7883703708648682, "loss": 0.8179, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5351133346557617, "rewards/margins": 0.2532569169998169, "rewards/rejected": -1.7883703708648682, "sft_loss": 1.5288115739822388, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 7.181538030480668, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.10764428228139877, "logits/rejected": 0.11544273793697357, "logps/chosen": -1.562271237373352, "logps/rejected": -1.9329475164413452, "loss": 0.7984, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.562271237373352, "rewards/margins": 0.3706762194633484, "rewards/rejected": -1.9329475164413452, "sft_loss": 1.5829569101333618, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.0757706157031315, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.12130825221538544, "logits/rejected": 0.13535746932029724, "logps/chosen": -1.4355661869049072, "logps/rejected": -1.7103980779647827, "loss": 0.797, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4355661869049072, "rewards/margins": 0.2748319208621979, "rewards/rejected": -1.7103980779647827, "sft_loss": 1.4296951293945312, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 5.9663442231467245, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.039339829236269, "logits/rejected": 0.16926564276218414, "logps/chosen": -1.5010632276535034, "logps/rejected": -1.7757505178451538, "loss": 0.7814, "rewards/accuracies": 0.625, "rewards/chosen": -1.5010632276535034, "rewards/margins": 0.2746872901916504, "rewards/rejected": -1.7757505178451538, "sft_loss": 1.5402605533599854, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 11.110364691132972, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.0302075557410717, "logits/rejected": 0.18410873413085938, "logps/chosen": -1.5870397090911865, "logps/rejected": -1.863877296447754, "loss": 0.8204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5870397090911865, "rewards/margins": 0.2768373489379883, "rewards/rejected": -1.863877296447754, "sft_loss": 1.5685373544692993, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 8.583992948150364, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.07231486588716507, "logits/rejected": 0.2873210310935974, "logps/chosen": -1.5195852518081665, "logps/rejected": -1.7979605197906494, "loss": 0.8113, "rewards/accuracies": 0.625, "rewards/chosen": -1.5195852518081665, "rewards/margins": 0.2783753275871277, "rewards/rejected": -1.7979605197906494, "sft_loss": 1.5771687030792236, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 5.738576213025098, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.1567464917898178, "logits/rejected": 0.1936349868774414, "logps/chosen": -1.360508680343628, "logps/rejected": -1.7414668798446655, "loss": 0.7424, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.360508680343628, "rewards/margins": 0.38095822930336, "rewards/rejected": -1.7414668798446655, "sft_loss": 1.4464884996414185, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 6.522349933524974, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.14568684995174408, "logits/rejected": 0.23906786739826202, "logps/chosen": -1.4772862195968628, "logps/rejected": -1.7023448944091797, "loss": 0.8421, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4772862195968628, "rewards/margins": 0.22505874931812286, "rewards/rejected": -1.7023448944091797, "sft_loss": 1.538696050643921, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 8.42651091806081, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.1448066532611847, "logits/rejected": 0.2907395362854004, "logps/chosen": -1.4993362426757812, "logps/rejected": -1.8288503885269165, "loss": 0.8031, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4993362426757812, "rewards/margins": 0.3295140266418457, "rewards/rejected": -1.8288503885269165, "sft_loss": 1.5193639993667603, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 8.251388861856864, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.10285203158855438, "logits/rejected": 0.2888593375682831, "logps/chosen": -1.5528720617294312, "logps/rejected": -1.8929901123046875, "loss": 0.8063, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5528720617294312, "rewards/margins": 0.34011831879615784, "rewards/rejected": -1.8929901123046875, "sft_loss": 1.5962483882904053, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 7.765685613578621, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.06226274371147156, "logits/rejected": 0.2799316942691803, "logps/chosen": -1.3958429098129272, "logps/rejected": -1.8346401453018188, "loss": 0.7577, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3958429098129272, "rewards/margins": 0.4387971758842468, "rewards/rejected": -1.8346401453018188, "sft_loss": 1.4307596683502197, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 6.217554327386058, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.0106730405241251, "logits/rejected": 0.20386436581611633, "logps/chosen": -1.448302149772644, "logps/rejected": -1.7997252941131592, "loss": 0.778, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.448302149772644, "rewards/margins": 0.35142338275909424, "rewards/rejected": -1.7997252941131592, "sft_loss": 1.4954019784927368, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 8.751420134729127, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.028878509998321533, "logits/rejected": 0.1896253526210785, "logps/chosen": -1.4995536804199219, "logps/rejected": -1.8639593124389648, "loss": 0.7641, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4995536804199219, "rewards/margins": 0.3644057810306549, "rewards/rejected": -1.8639593124389648, "sft_loss": 1.5418555736541748, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 7.24280728417126, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.011818932369351387, "logits/rejected": 0.1281585395336151, "logps/chosen": -1.4514957666397095, "logps/rejected": -1.899003028869629, "loss": 0.7518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4514957666397095, "rewards/margins": 0.4475073218345642, "rewards/rejected": -1.899003028869629, "sft_loss": 1.4739365577697754, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 8.534210457317407, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.020249877125024796, "logits/rejected": 0.1163882464170456, "logps/chosen": -1.5264770984649658, "logps/rejected": -1.8450472354888916, "loss": 0.8067, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5264770984649658, "rewards/margins": 0.31856995820999146, "rewards/rejected": -1.8450472354888916, "sft_loss": 1.5603917837142944, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 9.88569530259178, "learning_rate": 9.783598330038924e-07, "logits/chosen": 0.008886132389307022, "logits/rejected": 0.14406760036945343, "logps/chosen": -1.5904018878936768, "logps/rejected": -1.8277761936187744, "loss": 0.8297, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5904018878936768, "rewards/margins": 0.2373741865158081, "rewards/rejected": -1.8277761936187744, "sft_loss": 1.5749119520187378, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 14.706570237742655, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.04168939217925072, "logits/rejected": 0.22393640875816345, "logps/chosen": -1.4788261651992798, "logps/rejected": -1.9062135219573975, "loss": 0.7817, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4788261651992798, "rewards/margins": 0.427387535572052, "rewards/rejected": -1.9062135219573975, "sft_loss": 1.5156362056732178, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 5.183033182056639, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.0063124462030828, "logits/rejected": 0.16308115422725677, "logps/chosen": -1.467664361000061, "logps/rejected": -1.8871591091156006, "loss": 0.7698, "rewards/accuracies": 0.5625, "rewards/chosen": -1.467664361000061, "rewards/margins": 0.41949495673179626, "rewards/rejected": -1.8871591091156006, "sft_loss": 1.5343271493911743, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 8.307017281327427, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.0377768948674202, "logits/rejected": 0.08416786044836044, "logps/chosen": -1.5451844930648804, "logps/rejected": -1.9892276525497437, "loss": 0.77, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5451844930648804, "rewards/margins": 0.44404298067092896, "rewards/rejected": -1.9892276525497437, "sft_loss": 1.5667245388031006, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 8.717574423207155, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.025077302008867264, "logits/rejected": 0.11167348921298981, "logps/chosen": -1.5285561084747314, "logps/rejected": -1.8976386785507202, "loss": 0.7661, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5285561084747314, "rewards/margins": 0.36908265948295593, "rewards/rejected": -1.8976386785507202, "sft_loss": 1.5427119731903076, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 9.179937605104982, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.07678677141666412, "logits/rejected": 0.27645179629325867, "logps/chosen": -1.6057662963867188, "logps/rejected": -1.9026782512664795, "loss": 0.8004, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6057662963867188, "rewards/margins": 0.2969121038913727, "rewards/rejected": -1.9026782512664795, "sft_loss": 1.5740219354629517, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 6.004659165744023, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.06088536977767944, "logits/rejected": 0.20597343146800995, "logps/chosen": -1.4853112697601318, "logps/rejected": -1.9441382884979248, "loss": 0.7476, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4853112697601318, "rewards/margins": 0.45882707834243774, "rewards/rejected": -1.9441382884979248, "sft_loss": 1.5710132122039795, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 6.526110209400112, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.06001533940434456, "logits/rejected": 0.1981702446937561, "logps/chosen": -1.5417296886444092, "logps/rejected": -1.9125770330429077, "loss": 0.7695, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5417296886444092, "rewards/margins": 0.37084728479385376, "rewards/rejected": -1.9125770330429077, "sft_loss": 1.5457255840301514, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 6.010466805128653, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.014503148384392262, "logits/rejected": 0.15595757961273193, "logps/chosen": -1.4644759893417358, "logps/rejected": -1.9023147821426392, "loss": 0.7587, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4644759893417358, "rewards/margins": 0.4378388822078705, "rewards/rejected": -1.9023147821426392, "sft_loss": 1.5187257528305054, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 6.789905308775565, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.05950881168246269, "logits/rejected": 0.05490509793162346, "logps/chosen": -1.6019874811172485, "logps/rejected": -1.9460639953613281, "loss": 0.796, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6019874811172485, "rewards/margins": 0.34407639503479004, "rewards/rejected": -1.9460639953613281, "sft_loss": 1.6386131048202515, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 11.801786985690509, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.0316731221973896, "logits/rejected": 0.21804389357566833, "logps/chosen": -1.5540869235992432, "logps/rejected": -1.723379373550415, "loss": 0.8586, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.5540869235992432, "rewards/margins": 0.16929247975349426, "rewards/rejected": -1.723379373550415, "sft_loss": 1.5509915351867676, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 6.6261454171114975, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.04232883080840111, "logits/rejected": 0.1627885401248932, "logps/chosen": -1.5111720561981201, "logps/rejected": -1.7020124197006226, "loss": 0.8327, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5111720561981201, "rewards/margins": 0.19084025919437408, "rewards/rejected": -1.7020124197006226, "sft_loss": 1.4962514638900757, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 7.301377453990225, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.108387291431427, "logits/rejected": 0.041251521557569504, "logps/chosen": -1.470492959022522, "logps/rejected": -1.832126259803772, "loss": 0.7628, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.470492959022522, "rewards/margins": 0.3616332411766052, "rewards/rejected": -1.832126259803772, "sft_loss": 1.5087473392486572, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 9.085439611453728, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.05425272136926651, "logits/rejected": 0.17461523413658142, "logps/chosen": -1.3911322355270386, "logps/rejected": -1.9362506866455078, "loss": 0.7306, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3911322355270386, "rewards/margins": 0.5451183319091797, "rewards/rejected": -1.9362506866455078, "sft_loss": 1.4278723001480103, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 8.896857067983582, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.02503260411322117, "logits/rejected": 0.07017181068658829, "logps/chosen": -1.5041018724441528, "logps/rejected": -1.7995065450668335, "loss": 0.803, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.5041018724441528, "rewards/margins": 0.2954048216342926, "rewards/rejected": -1.7995065450668335, "sft_loss": 1.4718215465545654, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 9.601817301808932, "learning_rate": 9.710419599007937e-07, "logits/chosen": 0.0020412392914295197, "logits/rejected": 0.14347967505455017, "logps/chosen": -1.4851233959197998, "logps/rejected": -1.7414737939834595, "loss": 0.796, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4851233959197998, "rewards/margins": 0.2563503682613373, "rewards/rejected": -1.7414737939834595, "sft_loss": 1.4898221492767334, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 7.276160024054407, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.0766424760222435, "logits/rejected": 0.22784237563610077, "logps/chosen": -1.4226289987564087, "logps/rejected": -1.865517258644104, "loss": 0.7333, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4226289987564087, "rewards/margins": 0.4428882598876953, "rewards/rejected": -1.865517258644104, "sft_loss": 1.3889240026474, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 7.250742494775409, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.11839134991168976, "logits/rejected": 0.0015437155961990356, "logps/chosen": -1.4555751085281372, "logps/rejected": -1.8659054040908813, "loss": 0.7601, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4555751085281372, "rewards/margins": 0.410330206155777, "rewards/rejected": -1.8659054040908813, "sft_loss": 1.4994200468063354, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 9.145403948775996, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1077490821480751, "logits/rejected": 0.09827809035778046, "logps/chosen": -1.5551248788833618, "logps/rejected": -2.0315284729003906, "loss": 0.7476, "rewards/accuracies": 0.625, "rewards/chosen": -1.5551248788833618, "rewards/margins": 0.47640371322631836, "rewards/rejected": -2.0315284729003906, "sft_loss": 1.5302419662475586, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 9.307111640049607, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.05722697824239731, "logits/rejected": 0.1269833743572235, "logps/chosen": -1.546934723854065, "logps/rejected": -1.9177815914154053, "loss": 0.7789, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.546934723854065, "rewards/margins": 0.37084686756134033, "rewards/rejected": -1.9177815914154053, "sft_loss": 1.4802888631820679, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 9.49623861916779, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.009790308773517609, "logits/rejected": 0.17401006817817688, "logps/chosen": -1.6562626361846924, "logps/rejected": -2.053626537322998, "loss": 0.793, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6562626361846924, "rewards/margins": 0.39736396074295044, "rewards/rejected": -2.053626537322998, "sft_loss": 1.5068317651748657, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 7.173243904307943, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.02818264439702034, "logits/rejected": 0.022147994488477707, "logps/chosen": -1.5916063785552979, "logps/rejected": -1.823362112045288, "loss": 0.8387, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5916063785552979, "rewards/margins": 0.23175561428070068, "rewards/rejected": -1.823362112045288, "sft_loss": 1.6197869777679443, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 6.698830022359408, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.1802448332309723, "logits/rejected": -0.04093443602323532, "logps/chosen": -1.548505187034607, "logps/rejected": -1.9775440692901611, "loss": 0.7957, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.548505187034607, "rewards/margins": 0.4290388226509094, "rewards/rejected": -1.9775440692901611, "sft_loss": 1.5944669246673584, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 6.654751618828755, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.1586047261953354, "logits/rejected": -0.012375918217003345, "logps/chosen": -1.4494497776031494, "logps/rejected": -1.8526294231414795, "loss": 0.7522, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4494497776031494, "rewards/margins": 0.40317949652671814, "rewards/rejected": -1.8526294231414795, "sft_loss": 1.4216344356536865, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 6.721788441279907, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.02532816492021084, "logits/rejected": 0.08195491135120392, "logps/chosen": -1.4947737455368042, "logps/rejected": -1.754565954208374, "loss": 0.8058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4947737455368042, "rewards/margins": 0.2597922682762146, "rewards/rejected": -1.754565954208374, "sft_loss": 1.5300415754318237, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 6.670747159616163, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.03451241925358772, "logits/rejected": 0.14234474301338196, "logps/chosen": -1.4421255588531494, "logps/rejected": -1.8174455165863037, "loss": 0.7505, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4421255588531494, "rewards/margins": 0.37532028555870056, "rewards/rejected": -1.8174455165863037, "sft_loss": 1.502881646156311, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 5.2604955150453, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.04966401308774948, "logits/rejected": 0.039190806448459625, "logps/chosen": -1.4304440021514893, "logps/rejected": -1.7264881134033203, "loss": 0.786, "rewards/accuracies": 0.625, "rewards/chosen": -1.4304440021514893, "rewards/margins": 0.29604414105415344, "rewards/rejected": -1.7264881134033203, "sft_loss": 1.4472519159317017, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.214496200779329, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.05986039713025093, "logits/rejected": 0.11513737589120865, "logps/chosen": -1.6795326471328735, "logps/rejected": -1.9661028385162354, "loss": 0.8471, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6795326471328735, "rewards/margins": 0.28657007217407227, "rewards/rejected": -1.9661028385162354, "sft_loss": 1.6201368570327759, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.935649319375468, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.041377611458301544, "logits/rejected": 0.1624259650707245, "logps/chosen": -1.4566946029663086, "logps/rejected": -1.7858794927597046, "loss": 0.8026, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4566946029663086, "rewards/margins": 0.3291848599910736, "rewards/rejected": -1.7858794927597046, "sft_loss": 1.497525930404663, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 6.2841215561364, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.030157005414366722, "logits/rejected": 0.14071953296661377, "logps/chosen": -1.5804452896118164, "logps/rejected": -1.9509687423706055, "loss": 0.7711, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5804452896118164, "rewards/margins": 0.3705234229564667, "rewards/rejected": -1.9509687423706055, "sft_loss": 1.555837869644165, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 7.9112424912874575, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.027452567592263222, "logits/rejected": 0.18740546703338623, "logps/chosen": -1.6113693714141846, "logps/rejected": -1.9770864248275757, "loss": 0.7945, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6113693714141846, "rewards/margins": 0.36571723222732544, "rewards/rejected": -1.9770864248275757, "sft_loss": 1.5532480478286743, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 7.947983418769137, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.05953580141067505, "logits/rejected": 0.10688817501068115, "logps/chosen": -1.5710599422454834, "logps/rejected": -2.035400152206421, "loss": 0.7421, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5710599422454834, "rewards/margins": 0.4643400311470032, "rewards/rejected": -2.035400152206421, "sft_loss": 1.5787485837936401, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 5.934324630585954, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.01701376587152481, "logits/rejected": 0.15736515820026398, "logps/chosen": -1.4584860801696777, "logps/rejected": -1.9364503622055054, "loss": 0.7377, "rewards/accuracies": 0.625, "rewards/chosen": -1.4584860801696777, "rewards/margins": 0.4779641628265381, "rewards/rejected": -1.9364503622055054, "sft_loss": 1.523564338684082, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 12.638943275034942, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.06725503504276276, "logits/rejected": 0.00759457191452384, "logps/chosen": -1.5744128227233887, "logps/rejected": -1.9083141088485718, "loss": 0.8073, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5744128227233887, "rewards/margins": 0.3339013159275055, "rewards/rejected": -1.9083141088485718, "sft_loss": 1.5534216165542603, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.4110593795776367, "eval_logits/rejected": 0.5229104161262512, "eval_logps/chosen": -1.5530545711517334, "eval_logps/rejected": -1.9756243228912354, "eval_loss": 0.7653185725212097, "eval_rewards/accuracies": 0.6298219561576843, "eval_rewards/chosen": -1.5530545711517334, "eval_rewards/margins": 0.42256975173950195, "eval_rewards/rejected": -1.9756243228912354, "eval_runtime": 43.1574, "eval_samples_per_second": 31.165, "eval_sft_loss": 1.5451533794403076, "eval_steps_per_second": 7.809, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 12.357461063254581, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.1391552984714508, "logits/rejected": 0.07347585260868073, "logps/chosen": -1.5341812372207642, "logps/rejected": -1.8918930292129517, "loss": 0.7769, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5341812372207642, "rewards/margins": 0.3577118515968323, "rewards/rejected": -1.8918930292129517, "sft_loss": 1.5553354024887085, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 8.0467987038882, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.054170556366443634, "logits/rejected": 0.1753295660018921, "logps/chosen": -1.500733733177185, "logps/rejected": -1.8652946949005127, "loss": 0.7662, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.500733733177185, "rewards/margins": 0.3645608723163605, "rewards/rejected": -1.8652946949005127, "sft_loss": 1.494295358657837, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 6.023609679077354, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.07709038257598877, "logits/rejected": 0.06649589538574219, "logps/chosen": -1.5462301969528198, "logps/rejected": -1.8417469263076782, "loss": 0.8088, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5462301969528198, "rewards/margins": 0.2955167889595032, "rewards/rejected": -1.8417469263076782, "sft_loss": 1.5518180131912231, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.939481462937099, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.09668295085430145, "logits/rejected": 0.22819213569164276, "logps/chosen": -1.4557883739471436, "logps/rejected": -1.7673383951187134, "loss": 0.7959, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4557883739471436, "rewards/margins": 0.31154999136924744, "rewards/rejected": -1.7673383951187134, "sft_loss": 1.4440884590148926, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 6.184149831904977, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.14620277285575867, "logits/rejected": 0.09027117490768433, "logps/chosen": -1.5523357391357422, "logps/rejected": -1.8385976552963257, "loss": 0.8107, "rewards/accuracies": 0.625, "rewards/chosen": -1.5523357391357422, "rewards/margins": 0.2862620949745178, "rewards/rejected": -1.8385976552963257, "sft_loss": 1.5590242147445679, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 7.395581806122241, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.0037692238111048937, "logits/rejected": 0.11023984849452972, "logps/chosen": -1.5106511116027832, "logps/rejected": -1.8116480112075806, "loss": 0.7982, "rewards/accuracies": 0.625, "rewards/chosen": -1.5106511116027832, "rewards/margins": 0.3009968400001526, "rewards/rejected": -1.8116480112075806, "sft_loss": 1.479651927947998, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 5.974228258491238, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.002071636961773038, "logits/rejected": 0.15372321009635925, "logps/chosen": -1.5231326818466187, "logps/rejected": -1.804103136062622, "loss": 0.7921, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5231326818466187, "rewards/margins": 0.2809702754020691, "rewards/rejected": -1.804103136062622, "sft_loss": 1.588775396347046, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 7.149789338518796, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.1252686232328415, "logits/rejected": 0.03118918463587761, "logps/chosen": -1.5223870277404785, "logps/rejected": -1.8628833293914795, "loss": 0.7708, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5223870277404785, "rewards/margins": 0.3404964506626129, "rewards/rejected": -1.8628833293914795, "sft_loss": 1.5761277675628662, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 7.49925271181474, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.03452850505709648, "logits/rejected": 0.1749984472990036, "logps/chosen": -1.5457907915115356, "logps/rejected": -1.8539975881576538, "loss": 0.8086, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5457907915115356, "rewards/margins": 0.3082069754600525, "rewards/rejected": -1.8539975881576538, "sft_loss": 1.5775070190429688, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 9.989572148925806, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.03555355966091156, "logits/rejected": 0.19438514113426208, "logps/chosen": -1.6394716501235962, "logps/rejected": -1.924386739730835, "loss": 0.8226, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6394716501235962, "rewards/margins": 0.28491532802581787, "rewards/rejected": -1.924386739730835, "sft_loss": 1.619627594947815, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 8.328792722668657, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.037635933607816696, "logits/rejected": 0.10790684074163437, "logps/chosen": -1.4985637664794922, "logps/rejected": -1.8529409170150757, "loss": 0.7796, "rewards/accuracies": 0.625, "rewards/chosen": -1.4985637664794922, "rewards/margins": 0.3543771803379059, "rewards/rejected": -1.8529409170150757, "sft_loss": 1.5785887241363525, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 6.475285222580039, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.05961136892437935, "logits/rejected": 0.093226358294487, "logps/chosen": -1.5092459917068481, "logps/rejected": -1.9873511791229248, "loss": 0.7607, "rewards/accuracies": 0.625, "rewards/chosen": -1.5092459917068481, "rewards/margins": 0.4781051576137543, "rewards/rejected": -1.9873511791229248, "sft_loss": 1.5577691793441772, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 7.71044140969529, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.030216574668884277, "logits/rejected": 0.21367864310741425, "logps/chosen": -1.518887996673584, "logps/rejected": -1.928295373916626, "loss": 0.7609, "rewards/accuracies": 0.6875, "rewards/chosen": -1.518887996673584, "rewards/margins": 0.40940746665000916, "rewards/rejected": -1.928295373916626, "sft_loss": 1.5540010929107666, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 7.090406948813718, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.04940169304609299, "logits/rejected": 0.06458373367786407, "logps/chosen": -1.517562747001648, "logps/rejected": -1.9599453210830688, "loss": 0.7689, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.517562747001648, "rewards/margins": 0.4423826336860657, "rewards/rejected": -1.9599453210830688, "sft_loss": 1.5648082494735718, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 7.436609733139317, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.015586158260703087, "logits/rejected": 0.1091979593038559, "logps/chosen": -1.5926053524017334, "logps/rejected": -1.8916200399398804, "loss": 0.8124, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5926053524017334, "rewards/margins": 0.29901453852653503, "rewards/rejected": -1.8916200399398804, "sft_loss": 1.5531997680664062, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 8.244268108810685, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.01770116575062275, "logits/rejected": 0.17539562284946442, "logps/chosen": -1.611375093460083, "logps/rejected": -1.9028886556625366, "loss": 0.8263, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.611375093460083, "rewards/margins": 0.29151374101638794, "rewards/rejected": -1.9028886556625366, "sft_loss": 1.6357406377792358, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 8.266442338359836, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.007032863795757294, "logits/rejected": 0.15912309288978577, "logps/chosen": -1.460541844367981, "logps/rejected": -1.761348009109497, "loss": 0.7738, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.460541844367981, "rewards/margins": 0.3008061349391937, "rewards/rejected": -1.761348009109497, "sft_loss": 1.4669584035873413, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 9.187576052202576, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.14198832213878632, "logits/rejected": 0.03478335589170456, "logps/chosen": -1.573289155960083, "logps/rejected": -1.8103530406951904, "loss": 0.8285, "rewards/accuracies": 0.5625, "rewards/chosen": -1.573289155960083, "rewards/margins": 0.23706409335136414, "rewards/rejected": -1.8103530406951904, "sft_loss": 1.5576437711715698, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 8.29582130813989, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.06643088161945343, "logits/rejected": -0.05218280479311943, "logps/chosen": -1.5680351257324219, "logps/rejected": -1.988735556602478, "loss": 0.7693, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5680351257324219, "rewards/margins": 0.4207003116607666, "rewards/rejected": -1.988735556602478, "sft_loss": 1.5478942394256592, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 7.225450850919134, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.06510347127914429, "logits/rejected": 0.23026804625988007, "logps/chosen": -1.4947283267974854, "logps/rejected": -1.968867301940918, "loss": 0.7512, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4947283267974854, "rewards/margins": 0.47413891553878784, "rewards/rejected": -1.968867301940918, "sft_loss": 1.53347647190094, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 6.993263806805278, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.07897701859474182, "logits/rejected": 0.13383238017559052, "logps/chosen": -1.5177541971206665, "logps/rejected": -1.8642276525497437, "loss": 0.7849, "rewards/accuracies": 0.625, "rewards/chosen": -1.5177541971206665, "rewards/margins": 0.346473753452301, "rewards/rejected": -1.8642276525497437, "sft_loss": 1.5034488439559937, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 5.0903941870232225, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.04826375097036362, "logits/rejected": 0.1482517570257187, "logps/chosen": -1.4772021770477295, "logps/rejected": -1.8546661138534546, "loss": 0.7747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4772021770477295, "rewards/margins": 0.37746408581733704, "rewards/rejected": -1.8546661138534546, "sft_loss": 1.489646553993225, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 7.280139129340317, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.061371732503175735, "logits/rejected": 0.1393098533153534, "logps/chosen": -1.4762002229690552, "logps/rejected": -1.8850879669189453, "loss": 0.7513, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4762002229690552, "rewards/margins": 0.4088878035545349, "rewards/rejected": -1.8850879669189453, "sft_loss": 1.4545695781707764, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 5.683714911708735, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.16505800187587738, "logits/rejected": 0.0649692565202713, "logps/chosen": -1.4719158411026, "logps/rejected": -1.8847873210906982, "loss": 0.7522, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4719158411026, "rewards/margins": 0.41287142038345337, "rewards/rejected": -1.8847873210906982, "sft_loss": 1.4930028915405273, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 8.055430003273564, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.019701320677995682, "logits/rejected": 0.15035542845726013, "logps/chosen": -1.5742028951644897, "logps/rejected": -1.9284107685089111, "loss": 0.848, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5742028951644897, "rewards/margins": 0.3542078137397766, "rewards/rejected": -1.9284107685089111, "sft_loss": 1.5179173946380615, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 6.804685432863646, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.015834815800189972, "logits/rejected": 0.12051324546337128, "logps/chosen": -1.4864460229873657, "logps/rejected": -1.7922194004058838, "loss": 0.7871, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4864460229873657, "rewards/margins": 0.3057732880115509, "rewards/rejected": -1.7922194004058838, "sft_loss": 1.4855268001556396, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 8.642305382394888, "learning_rate": 9.429949548420417e-07, "logits/chosen": 0.021522030234336853, "logits/rejected": 0.12598273158073425, "logps/chosen": -1.5665611028671265, "logps/rejected": -1.8502845764160156, "loss": 0.7906, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5665611028671265, "rewards/margins": 0.2837234437465668, "rewards/rejected": -1.8502845764160156, "sft_loss": 1.5793159008026123, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 9.634679261159716, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.007305607199668884, "logits/rejected": 0.05420448258519173, "logps/chosen": -1.4939991235733032, "logps/rejected": -1.7675464153289795, "loss": 0.7981, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4939991235733032, "rewards/margins": 0.27354729175567627, "rewards/rejected": -1.7675464153289795, "sft_loss": 1.4924724102020264, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 5.101473763652376, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.06430179625749588, "logits/rejected": 0.29094842076301575, "logps/chosen": -1.4747602939605713, "logps/rejected": -1.8495433330535889, "loss": 0.7533, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4747602939605713, "rewards/margins": 0.37478309869766235, "rewards/rejected": -1.8495433330535889, "sft_loss": 1.508946418762207, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 7.846887252192513, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.06767670810222626, "logits/rejected": 0.1431683599948883, "logps/chosen": -1.497562050819397, "logps/rejected": -1.6580489873886108, "loss": 0.8355, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.497562050819397, "rewards/margins": 0.16048704087734222, "rewards/rejected": -1.6580489873886108, "sft_loss": 1.5262967348098755, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 6.105275658779984, "learning_rate": 9.400719478771449e-07, "logits/chosen": 0.04067292809486389, "logits/rejected": 0.3882962167263031, "logps/chosen": -1.567137360572815, "logps/rejected": -1.8524439334869385, "loss": 0.7995, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.567137360572815, "rewards/margins": 0.28530648350715637, "rewards/rejected": -1.8524439334869385, "sft_loss": 1.572643756866455, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 6.874178003863533, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.07719989120960236, "logits/rejected": 0.08962713927030563, "logps/chosen": -1.6100492477416992, "logps/rejected": -2.03241229057312, "loss": 0.7714, "rewards/accuracies": 0.625, "rewards/chosen": -1.6100492477416992, "rewards/margins": 0.42236295342445374, "rewards/rejected": -2.03241229057312, "sft_loss": 1.6464271545410156, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.847321081069581, "learning_rate": 9.38584798451817e-07, "logits/chosen": 0.025656018406152725, "logits/rejected": 0.1984395831823349, "logps/chosen": -1.5380942821502686, "logps/rejected": -1.916996717453003, "loss": 0.7604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5380942821502686, "rewards/margins": 0.3789026439189911, "rewards/rejected": -1.916996717453003, "sft_loss": 1.5578184127807617, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 35.62402709175856, "learning_rate": 9.37834837394275e-07, "logits/chosen": 0.04204457253217697, "logits/rejected": 0.1908070296049118, "logps/chosen": -1.658442497253418, "logps/rejected": -2.2107603549957275, "loss": 0.7533, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.658442497253418, "rewards/margins": 0.5523178577423096, "rewards/rejected": -2.2107603549957275, "sft_loss": 1.6409889459609985, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 4.555540968760231, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.016873013228178024, "logits/rejected": 0.16399024426937103, "logps/chosen": -1.5526940822601318, "logps/rejected": -2.080778121948242, "loss": 0.7375, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5526940822601318, "rewards/margins": 0.5280841588973999, "rewards/rejected": -2.080778121948242, "sft_loss": 1.5966856479644775, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.817135219008114, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.0778733491897583, "logits/rejected": 0.08052931725978851, "logps/chosen": -1.6007308959960938, "logps/rejected": -1.8602443933486938, "loss": 0.8371, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6007308959960938, "rewards/margins": 0.25951334834098816, "rewards/rejected": -1.8602443933486938, "sft_loss": 1.556165337562561, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 13.903676260989972, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.04237335920333862, "logits/rejected": 0.1167731061577797, "logps/chosen": -1.4700403213500977, "logps/rejected": -1.8338407278060913, "loss": 0.7779, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4700403213500977, "rewards/margins": 0.3638002574443817, "rewards/rejected": -1.8338407278060913, "sft_loss": 1.5081623792648315, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 8.675665084396295, "learning_rate": 9.34792587788002e-07, "logits/chosen": 0.03711455315351486, "logits/rejected": 0.17952170968055725, "logps/chosen": -1.5565040111541748, "logps/rejected": -1.891310453414917, "loss": 0.7827, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5565040111541748, "rewards/margins": 0.33480656147003174, "rewards/rejected": -1.891310453414917, "sft_loss": 1.5822795629501343, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 5.372665261834803, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.0711214691400528, "logits/rejected": 0.16815319657325745, "logps/chosen": -1.5087722539901733, "logps/rejected": -1.739585518836975, "loss": 0.7959, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5087722539901733, "rewards/margins": 0.23081330955028534, "rewards/rejected": -1.739585518836975, "sft_loss": 1.5105878114700317, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 6.260615858416727, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.1397058665752411, "logits/rejected": 0.027968278154730797, "logps/chosen": -1.6092488765716553, "logps/rejected": -1.9042243957519531, "loss": 0.8032, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6092488765716553, "rewards/margins": 0.29497548937797546, "rewards/rejected": -1.9042243957519531, "sft_loss": 1.6180202960968018, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 5.1032675038204225, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.017549514770507812, "logits/rejected": 0.19922587275505066, "logps/chosen": -1.411353588104248, "logps/rejected": -1.8633699417114258, "loss": 0.7606, "rewards/accuracies": 0.625, "rewards/chosen": -1.411353588104248, "rewards/margins": 0.45201629400253296, "rewards/rejected": -1.8633699417114258, "sft_loss": 1.477474570274353, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 8.976946685471301, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.04781641438603401, "logits/rejected": 0.16527536511421204, "logps/chosen": -1.5514352321624756, "logps/rejected": -1.929112195968628, "loss": 0.7721, "rewards/accuracies": 0.625, "rewards/chosen": -1.5514352321624756, "rewards/margins": 0.37767690420150757, "rewards/rejected": -1.929112195968628, "sft_loss": 1.6095211505889893, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 9.167216900710967, "learning_rate": 9.30894920180659e-07, "logits/chosen": 0.05302456021308899, "logits/rejected": 0.22592797875404358, "logps/chosen": -1.5522041320800781, "logps/rejected": -1.6967365741729736, "loss": 0.8449, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5522041320800781, "rewards/margins": 0.14453235268592834, "rewards/rejected": -1.6967365741729736, "sft_loss": 1.5160868167877197, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 5.038368623715311, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.04643978923559189, "logits/rejected": 0.2078028917312622, "logps/chosen": -1.4708811044692993, "logps/rejected": -2.004676342010498, "loss": 0.7507, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4708811044692993, "rewards/margins": 0.533795177936554, "rewards/rejected": -2.004676342010498, "sft_loss": 1.5242481231689453, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 6.728106290375101, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.06993760168552399, "logits/rejected": 0.18598724901676178, "logps/chosen": -1.5088984966278076, "logps/rejected": -1.963747262954712, "loss": 0.7522, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5088984966278076, "rewards/margins": 0.4548489451408386, "rewards/rejected": -1.963747262954712, "sft_loss": 1.5313358306884766, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 7.7788562484580615, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.050735920667648315, "logits/rejected": 0.10791660845279694, "logps/chosen": -1.5653870105743408, "logps/rejected": -1.9160648584365845, "loss": 0.766, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5653870105743408, "rewards/margins": 0.3506779968738556, "rewards/rejected": -1.9160648584365845, "sft_loss": 1.560789942741394, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 4.690654616837126, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.11185245215892792, "logits/rejected": 0.17951732873916626, "logps/chosen": -1.488734483718872, "logps/rejected": -2.001232147216797, "loss": 0.7397, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.488734483718872, "rewards/margins": 0.5124975442886353, "rewards/rejected": -2.001232147216797, "sft_loss": 1.5031285285949707, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 5.200187149300906, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.03065555915236473, "logits/rejected": 0.09611991047859192, "logps/chosen": -1.5341428518295288, "logps/rejected": -1.7978395223617554, "loss": 0.8089, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5341428518295288, "rewards/margins": 0.263696551322937, "rewards/rejected": -1.7978395223617554, "sft_loss": 1.5808682441711426, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 9.14791169437769, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.10061429440975189, "logits/rejected": 0.14751431345939636, "logps/chosen": -1.588069200515747, "logps/rejected": -1.880556344985962, "loss": 0.8187, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.588069200515747, "rewards/margins": 0.2924869656562805, "rewards/rejected": -1.880556344985962, "sft_loss": 1.6440925598144531, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 10.79698235896366, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.03961476683616638, "logits/rejected": 0.1479351818561554, "logps/chosen": -1.5507192611694336, "logps/rejected": -1.8307104110717773, "loss": 0.824, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5507192611694336, "rewards/margins": 0.2799910604953766, "rewards/rejected": -1.8307104110717773, "sft_loss": 1.5601301193237305, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 8.577913177392546, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.043893106281757355, "logits/rejected": 0.15120232105255127, "logps/chosen": -1.5105851888656616, "logps/rejected": -1.9094839096069336, "loss": 0.7752, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5105851888656616, "rewards/margins": 0.3988986611366272, "rewards/rejected": -1.9094839096069336, "sft_loss": 1.5464986562728882, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 7.7958491125235145, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.009665383026003838, "logits/rejected": 0.193177729845047, "logps/chosen": -1.4877359867095947, "logps/rejected": -1.9297077655792236, "loss": 0.7478, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4877359867095947, "rewards/margins": 0.44197168946266174, "rewards/rejected": -1.9297077655792236, "sft_loss": 1.4791226387023926, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 5.6820709234832645, "learning_rate": 9.227870209296395e-07, "logits/chosen": 0.03165459632873535, "logits/rejected": 0.17374761402606964, "logps/chosen": -1.595961332321167, "logps/rejected": -1.918928861618042, "loss": 0.8026, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.595961332321167, "rewards/margins": 0.32296767830848694, "rewards/rejected": -1.918928861618042, "sft_loss": 1.630262017250061, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 6.885487096966203, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.05570758506655693, "logits/rejected": -0.022068342193961143, "logps/chosen": -1.5589110851287842, "logps/rejected": -1.9327443838119507, "loss": 0.7927, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5589110851287842, "rewards/margins": 0.3738333582878113, "rewards/rejected": -1.9327443838119507, "sft_loss": 1.5968363285064697, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 7.4717123676127635, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.06700778752565384, "logits/rejected": 0.1450539529323578, "logps/chosen": -1.5368725061416626, "logps/rejected": -1.9300647974014282, "loss": 0.7715, "rewards/accuracies": 0.625, "rewards/chosen": -1.5368725061416626, "rewards/margins": 0.393192321062088, "rewards/rejected": -1.9300647974014282, "sft_loss": 1.5752685070037842, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 7.157016855688211, "learning_rate": 9.202743024992367e-07, "logits/chosen": 0.0331614688038826, "logits/rejected": 0.16338004171848297, "logps/chosen": -1.4922975301742554, "logps/rejected": -1.9882208108901978, "loss": 0.7534, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4922975301742554, "rewards/margins": 0.49592337012290955, "rewards/rejected": -1.9882208108901978, "sft_loss": 1.5234344005584717, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 6.786258855213881, "learning_rate": 9.194285638083293e-07, "logits/chosen": 0.046152498573064804, "logits/rejected": 0.23527351021766663, "logps/chosen": -1.5785131454467773, "logps/rejected": -2.0503058433532715, "loss": 0.7667, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5785131454467773, "rewards/margins": 0.4717925190925598, "rewards/rejected": -2.0503058433532715, "sft_loss": 1.5579280853271484, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 7.831630267523082, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.030686121433973312, "logits/rejected": 0.11260805279016495, "logps/chosen": -1.5385570526123047, "logps/rejected": -1.8790044784545898, "loss": 0.8, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5385570526123047, "rewards/margins": 0.34044739603996277, "rewards/rejected": -1.8790044784545898, "sft_loss": 1.5312492847442627, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 7.736431341921282, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.008087193593382835, "logits/rejected": 0.11138725280761719, "logps/chosen": -1.7046077251434326, "logps/rejected": -1.9126007556915283, "loss": 0.8542, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7046077251434326, "rewards/margins": 0.20799300074577332, "rewards/rejected": -1.9126007556915283, "sft_loss": 1.7330427169799805, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 11.665343241759041, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.06726381927728653, "logits/rejected": 0.05159614235162735, "logps/chosen": -1.5922222137451172, "logps/rejected": -2.0203213691711426, "loss": 0.7955, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5922222137451172, "rewards/margins": 0.42809900641441345, "rewards/rejected": -2.0203213691711426, "sft_loss": 1.6424007415771484, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 8.702502030234344, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.02822733484208584, "logits/rejected": 0.1210421547293663, "logps/chosen": -1.5506961345672607, "logps/rejected": -2.0492966175079346, "loss": 0.7303, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5506961345672607, "rewards/margins": 0.498600572347641, "rewards/rejected": -2.0492966175079346, "sft_loss": 1.5302189588546753, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 3.875924921292042, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.06352396309375763, "logits/rejected": 0.2692248225212097, "logps/chosen": -1.5775508880615234, "logps/rejected": -2.1155786514282227, "loss": 0.737, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5775508880615234, "rewards/margins": 0.538027822971344, "rewards/rejected": -2.1155786514282227, "sft_loss": 1.5825284719467163, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 6.586599789388135, "learning_rate": 9.142689663565577e-07, "logits/chosen": 0.032250575721263885, "logits/rejected": 0.11554038524627686, "logps/chosen": -1.5206407308578491, "logps/rejected": -1.9471498727798462, "loss": 0.7591, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5206407308578491, "rewards/margins": 0.42650899291038513, "rewards/rejected": -1.9471498727798462, "sft_loss": 1.5285907983779907, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 8.89223756756473, "learning_rate": 9.133949170991397e-07, "logits/chosen": 0.04207143187522888, "logits/rejected": 0.15103182196617126, "logps/chosen": -1.5528504848480225, "logps/rejected": -1.9232463836669922, "loss": 0.778, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5528504848480225, "rewards/margins": 0.37039583921432495, "rewards/rejected": -1.9232463836669922, "sft_loss": 1.6473729610443115, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 6.184798678174358, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.033777594566345215, "logits/rejected": 0.18705672025680542, "logps/chosen": -1.5442079305648804, "logps/rejected": -1.8608815670013428, "loss": 0.8139, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5442079305648804, "rewards/margins": 0.3166736960411072, "rewards/rejected": -1.8608815670013428, "sft_loss": 1.5317318439483643, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 11.353408197353502, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.029499346390366554, "logits/rejected": 0.11033545434474945, "logps/chosen": -1.5677837133407593, "logps/rejected": -1.8793418407440186, "loss": 0.7965, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.5677837133407593, "rewards/margins": 0.31155824661254883, "rewards/rejected": -1.8793418407440186, "sft_loss": 1.5364774465560913, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 9.202796514689775, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.04973259195685387, "logits/rejected": 0.16444841027259827, "logps/chosen": -1.4995152950286865, "logps/rejected": -1.84906804561615, "loss": 0.7596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4995152950286865, "rewards/margins": 0.3495527505874634, "rewards/rejected": -1.84906804561615, "sft_loss": 1.5238702297210693, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 9.824044254315865, "learning_rate": 9.098586978599673e-07, "logits/chosen": 0.008280429989099503, "logits/rejected": 0.20027735829353333, "logps/chosen": -1.522702932357788, "logps/rejected": -2.0823118686676025, "loss": 0.7144, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.522702932357788, "rewards/margins": 0.559609055519104, "rewards/rejected": -2.0823118686676025, "sft_loss": 1.5239137411117554, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 7.308043704720405, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.032457828521728516, "logits/rejected": 0.22786331176757812, "logps/chosen": -1.5191773176193237, "logps/rejected": -1.8827060461044312, "loss": 0.7864, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5191773176193237, "rewards/margins": 0.36352863907814026, "rewards/rejected": -1.8827060461044312, "sft_loss": 1.557527780532837, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 7.122326111325528, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.13108573853969574, "logits/rejected": 0.18660180270671844, "logps/chosen": -1.5911967754364014, "logps/rejected": -2.06133770942688, "loss": 0.7599, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5911967754364014, "rewards/margins": 0.4701407551765442, "rewards/rejected": -2.06133770942688, "sft_loss": 1.6169294118881226, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 6.054936480652723, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.09555993229150772, "logits/rejected": 0.22705230116844177, "logps/chosen": -1.6089187860488892, "logps/rejected": -2.1285042762756348, "loss": 0.7702, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6089187860488892, "rewards/margins": 0.5195856094360352, "rewards/rejected": -2.1285042762756348, "sft_loss": 1.5718333721160889, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 8.863111314757784, "learning_rate": 9.062588561782354e-07, "logits/chosen": 0.006359955761581659, "logits/rejected": 0.0931992456316948, "logps/chosen": -1.6885942220687866, "logps/rejected": -2.040802478790283, "loss": 0.8224, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6885942220687866, "rewards/margins": 0.35220804810523987, "rewards/rejected": -2.040802478790283, "sft_loss": 1.7221790552139282, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 5.956927097230177, "learning_rate": 9.053490200202358e-07, "logits/chosen": 0.03864731639623642, "logits/rejected": 0.13886341452598572, "logps/chosen": -1.6609443426132202, "logps/rejected": -1.9945148229599, "loss": 0.8091, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6609443426132202, "rewards/margins": 0.3335704505443573, "rewards/rejected": -1.9945148229599, "sft_loss": 1.6638319492340088, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 17.435806124348037, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.07239361107349396, "logits/rejected": 0.0997651219367981, "logps/chosen": -1.5358011722564697, "logps/rejected": -1.8270368576049805, "loss": 0.8213, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5358011722564697, "rewards/margins": 0.2912355065345764, "rewards/rejected": -1.8270368576049805, "sft_loss": 1.5815781354904175, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 6.986174292372821, "learning_rate": 9.03517558475716e-07, "logits/chosen": 0.008755502291023731, "logits/rejected": 0.13093584775924683, "logps/chosen": -1.50785231590271, "logps/rejected": -1.7878860235214233, "loss": 0.7931, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.50785231590271, "rewards/margins": 0.28003376722335815, "rewards/rejected": -1.7878860235214233, "sft_loss": 1.518880844116211, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 11.642299733682131, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.06780470162630081, "logits/rejected": 0.3653286397457123, "logps/chosen": -1.5423269271850586, "logps/rejected": -1.928520917892456, "loss": 0.7705, "rewards/accuracies": 0.625, "rewards/chosen": -1.5423269271850586, "rewards/margins": 0.38619405031204224, "rewards/rejected": -1.928520917892456, "sft_loss": 1.559385061264038, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.009699081136519, "learning_rate": 9.016704372526905e-07, "logits/chosen": 0.00031385422335006297, "logits/rejected": 0.19508454203605652, "logps/chosen": -1.4494677782058716, "logps/rejected": -1.9591785669326782, "loss": 0.7216, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4494677782058716, "rewards/margins": 0.5097106695175171, "rewards/rejected": -1.9591785669326782, "sft_loss": 1.4588356018066406, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 11.257261765072354, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.04426584765315056, "logits/rejected": 0.0597103014588356, "logps/chosen": -1.5061947107315063, "logps/rejected": -1.7958621978759766, "loss": 0.7956, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5061947107315063, "rewards/margins": 0.28966760635375977, "rewards/rejected": -1.7958621978759766, "sft_loss": 1.5797650814056396, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 8.973501041716743, "learning_rate": 8.998077280340981e-07, "logits/chosen": 0.053844332695007324, "logits/rejected": 0.15547195076942444, "logps/chosen": -1.6568008661270142, "logps/rejected": -1.8795292377471924, "loss": 0.843, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6568008661270142, "rewards/margins": 0.22272829711437225, "rewards/rejected": -1.8795292377471924, "sft_loss": 1.6281216144561768, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 4.661622772248416, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.06552759557962418, "logits/rejected": 0.18121913075447083, "logps/chosen": -1.5737403631210327, "logps/rejected": -2.069803237915039, "loss": 0.7417, "rewards/accuracies": 0.625, "rewards/chosen": -1.5737403631210327, "rewards/margins": 0.4960629343986511, "rewards/rejected": -2.069803237915039, "sft_loss": 1.5900707244873047, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.39235806465148926, "eval_logits/rejected": 0.5071808099746704, "eval_logps/chosen": -1.5631626844406128, "eval_logps/rejected": -1.9861836433410645, "eval_loss": 0.7599208950996399, "eval_rewards/accuracies": 0.6483679413795471, "eval_rewards/chosen": -1.5631626844406128, "eval_rewards/margins": 0.42302072048187256, "eval_rewards/rejected": -1.9861836433410645, "eval_runtime": 43.2862, "eval_samples_per_second": 31.072, "eval_sft_loss": 1.565151333808899, "eval_steps_per_second": 7.785, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 6.950721650290741, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.08305919915437698, "logits/rejected": 0.1882287561893463, "logps/chosen": -1.5531580448150635, "logps/rejected": -1.9959478378295898, "loss": 0.7492, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5531580448150635, "rewards/margins": 0.44278964400291443, "rewards/rejected": -1.9959478378295898, "sft_loss": 1.567907691001892, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 6.429214843911583, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.09699267148971558, "logits/rejected": 0.11282362043857574, "logps/chosen": -1.525758147239685, "logps/rejected": -2.0899970531463623, "loss": 0.7277, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.525758147239685, "rewards/margins": 0.5642391443252563, "rewards/rejected": -2.0899970531463623, "sft_loss": 1.5505108833312988, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 10.079542623972314, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.03095575049519539, "logits/rejected": 0.10849988460540771, "logps/chosen": -1.6641359329223633, "logps/rejected": -2.1700820922851562, "loss": 0.7771, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6641359329223633, "rewards/margins": 0.5059463977813721, "rewards/rejected": -2.1700820922851562, "sft_loss": 1.6728131771087646, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 6.71468909442755, "learning_rate": 8.950832333773184e-07, "logits/chosen": 0.01351380068808794, "logits/rejected": 0.19513963162899017, "logps/chosen": -1.5082354545593262, "logps/rejected": -1.9922988414764404, "loss": 0.7509, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5082354545593262, "rewards/margins": 0.48406338691711426, "rewards/rejected": -1.9922988414764404, "sft_loss": 1.5296316146850586, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 8.10149196580218, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.07851864397525787, "logits/rejected": 0.12994399666786194, "logps/chosen": -1.6703354120254517, "logps/rejected": -1.9375226497650146, "loss": 0.8383, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6703354120254517, "rewards/margins": 0.2671871483325958, "rewards/rejected": -1.9375226497650146, "sft_loss": 1.6011865139007568, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 7.310737250490014, "learning_rate": 8.931665393857983e-07, "logits/chosen": 0.006381544284522533, "logits/rejected": 0.18845947086811066, "logps/chosen": -1.5674880743026733, "logps/rejected": -1.9646985530853271, "loss": 0.7809, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5674880743026733, "rewards/margins": 0.3972106873989105, "rewards/rejected": -1.9646985530853271, "sft_loss": 1.550384759902954, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 6.856678915765684, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.12018264830112457, "logits/rejected": 0.023956522345542908, "logps/chosen": -1.4536283016204834, "logps/rejected": -1.9159862995147705, "loss": 0.7444, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4536283016204834, "rewards/margins": 0.46235793828964233, "rewards/rejected": -1.9159862995147705, "sft_loss": 1.496638536453247, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 6.808888226300914, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.0866696760058403, "logits/rejected": 0.0618831105530262, "logps/chosen": -1.4781012535095215, "logps/rejected": -1.9741615056991577, "loss": 0.7481, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4781012535095215, "rewards/margins": 0.4960601329803467, "rewards/rejected": -1.9741615056991577, "sft_loss": 1.5076395273208618, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 7.369035043754625, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.014099538326263428, "logits/rejected": 0.05403543636202812, "logps/chosen": -1.4911154508590698, "logps/rejected": -1.7865339517593384, "loss": 0.7804, "rewards/accuracies": 0.625, "rewards/chosen": -1.4911154508590698, "rewards/margins": 0.2954184412956238, "rewards/rejected": -1.7865339517593384, "sft_loss": 1.51164710521698, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 9.298785451247301, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.08125178515911102, "logits/rejected": 0.15900209546089172, "logps/chosen": -1.497410535812378, "logps/rejected": -1.8727238178253174, "loss": 0.7479, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.497410535812378, "rewards/margins": 0.3753132224082947, "rewards/rejected": -1.8727238178253174, "sft_loss": 1.4813406467437744, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 7.038254347618017, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.06126236915588379, "logits/rejected": 0.07454331964254379, "logps/chosen": -1.5392359495162964, "logps/rejected": -1.891977071762085, "loss": 0.7797, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5392359495162964, "rewards/margins": 0.3527410924434662, "rewards/rejected": -1.891977071762085, "sft_loss": 1.5223100185394287, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 7.158954963371034, "learning_rate": 8.873252100389377e-07, "logits/chosen": 0.008266371674835682, "logits/rejected": 0.04366803914308548, "logps/chosen": -1.4343124628067017, "logps/rejected": -1.8190491199493408, "loss": 0.7481, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4343124628067017, "rewards/margins": 0.38473668694496155, "rewards/rejected": -1.8190491199493408, "sft_loss": 1.3876932859420776, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 5.37680682155124, "learning_rate": 8.863384473200411e-07, "logits/chosen": 0.009859999641776085, "logits/rejected": 0.09268442541360855, "logps/chosen": -1.599784016609192, "logps/rejected": -1.8659346103668213, "loss": 0.8231, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.599784016609192, "rewards/margins": 0.2661505937576294, "rewards/rejected": -1.8659346103668213, "sft_loss": 1.5741994380950928, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 5.8214732523649335, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.013542311266064644, "logits/rejected": 0.20994336903095245, "logps/chosen": -1.6315854787826538, "logps/rejected": -1.903311014175415, "loss": 0.8503, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.6315854787826538, "rewards/margins": 0.27172523736953735, "rewards/rejected": -1.903311014175415, "sft_loss": 1.5560357570648193, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 6.6522105459660565, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.016666624695062637, "logits/rejected": 0.2642180323600769, "logps/chosen": -1.5791969299316406, "logps/rejected": -2.080298900604248, "loss": 0.7648, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5791969299316406, "rewards/margins": 0.501102089881897, "rewards/rejected": -2.080298900604248, "sft_loss": 1.6128031015396118, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 7.612359407180258, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.075277179479599, "logits/rejected": 0.06636609137058258, "logps/chosen": -1.6033694744110107, "logps/rejected": -1.960039734840393, "loss": 0.7862, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.6033694744110107, "rewards/margins": 0.3566703200340271, "rewards/rejected": -1.960039734840393, "sft_loss": 1.605123519897461, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 10.894153140601103, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.028671735897660255, "logits/rejected": 0.254193514585495, "logps/chosen": -1.569838285446167, "logps/rejected": -1.9749723672866821, "loss": 0.8034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.569838285446167, "rewards/margins": 0.4051341414451599, "rewards/rejected": -1.9749723672866821, "sft_loss": 1.5742875337600708, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 6.157352124965408, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.0440065935254097, "logits/rejected": 0.18358276784420013, "logps/chosen": -1.4764903783798218, "logps/rejected": -1.9215567111968994, "loss": 0.7354, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4764903783798218, "rewards/margins": 0.4450664520263672, "rewards/rejected": -1.9215567111968994, "sft_loss": 1.54736328125, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 13.585905028563731, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.172541543841362, "logits/rejected": 0.012338097207248211, "logps/chosen": -1.5869982242584229, "logps/rejected": -1.941210150718689, "loss": 0.7928, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5869982242584229, "rewards/margins": 0.3542119860649109, "rewards/rejected": -1.941210150718689, "sft_loss": 1.6414573192596436, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 7.80329210733547, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.037085093557834625, "logits/rejected": -0.020756395533680916, "logps/chosen": -1.590086817741394, "logps/rejected": -1.78534734249115, "loss": 0.8289, "rewards/accuracies": 0.5625, "rewards/chosen": -1.590086817741394, "rewards/margins": 0.1952604502439499, "rewards/rejected": -1.78534734249115, "sft_loss": 1.6044000387191772, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 14.757562209155708, "learning_rate": 8.783102200993085e-07, "logits/chosen": 0.013259832747280598, "logits/rejected": 0.18860404193401337, "logps/chosen": -1.5393917560577393, "logps/rejected": -1.8625844717025757, "loss": 0.7849, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5393917560577393, "rewards/margins": 0.32319265604019165, "rewards/rejected": -1.8625844717025757, "sft_loss": 1.55495023727417, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 6.292367029878871, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.017006704583764076, "logits/rejected": 0.08735963702201843, "logps/chosen": -1.5250099897384644, "logps/rejected": -1.8657004833221436, "loss": 0.7748, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.5250099897384644, "rewards/margins": 0.3406905233860016, "rewards/rejected": -1.8657004833221436, "sft_loss": 1.567348837852478, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 8.87494086129755, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.03462691232562065, "logits/rejected": 0.2086246907711029, "logps/chosen": -1.636423110961914, "logps/rejected": -1.9210647344589233, "loss": 0.8358, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.636423110961914, "rewards/margins": 0.284641832113266, "rewards/rejected": -1.9210647344589233, "sft_loss": 1.6136753559112549, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 8.215143687540499, "learning_rate": 8.752388044972811e-07, "logits/chosen": 0.007067875470966101, "logits/rejected": 0.09803800284862518, "logps/chosen": -1.427431344985962, "logps/rejected": -1.9149444103240967, "loss": 0.7381, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.427431344985962, "rewards/margins": 0.48751306533813477, "rewards/rejected": -1.9149444103240967, "sft_loss": 1.4537254571914673, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 5.651986706391311, "learning_rate": 8.74207704880141e-07, "logits/chosen": 0.016439497470855713, "logits/rejected": 0.14485010504722595, "logps/chosen": -1.5648473501205444, "logps/rejected": -2.0637013912200928, "loss": 0.7464, "rewards/accuracies": 0.625, "rewards/chosen": -1.5648473501205444, "rewards/margins": 0.4988541007041931, "rewards/rejected": -2.0637013912200928, "sft_loss": 1.6262567043304443, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 6.955691452811158, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.061527181416749954, "logits/rejected": 0.13255621492862701, "logps/chosen": -1.521337628364563, "logps/rejected": -1.8079208135604858, "loss": 0.7999, "rewards/accuracies": 0.59375, "rewards/chosen": -1.521337628364563, "rewards/margins": 0.28658327460289, "rewards/rejected": -1.8079208135604858, "sft_loss": 1.5436383485794067, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 8.781374099205129, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.11914999783039093, "logits/rejected": 0.09365762025117874, "logps/chosen": -1.501193881034851, "logps/rejected": -2.1662983894348145, "loss": 0.7068, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.501193881034851, "rewards/margins": 0.6651046872138977, "rewards/rejected": -2.1662983894348145, "sft_loss": 1.4717814922332764, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 7.722501189567642, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.05384911969304085, "logits/rejected": 0.13911385834217072, "logps/chosen": -1.549055814743042, "logps/rejected": -1.9711103439331055, "loss": 0.7493, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.549055814743042, "rewards/margins": 0.42205458879470825, "rewards/rejected": -1.9711103439331055, "sft_loss": 1.5022753477096558, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 5.185225159706669, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.04960157722234726, "logits/rejected": 0.09407901763916016, "logps/chosen": -1.5644044876098633, "logps/rejected": -1.9328142404556274, "loss": 0.7877, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5644044876098633, "rewards/margins": 0.3684097230434418, "rewards/rejected": -1.9328142404556274, "sft_loss": 1.6086210012435913, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 12.470923210647872, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.028592389076948166, "logits/rejected": 0.04402661696076393, "logps/chosen": -1.6438688039779663, "logps/rejected": -2.0395870208740234, "loss": 0.7996, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6438688039779663, "rewards/margins": 0.3957185447216034, "rewards/rejected": -2.0395870208740234, "sft_loss": 1.621983528137207, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 6.43459386412708, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.04266032949090004, "logits/rejected": 0.0896659567952156, "logps/chosen": -1.5139906406402588, "logps/rejected": -1.8963301181793213, "loss": 0.767, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5139906406402588, "rewards/margins": 0.38233935832977295, "rewards/rejected": -1.8963301181793213, "sft_loss": 1.5390372276306152, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 7.071943235618348, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.03328155726194382, "logits/rejected": 0.12428422272205353, "logps/chosen": -1.4985084533691406, "logps/rejected": -1.8861863613128662, "loss": 0.7683, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4985084533691406, "rewards/margins": 0.3876778185367584, "rewards/rejected": -1.8861863613128662, "sft_loss": 1.5723161697387695, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 6.776845222799249, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.06044235825538635, "logits/rejected": 0.09354174137115479, "logps/chosen": -1.5422546863555908, "logps/rejected": -1.9732897281646729, "loss": 0.7745, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5422546863555908, "rewards/margins": 0.4310351014137268, "rewards/rejected": -1.9732897281646729, "sft_loss": 1.5475618839263916, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 6.627823165527759, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.05167815834283829, "logits/rejected": 0.23066608607769012, "logps/chosen": -1.5691864490509033, "logps/rejected": -1.8257277011871338, "loss": 0.8033, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5691864490509033, "rewards/margins": 0.25654134154319763, "rewards/rejected": -1.8257277011871338, "sft_loss": 1.633164405822754, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 7.094457483029642, "learning_rate": 8.636986956193235e-07, "logits/chosen": 0.02749965712428093, "logits/rejected": 0.15694104135036469, "logps/chosen": -1.4772082567214966, "logps/rejected": -1.840349555015564, "loss": 0.763, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4772082567214966, "rewards/margins": 0.3631410598754883, "rewards/rejected": -1.840349555015564, "sft_loss": 1.5266755819320679, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 5.824711372725206, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.019988730549812317, "logits/rejected": 0.1297445446252823, "logps/chosen": -1.4760496616363525, "logps/rejected": -1.868734359741211, "loss": 0.7575, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4760496616363525, "rewards/margins": 0.39268460869789124, "rewards/rejected": -1.868734359741211, "sft_loss": 1.460947036743164, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 7.0292217875058505, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.08174098283052444, "logits/rejected": 0.16942276060581207, "logps/chosen": -1.4269675016403198, "logps/rejected": -1.6700265407562256, "loss": 0.7816, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4269675016403198, "rewards/margins": 0.24305900931358337, "rewards/rejected": -1.6700265407562256, "sft_loss": 1.4381697177886963, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 7.026437399051398, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.10523724555969238, "logits/rejected": 0.1786731481552124, "logps/chosen": -1.5353397130966187, "logps/rejected": -1.8051735162734985, "loss": 0.7923, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5353397130966187, "rewards/margins": 0.26983359456062317, "rewards/rejected": -1.8051735162734985, "sft_loss": 1.5529329776763916, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 5.300536603164092, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.039864469319581985, "logits/rejected": 0.12786540389060974, "logps/chosen": -1.4416911602020264, "logps/rejected": -1.856658935546875, "loss": 0.7518, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4416911602020264, "rewards/margins": 0.41496768593788147, "rewards/rejected": -1.856658935546875, "sft_loss": 1.5043244361877441, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 5.769861279523792, "learning_rate": 8.58311228163888e-07, "logits/chosen": 0.0461508110165596, "logits/rejected": 0.14603093266487122, "logps/chosen": -1.4991919994354248, "logps/rejected": -1.7921984195709229, "loss": 0.7867, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4991919994354248, "rewards/margins": 0.2930064797401428, "rewards/rejected": -1.7921984195709229, "sft_loss": 1.575452208518982, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 6.347038542448722, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.11934056133031845, "logits/rejected": 0.04466427490115166, "logps/chosen": -1.4615641832351685, "logps/rejected": -2.040121555328369, "loss": 0.7071, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4615641832351685, "rewards/margins": 0.5785572528839111, "rewards/rejected": -2.040121555328369, "sft_loss": 1.516384243965149, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.7573932122480915, "learning_rate": 8.561318334069511e-07, "logits/chosen": 0.04206203669309616, "logits/rejected": 0.21555328369140625, "logps/chosen": -1.4878902435302734, "logps/rejected": -1.8362696170806885, "loss": 0.7746, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4878902435302734, "rewards/margins": 0.3483791947364807, "rewards/rejected": -1.8362696170806885, "sft_loss": 1.5137768983840942, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 6.38880414039442, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.016018684953451157, "logits/rejected": 0.16926954686641693, "logps/chosen": -1.5211659669876099, "logps/rejected": -1.8932441473007202, "loss": 0.768, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5211659669876099, "rewards/margins": 0.3720782995223999, "rewards/rejected": -1.8932441473007202, "sft_loss": 1.5668119192123413, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 9.065425551499487, "learning_rate": 8.539386179130977e-07, "logits/chosen": 0.022740861400961876, "logits/rejected": 0.11544118821620941, "logps/chosen": -1.5713417530059814, "logps/rejected": -1.9112104177474976, "loss": 0.7697, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5713417530059814, "rewards/margins": 0.3398687243461609, "rewards/rejected": -1.9112104177474976, "sft_loss": 1.5196139812469482, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 8.349458785483943, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.05050767585635185, "logits/rejected": 0.15125596523284912, "logps/chosen": -1.4881436824798584, "logps/rejected": -2.0509819984436035, "loss": 0.7561, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4881436824798584, "rewards/margins": 0.5628384351730347, "rewards/rejected": -2.0509819984436035, "sft_loss": 1.5483183860778809, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 4.832202775402732, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.12971444427967072, "logits/rejected": 0.18138661980628967, "logps/chosen": -1.5749452114105225, "logps/rejected": -1.843732237815857, "loss": 0.7976, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5749452114105225, "rewards/margins": 0.2687872648239136, "rewards/rejected": -1.843732237815857, "sft_loss": 1.5671472549438477, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.777056574304845, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.030573034659028053, "logits/rejected": 0.05934322997927666, "logps/chosen": -1.5336406230926514, "logps/rejected": -1.7632644176483154, "loss": 0.807, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5336406230926514, "rewards/margins": 0.22962398827075958, "rewards/rejected": -1.7632644176483154, "sft_loss": 1.581390619277954, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 6.352995372813855, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.03974000737071037, "logits/rejected": 0.19081595540046692, "logps/chosen": -1.5683252811431885, "logps/rejected": -2.034064531326294, "loss": 0.7385, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5683252811431885, "rewards/margins": 0.46573933959007263, "rewards/rejected": -2.034064531326294, "sft_loss": 1.5989660024642944, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 8.834761360627454, "learning_rate": 8.483956733269799e-07, "logits/chosen": 0.0018174918368458748, "logits/rejected": 0.11887848377227783, "logps/chosen": -1.59364914894104, "logps/rejected": -2.0107507705688477, "loss": 0.8162, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.59364914894104, "rewards/margins": 0.4171017110347748, "rewards/rejected": -2.0107507705688477, "sft_loss": 1.6244335174560547, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 13.742653789295929, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.15432365238666534, "logits/rejected": 0.0022293240763247013, "logps/chosen": -1.6489982604980469, "logps/rejected": -1.913196325302124, "loss": 0.8256, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.6489982604980469, "rewards/margins": 0.2641982138156891, "rewards/rejected": -1.913196325302124, "sft_loss": 1.6017036437988281, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 10.516225674236507, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.031183335930109024, "logits/rejected": 0.12132366746664047, "logps/chosen": -1.545526146888733, "logps/rejected": -1.9726976156234741, "loss": 0.7701, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.545526146888733, "rewards/margins": 0.42717185616493225, "rewards/rejected": -1.9726976156234741, "sft_loss": 1.5706778764724731, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 10.663044718505018, "learning_rate": 8.450292588271014e-07, "logits/chosen": 0.02532017230987549, "logits/rejected": 0.14811861515045166, "logps/chosen": -1.700233817100525, "logps/rejected": -2.023632526397705, "loss": 0.8157, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.700233817100525, "rewards/margins": 0.32339876890182495, "rewards/rejected": -2.023632526397705, "sft_loss": 1.6142208576202393, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 6.122243361334991, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.0030615911819040775, "logits/rejected": 0.09635351598262787, "logps/chosen": -1.5432813167572021, "logps/rejected": -1.9290262460708618, "loss": 0.7876, "rewards/accuracies": 0.625, "rewards/chosen": -1.5432813167572021, "rewards/margins": 0.38574501872062683, "rewards/rejected": -1.9290262460708618, "sft_loss": 1.5193164348602295, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 9.26611719755536, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.0876588523387909, "logits/rejected": 0.09467057138681412, "logps/chosen": -1.586876630783081, "logps/rejected": -1.9555718898773193, "loss": 0.7772, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.586876630783081, "rewards/margins": 0.36869534850120544, "rewards/rejected": -1.9555718898773193, "sft_loss": 1.6100679636001587, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 10.517127741740126, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.1266523003578186, "logits/rejected": 0.03176708519458771, "logps/chosen": -1.5891125202178955, "logps/rejected": -1.8949331045150757, "loss": 0.7878, "rewards/accuracies": 0.625, "rewards/chosen": -1.5891125202178955, "rewards/margins": 0.3058207035064697, "rewards/rejected": -1.8949331045150757, "sft_loss": 1.5256074666976929, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 6.224246438676703, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.006023736204952002, "logits/rejected": 0.18766924738883972, "logps/chosen": -1.505459189414978, "logps/rejected": -2.0570313930511475, "loss": 0.7306, "rewards/accuracies": 0.71875, "rewards/chosen": -1.505459189414978, "rewards/margins": 0.5515719652175903, "rewards/rejected": -2.0570313930511475, "sft_loss": 1.5429878234863281, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 8.291228486959811, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.04823196679353714, "logits/rejected": 0.03718255087733269, "logps/chosen": -1.4801435470581055, "logps/rejected": -1.918800950050354, "loss": 0.7626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4801435470581055, "rewards/margins": 0.4386575222015381, "rewards/rejected": -1.918800950050354, "sft_loss": 1.4620674848556519, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 8.240408965783027, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.049523863941431046, "logits/rejected": 0.18543100357055664, "logps/chosen": -1.5200674533843994, "logps/rejected": -1.9326683282852173, "loss": 0.7451, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5200674533843994, "rewards/margins": 0.4126008152961731, "rewards/rejected": -1.9326683282852173, "sft_loss": 1.609426498413086, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 6.89322791423342, "learning_rate": 8.370576437587742e-07, "logits/chosen": 0.02002127841114998, "logits/rejected": 0.08521411567926407, "logps/chosen": -1.556764006614685, "logps/rejected": -1.8716866970062256, "loss": 0.7656, "rewards/accuracies": 0.65625, "rewards/chosen": -1.556764006614685, "rewards/margins": 0.31492260098457336, "rewards/rejected": -1.8716866970062256, "sft_loss": 1.5431832075119019, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 5.947005797172455, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1342497169971466, "logits/rejected": 0.18603236973285675, "logps/chosen": -1.5675729513168335, "logps/rejected": -1.9816787242889404, "loss": 0.7561, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5675729513168335, "rewards/margins": 0.41410571336746216, "rewards/rejected": -1.9816787242889404, "sft_loss": 1.6156734228134155, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 4.679167833965981, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.011838539503514767, "logits/rejected": 0.17623288929462433, "logps/chosen": -1.5792244672775269, "logps/rejected": -1.8999712467193604, "loss": 0.7917, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5792244672775269, "rewards/margins": 0.32074683904647827, "rewards/rejected": -1.8999712467193604, "sft_loss": 1.568099021911621, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 6.445874483655265, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.12268956005573273, "logits/rejected": 0.0336979515850544, "logps/chosen": -1.5386877059936523, "logps/rejected": -1.8510468006134033, "loss": 0.7781, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5386877059936523, "rewards/margins": 0.3123589754104614, "rewards/rejected": -1.8510468006134033, "sft_loss": 1.5555994510650635, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 5.60248241765723, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.07647466659545898, "logits/rejected": 0.030324572697281837, "logps/chosen": -1.5298904180526733, "logps/rejected": -1.8408399820327759, "loss": 0.7912, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5298904180526733, "rewards/margins": 0.31094953417778015, "rewards/rejected": -1.8408399820327759, "sft_loss": 1.5781582593917847, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 6.872062796203317, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.0939202755689621, "logits/rejected": 0.0038464218378067017, "logps/chosen": -1.417473554611206, "logps/rejected": -1.8827733993530273, "loss": 0.7193, "rewards/accuracies": 0.71875, "rewards/chosen": -1.417473554611206, "rewards/margins": 0.46529990434646606, "rewards/rejected": -1.8827733993530273, "sft_loss": 1.4600465297698975, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 8.606184103445024, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.23066473007202148, "logits/rejected": 0.0176524817943573, "logps/chosen": -1.57735276222229, "logps/rejected": -1.9532798528671265, "loss": 0.7679, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.57735276222229, "rewards/margins": 0.3759271502494812, "rewards/rejected": -1.9532798528671265, "sft_loss": 1.6187171936035156, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 9.883368947526794, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.1374153196811676, "logits/rejected": 0.029422109946608543, "logps/chosen": -1.4726645946502686, "logps/rejected": -1.95355224609375, "loss": 0.7126, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4726645946502686, "rewards/margins": 0.4808877110481262, "rewards/rejected": -1.95355224609375, "sft_loss": 1.534286379814148, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 10.490262214467762, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.1872721016407013, "logits/rejected": 0.03306983783841133, "logps/chosen": -1.5944609642028809, "logps/rejected": -2.119265079498291, "loss": 0.7213, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5944609642028809, "rewards/margins": 0.5248039960861206, "rewards/rejected": -2.119265079498291, "sft_loss": 1.6122589111328125, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 6.302364668930056, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.09055320918560028, "logits/rejected": 0.04974263533949852, "logps/chosen": -1.5618771314620972, "logps/rejected": -2.0841903686523438, "loss": 0.7254, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5618771314620972, "rewards/margins": 0.5223131775856018, "rewards/rejected": -2.0841903686523438, "sft_loss": 1.5617009401321411, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 5.105878610572517, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.19473005831241608, "logits/rejected": 0.002607308328151703, "logps/chosen": -1.5916911363601685, "logps/rejected": -2.239445924758911, "loss": 0.7145, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5916911363601685, "rewards/margins": 0.6477545499801636, "rewards/rejected": -2.239445924758911, "sft_loss": 1.6394460201263428, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 8.825394306033553, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.10703225433826447, "logits/rejected": 0.14174774289131165, "logps/chosen": -1.6393458843231201, "logps/rejected": -2.130890369415283, "loss": 0.7543, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6393458843231201, "rewards/margins": 0.49154457449913025, "rewards/rejected": -2.130890369415283, "sft_loss": 1.696245789527893, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 6.03901388870023, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.10481816530227661, "logits/rejected": -0.01591634750366211, "logps/chosen": -1.497739315032959, "logps/rejected": -1.9503190517425537, "loss": 0.728, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.497739315032959, "rewards/margins": 0.4525798261165619, "rewards/rejected": -1.9503190517425537, "sft_loss": 1.525494933128357, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 7.652054071559482, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.12107083946466446, "logits/rejected": -0.07771385461091995, "logps/chosen": -1.5867562294006348, "logps/rejected": -1.9755197763442993, "loss": 0.7759, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5867562294006348, "rewards/margins": 0.38876357674598694, "rewards/rejected": -1.9755197763442993, "sft_loss": 1.6299279928207397, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 6.276052055732979, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.051191676408052444, "logits/rejected": 0.035134200006723404, "logps/chosen": -1.4719064235687256, "logps/rejected": -2.1249301433563232, "loss": 0.6928, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4719064235687256, "rewards/margins": 0.6530237197875977, "rewards/rejected": -2.1249301433563232, "sft_loss": 1.525087594985962, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 5.5824053342719635, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.1287408024072647, "logits/rejected": 0.04822666198015213, "logps/chosen": -1.4870167970657349, "logps/rejected": -1.914961576461792, "loss": 0.7473, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4870167970657349, "rewards/margins": 0.42794495820999146, "rewards/rejected": -1.914961576461792, "sft_loss": 1.521883249282837, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 7.327638398794357, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.0012739896774291992, "logits/rejected": 0.10261593759059906, "logps/chosen": -1.4147354364395142, "logps/rejected": -1.8603957891464233, "loss": 0.7199, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4147354364395142, "rewards/margins": 0.4456602931022644, "rewards/rejected": -1.8603957891464233, "sft_loss": 1.462917447090149, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 11.043031827539602, "learning_rate": 8.170386052085389e-07, "logits/chosen": 0.015211904421448708, "logits/rejected": 0.15176545083522797, "logps/chosen": -1.564788818359375, "logps/rejected": -2.012122631072998, "loss": 0.7649, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.564788818359375, "rewards/margins": 0.4473339021205902, "rewards/rejected": -2.012122631072998, "sft_loss": 1.5762453079223633, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 8.459188558761161, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.04446374252438545, "logits/rejected": 0.14775268733501434, "logps/chosen": -1.5686583518981934, "logps/rejected": -1.9097172021865845, "loss": 0.7869, "rewards/accuracies": 0.625, "rewards/chosen": -1.5686583518981934, "rewards/margins": 0.3410589396953583, "rewards/rejected": -1.9097172021865845, "sft_loss": 1.5495915412902832, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 9.330051888896845, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.1098170280456543, "logits/rejected": 0.010443707928061485, "logps/chosen": -1.5835919380187988, "logps/rejected": -1.9340200424194336, "loss": 0.7724, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5835919380187988, "rewards/margins": 0.3504280149936676, "rewards/rejected": -1.9340200424194336, "sft_loss": 1.579437017440796, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 7.961376866292324, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.01160441618412733, "logits/rejected": 0.1690918505191803, "logps/chosen": -1.484935998916626, "logps/rejected": -2.096376895904541, "loss": 0.7013, "rewards/accuracies": 0.71875, "rewards/chosen": -1.484935998916626, "rewards/margins": 0.611440896987915, "rewards/rejected": -2.096376895904541, "sft_loss": 1.5153307914733887, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 14.520178603275648, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.11988396942615509, "logits/rejected": 0.057445477694272995, "logps/chosen": -1.6442123651504517, "logps/rejected": -2.076686382293701, "loss": 0.8212, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6442123651504517, "rewards/margins": 0.4324740767478943, "rewards/rejected": -2.076686382293701, "sft_loss": 1.6265455484390259, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.3208009600639343, "eval_logits/rejected": 0.4351368248462677, "eval_logps/chosen": -1.5506218671798706, "eval_logps/rejected": -2.030233144760132, "eval_loss": 0.7517605423927307, "eval_rewards/accuracies": 0.6543026566505432, "eval_rewards/chosen": -1.5506218671798706, "eval_rewards/margins": 0.4796109199523926, "eval_rewards/rejected": -2.030233144760132, "eval_runtime": 43.1071, "eval_samples_per_second": 31.201, "eval_sft_loss": 1.556130290031433, "eval_steps_per_second": 7.818, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 10.173748547209174, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.05995626375079155, "logits/rejected": 0.03771807625889778, "logps/chosen": -1.5748722553253174, "logps/rejected": -1.940760850906372, "loss": 0.8069, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5748722553253174, "rewards/margins": 0.36588844656944275, "rewards/rejected": -1.940760850906372, "sft_loss": 1.5942885875701904, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 10.218445802624762, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.022941067814826965, "logits/rejected": 0.10576201975345612, "logps/chosen": -1.4721992015838623, "logps/rejected": -1.882412314414978, "loss": 0.756, "rewards/accuracies": 0.625, "rewards/chosen": -1.4721992015838623, "rewards/margins": 0.4102131426334381, "rewards/rejected": -1.882412314414978, "sft_loss": 1.5250442028045654, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.229362546840197, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.04725777357816696, "logits/rejected": 0.1172998696565628, "logps/chosen": -1.4901845455169678, "logps/rejected": -1.9726600646972656, "loss": 0.7338, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4901845455169678, "rewards/margins": 0.48247528076171875, "rewards/rejected": -1.9726600646972656, "sft_loss": 1.4658830165863037, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 7.439354392035033, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.1446385681629181, "logits/rejected": 0.09384318441152573, "logps/chosen": -1.5812222957611084, "logps/rejected": -2.0778231620788574, "loss": 0.7333, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5812222957611084, "rewards/margins": 0.49660077691078186, "rewards/rejected": -2.0778231620788574, "sft_loss": 1.5531771183013916, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 6.9687121656640345, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.0020522386766970158, "logits/rejected": 0.165645033121109, "logps/chosen": -1.5605833530426025, "logps/rejected": -2.1044960021972656, "loss": 0.7513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5605833530426025, "rewards/margins": 0.5439127087593079, "rewards/rejected": -2.1044960021972656, "sft_loss": 1.6236639022827148, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 6.027964332918541, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.08813516795635223, "logits/rejected": -0.056090403348207474, "logps/chosen": -1.5097250938415527, "logps/rejected": -1.8779700994491577, "loss": 0.7729, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.5097250938415527, "rewards/margins": 0.3682449460029602, "rewards/rejected": -1.8779700994491577, "sft_loss": 1.557384729385376, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 9.311901890327995, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.10653889179229736, "logits/rejected": 0.09843524545431137, "logps/chosen": -1.5623483657836914, "logps/rejected": -2.0298380851745605, "loss": 0.7674, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5623483657836914, "rewards/margins": 0.4674898087978363, "rewards/rejected": -2.0298380851745605, "sft_loss": 1.5933910608291626, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 30.402990975973324, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.1777440905570984, "logits/rejected": -0.09381499886512756, "logps/chosen": -1.6253573894500732, "logps/rejected": -1.994631052017212, "loss": 0.8231, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6253573894500732, "rewards/margins": 0.36927351355552673, "rewards/rejected": -1.994631052017212, "sft_loss": 1.6185871362686157, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 7.389790275843495, "learning_rate": 8.01127203488458e-07, "logits/chosen": 0.0011692598927766085, "logits/rejected": 0.05350467562675476, "logps/chosen": -1.5495258569717407, "logps/rejected": -2.001826524734497, "loss": 0.7341, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5495258569717407, "rewards/margins": 0.45230036973953247, "rewards/rejected": -2.001826524734497, "sft_loss": 1.5058408975601196, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 8.709186910145224, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.15906846523284912, "logits/rejected": 0.0037447966169565916, "logps/chosen": -1.5510900020599365, "logps/rejected": -1.9378044605255127, "loss": 0.7899, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5510900020599365, "rewards/margins": 0.3867144286632538, "rewards/rejected": -1.9378044605255127, "sft_loss": 1.6270862817764282, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 7.773885609667464, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.04642852395772934, "logits/rejected": 0.11252977699041367, "logps/chosen": -1.5034029483795166, "logps/rejected": -1.9795360565185547, "loss": 0.7482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5034029483795166, "rewards/margins": 0.4761331081390381, "rewards/rejected": -1.9795360565185547, "sft_loss": 1.5702406167984009, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 10.043024517154269, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.0061192139983177185, "logits/rejected": 0.06203888729214668, "logps/chosen": -1.5721771717071533, "logps/rejected": -2.114776372909546, "loss": 0.7306, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5721771717071533, "rewards/margins": 0.5425990223884583, "rewards/rejected": -2.114776372909546, "sft_loss": 1.6257705688476562, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 8.40062323844928, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.053060900419950485, "logits/rejected": 0.11354587972164154, "logps/chosen": -1.5385710000991821, "logps/rejected": -2.0182838439941406, "loss": 0.7515, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5385710000991821, "rewards/margins": 0.47971296310424805, "rewards/rejected": -2.0182838439941406, "sft_loss": 1.5186246633529663, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 8.124171343349465, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.04503004625439644, "logits/rejected": 0.1332114040851593, "logps/chosen": -1.460111379623413, "logps/rejected": -2.0412354469299316, "loss": 0.6911, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.460111379623413, "rewards/margins": 0.5811241269111633, "rewards/rejected": -2.0412354469299316, "sft_loss": 1.4853391647338867, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 10.293968415502416, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.06999489665031433, "logits/rejected": 0.058819591999053955, "logps/chosen": -1.5956321954727173, "logps/rejected": -2.0661768913269043, "loss": 0.7658, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5956321954727173, "rewards/margins": 0.4705447256565094, "rewards/rejected": -2.0661768913269043, "sft_loss": 1.6618207693099976, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 7.542204256175726, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.1348196566104889, "logits/rejected": 0.06260451674461365, "logps/chosen": -1.4696956872940063, "logps/rejected": -1.954728126525879, "loss": 0.7134, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4696956872940063, "rewards/margins": 0.4850325584411621, "rewards/rejected": -1.954728126525879, "sft_loss": 1.5264543294906616, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 15.772911994217747, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.10263931751251221, "logits/rejected": 0.03259637579321861, "logps/chosen": -1.5333244800567627, "logps/rejected": -2.0411601066589355, "loss": 0.7412, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5333244800567627, "rewards/margins": 0.5078356862068176, "rewards/rejected": -2.0411601066589355, "sft_loss": 1.5663384199142456, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 7.748107178627658, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.15963055193424225, "logits/rejected": -0.15033797919750214, "logps/chosen": -1.5429985523223877, "logps/rejected": -1.884341835975647, "loss": 0.7973, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5429985523223877, "rewards/margins": 0.3413431942462921, "rewards/rejected": -1.884341835975647, "sft_loss": 1.6676254272460938, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 7.845536338154415, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.14314241707324982, "logits/rejected": 0.03491301089525223, "logps/chosen": -1.6934551000595093, "logps/rejected": -2.0634000301361084, "loss": 0.7966, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6934551000595093, "rewards/margins": 0.36994481086730957, "rewards/rejected": -2.0634000301361084, "sft_loss": 1.7066478729248047, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 9.466314948702411, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.037253573536872864, "logits/rejected": 0.030082762241363525, "logps/chosen": -1.6872516870498657, "logps/rejected": -2.073812246322632, "loss": 0.7789, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6872516870498657, "rewards/margins": 0.3865607678890228, "rewards/rejected": -2.073812246322632, "sft_loss": 1.6730873584747314, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 9.17513602595643, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.030986011028289795, "logits/rejected": 0.11619605123996735, "logps/chosen": -1.6120907068252563, "logps/rejected": -2.146749973297119, "loss": 0.7352, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6120907068252563, "rewards/margins": 0.5346594452857971, "rewards/rejected": -2.146749973297119, "sft_loss": 1.5929841995239258, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 9.274219625284895, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.1234712153673172, "logits/rejected": 0.05881828814744949, "logps/chosen": -1.5510352849960327, "logps/rejected": -2.2057430744171143, "loss": 0.7389, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5510352849960327, "rewards/margins": 0.6547077894210815, "rewards/rejected": -2.2057430744171143, "sft_loss": 1.5953401327133179, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 7.515236126774249, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.014821672812104225, "logits/rejected": 0.05407888814806938, "logps/chosen": -1.5893491506576538, "logps/rejected": -2.0642919540405273, "loss": 0.7761, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5893491506576538, "rewards/margins": 0.47494274377822876, "rewards/rejected": -2.0642919540405273, "sft_loss": 1.6317846775054932, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 10.313518147628233, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.10846789926290512, "logits/rejected": -0.05322038009762764, "logps/chosen": -1.5747969150543213, "logps/rejected": -1.9697825908660889, "loss": 0.7619, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5747969150543213, "rewards/margins": 0.3949858248233795, "rewards/rejected": -1.9697825908660889, "sft_loss": 1.6365448236465454, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 6.124317665981363, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.09734370559453964, "logits/rejected": -0.015683285892009735, "logps/chosen": -1.5244369506835938, "logps/rejected": -2.0510950088500977, "loss": 0.7147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5244369506835938, "rewards/margins": 0.5266581773757935, "rewards/rejected": -2.0510950088500977, "sft_loss": 1.5464966297149658, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 11.992802184306996, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.05712106078863144, "logits/rejected": 0.2383485734462738, "logps/chosen": -1.57157301902771, "logps/rejected": -2.1517395973205566, "loss": 0.718, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.57157301902771, "rewards/margins": 0.5801664590835571, "rewards/rejected": -2.1517395973205566, "sft_loss": 1.6144263744354248, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 7.6770638959720925, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.0391792431473732, "logits/rejected": 0.16381685435771942, "logps/chosen": -1.5745493173599243, "logps/rejected": -2.129084825515747, "loss": 0.7275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5745493173599243, "rewards/margins": 0.5545353293418884, "rewards/rejected": -2.129084825515747, "sft_loss": 1.5876792669296265, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 12.407542192109783, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.06183594465255737, "logits/rejected": 0.10918141901493073, "logps/chosen": -1.5881478786468506, "logps/rejected": -2.0792908668518066, "loss": 0.7569, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5881478786468506, "rewards/margins": 0.49114298820495605, "rewards/rejected": -2.0792908668518066, "sft_loss": 1.5978810787200928, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 11.266448501825845, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.04740814119577408, "logits/rejected": 0.06394124031066895, "logps/chosen": -1.4710605144500732, "logps/rejected": -2.075150966644287, "loss": 0.7122, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4710605144500732, "rewards/margins": 0.60409015417099, "rewards/rejected": -2.075150966644287, "sft_loss": 1.5502369403839111, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 6.3598502887288495, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.08250895887613297, "logits/rejected": -0.00883649755269289, "logps/chosen": -1.6141011714935303, "logps/rejected": -2.086073160171509, "loss": 0.7469, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6141011714935303, "rewards/margins": 0.4719718098640442, "rewards/rejected": -2.086073160171509, "sft_loss": 1.638538122177124, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 8.184105771942406, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.001839971519075334, "logits/rejected": 0.08463334292173386, "logps/chosen": -1.6715660095214844, "logps/rejected": -2.067680835723877, "loss": 0.7798, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6715660095214844, "rewards/margins": 0.39611494541168213, "rewards/rejected": -2.067680835723877, "sft_loss": 1.7208125591278076, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 13.082849403503461, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.12100370228290558, "logits/rejected": 0.0018446326721459627, "logps/chosen": -1.5938975811004639, "logps/rejected": -2.149057388305664, "loss": 0.7475, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5938975811004639, "rewards/margins": 0.5551599264144897, "rewards/rejected": -2.149057388305664, "sft_loss": 1.62832510471344, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 7.031102804155232, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.056700628250837326, "logits/rejected": 0.15962740778923035, "logps/chosen": -1.5927625894546509, "logps/rejected": -2.215181589126587, "loss": 0.7246, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5927625894546509, "rewards/margins": 0.6224191188812256, "rewards/rejected": -2.215181589126587, "sft_loss": 1.6102997064590454, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 5.170176400761721, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.03960901498794556, "logits/rejected": 0.17432275414466858, "logps/chosen": -1.5560939311981201, "logps/rejected": -2.1399526596069336, "loss": 0.7365, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5560939311981201, "rewards/margins": 0.583858847618103, "rewards/rejected": -2.1399526596069336, "sft_loss": 1.5941691398620605, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 5.973256499847812, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.08819224685430527, "logits/rejected": 0.12655667960643768, "logps/chosen": -1.596260905265808, "logps/rejected": -2.049752712249756, "loss": 0.7411, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.596260905265808, "rewards/margins": 0.453491747379303, "rewards/rejected": -2.049752712249756, "sft_loss": 1.6357721090316772, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 5.552972862726242, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.08877646923065186, "logits/rejected": 0.017213106155395508, "logps/chosen": -1.5630794763565063, "logps/rejected": -2.1341676712036133, "loss": 0.7235, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5630794763565063, "rewards/margins": 0.5710882544517517, "rewards/rejected": -2.1341676712036133, "sft_loss": 1.634744644165039, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 15.403941224331126, "learning_rate": 7.652149295157868e-07, "logits/chosen": 0.03003104403614998, "logits/rejected": 0.20993654429912567, "logps/chosen": -1.6422191858291626, "logps/rejected": -2.0472705364227295, "loss": 0.7629, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6422191858291626, "rewards/margins": 0.40505123138427734, "rewards/rejected": -2.0472705364227295, "sft_loss": 1.6169044971466064, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 10.54276739205797, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.11677481979131699, "logits/rejected": 0.18966534733772278, "logps/chosen": -1.5932385921478271, "logps/rejected": -2.098334789276123, "loss": 0.7532, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5932385921478271, "rewards/margins": 0.5050963163375854, "rewards/rejected": -2.098334789276123, "sft_loss": 1.6820847988128662, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 9.844849482330293, "learning_rate": 7.625692901064573e-07, "logits/chosen": 0.033648911863565445, "logits/rejected": 0.14953699707984924, "logps/chosen": -1.5926415920257568, "logps/rejected": -2.184016227722168, "loss": 0.742, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5926415920257568, "rewards/margins": 0.5913747549057007, "rewards/rejected": -2.184016227722168, "sft_loss": 1.659144401550293, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 8.033187049359737, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.047178782522678375, "logits/rejected": -0.054433345794677734, "logps/chosen": -1.5915645360946655, "logps/rejected": -2.07096529006958, "loss": 0.7669, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5915645360946655, "rewards/margins": 0.47940054535865784, "rewards/rejected": -2.07096529006958, "sft_loss": 1.663214087486267, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 5.821876711785263, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.09211932867765427, "logits/rejected": 0.137475848197937, "logps/chosen": -1.4992023706436157, "logps/rejected": -1.9810651540756226, "loss": 0.7215, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4992023706436157, "rewards/margins": 0.4818628430366516, "rewards/rejected": -1.9810651540756226, "sft_loss": 1.503271222114563, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 8.29166943330852, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.13009805977344513, "logits/rejected": 0.018473362550139427, "logps/chosen": -1.4088270664215088, "logps/rejected": -2.037583112716675, "loss": 0.6937, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4088270664215088, "rewards/margins": 0.6287561655044556, "rewards/rejected": -2.037583112716675, "sft_loss": 1.4674748182296753, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 12.71671991848412, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.08145278692245483, "logits/rejected": 0.029804622754454613, "logps/chosen": -1.6682164669036865, "logps/rejected": -2.2155566215515137, "loss": 0.7423, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6682164669036865, "rewards/margins": 0.5473401546478271, "rewards/rejected": -2.2155566215515137, "sft_loss": 1.6055316925048828, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 10.507466652353193, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.15689218044281006, "logits/rejected": -0.0009407728794030845, "logps/chosen": -1.48911714553833, "logps/rejected": -1.9215772151947021, "loss": 0.7739, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.48911714553833, "rewards/margins": 0.43245992064476013, "rewards/rejected": -1.9215772151947021, "sft_loss": 1.5419213771820068, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 7.635644847013896, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.15236765146255493, "logits/rejected": 0.11053421348333359, "logps/chosen": -1.4636036157608032, "logps/rejected": -2.061983585357666, "loss": 0.7064, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4636036157608032, "rewards/margins": 0.5983799695968628, "rewards/rejected": -2.061983585357666, "sft_loss": 1.5207092761993408, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 10.675849410884368, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.15605592727661133, "logits/rejected": 0.015971161425113678, "logps/chosen": -1.5459859371185303, "logps/rejected": -2.1895651817321777, "loss": 0.7278, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5459859371185303, "rewards/margins": 0.6435791254043579, "rewards/rejected": -2.1895651817321777, "sft_loss": 1.5461068153381348, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 10.584034897403765, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.13053454458713531, "logits/rejected": 0.09670326858758926, "logps/chosen": -1.5013011693954468, "logps/rejected": -2.0812840461730957, "loss": 0.7236, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5013011693954468, "rewards/margins": 0.5799828767776489, "rewards/rejected": -2.0812840461730957, "sft_loss": 1.5167697668075562, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 6.399045718409167, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.1140945702791214, "logits/rejected": 0.08292241394519806, "logps/chosen": -1.5222761631011963, "logps/rejected": -2.046513795852661, "loss": 0.7149, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5222761631011963, "rewards/margins": 0.5242374539375305, "rewards/rejected": -2.046513795852661, "sft_loss": 1.5688246488571167, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 8.523350789273504, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.06899989396333694, "logits/rejected": -0.007209339644759893, "logps/chosen": -1.4910833835601807, "logps/rejected": -1.9324243068695068, "loss": 0.7505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4910833835601807, "rewards/margins": 0.44134092330932617, "rewards/rejected": -1.9324243068695068, "sft_loss": 1.5616381168365479, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 9.41513514622019, "learning_rate": 7.478389038622441e-07, "logits/chosen": 0.021819135174155235, "logits/rejected": 0.05284147337079048, "logps/chosen": -1.4935173988342285, "logps/rejected": -2.049025058746338, "loss": 0.6988, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4935173988342285, "rewards/margins": 0.5555076003074646, "rewards/rejected": -2.049025058746338, "sft_loss": 1.5246081352233887, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 9.98319988392465, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.10194502770900726, "logits/rejected": 0.021598730236291885, "logps/chosen": -1.6092774868011475, "logps/rejected": -1.9808412790298462, "loss": 0.809, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6092774868011475, "rewards/margins": 0.3715636134147644, "rewards/rejected": -1.9808412790298462, "sft_loss": 1.605093002319336, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 7.035300026713442, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.024056220427155495, "logits/rejected": -0.0342448428273201, "logps/chosen": -1.5393335819244385, "logps/rejected": -1.8783483505249023, "loss": 0.7758, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5393335819244385, "rewards/margins": 0.3390146791934967, "rewards/rejected": -1.8783483505249023, "sft_loss": 1.566124677658081, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 7.280154971422248, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.14503630995750427, "logits/rejected": -0.03414043039083481, "logps/chosen": -1.58201003074646, "logps/rejected": -2.051666021347046, "loss": 0.7539, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.58201003074646, "rewards/margins": 0.46965599060058594, "rewards/rejected": -2.051666021347046, "sft_loss": 1.652116060256958, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 10.593971685054335, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.2341700792312622, "logits/rejected": -0.026684647426009178, "logps/chosen": -1.5408596992492676, "logps/rejected": -2.1194355487823486, "loss": 0.7173, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5408596992492676, "rewards/margins": 0.5785756707191467, "rewards/rejected": -2.1194355487823486, "sft_loss": 1.5281615257263184, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 9.283410871585753, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.11060525476932526, "logits/rejected": 0.0447152815759182, "logps/chosen": -1.5042294263839722, "logps/rejected": -2.1289517879486084, "loss": 0.6967, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5042294263839722, "rewards/margins": 0.6247223615646362, "rewards/rejected": -2.1289517879486084, "sft_loss": 1.5233356952667236, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 10.964334832287847, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.18535827100276947, "logits/rejected": 0.016271600499749184, "logps/chosen": -1.4418184757232666, "logps/rejected": -1.9585402011871338, "loss": 0.7232, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4418184757232666, "rewards/margins": 0.5167215466499329, "rewards/rejected": -1.9585402011871338, "sft_loss": 1.492578387260437, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 6.226159036453706, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.10150859504938126, "logits/rejected": 0.1336476057767868, "logps/chosen": -1.6237376928329468, "logps/rejected": -2.2351582050323486, "loss": 0.7147, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6237376928329468, "rewards/margins": 0.6114204525947571, "rewards/rejected": -2.2351582050323486, "sft_loss": 1.6174319982528687, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 5.994835467251361, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.21705873310565948, "logits/rejected": -0.0303493682295084, "logps/chosen": -1.467805027961731, "logps/rejected": -2.013909339904785, "loss": 0.741, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.467805027961731, "rewards/margins": 0.5461043119430542, "rewards/rejected": -2.013909339904785, "sft_loss": 1.5122298002243042, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 8.50591853387839, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.1560746729373932, "logits/rejected": -0.05373350903391838, "logps/chosen": -1.5913516283035278, "logps/rejected": -2.1020658016204834, "loss": 0.7746, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5913516283035278, "rewards/margins": 0.5107141733169556, "rewards/rejected": -2.1020658016204834, "sft_loss": 1.6778093576431274, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 13.835632867691519, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.09413419663906097, "logits/rejected": 0.06103119999170303, "logps/chosen": -1.567608118057251, "logps/rejected": -2.026780605316162, "loss": 0.7866, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.567608118057251, "rewards/margins": 0.45917272567749023, "rewards/rejected": -2.026780605316162, "sft_loss": 1.6191011667251587, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 8.952985147670837, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.11925704777240753, "logits/rejected": 0.04713190719485283, "logps/chosen": -1.7047021389007568, "logps/rejected": -2.2933902740478516, "loss": 0.7382, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7047021389007568, "rewards/margins": 0.5886882543563843, "rewards/rejected": -2.2933902740478516, "sft_loss": 1.6612449884414673, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 9.154665834484751, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.06198301166296005, "logits/rejected": 0.26731908321380615, "logps/chosen": -1.57393217086792, "logps/rejected": -2.24753737449646, "loss": 0.7066, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.57393217086792, "rewards/margins": 0.6736055612564087, "rewards/rejected": -2.24753737449646, "sft_loss": 1.562909483909607, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 8.84351263967697, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.0669042244553566, "logits/rejected": 0.16192308068275452, "logps/chosen": -1.6145012378692627, "logps/rejected": -2.2150444984436035, "loss": 0.7349, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6145012378692627, "rewards/margins": 0.600543200969696, "rewards/rejected": -2.2150444984436035, "sft_loss": 1.6317462921142578, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 5.8391924472115715, "learning_rate": 7.286726973755554e-07, "logits/chosen": 0.03708028048276901, "logits/rejected": 0.08343568444252014, "logps/chosen": -1.5504786968231201, "logps/rejected": -2.064244508743286, "loss": 0.738, "rewards/accuracies": 0.625, "rewards/chosen": -1.5504786968231201, "rewards/margins": 0.513765811920166, "rewards/rejected": -2.064244508743286, "sft_loss": 1.5739312171936035, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 7.698629854288435, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.11247192323207855, "logits/rejected": 0.22476942837238312, "logps/chosen": -1.6056830883026123, "logps/rejected": -2.1932504177093506, "loss": 0.7134, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6056830883026123, "rewards/margins": 0.5875673890113831, "rewards/rejected": -2.1932504177093506, "sft_loss": 1.568518877029419, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 10.48787629436489, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.12229554355144501, "logits/rejected": 0.02094579115509987, "logps/chosen": -1.5846467018127441, "logps/rejected": -2.1185622215270996, "loss": 0.7641, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5846467018127441, "rewards/margins": 0.5339155793190002, "rewards/rejected": -2.1185622215270996, "sft_loss": 1.6333281993865967, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 10.111955744672095, "learning_rate": 7.245078304138335e-07, "logits/chosen": 0.04601864144206047, "logits/rejected": 0.13743598759174347, "logps/chosen": -1.6071914434432983, "logps/rejected": -2.1458280086517334, "loss": 0.7298, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6071914434432983, "rewards/margins": 0.5386365652084351, "rewards/rejected": -2.1458280086517334, "sft_loss": 1.6183099746704102, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 6.664383588607223, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.14049389958381653, "logits/rejected": 0.09535631537437439, "logps/chosen": -1.6266456842422485, "logps/rejected": -2.1694750785827637, "loss": 0.752, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6266456842422485, "rewards/margins": 0.5428295731544495, "rewards/rejected": -2.1694750785827637, "sft_loss": 1.6184921264648438, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 10.209690962213324, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.07098677009344101, "logits/rejected": 0.1403525173664093, "logps/chosen": -1.590184211730957, "logps/rejected": -2.224980592727661, "loss": 0.7297, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.590184211730957, "rewards/margins": 0.6347963213920593, "rewards/rejected": -2.224980592727661, "sft_loss": 1.613470435142517, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 8.83490195604573, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.05324209854006767, "logits/rejected": 0.1348373293876648, "logps/chosen": -1.6206588745117188, "logps/rejected": -2.135542392730713, "loss": 0.7551, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6206588745117188, "rewards/margins": 0.5148836970329285, "rewards/rejected": -2.135542392730713, "sft_loss": 1.6155281066894531, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 11.105122071094831, "learning_rate": 7.189242433016852e-07, "logits/chosen": 0.032990649342536926, "logits/rejected": 0.2036905586719513, "logps/chosen": -1.508832573890686, "logps/rejected": -2.2878057956695557, "loss": 0.7128, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.508832573890686, "rewards/margins": 0.7789733409881592, "rewards/rejected": -2.2878057956695557, "sft_loss": 1.5569217205047607, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 11.435945670048056, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.0018535584677010775, "logits/rejected": 0.19345472753047943, "logps/chosen": -1.5967261791229248, "logps/rejected": -2.323276996612549, "loss": 0.7152, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5967261791229248, "rewards/margins": 0.7265505790710449, "rewards/rejected": -2.323276996612549, "sft_loss": 1.6429847478866577, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 7.42497726244175, "learning_rate": 7.161196514973734e-07, "logits/chosen": 0.027911877259612083, "logits/rejected": 0.22461143136024475, "logps/chosen": -1.5947569608688354, "logps/rejected": -2.248321771621704, "loss": 0.7254, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5947569608688354, "rewards/margins": 0.6535648107528687, "rewards/rejected": -2.248321771621704, "sft_loss": 1.6426048278808594, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 13.607065252786148, "learning_rate": 7.147142035968483e-07, "logits/chosen": 0.05428176000714302, "logits/rejected": 0.24193844199180603, "logps/chosen": -1.6152242422103882, "logps/rejected": -2.1521873474121094, "loss": 0.7319, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6152242422103882, "rewards/margins": 0.5369630455970764, "rewards/rejected": -2.1521873474121094, "sft_loss": 1.6707464456558228, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 8.944244122072076, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.11234885454177856, "logits/rejected": 0.12381456047296524, "logps/chosen": -1.4773558378219604, "logps/rejected": -1.9726444482803345, "loss": 0.736, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4773558378219604, "rewards/margins": 0.49528855085372925, "rewards/rejected": -1.9726444482803345, "sft_loss": 1.5102120637893677, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 10.3586111064844, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.06915297359228134, "logits/rejected": 0.18449541926383972, "logps/chosen": -1.632387399673462, "logps/rejected": -2.2874670028686523, "loss": 0.738, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.632387399673462, "rewards/margins": 0.65507972240448, "rewards/rejected": -2.2874670028686523, "sft_loss": 1.6928699016571045, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 7.9949368148422675, "learning_rate": 7.104854155899711e-07, "logits/chosen": 0.03378201648592949, "logits/rejected": 0.1725183129310608, "logps/chosen": -1.59267258644104, "logps/rejected": -2.1078784465789795, "loss": 0.7498, "rewards/accuracies": 0.625, "rewards/chosen": -1.59267258644104, "rewards/margins": 0.515205979347229, "rewards/rejected": -2.1078784465789795, "sft_loss": 1.5630031824111938, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 7.283411659911242, "learning_rate": 7.090717170722817e-07, "logits/chosen": 0.05955095216631889, "logits/rejected": 0.1517009288072586, "logps/chosen": -1.5180206298828125, "logps/rejected": -2.203601121902466, "loss": 0.6876, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5180206298828125, "rewards/margins": 0.6855801939964294, "rewards/rejected": -2.203601121902466, "sft_loss": 1.5829445123672485, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 7.898140544309821, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.16072717308998108, "logits/rejected": 0.016164880245923996, "logps/chosen": -1.4808943271636963, "logps/rejected": -2.01924467086792, "loss": 0.7099, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4808943271636963, "rewards/margins": 0.5383505821228027, "rewards/rejected": -2.01924467086792, "sft_loss": 1.5266382694244385, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 9.906403385651851, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.013724331744015217, "logits/rejected": 0.15427502989768982, "logps/chosen": -1.5086654424667358, "logps/rejected": -1.9887739419937134, "loss": 0.7326, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5086654424667358, "rewards/margins": 0.48010849952697754, "rewards/rejected": -1.9887739419937134, "sft_loss": 1.5377105474472046, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.379881352186203, "eval_logits/rejected": 0.49927571415901184, "eval_logps/chosen": -1.6077141761779785, "eval_logps/rejected": -2.1582469940185547, "eval_loss": 0.7455233335494995, "eval_rewards/accuracies": 0.6632047295570374, "eval_rewards/chosen": -1.6077141761779785, "eval_rewards/margins": 0.5505325198173523, "eval_rewards/rejected": -2.1582469940185547, "eval_runtime": 43.1701, "eval_samples_per_second": 31.156, "eval_sft_loss": 1.6026520729064941, "eval_steps_per_second": 7.806, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 6.134250664247765, "learning_rate": 7.048185059920193e-07, "logits/chosen": 0.015119487419724464, "logits/rejected": 0.20360179245471954, "logps/chosen": -1.5773468017578125, "logps/rejected": -2.247434616088867, "loss": 0.7286, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5773468017578125, "rewards/margins": 0.6700879335403442, "rewards/rejected": -2.247434616088867, "sft_loss": 1.5948069095611572, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 9.079221564141005, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.1293601542711258, "logits/rejected": 0.12338282912969589, "logps/chosen": -1.555763840675354, "logps/rejected": -1.8928635120391846, "loss": 0.7777, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.555763840675354, "rewards/margins": 0.33709952235221863, "rewards/rejected": -1.8928635120391846, "sft_loss": 1.5949723720550537, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 8.650216372430528, "learning_rate": 7.019730732632681e-07, "logits/chosen": 0.07996892929077148, "logits/rejected": 0.19383428990840912, "logps/chosen": -1.48301362991333, "logps/rejected": -2.181229591369629, "loss": 0.7097, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.48301362991333, "rewards/margins": 0.6982158422470093, "rewards/rejected": -2.181229591369629, "sft_loss": 1.4842723608016968, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 7.342967806758975, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.09568249434232712, "logits/rejected": 0.06348307430744171, "logps/chosen": -1.5990853309631348, "logps/rejected": -2.171008348464966, "loss": 0.7025, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5990853309631348, "rewards/margins": 0.571923017501831, "rewards/rejected": -2.171008348464966, "sft_loss": 1.6577285528182983, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 7.374027514614783, "learning_rate": 6.991198023789577e-07, "logits/chosen": 0.04409540072083473, "logits/rejected": 0.143757626414299, "logps/chosen": -1.4786756038665771, "logps/rejected": -1.925246238708496, "loss": 0.7329, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4786756038665771, "rewards/margins": 0.44657057523727417, "rewards/rejected": -1.925246238708496, "sft_loss": 1.569298505783081, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 8.992988796205905, "learning_rate": 6.976902622196776e-07, "logits/chosen": 0.06224370002746582, "logits/rejected": 0.15910187363624573, "logps/chosen": -1.637851357460022, "logps/rejected": -2.1420624256134033, "loss": 0.7469, "rewards/accuracies": 0.65625, "rewards/chosen": -1.637851357460022, "rewards/margins": 0.5042110681533813, "rewards/rejected": -2.1420624256134033, "sft_loss": 1.6566555500030518, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 7.59552432689376, "learning_rate": 6.962588040686064e-07, "logits/chosen": 0.06839130818843842, "logits/rejected": 0.2438676357269287, "logps/chosen": -1.5475409030914307, "logps/rejected": -1.9469690322875977, "loss": 0.7758, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5475409030914307, "rewards/margins": 0.3994281589984894, "rewards/rejected": -1.9469690322875977, "sft_loss": 1.571683645248413, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 9.598063201919448, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.047854699194431305, "logits/rejected": 0.11139366775751114, "logps/chosen": -1.5512007474899292, "logps/rejected": -2.186490535736084, "loss": 0.7229, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5512007474899292, "rewards/margins": 0.6352895498275757, "rewards/rejected": -2.186490535736084, "sft_loss": 1.538778305053711, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 17.085205151314383, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.029192287474870682, "logits/rejected": 0.15908031165599823, "logps/chosen": -1.623412847518921, "logps/rejected": -2.1014180183410645, "loss": 0.773, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.623412847518921, "rewards/margins": 0.47800517082214355, "rewards/rejected": -2.1014180183410645, "sft_loss": 1.6222498416900635, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 9.498722987874377, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.006516993045806885, "logits/rejected": 0.197300523519516, "logps/chosen": -1.5290013551712036, "logps/rejected": -2.188312292098999, "loss": 0.7188, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5290013551712036, "rewards/margins": 0.6593109369277954, "rewards/rejected": -2.188312292098999, "sft_loss": 1.5248825550079346, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 5.895566234626802, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.07133042812347412, "logits/rejected": 0.23768667876720428, "logps/chosen": -1.5920215845108032, "logps/rejected": -2.138711452484131, "loss": 0.7329, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5920215845108032, "rewards/margins": 0.5466898679733276, "rewards/rejected": -2.138711452484131, "sft_loss": 1.626725435256958, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 13.092560603083175, "learning_rate": 6.890732301605904e-07, "logits/chosen": 0.02958945371210575, "logits/rejected": 0.1718485802412033, "logps/chosen": -1.5982673168182373, "logps/rejected": -1.993364930152893, "loss": 0.7908, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5982673168182373, "rewards/margins": 0.39509764313697815, "rewards/rejected": -1.993364930152893, "sft_loss": 1.5853713750839233, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 11.285942644884665, "learning_rate": 6.876305563489021e-07, "logits/chosen": 0.008666718378663063, "logits/rejected": 0.12356449663639069, "logps/chosen": -1.5056078433990479, "logps/rejected": -2.138023853302002, "loss": 0.713, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5056078433990479, "rewards/margins": 0.6324161291122437, "rewards/rejected": -2.138023853302002, "sft_loss": 1.497593641281128, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 7.218964319947808, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.1583685576915741, "logits/rejected": 0.007396022789180279, "logps/chosen": -1.5102524757385254, "logps/rejected": -1.8816665410995483, "loss": 0.7632, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5102524757385254, "rewards/margins": 0.3714141845703125, "rewards/rejected": -1.8816665410995483, "sft_loss": 1.5781974792480469, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 8.176604919091433, "learning_rate": 6.847397615625725e-07, "logits/chosen": 0.0185202918946743, "logits/rejected": 0.10688277333974838, "logps/chosen": -1.5296119451522827, "logps/rejected": -2.0134944915771484, "loss": 0.7249, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5296119451522827, "rewards/margins": 0.48388272523880005, "rewards/rejected": -2.0134944915771484, "sft_loss": 1.5616164207458496, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 5.836861077468094, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.17154854536056519, "logits/rejected": 0.055903516709804535, "logps/chosen": -1.584151029586792, "logps/rejected": -2.2295730113983154, "loss": 0.7026, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.584151029586792, "rewards/margins": 0.6454219818115234, "rewards/rejected": -2.2295730113983154, "sft_loss": 1.6839739084243774, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 7.0519459260989965, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.058473795652389526, "logits/rejected": 0.2768692374229431, "logps/chosen": -1.5622868537902832, "logps/rejected": -2.2439277172088623, "loss": 0.7453, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5622868537902832, "rewards/margins": 0.6816409230232239, "rewards/rejected": -2.2439277172088623, "sft_loss": 1.6638433933258057, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 8.132511371722622, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.09446346759796143, "logits/rejected": -0.019774939864873886, "logps/chosen": -1.6264663934707642, "logps/rejected": -2.1780388355255127, "loss": 0.7305, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6264663934707642, "rewards/margins": 0.5515724420547485, "rewards/rejected": -2.1780388355255127, "sft_loss": 1.663002371788025, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 9.230278476559855, "learning_rate": 6.789367763543292e-07, "logits/chosen": 0.028789719566702843, "logits/rejected": 0.05060397461056709, "logps/chosen": -1.5779023170471191, "logps/rejected": -2.081831455230713, "loss": 0.7595, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5779023170471191, "rewards/margins": 0.5039289593696594, "rewards/rejected": -2.081831455230713, "sft_loss": 1.6103708744049072, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 11.214605077444785, "learning_rate": 6.774816547081714e-07, "logits/chosen": 0.04705638065934181, "logits/rejected": 0.2561543583869934, "logps/chosen": -1.5337834358215332, "logps/rejected": -2.009347438812256, "loss": 0.7357, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5337834358215332, "rewards/margins": 0.4755638539791107, "rewards/rejected": -2.009347438812256, "sft_loss": 1.5893291234970093, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 7.488009579786737, "learning_rate": 6.760248111342211e-07, "logits/chosen": 0.0024774298071861267, "logits/rejected": 0.21770718693733215, "logps/chosen": -1.4978857040405273, "logps/rejected": -2.041604518890381, "loss": 0.7336, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4978857040405273, "rewards/margins": 0.5437186360359192, "rewards/rejected": -2.041604518890381, "sft_loss": 1.5103671550750732, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 11.227384599577906, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.09081384539604187, "logits/rejected": 0.0858566015958786, "logps/chosen": -1.5253804922103882, "logps/rejected": -2.1070313453674316, "loss": 0.7059, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5253804922103882, "rewards/margins": 0.5816511511802673, "rewards/rejected": -2.1070313453674316, "sft_loss": 1.5771745443344116, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 6.691083484470365, "learning_rate": 6.731060147567236e-07, "logits/chosen": 0.009510062634944916, "logits/rejected": 0.1557776927947998, "logps/chosen": -1.5624955892562866, "logps/rejected": -2.147212266921997, "loss": 0.7281, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5624955892562866, "rewards/margins": 0.5847165584564209, "rewards/rejected": -2.147212266921997, "sft_loss": 1.6406068801879883, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 7.098284699993729, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.0812225341796875, "logits/rejected": 0.030553704127669334, "logps/chosen": -1.616703987121582, "logps/rejected": -2.094724178314209, "loss": 0.7242, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.616703987121582, "rewards/margins": 0.47802019119262695, "rewards/rejected": -2.094724178314209, "sft_loss": 1.5426809787750244, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 11.199999678303959, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.07323163747787476, "logits/rejected": 0.03639540821313858, "logps/chosen": -1.641036033630371, "logps/rejected": -2.2511465549468994, "loss": 0.7375, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.641036033630371, "rewards/margins": 0.6101104021072388, "rewards/rejected": -2.2511465549468994, "sft_loss": 1.740053415298462, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 12.49581898212095, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.08187204599380493, "logits/rejected": -0.007060323841869831, "logps/chosen": -1.643933892250061, "logps/rejected": -2.0629167556762695, "loss": 0.808, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.643933892250061, "rewards/margins": 0.41898274421691895, "rewards/rejected": -2.0629167556762695, "sft_loss": 1.663609266281128, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 6.76769436815532, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.14193886518478394, "logits/rejected": 0.046471722424030304, "logps/chosen": -1.5734301805496216, "logps/rejected": -2.1796512603759766, "loss": 0.7077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5734301805496216, "rewards/margins": 0.6062213778495789, "rewards/rejected": -2.1796512603759766, "sft_loss": 1.5730676651000977, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 11.196751105946237, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.025406192988157272, "logits/rejected": 0.2185060679912567, "logps/chosen": -1.6591308116912842, "logps/rejected": -2.091714382171631, "loss": 0.7887, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6591308116912842, "rewards/margins": 0.4325834810733795, "rewards/rejected": -2.091714382171631, "sft_loss": 1.6982816457748413, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 6.784538794916478, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.03418979048728943, "logits/rejected": 0.20457592606544495, "logps/chosen": -1.5542190074920654, "logps/rejected": -2.17234206199646, "loss": 0.7292, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5542190074920654, "rewards/margins": 0.618122935295105, "rewards/rejected": -2.17234206199646, "sft_loss": 1.588695764541626, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 9.09739401638349, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.04393025487661362, "logits/rejected": 0.17335300147533417, "logps/chosen": -1.573068380355835, "logps/rejected": -1.9681116342544556, "loss": 0.7752, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.573068380355835, "rewards/margins": 0.3950433135032654, "rewards/rejected": -1.9681116342544556, "sft_loss": 1.5954360961914062, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 13.318180426440863, "learning_rate": 6.61364787005525e-07, "logits/chosen": 0.03480113670229912, "logits/rejected": 0.16942448914051056, "logps/chosen": -1.4854832887649536, "logps/rejected": -2.1415598392486572, "loss": 0.725, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4854832887649536, "rewards/margins": 0.6560766100883484, "rewards/rejected": -2.1415598392486572, "sft_loss": 1.5388848781585693, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 11.225467596310303, "learning_rate": 6.598899386669395e-07, "logits/chosen": 0.016088951379060745, "logits/rejected": 0.17537520825862885, "logps/chosen": -1.5572017431259155, "logps/rejected": -2.107085704803467, "loss": 0.7287, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5572017431259155, "rewards/margins": 0.5498839020729065, "rewards/rejected": -2.107085704803467, "sft_loss": 1.5643457174301147, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 11.861747804163215, "learning_rate": 6.584135390754679e-07, "logits/chosen": 0.009006911888718605, "logits/rejected": 0.18510481715202332, "logps/chosen": -1.5127650499343872, "logps/rejected": -2.1335902214050293, "loss": 0.7214, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5127650499343872, "rewards/margins": 0.620825469493866, "rewards/rejected": -2.1335902214050293, "sft_loss": 1.5672950744628906, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 6.785956195215942, "learning_rate": 6.569356025551454e-07, "logits/chosen": 0.08204902708530426, "logits/rejected": 0.17699483036994934, "logps/chosen": -1.5565465688705444, "logps/rejected": -2.078390598297119, "loss": 0.7421, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5565465688705444, "rewards/margins": 0.5218440294265747, "rewards/rejected": -2.078390598297119, "sft_loss": 1.5584813356399536, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 9.932105579404821, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.08810216188430786, "logits/rejected": 0.10158564150333405, "logps/chosen": -1.5230224132537842, "logps/rejected": -2.0544304847717285, "loss": 0.7296, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5230224132537842, "rewards/margins": 0.5314079523086548, "rewards/rejected": -2.0544304847717285, "sft_loss": 1.5627092123031616, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 11.130280333108473, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.014438660815358162, "logits/rejected": 0.09829623252153397, "logps/chosen": -1.594249963760376, "logps/rejected": -1.967444658279419, "loss": 0.7689, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.594249963760376, "rewards/margins": 0.37319469451904297, "rewards/rejected": -1.967444658279419, "sft_loss": 1.6483619213104248, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 9.318112503704617, "learning_rate": 6.524927148842602e-07, "logits/chosen": 0.08159098774194717, "logits/rejected": 0.28421950340270996, "logps/chosen": -1.4942444562911987, "logps/rejected": -2.145620822906494, "loss": 0.6733, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.4942444562911987, "rewards/margins": 0.6513766050338745, "rewards/rejected": -2.145620822906494, "sft_loss": 1.5016446113586426, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 10.309342938360665, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.021885087713599205, "logits/rejected": 0.15587705373764038, "logps/chosen": -1.5317487716674805, "logps/rejected": -2.0334672927856445, "loss": 0.7581, "rewards/accuracies": 0.625, "rewards/chosen": -1.5317487716674805, "rewards/margins": 0.5017184615135193, "rewards/rejected": -2.0334672927856445, "sft_loss": 1.643341064453125, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 10.737516686561118, "learning_rate": 6.495233683980012e-07, "logits/chosen": 0.014032554812729359, "logits/rejected": 0.08581315726041794, "logps/chosen": -1.5315479040145874, "logps/rejected": -1.9563289880752563, "loss": 0.7607, "rewards/accuracies": 0.625, "rewards/chosen": -1.5315479040145874, "rewards/margins": 0.4247809946537018, "rewards/rejected": -1.9563289880752563, "sft_loss": 1.5282328128814697, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 10.645922084021096, "learning_rate": 6.480365119346011e-07, "logits/chosen": 0.07969672977924347, "logits/rejected": 0.24790850281715393, "logps/chosen": -1.5347075462341309, "logps/rejected": -1.9226144552230835, "loss": 0.7658, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5347075462341309, "rewards/margins": 0.3879067301750183, "rewards/rejected": -1.9226144552230835, "sft_loss": 1.5482189655303955, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 15.19947357785722, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.1110302060842514, "logits/rejected": 0.19423599541187286, "logps/chosen": -1.5484099388122559, "logps/rejected": -2.007843017578125, "loss": 0.7362, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5484099388122559, "rewards/margins": 0.4594331383705139, "rewards/rejected": -2.007843017578125, "sft_loss": 1.603734016418457, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 15.93671184980935, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.12104474008083344, "logits/rejected": 0.21550802886486053, "logps/chosen": -1.586740255355835, "logps/rejected": -2.12553071975708, "loss": 0.7681, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.586740255355835, "rewards/margins": 0.5387903451919556, "rewards/rejected": -2.12553071975708, "sft_loss": 1.6360355615615845, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 19.839214986590598, "learning_rate": 6.435673828105564e-07, "logits/chosen": 0.0025325207971036434, "logits/rejected": 0.19006134569644928, "logps/chosen": -1.5142167806625366, "logps/rejected": -2.1375746726989746, "loss": 0.7336, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5142167806625366, "rewards/margins": 0.6233576536178589, "rewards/rejected": -2.1375746726989746, "sft_loss": 1.5970064401626587, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 11.106034654414612, "learning_rate": 6.420748680351763e-07, "logits/chosen": 0.0049167824909091, "logits/rejected": 0.012611147947609425, "logps/chosen": -1.6260411739349365, "logps/rejected": -1.971514105796814, "loss": 0.8197, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6260411739349365, "rewards/margins": 0.3454729914665222, "rewards/rejected": -1.971514105796814, "sft_loss": 1.6711645126342773, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 15.462237485988142, "learning_rate": 6.405809748488032e-07, "logits/chosen": 0.021909568458795547, "logits/rejected": 0.21467368304729462, "logps/chosen": -1.5951663255691528, "logps/rejected": -2.162919521331787, "loss": 0.7434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5951663255691528, "rewards/margins": 0.567753255367279, "rewards/rejected": -2.162919521331787, "sft_loss": 1.572172999382019, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 9.496711366538626, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.12060613930225372, "logits/rejected": 0.12300988286733627, "logps/chosen": -1.5986261367797852, "logps/rejected": -2.1122477054595947, "loss": 0.7534, "rewards/accuracies": 0.625, "rewards/chosen": -1.5986261367797852, "rewards/margins": 0.5136216282844543, "rewards/rejected": -2.1122477054595947, "sft_loss": 1.6099659204483032, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 14.182245845144008, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.03867544233798981, "logits/rejected": 0.08904945850372314, "logps/chosen": -1.5808160305023193, "logps/rejected": -2.118974208831787, "loss": 0.7429, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5808160305023193, "rewards/margins": 0.5381582379341125, "rewards/rejected": -2.118974208831787, "sft_loss": 1.6240956783294678, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 7.254310350857439, "learning_rate": 6.360911698273326e-07, "logits/chosen": 0.013441148214042187, "logits/rejected": 0.13574287295341492, "logps/chosen": -1.6644134521484375, "logps/rejected": -2.14290714263916, "loss": 0.7704, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6644134521484375, "rewards/margins": 0.47849392890930176, "rewards/rejected": -2.14290714263916, "sft_loss": 1.6617565155029297, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 13.040623547237937, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.035405468195676804, "logits/rejected": 0.07356153428554535, "logps/chosen": -1.5818744897842407, "logps/rejected": -2.1695423126220703, "loss": 0.7261, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5818744897842407, "rewards/margins": 0.5876680612564087, "rewards/rejected": -2.1695423126220703, "sft_loss": 1.571797490119934, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 12.797180551249683, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.07132842391729355, "logits/rejected": 0.12927840650081635, "logps/chosen": -1.6040897369384766, "logps/rejected": -2.4128212928771973, "loss": 0.7087, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6040897369384766, "rewards/margins": 0.808731734752655, "rewards/rejected": -2.4128212928771973, "sft_loss": 1.6210588216781616, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 18.47812390837442, "learning_rate": 6.315894816685838e-07, "logits/chosen": 0.012104770168662071, "logits/rejected": 0.20364436507225037, "logps/chosen": -1.5872005224227905, "logps/rejected": -2.139583110809326, "loss": 0.7125, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5872005224227905, "rewards/margins": 0.5523822903633118, "rewards/rejected": -2.139583110809326, "sft_loss": 1.5623565912246704, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 11.584960570443927, "learning_rate": 6.300863461616657e-07, "logits/chosen": 0.0984925776720047, "logits/rejected": 0.1792813092470169, "logps/chosen": -1.5403988361358643, "logps/rejected": -2.1503360271453857, "loss": 0.721, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5403988361358643, "rewards/margins": 0.6099372506141663, "rewards/rejected": -2.1503360271453857, "sft_loss": 1.5541141033172607, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 8.466099794521575, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.09084759652614594, "logits/rejected": 0.08720861375331879, "logps/chosen": -1.5717127323150635, "logps/rejected": -2.1415293216705322, "loss": 0.7184, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5717127323150635, "rewards/margins": 0.5698164105415344, "rewards/rejected": -2.1415293216705322, "sft_loss": 1.6834430694580078, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 13.575628071316117, "learning_rate": 6.270763034485986e-07, "logits/chosen": 0.06083375960588455, "logits/rejected": 0.19356988370418549, "logps/chosen": -1.6508433818817139, "logps/rejected": -2.0878913402557373, "loss": 0.76, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6508433818817139, "rewards/margins": 0.43704789876937866, "rewards/rejected": -2.0878913402557373, "sft_loss": 1.6485340595245361, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 11.650480435088047, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.004039292223751545, "logits/rejected": 0.20745310187339783, "logps/chosen": -1.6140371561050415, "logps/rejected": -2.0637290477752686, "loss": 0.7755, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6140371561050415, "rewards/margins": 0.4496920704841614, "rewards/rejected": -2.0637290477752686, "sft_loss": 1.5235536098480225, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 11.00281522989197, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.019755516201257706, "logits/rejected": 0.2021590769290924, "logps/chosen": -1.5822246074676514, "logps/rejected": -2.054351329803467, "loss": 0.7624, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5822246074676514, "rewards/margins": 0.47212672233581543, "rewards/rejected": -2.054351329803467, "sft_loss": 1.6455892324447632, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 8.011311227251987, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.028271233662962914, "logits/rejected": 0.2493860423564911, "logps/chosen": -1.5316722393035889, "logps/rejected": -2.009626865386963, "loss": 0.7186, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5316722393035889, "rewards/margins": 0.4779546856880188, "rewards/rejected": -2.009626865386963, "sft_loss": 1.5618966817855835, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 22.223876727884587, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.17216753959655762, "logits/rejected": 0.15086044371128082, "logps/chosen": -1.5619791746139526, "logps/rejected": -2.1863982677459717, "loss": 0.7188, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5619791746139526, "rewards/margins": 0.6244191527366638, "rewards/rejected": -2.1863982677459717, "sft_loss": 1.580539345741272, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 10.387235352905673, "learning_rate": 6.195298770577415e-07, "logits/chosen": 0.08286045491695404, "logits/rejected": 0.12139495462179184, "logps/chosen": -1.5578416585922241, "logps/rejected": -2.1099398136138916, "loss": 0.7377, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5578416585922241, "rewards/margins": 0.5520982146263123, "rewards/rejected": -2.1099398136138916, "sft_loss": 1.5624536275863647, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 8.748424921624208, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.04530134052038193, "logits/rejected": 0.1910441368818283, "logps/chosen": -1.5966148376464844, "logps/rejected": -2.108455181121826, "loss": 0.7562, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5966148376464844, "rewards/margins": 0.511840283870697, "rewards/rejected": -2.108455181121826, "sft_loss": 1.6653258800506592, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 6.864538210703868, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.09009388089179993, "logits/rejected": 0.19870546460151672, "logps/chosen": -1.44761061668396, "logps/rejected": -2.1615242958068848, "loss": 0.6559, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.44761061668396, "rewards/margins": 0.7139135003089905, "rewards/rejected": -2.1615242958068848, "sft_loss": 1.4398186206817627, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 11.586561481990234, "learning_rate": 6.149879879003876e-07, "logits/chosen": 0.06874538958072662, "logits/rejected": 0.10356751829385757, "logps/chosen": -1.5623998641967773, "logps/rejected": -2.115769147872925, "loss": 0.7219, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5623998641967773, "rewards/margins": 0.5533693432807922, "rewards/rejected": -2.115769147872925, "sft_loss": 1.568113923072815, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 5.8636581408617605, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.1313377320766449, "logits/rejected": 0.010066904127597809, "logps/chosen": -1.4795358180999756, "logps/rejected": -2.0026803016662598, "loss": 0.7328, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.4795358180999756, "rewards/margins": 0.5231444239616394, "rewards/rejected": -2.0026803016662598, "sft_loss": 1.5832579135894775, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 8.275470557294518, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.04487771540880203, "logits/rejected": 0.107400082051754, "logps/chosen": -1.5288636684417725, "logps/rejected": -1.9477142095565796, "loss": 0.7412, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5288636684417725, "rewards/margins": 0.41885048151016235, "rewards/rejected": -1.9477142095565796, "sft_loss": 1.5226595401763916, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 8.195631451899128, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.015558170154690742, "logits/rejected": 0.13231968879699707, "logps/chosen": -1.529274582862854, "logps/rejected": -2.0861575603485107, "loss": 0.7165, "rewards/accuracies": 0.6875, "rewards/chosen": -1.529274582862854, "rewards/margins": 0.5568830966949463, "rewards/rejected": -2.0861575603485107, "sft_loss": 1.550046682357788, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 10.647633227654756, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.06200261786580086, "logits/rejected": 0.17863117158412933, "logps/chosen": -1.553851842880249, "logps/rejected": -2.1897096633911133, "loss": 0.7164, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.553851842880249, "rewards/margins": 0.6358579397201538, "rewards/rejected": -2.1897096633911133, "sft_loss": 1.5742568969726562, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 6.12910835782927, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.04491172358393669, "logits/rejected": 0.22047391533851624, "logps/chosen": -1.4495924711227417, "logps/rejected": -2.138632297515869, "loss": 0.6766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4495924711227417, "rewards/margins": 0.6890400052070618, "rewards/rejected": -2.138632297515869, "sft_loss": 1.5353037118911743, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 13.536049062681036, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.053219765424728394, "logits/rejected": 0.09907916933298111, "logps/chosen": -1.5724198818206787, "logps/rejected": -2.3262829780578613, "loss": 0.6927, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5724198818206787, "rewards/margins": 0.7538628578186035, "rewards/rejected": -2.3262829780578613, "sft_loss": 1.6013047695159912, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 8.641780317783356, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.09193633496761322, "logits/rejected": 0.10880827903747559, "logps/chosen": -1.5175427198410034, "logps/rejected": -2.0336031913757324, "loss": 0.7264, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5175427198410034, "rewards/margins": 0.516060471534729, "rewards/rejected": -2.0336031913757324, "sft_loss": 1.5405975580215454, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 15.49928306012021, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.0046337745152413845, "logits/rejected": 0.14370040595531464, "logps/chosen": -1.6496000289916992, "logps/rejected": -2.304265022277832, "loss": 0.6891, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6496000289916992, "rewards/margins": 0.6546646356582642, "rewards/rejected": -2.304265022277832, "sft_loss": 1.6527996063232422, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 13.45370181698858, "learning_rate": 6.013036683579798e-07, "logits/chosen": 0.018690815195441246, "logits/rejected": 0.19791176915168762, "logps/chosen": -1.54180908203125, "logps/rejected": -2.130697250366211, "loss": 0.7066, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.54180908203125, "rewards/margins": 0.5888879895210266, "rewards/rejected": -2.130697250366211, "sft_loss": 1.6045894622802734, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 10.988118746606212, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.11253751814365387, "logits/rejected": 0.13003113865852356, "logps/chosen": -1.6249583959579468, "logps/rejected": -2.369246244430542, "loss": 0.6692, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6249583959579468, "rewards/margins": 0.7442880272865295, "rewards/rejected": -2.369246244430542, "sft_loss": 1.6231462955474854, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 12.862568019878607, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.013584159314632416, "logits/rejected": 0.1644848883152008, "logps/chosen": -1.5914555788040161, "logps/rejected": -2.14353609085083, "loss": 0.7132, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5914555788040161, "rewards/margins": 0.5520803332328796, "rewards/rejected": -2.14353609085083, "sft_loss": 1.6580852270126343, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 8.679092459009466, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.12652315199375153, "logits/rejected": -0.029349103569984436, "logps/chosen": -1.5451549291610718, "logps/rejected": -1.9567607641220093, "loss": 0.7663, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5451549291610718, "rewards/margins": 0.41160592436790466, "rewards/rejected": -1.9567607641220093, "sft_loss": 1.6039302349090576, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 10.188552167339967, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.06204778701066971, "logits/rejected": 0.019216526299715042, "logps/chosen": -1.5972654819488525, "logps/rejected": -2.2767837047576904, "loss": 0.7168, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5972654819488525, "rewards/margins": 0.6795179843902588, "rewards/rejected": -2.2767837047576904, "sft_loss": 1.6243031024932861, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 7.512678617406759, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.02684381604194641, "logits/rejected": 0.11252357810735703, "logps/chosen": -1.7422711849212646, "logps/rejected": -2.2198100090026855, "loss": 0.7896, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7422711849212646, "rewards/margins": 0.47753873467445374, "rewards/rejected": -2.2198100090026855, "sft_loss": 1.7010536193847656, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 22.120187183398098, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.08323322981595993, "logits/rejected": 0.10172925889492035, "logps/chosen": -1.7256314754486084, "logps/rejected": -2.309457778930664, "loss": 0.7671, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7256314754486084, "rewards/margins": 0.5838262438774109, "rewards/rejected": -2.309457778930664, "sft_loss": 1.7234764099121094, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 11.656102912428466, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.014649082906544209, "logits/rejected": -0.03606802597641945, "logps/chosen": -1.6121692657470703, "logps/rejected": -2.176577091217041, "loss": 0.7682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6121692657470703, "rewards/margins": 0.5644077062606812, "rewards/rejected": -2.176577091217041, "sft_loss": 1.7494573593139648, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 8.251782143184926, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.08955221623182297, "logits/rejected": 0.10675595700740814, "logps/chosen": -1.492752194404602, "logps/rejected": -2.0434365272521973, "loss": 0.7328, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.492752194404602, "rewards/margins": 0.5506845712661743, "rewards/rejected": -2.0434365272521973, "sft_loss": 1.5318242311477661, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 13.714096628273396, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.08133091032505035, "logits/rejected": 0.022093068808317184, "logps/chosen": -1.6452796459197998, "logps/rejected": -2.1473469734191895, "loss": 0.7742, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6452796459197998, "rewards/margins": 0.5020670890808105, "rewards/rejected": -2.1473469734191895, "sft_loss": 1.7196013927459717, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.34323760867118835, "eval_logits/rejected": 0.4611225426197052, "eval_logps/chosen": -1.6147866249084473, "eval_logps/rejected": -2.159003257751465, "eval_loss": 0.7443860769271851, "eval_rewards/accuracies": 0.6632047295570374, "eval_rewards/chosen": -1.6147866249084473, "eval_rewards/margins": 0.5442166328430176, "eval_rewards/rejected": -2.159003257751465, "eval_runtime": 43.6338, "eval_samples_per_second": 30.825, "eval_sft_loss": 1.619554877281189, "eval_steps_per_second": 7.723, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 7.069418701261009, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.22436638176441193, "logits/rejected": -0.02038896642625332, "logps/chosen": -1.4126793146133423, "logps/rejected": -2.0157692432403564, "loss": 0.6892, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4126793146133423, "rewards/margins": 0.6030899286270142, "rewards/rejected": -2.0157692432403564, "sft_loss": 1.4921300411224365, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 11.681747845165468, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.06591981649398804, "logits/rejected": 0.016422418877482414, "logps/chosen": -1.5853317975997925, "logps/rejected": -2.133549690246582, "loss": 0.7366, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5853317975997925, "rewards/margins": 0.5482179522514343, "rewards/rejected": -2.133549690246582, "sft_loss": 1.6722828149795532, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 8.076137711084458, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.018374616280198097, "logits/rejected": 0.16947081685066223, "logps/chosen": -1.5850551128387451, "logps/rejected": -2.2069504261016846, "loss": 0.7005, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5850551128387451, "rewards/margins": 0.6218953132629395, "rewards/rejected": -2.2069504261016846, "sft_loss": 1.599350929260254, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 6.820846632973787, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.048033684492111206, "logits/rejected": 0.196162149310112, "logps/chosen": -1.5583544969558716, "logps/rejected": -2.0931642055511475, "loss": 0.7341, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5583544969558716, "rewards/margins": 0.5348098874092102, "rewards/rejected": -2.0931642055511475, "sft_loss": 1.5960338115692139, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 10.907081197916176, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.13935433328151703, "logits/rejected": 0.10037778317928314, "logps/chosen": -1.6392818689346313, "logps/rejected": -2.3126654624938965, "loss": 0.7053, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6392818689346313, "rewards/margins": 0.6733838319778442, "rewards/rejected": -2.3126654624938965, "sft_loss": 1.6326940059661865, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 12.274204183141263, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.05011101812124252, "logits/rejected": 0.13744275271892548, "logps/chosen": -1.632354736328125, "logps/rejected": -2.112496852874756, "loss": 0.7597, "rewards/accuracies": 0.6875, "rewards/chosen": -1.632354736328125, "rewards/margins": 0.4801420569419861, "rewards/rejected": -2.112496852874756, "sft_loss": 1.680962324142456, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 9.414851497910854, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.08316750079393387, "logits/rejected": 0.1129358559846878, "logps/chosen": -1.5908558368682861, "logps/rejected": -2.215712547302246, "loss": 0.7091, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5908558368682861, "rewards/margins": 0.6248565316200256, "rewards/rejected": -2.215712547302246, "sft_loss": 1.6170666217803955, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 9.343892927091627, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.07021068781614304, "logits/rejected": 0.051569510251283646, "logps/chosen": -1.6252899169921875, "logps/rejected": -2.0436363220214844, "loss": 0.7936, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6252899169921875, "rewards/margins": 0.4183463156223297, "rewards/rejected": -2.0436363220214844, "sft_loss": 1.6690508127212524, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 9.484036772348361, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.05625772476196289, "logits/rejected": 0.05892757698893547, "logps/chosen": -1.6051113605499268, "logps/rejected": -2.1852452754974365, "loss": 0.7461, "rewards/accuracies": 0.625, "rewards/chosen": -1.6051113605499268, "rewards/margins": 0.5801340341567993, "rewards/rejected": -2.1852452754974365, "sft_loss": 1.5738210678100586, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 9.330283020653878, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.11531674861907959, "logits/rejected": -0.008421322330832481, "logps/chosen": -1.5739657878875732, "logps/rejected": -2.1691317558288574, "loss": 0.7272, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5739657878875732, "rewards/margins": 0.595166027545929, "rewards/rejected": -2.1691317558288574, "sft_loss": 1.5956964492797852, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 9.059434400229454, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.13385756313800812, "logits/rejected": 0.07013119012117386, "logps/chosen": -1.6829839944839478, "logps/rejected": -2.2230608463287354, "loss": 0.7333, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6829839944839478, "rewards/margins": 0.5400767922401428, "rewards/rejected": -2.2230608463287354, "sft_loss": 1.7057762145996094, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 9.954776950876496, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.18614499270915985, "logits/rejected": 0.04743589088320732, "logps/chosen": -1.5779237747192383, "logps/rejected": -2.0241525173187256, "loss": 0.7397, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5779237747192383, "rewards/margins": 0.44622907042503357, "rewards/rejected": -2.0241525173187256, "sft_loss": 1.601406455039978, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 9.311618437409733, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.2462371289730072, "logits/rejected": -0.08286289125680923, "logps/chosen": -1.6105082035064697, "logps/rejected": -2.101499080657959, "loss": 0.7416, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6105082035064697, "rewards/margins": 0.4909909665584564, "rewards/rejected": -2.101499080657959, "sft_loss": 1.6846234798431396, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 10.99633162132085, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.12304872274398804, "logits/rejected": 0.02933737076818943, "logps/chosen": -1.6165269613265991, "logps/rejected": -1.9879896640777588, "loss": 0.7752, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6165269613265991, "rewards/margins": 0.3714628219604492, "rewards/rejected": -1.9879896640777588, "sft_loss": 1.5911413431167603, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 9.757369090813622, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.11283618211746216, "logits/rejected": 0.014583133161067963, "logps/chosen": -1.6313034296035767, "logps/rejected": -2.1418285369873047, "loss": 0.7081, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6313034296035767, "rewards/margins": 0.5105254054069519, "rewards/rejected": -2.1418285369873047, "sft_loss": 1.5995954275131226, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 11.764424411667864, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.03585640341043472, "logits/rejected": 0.1789165735244751, "logps/chosen": -1.5594947338104248, "logps/rejected": -2.15980863571167, "loss": 0.7455, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5594947338104248, "rewards/margins": 0.600314199924469, "rewards/rejected": -2.15980863571167, "sft_loss": 1.5975472927093506, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 11.692170403216272, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.10905871540307999, "logits/rejected": 0.04495047777891159, "logps/chosen": -1.658898949623108, "logps/rejected": -2.226306915283203, "loss": 0.7474, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.658898949623108, "rewards/margins": 0.5674082040786743, "rewards/rejected": -2.226306915283203, "sft_loss": 1.6935369968414307, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 9.449116245833352, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.15788525342941284, "logits/rejected": 0.07475180178880692, "logps/chosen": -1.604871392250061, "logps/rejected": -2.2114899158477783, "loss": 0.7077, "rewards/accuracies": 0.75, "rewards/chosen": -1.604871392250061, "rewards/margins": 0.6066186428070068, "rewards/rejected": -2.2114899158477783, "sft_loss": 1.5997017621994019, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 9.958974028692868, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.13391032814979553, "logits/rejected": 0.05076754838228226, "logps/chosen": -1.5524380207061768, "logps/rejected": -2.141404867172241, "loss": 0.7152, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5524380207061768, "rewards/margins": 0.5889667868614197, "rewards/rejected": -2.141404867172241, "sft_loss": 1.5533422231674194, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 8.285629965811736, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.033826012164354324, "logits/rejected": 0.09941424429416656, "logps/chosen": -1.5625638961791992, "logps/rejected": -2.155369281768799, "loss": 0.7072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5625638961791992, "rewards/margins": 0.5928055047988892, "rewards/rejected": -2.155369281768799, "sft_loss": 1.5898663997650146, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 10.314885700401105, "learning_rate": 5.551751964760838e-07, "logits/chosen": 0.03655567765235901, "logits/rejected": 0.061975110322237015, "logps/chosen": -1.5969996452331543, "logps/rejected": -2.1453871726989746, "loss": 0.7431, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5969996452331543, "rewards/margins": 0.548387348651886, "rewards/rejected": -2.1453871726989746, "sft_loss": 1.6407146453857422, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 9.383660278309033, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.0655091255903244, "logits/rejected": 0.07182935625314713, "logps/chosen": -1.5500905513763428, "logps/rejected": -1.9859189987182617, "loss": 0.7451, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5500905513763428, "rewards/margins": 0.4358287751674652, "rewards/rejected": -1.9859189987182617, "sft_loss": 1.6303850412368774, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 16.516386839683022, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.005285367369651794, "logits/rejected": 0.20937585830688477, "logps/chosen": -1.6288344860076904, "logps/rejected": -2.242150068283081, "loss": 0.7439, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6288344860076904, "rewards/margins": 0.6133158802986145, "rewards/rejected": -2.242150068283081, "sft_loss": 1.6870357990264893, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 10.539335740482224, "learning_rate": 5.505291815446082e-07, "logits/chosen": 0.00523983221501112, "logits/rejected": 0.1609693169593811, "logps/chosen": -1.6254608631134033, "logps/rejected": -2.193239688873291, "loss": 0.7471, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6254608631134033, "rewards/margins": 0.5677791237831116, "rewards/rejected": -2.193239688873291, "sft_loss": 1.6637426614761353, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 10.694420652334653, "learning_rate": 5.489795093935089e-07, "logits/chosen": 0.0060553462244570255, "logits/rejected": 0.10224980115890503, "logps/chosen": -1.5308917760849, "logps/rejected": -2.108527421951294, "loss": 0.7486, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5308917760849, "rewards/margins": 0.577635645866394, "rewards/rejected": -2.108527421951294, "sft_loss": 1.5484187602996826, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 10.415479021324261, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.16104844212532043, "logits/rejected": 0.06276810169219971, "logps/chosen": -1.5397355556488037, "logps/rejected": -2.3774197101593018, "loss": 0.6839, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5397355556488037, "rewards/margins": 0.837684154510498, "rewards/rejected": -2.3774197101593018, "sft_loss": 1.6458343267440796, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 8.997652098298424, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.11595894396305084, "logits/rejected": 0.058186840265989304, "logps/chosen": -1.6195943355560303, "logps/rejected": -2.1812405586242676, "loss": 0.7404, "rewards/accuracies": 0.625, "rewards/chosen": -1.6195943355560303, "rewards/margins": 0.561646044254303, "rewards/rejected": -2.1812405586242676, "sft_loss": 1.6610116958618164, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 7.71715712661338, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.11065827310085297, "logits/rejected": 0.1022631898522377, "logps/chosen": -1.5852737426757812, "logps/rejected": -2.3204193115234375, "loss": 0.7106, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5852737426757812, "rewards/margins": 0.735145628452301, "rewards/rejected": -2.3204193115234375, "sft_loss": 1.65309739112854, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 15.091753849332113, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.09177873283624649, "logits/rejected": 0.08104687929153442, "logps/chosen": -1.5999292135238647, "logps/rejected": -2.0878443717956543, "loss": 0.7536, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5999292135238647, "rewards/margins": 0.4879148006439209, "rewards/rejected": -2.0878443717956543, "sft_loss": 1.6035125255584717, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 22.54646841959308, "learning_rate": 5.41224321503607e-07, "logits/chosen": 0.005368401296436787, "logits/rejected": 0.3275861442089081, "logps/chosen": -1.5233429670333862, "logps/rejected": -2.2439920902252197, "loss": 0.6857, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5233429670333862, "rewards/margins": 0.7206490635871887, "rewards/rejected": -2.2439920902252197, "sft_loss": 1.5576990842819214, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 13.152873965233374, "learning_rate": 5.396720238361637e-07, "logits/chosen": 0.012000272050499916, "logits/rejected": 0.15233470499515533, "logps/chosen": -1.6005465984344482, "logps/rejected": -2.160879373550415, "loss": 0.7435, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6005465984344482, "rewards/margins": 0.5603323578834534, "rewards/rejected": -2.160879373550415, "sft_loss": 1.6554343700408936, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 10.002911805575492, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.10680749267339706, "logits/rejected": 0.06608447432518005, "logps/chosen": -1.5578012466430664, "logps/rejected": -2.1227807998657227, "loss": 0.7188, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5578012466430664, "rewards/margins": 0.5649798512458801, "rewards/rejected": -2.1227807998657227, "sft_loss": 1.590313196182251, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 9.120498789279779, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.0703551322221756, "logits/rejected": 0.057837922126054764, "logps/chosen": -1.5598151683807373, "logps/rejected": -2.1630189418792725, "loss": 0.7325, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5598151683807373, "rewards/margins": 0.6032034754753113, "rewards/rejected": -2.1630189418792725, "sft_loss": 1.5481512546539307, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 15.83886646989159, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.11105934530496597, "logits/rejected": 0.1290782392024994, "logps/chosen": -1.6484237909317017, "logps/rejected": -2.2263407707214355, "loss": 0.7675, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6484237909317017, "rewards/margins": 0.5779169797897339, "rewards/rejected": -2.2263407707214355, "sft_loss": 1.693275809288025, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 9.629084997760502, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.07688424736261368, "logits/rejected": 0.1146201491355896, "logps/chosen": -1.6321853399276733, "logps/rejected": -2.309251308441162, "loss": 0.7146, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6321853399276733, "rewards/margins": 0.6770658493041992, "rewards/rejected": -2.309251308441162, "sft_loss": 1.7109416723251343, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 7.69970358075047, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.18153062462806702, "logits/rejected": 0.02972468174993992, "logps/chosen": -1.7193416357040405, "logps/rejected": -2.289445400238037, "loss": 0.7264, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7193416357040405, "rewards/margins": 0.5701041221618652, "rewards/rejected": -2.289445400238037, "sft_loss": 1.728664755821228, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 8.442508891796157, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.1056077852845192, "logits/rejected": 0.14674147963523865, "logps/chosen": -1.7464758157730103, "logps/rejected": -2.368039846420288, "loss": 0.7621, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7464758157730103, "rewards/margins": 0.6215642094612122, "rewards/rejected": -2.368039846420288, "sft_loss": 1.684819221496582, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 15.152948975679237, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.13494433462619781, "logits/rejected": 0.1010039821267128, "logps/chosen": -1.587498664855957, "logps/rejected": -2.339174747467041, "loss": 0.6993, "rewards/accuracies": 0.71875, "rewards/chosen": -1.587498664855957, "rewards/margins": 0.7516759634017944, "rewards/rejected": -2.339174747467041, "sft_loss": 1.612090826034546, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 10.88110209533411, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.08616335690021515, "logits/rejected": 0.06338141858577728, "logps/chosen": -1.6676971912384033, "logps/rejected": -2.4814209938049316, "loss": 0.7093, "rewards/accuracies": 0.75, "rewards/chosen": -1.6676971912384033, "rewards/margins": 0.8137239217758179, "rewards/rejected": -2.4814209938049316, "sft_loss": 1.7446247339248657, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 7.351027664152738, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.13348999619483948, "logits/rejected": 0.05874284356832504, "logps/chosen": -1.6815016269683838, "logps/rejected": -2.2286417484283447, "loss": 0.7572, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6815016269683838, "rewards/margins": 0.5471398234367371, "rewards/rejected": -2.2286417484283447, "sft_loss": 1.6341221332550049, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 17.21850548194868, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.194489985704422, "logits/rejected": -0.003558636410161853, "logps/chosen": -1.5862547159194946, "logps/rejected": -2.069319248199463, "loss": 0.7609, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5862547159194946, "rewards/margins": 0.4830645024776459, "rewards/rejected": -2.069319248199463, "sft_loss": 1.5764394998550415, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 10.152980276891444, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.17317108809947968, "logits/rejected": -0.007814708165824413, "logps/chosen": -1.5412657260894775, "logps/rejected": -1.9843921661376953, "loss": 0.7719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5412657260894775, "rewards/margins": 0.443126380443573, "rewards/rejected": -1.9843921661376953, "sft_loss": 1.5622650384902954, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 11.628604252931284, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.041035737842321396, "logits/rejected": 0.13420096039772034, "logps/chosen": -1.6166772842407227, "logps/rejected": -2.1054153442382812, "loss": 0.7499, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6166772842407227, "rewards/margins": 0.48873797059059143, "rewards/rejected": -2.1054153442382812, "sft_loss": 1.6266248226165771, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 10.677801732145445, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.14725860953330994, "logits/rejected": 0.03786567598581314, "logps/chosen": -1.5627483129501343, "logps/rejected": -2.031207323074341, "loss": 0.7359, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5627483129501343, "rewards/margins": 0.46845898032188416, "rewards/rejected": -2.031207323074341, "sft_loss": 1.6287370920181274, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 8.211168708387598, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.09281010180711746, "logits/rejected": 0.11317907273769379, "logps/chosen": -1.4985650777816772, "logps/rejected": -1.9761158227920532, "loss": 0.738, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4985650777816772, "rewards/margins": 0.47755080461502075, "rewards/rejected": -1.9761158227920532, "sft_loss": 1.5786664485931396, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 9.826802624639164, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.04950065538287163, "logits/rejected": 0.1300159990787506, "logps/chosen": -1.6472562551498413, "logps/rejected": -2.155564785003662, "loss": 0.7509, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6472562551498413, "rewards/margins": 0.5083084106445312, "rewards/rejected": -2.155564785003662, "sft_loss": 1.6684767007827759, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 10.039723075571207, "learning_rate": 5.147931662540144e-07, "logits/chosen": 0.0622626356780529, "logits/rejected": 0.23000629246234894, "logps/chosen": -1.5437476634979248, "logps/rejected": -1.917470932006836, "loss": 0.7518, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5437476634979248, "rewards/margins": 0.3737230896949768, "rewards/rejected": -1.917470932006836, "sft_loss": 1.5594230890274048, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.66338888171497, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.06233688443899155, "logits/rejected": 0.02511647343635559, "logps/chosen": -1.4620157480239868, "logps/rejected": -2.1915090084075928, "loss": 0.6797, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4620157480239868, "rewards/margins": 0.7294934391975403, "rewards/rejected": -2.1915090084075928, "sft_loss": 1.5150892734527588, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 33.19015661003358, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.03809171915054321, "logits/rejected": 0.08693098276853561, "logps/chosen": -1.538657546043396, "logps/rejected": -1.9789533615112305, "loss": 0.7794, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.538657546043396, "rewards/margins": 0.44029584527015686, "rewards/rejected": -1.9789533615112305, "sft_loss": 1.5494117736816406, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 9.710219275005215, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.1227768287062645, "logits/rejected": 0.06207343190908432, "logps/chosen": -1.640450119972229, "logps/rejected": -2.286729335784912, "loss": 0.7091, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.640450119972229, "rewards/margins": 0.6462793946266174, "rewards/rejected": -2.286729335784912, "sft_loss": 1.6855093240737915, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 7.64992324306241, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.13107867538928986, "logits/rejected": 0.04695403203368187, "logps/chosen": -1.6148920059204102, "logps/rejected": -2.221074104309082, "loss": 0.7089, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6148920059204102, "rewards/margins": 0.6061822772026062, "rewards/rejected": -2.221074104309082, "sft_loss": 1.6260604858398438, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 7.225659083832635, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.11677054315805435, "logits/rejected": 0.11139710992574692, "logps/chosen": -1.6001787185668945, "logps/rejected": -2.085768461227417, "loss": 0.7826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6001787185668945, "rewards/margins": 0.48558980226516724, "rewards/rejected": -2.085768461227417, "sft_loss": 1.655072808265686, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 12.028425228450923, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.02619919739663601, "logits/rejected": 0.04093898832798004, "logps/chosen": -1.6137205362319946, "logps/rejected": -2.2553040981292725, "loss": 0.7294, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6137205362319946, "rewards/margins": 0.6415835618972778, "rewards/rejected": -2.2553040981292725, "sft_loss": 1.6506626605987549, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 14.30040771508223, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.1637105792760849, "logits/rejected": -0.012656033039093018, "logps/chosen": -1.6590261459350586, "logps/rejected": -2.2071521282196045, "loss": 0.7604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6590261459350586, "rewards/margins": 0.5481262803077698, "rewards/rejected": -2.2071521282196045, "sft_loss": 1.7005878686904907, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 6.847851947774549, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.17357437312602997, "logits/rejected": -0.07817217707633972, "logps/chosen": -1.522201418876648, "logps/rejected": -2.0808849334716797, "loss": 0.7081, "rewards/accuracies": 0.6875, "rewards/chosen": -1.522201418876648, "rewards/margins": 0.5586836934089661, "rewards/rejected": -2.0808849334716797, "sft_loss": 1.5187063217163086, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 9.896574320479443, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.21580497920513153, "logits/rejected": 0.021061724051833153, "logps/chosen": -1.4339303970336914, "logps/rejected": -2.0267107486724854, "loss": 0.6919, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4339303970336914, "rewards/margins": 0.5927804112434387, "rewards/rejected": -2.0267107486724854, "sft_loss": 1.5159814357757568, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 8.070256083142056, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.015930231660604477, "logits/rejected": 0.00949743203818798, "logps/chosen": -1.5202354192733765, "logps/rejected": -2.0487263202667236, "loss": 0.7195, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5202354192733765, "rewards/margins": 0.5284907817840576, "rewards/rejected": -2.0487263202667236, "sft_loss": 1.5544708967208862, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 8.767068096028254, "learning_rate": 4.976639045035036e-07, "logits/chosen": 0.014556407928466797, "logits/rejected": 0.11699094623327255, "logps/chosen": -1.532111644744873, "logps/rejected": -1.9391781091690063, "loss": 0.7994, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.532111644744873, "rewards/margins": 0.4070662558078766, "rewards/rejected": -1.9391781091690063, "sft_loss": 1.6094253063201904, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 7.499007977839553, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.04810868948698044, "logits/rejected": 0.14180199801921844, "logps/chosen": -1.5714137554168701, "logps/rejected": -2.147458553314209, "loss": 0.7115, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5714137554168701, "rewards/margins": 0.5760447382926941, "rewards/rejected": -2.147458553314209, "sft_loss": 1.582482933998108, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 12.741223489847583, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.19680993258953094, "logits/rejected": -0.025873342528939247, "logps/chosen": -1.5794672966003418, "logps/rejected": -2.1392455101013184, "loss": 0.7386, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5794672966003418, "rewards/margins": 0.5597780346870422, "rewards/rejected": -2.1392455101013184, "sft_loss": 1.6871474981307983, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 11.64444072045427, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.16362957656383514, "logits/rejected": 0.07465249300003052, "logps/chosen": -1.6116749048233032, "logps/rejected": -2.0694832801818848, "loss": 0.7634, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6116749048233032, "rewards/margins": 0.45780810713768005, "rewards/rejected": -2.0694832801818848, "sft_loss": 1.6741182804107666, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 8.842478503554599, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.051214706152677536, "logits/rejected": 0.09491660445928574, "logps/chosen": -1.5871381759643555, "logps/rejected": -2.105633020401001, "loss": 0.7358, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5871381759643555, "rewards/margins": 0.518494725227356, "rewards/rejected": -2.105633020401001, "sft_loss": 1.525151252746582, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 8.646857072225027, "learning_rate": 4.898775742651013e-07, "logits/chosen": 0.013488218188285828, "logits/rejected": 0.1442359834909439, "logps/chosen": -1.605381965637207, "logps/rejected": -2.238438129425049, "loss": 0.7149, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.605381965637207, "rewards/margins": 0.6330560445785522, "rewards/rejected": -2.238438129425049, "sft_loss": 1.663658857345581, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 8.00563810698579, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.15756067633628845, "logits/rejected": 0.04317054525017738, "logps/chosen": -1.6351125240325928, "logps/rejected": -2.2063910961151123, "loss": 0.7586, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6351125240325928, "rewards/margins": 0.5712786912918091, "rewards/rejected": -2.2063910961151123, "sft_loss": 1.637192726135254, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 8.539282062318039, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.047394294291734695, "logits/rejected": 0.10682227462530136, "logps/chosen": -1.7094351053237915, "logps/rejected": -2.266960382461548, "loss": 0.7403, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7094351053237915, "rewards/margins": 0.5575251579284668, "rewards/rejected": -2.266960382461548, "sft_loss": 1.6307144165039062, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 9.88475333502435, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.030721943825483322, "logits/rejected": 0.1795428842306137, "logps/chosen": -1.6747875213623047, "logps/rejected": -2.227417469024658, "loss": 0.7219, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6747875213623047, "rewards/margins": 0.5526295900344849, "rewards/rejected": -2.227417469024658, "sft_loss": 1.7008603811264038, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 8.549635107719798, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.014549210667610168, "logits/rejected": 0.07728904485702515, "logps/chosen": -1.710875153541565, "logps/rejected": -2.208728790283203, "loss": 0.7606, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.710875153541565, "rewards/margins": 0.4978538453578949, "rewards/rejected": -2.208728790283203, "sft_loss": 1.6226184368133545, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 7.858687836270272, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.2910242974758148, "logits/rejected": -0.10606098175048828, "logps/chosen": -1.500328540802002, "logps/rejected": -1.9997621774673462, "loss": 0.7297, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.500328540802002, "rewards/margins": 0.4994335174560547, "rewards/rejected": -1.9997621774673462, "sft_loss": 1.5537258386611938, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 7.063378485587355, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.06125012785196304, "logits/rejected": 0.11608482897281647, "logps/chosen": -1.5468113422393799, "logps/rejected": -2.197829484939575, "loss": 0.7199, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5468113422393799, "rewards/margins": 0.651017963886261, "rewards/rejected": -2.197829484939575, "sft_loss": 1.5807876586914062, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 8.691030419998123, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.21434545516967773, "logits/rejected": -0.06286247074604034, "logps/chosen": -1.6448936462402344, "logps/rejected": -2.1703684329986572, "loss": 0.7258, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6448936462402344, "rewards/margins": 0.5254749059677124, "rewards/rejected": -2.1703684329986572, "sft_loss": 1.6046804189682007, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 8.485977699520364, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.21592874825000763, "logits/rejected": -0.06825269013643265, "logps/chosen": -1.4869664907455444, "logps/rejected": -2.121763229370117, "loss": 0.7039, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4869664907455444, "rewards/margins": 0.6347967982292175, "rewards/rejected": -2.121763229370117, "sft_loss": 1.5187429189682007, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 14.075088351700078, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.09149859100580215, "logits/rejected": -0.08967911452054977, "logps/chosen": -1.6454637050628662, "logps/rejected": -2.2641594409942627, "loss": 0.7319, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6454637050628662, "rewards/margins": 0.6186956167221069, "rewards/rejected": -2.2641594409942627, "sft_loss": 1.7238661050796509, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 9.115391425215483, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.22340884804725647, "logits/rejected": -0.08090896904468536, "logps/chosen": -1.6274245977401733, "logps/rejected": -2.083270311355591, "loss": 0.7834, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6274245977401733, "rewards/margins": 0.4558456838130951, "rewards/rejected": -2.083270311355591, "sft_loss": 1.6869560480117798, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 12.515919571307833, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.15428565442562103, "logits/rejected": -0.021116364747285843, "logps/chosen": -1.6140978336334229, "logps/rejected": -2.2516989707946777, "loss": 0.7169, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6140978336334229, "rewards/margins": 0.6376012563705444, "rewards/rejected": -2.2516989707946777, "sft_loss": 1.651226282119751, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 19.604246359387243, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.15744802355766296, "logits/rejected": 0.00397085165604949, "logps/chosen": -1.6822433471679688, "logps/rejected": -2.0983026027679443, "loss": 0.8278, "rewards/accuracies": 0.625, "rewards/chosen": -1.6822433471679688, "rewards/margins": 0.4160594344139099, "rewards/rejected": -2.0983026027679443, "sft_loss": 1.68734872341156, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 8.951990832674902, "learning_rate": 4.6964931766245905e-07, "logits/chosen": 0.0071522025391459465, "logits/rejected": 0.07327570021152496, "logps/chosen": -1.6191751956939697, "logps/rejected": -2.2935404777526855, "loss": 0.7138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6191751956939697, "rewards/margins": 0.6743653416633606, "rewards/rejected": -2.2935404777526855, "sft_loss": 1.6083351373672485, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 9.500669218110408, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.05228153616189957, "logits/rejected": -0.007633596658706665, "logps/chosen": -1.5852936506271362, "logps/rejected": -1.9452216625213623, "loss": 0.7766, "rewards/accuracies": 0.625, "rewards/chosen": -1.5852936506271362, "rewards/margins": 0.3599281907081604, "rewards/rejected": -1.9452216625213623, "sft_loss": 1.6326818466186523, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 9.60478058128669, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.1722680628299713, "logits/rejected": 0.04278699681162834, "logps/chosen": -1.5222604274749756, "logps/rejected": -2.1924681663513184, "loss": 0.666, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5222604274749756, "rewards/margins": 0.6702078580856323, "rewards/rejected": -2.1924681663513184, "sft_loss": 1.5173310041427612, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 9.074362481757733, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.19109642505645752, "logits/rejected": -0.05392248556017876, "logps/chosen": -1.521126389503479, "logps/rejected": -2.118821620941162, "loss": 0.7024, "rewards/accuracies": 0.6875, "rewards/chosen": -1.521126389503479, "rewards/margins": 0.5976952910423279, "rewards/rejected": -2.118821620941162, "sft_loss": 1.5061622858047485, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 11.58671564141073, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.043043818324804306, "logits/rejected": 0.14518333971500397, "logps/chosen": -1.6486995220184326, "logps/rejected": -2.25911283493042, "loss": 0.7597, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.6486995220184326, "rewards/margins": 0.6104133129119873, "rewards/rejected": -2.25911283493042, "sft_loss": 1.648297667503357, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.2796221971511841, "eval_logits/rejected": 0.3926002085208893, "eval_logps/chosen": -1.6049264669418335, "eval_logps/rejected": -2.1441104412078857, "eval_loss": 0.7437512874603271, "eval_rewards/accuracies": 0.6632047295570374, "eval_rewards/chosen": -1.6049264669418335, "eval_rewards/margins": 0.5391839742660522, "eval_rewards/rejected": -2.1441104412078857, "eval_runtime": 48.6934, "eval_samples_per_second": 27.622, "eval_sft_loss": 1.6039412021636963, "eval_steps_per_second": 6.921, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 6.520148146017283, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.22467903792858124, "logits/rejected": -0.07522128522396088, "logps/chosen": -1.6090145111083984, "logps/rejected": -2.1922872066497803, "loss": 0.7259, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6090145111083984, "rewards/margins": 0.5832728743553162, "rewards/rejected": -2.1922872066497803, "sft_loss": 1.6034557819366455, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 10.76071020612093, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.1721169799566269, "logits/rejected": -0.03359944000840187, "logps/chosen": -1.5757770538330078, "logps/rejected": -2.0993614196777344, "loss": 0.7491, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5757770538330078, "rewards/margins": 0.5235844850540161, "rewards/rejected": -2.0993614196777344, "sft_loss": 1.59102463722229, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 10.747026810083767, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.135072261095047, "logits/rejected": 0.02064352110028267, "logps/chosen": -1.5834659337997437, "logps/rejected": -2.164900779724121, "loss": 0.7289, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5834659337997437, "rewards/margins": 0.5814345479011536, "rewards/rejected": -2.164900779724121, "sft_loss": 1.6022837162017822, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 9.265646737938422, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.17663605511188507, "logits/rejected": 0.11963503062725067, "logps/chosen": -1.7275232076644897, "logps/rejected": -2.317758321762085, "loss": 0.7812, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.7275232076644897, "rewards/margins": 0.5902352929115295, "rewards/rejected": -2.317758321762085, "sft_loss": 1.674320936203003, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 9.563256964109765, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.13121530413627625, "logits/rejected": 0.03350965678691864, "logps/chosen": -1.5004379749298096, "logps/rejected": -2.140740156173706, "loss": 0.6999, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5004379749298096, "rewards/margins": 0.6403024792671204, "rewards/rejected": -2.140740156173706, "sft_loss": 1.548069715499878, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 10.60559119027785, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.14714549481868744, "logits/rejected": 0.06583376228809357, "logps/chosen": -1.6301219463348389, "logps/rejected": -2.4229018688201904, "loss": 0.6771, "rewards/accuracies": 0.75, "rewards/chosen": -1.6301219463348389, "rewards/margins": 0.792779803276062, "rewards/rejected": -2.4229018688201904, "sft_loss": 1.6081111431121826, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 8.336806235329222, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.16307315230369568, "logits/rejected": -0.05455092340707779, "logps/chosen": -1.5915212631225586, "logps/rejected": -2.1753382682800293, "loss": 0.7194, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5915212631225586, "rewards/margins": 0.583817183971405, "rewards/rejected": -2.1753382682800293, "sft_loss": 1.6298892498016357, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 8.855830947925009, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.055436838418245316, "logits/rejected": 0.10494302213191986, "logps/chosen": -1.5890477895736694, "logps/rejected": -2.2761149406433105, "loss": 0.697, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5890477895736694, "rewards/margins": 0.687066912651062, "rewards/rejected": -2.2761149406433105, "sft_loss": 1.506320595741272, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 12.010404760983135, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.2767189145088196, "logits/rejected": -0.0948868840932846, "logps/chosen": -1.6528947353363037, "logps/rejected": -2.2487998008728027, "loss": 0.733, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6528947353363037, "rewards/margins": 0.5959050059318542, "rewards/rejected": -2.2487998008728027, "sft_loss": 1.6089842319488525, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 7.832474103457955, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.05546053126454353, "logits/rejected": 0.1495744287967682, "logps/chosen": -1.6067962646484375, "logps/rejected": -2.2411015033721924, "loss": 0.7194, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6067962646484375, "rewards/margins": 0.6343055963516235, "rewards/rejected": -2.2411015033721924, "sft_loss": 1.603760004043579, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 6.924236912977645, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.07914148271083832, "logits/rejected": 0.028579026460647583, "logps/chosen": -1.6794124841690063, "logps/rejected": -2.1807055473327637, "loss": 0.7535, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6794124841690063, "rewards/margins": 0.5012931227684021, "rewards/rejected": -2.1807055473327637, "sft_loss": 1.6679092645645142, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 9.406958739837947, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.18839971721172333, "logits/rejected": -0.01849387027323246, "logps/chosen": -1.5993618965148926, "logps/rejected": -2.131162643432617, "loss": 0.7302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5993618965148926, "rewards/margins": 0.5318008661270142, "rewards/rejected": -2.131162643432617, "sft_loss": 1.6234018802642822, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 16.190287619308688, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.07291006296873093, "logits/rejected": 0.06860268115997314, "logps/chosen": -1.599198818206787, "logps/rejected": -2.202462673187256, "loss": 0.7322, "rewards/accuracies": 0.71875, "rewards/chosen": -1.599198818206787, "rewards/margins": 0.6032637357711792, "rewards/rejected": -2.202462673187256, "sft_loss": 1.6434195041656494, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 9.156115628999908, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.1281352937221527, "logits/rejected": 0.03374654799699783, "logps/chosen": -1.5201386213302612, "logps/rejected": -2.1830506324768066, "loss": 0.7015, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5201386213302612, "rewards/margins": 0.6629122495651245, "rewards/rejected": -2.1830506324768066, "sft_loss": 1.542698860168457, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 9.06287627466859, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.18511958420276642, "logits/rejected": 0.20474426448345184, "logps/chosen": -1.5909534692764282, "logps/rejected": -2.216071367263794, "loss": 0.7095, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5909534692764282, "rewards/margins": 0.6251178979873657, "rewards/rejected": -2.216071367263794, "sft_loss": 1.6472440958023071, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 14.944593884056676, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.13688163459300995, "logits/rejected": -0.02319144643843174, "logps/chosen": -1.4944909811019897, "logps/rejected": -2.0645744800567627, "loss": 0.7145, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4944909811019897, "rewards/margins": 0.5700834393501282, "rewards/rejected": -2.0645744800567627, "sft_loss": 1.5962297916412354, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 8.227143331935864, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.17151449620723724, "logits/rejected": 0.10024061053991318, "logps/chosen": -1.552542805671692, "logps/rejected": -2.147279977798462, "loss": 0.7221, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.552542805671692, "rewards/margins": 0.59473717212677, "rewards/rejected": -2.147279977798462, "sft_loss": 1.5952181816101074, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 15.216888266947088, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.10421431064605713, "logits/rejected": 0.034745991230010986, "logps/chosen": -1.5860973596572876, "logps/rejected": -2.113673448562622, "loss": 0.7487, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.5860973596572876, "rewards/margins": 0.5275761485099792, "rewards/rejected": -2.113673448562622, "sft_loss": 1.6562366485595703, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 9.276452593318462, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.11463622003793716, "logits/rejected": -0.01022398192435503, "logps/chosen": -1.7121713161468506, "logps/rejected": -2.2425670623779297, "loss": 0.7871, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.7121713161468506, "rewards/margins": 0.5303958058357239, "rewards/rejected": -2.2425670623779297, "sft_loss": 1.719866394996643, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 8.52513266080087, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.22735516726970673, "logits/rejected": -0.04663518816232681, "logps/chosen": -1.721483826637268, "logps/rejected": -2.361401081085205, "loss": 0.7489, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.721483826637268, "rewards/margins": 0.6399173736572266, "rewards/rejected": -2.361401081085205, "sft_loss": 1.7590858936309814, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 12.69721828776736, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.226566880941391, "logits/rejected": -0.027831286191940308, "logps/chosen": -1.6754512786865234, "logps/rejected": -2.2710180282592773, "loss": 0.7446, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6754512786865234, "rewards/margins": 0.5955665707588196, "rewards/rejected": -2.2710180282592773, "sft_loss": 1.68548583984375, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 7.786749533027135, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.01849212870001793, "logits/rejected": 0.04187586531043053, "logps/chosen": -1.6516926288604736, "logps/rejected": -2.2091779708862305, "loss": 0.7265, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6516926288604736, "rewards/margins": 0.5574851632118225, "rewards/rejected": -2.2091779708862305, "sft_loss": 1.6806981563568115, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 14.530591047648015, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.21565723419189453, "logits/rejected": -0.0042025139555335045, "logps/chosen": -1.6339657306671143, "logps/rejected": -2.2094879150390625, "loss": 0.7418, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6339657306671143, "rewards/margins": 0.575522243976593, "rewards/rejected": -2.2094879150390625, "sft_loss": 1.6413185596466064, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 7.577448208379862, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.12442121654748917, "logits/rejected": 0.13959330320358276, "logps/chosen": -1.6186530590057373, "logps/rejected": -2.416656970977783, "loss": 0.6994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6186530590057373, "rewards/margins": 0.7980039715766907, "rewards/rejected": -2.416656970977783, "sft_loss": 1.662672996520996, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 14.763651419144733, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.19086572527885437, "logits/rejected": 0.041380513459444046, "logps/chosen": -1.6585109233856201, "logps/rejected": -2.293794870376587, "loss": 0.7289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6585109233856201, "rewards/margins": 0.6352839469909668, "rewards/rejected": -2.293794870376587, "sft_loss": 1.7602535486221313, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 8.90927158777695, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.09383013844490051, "logits/rejected": 0.047669991850852966, "logps/chosen": -1.5358823537826538, "logps/rejected": -2.2465291023254395, "loss": 0.7023, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5358823537826538, "rewards/margins": 0.7106468677520752, "rewards/rejected": -2.2465291023254395, "sft_loss": 1.5927324295043945, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 22.34485181722508, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.11399248987436295, "logits/rejected": 0.08677612245082855, "logps/chosen": -1.712977409362793, "logps/rejected": -2.2527875900268555, "loss": 0.7571, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.712977409362793, "rewards/margins": 0.5398100018501282, "rewards/rejected": -2.2527875900268555, "sft_loss": 1.6614840030670166, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 9.774901505386355, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.1074158325791359, "logits/rejected": 0.0926705077290535, "logps/chosen": -1.5266141891479492, "logps/rejected": -2.18959903717041, "loss": 0.7093, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5266141891479492, "rewards/margins": 0.6629847288131714, "rewards/rejected": -2.18959903717041, "sft_loss": 1.5803487300872803, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 9.243887206549184, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.1385643184185028, "logits/rejected": 0.08555924892425537, "logps/chosen": -1.4835153818130493, "logps/rejected": -2.0727763175964355, "loss": 0.7044, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4835153818130493, "rewards/margins": 0.5892609357833862, "rewards/rejected": -2.0727763175964355, "sft_loss": 1.5492403507232666, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 9.748515057483266, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.2937970757484436, "logits/rejected": -0.11782214790582657, "logps/chosen": -1.6570533514022827, "logps/rejected": -2.187161445617676, "loss": 0.731, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6570533514022827, "rewards/margins": 0.5301080942153931, "rewards/rejected": -2.187161445617676, "sft_loss": 1.668857216835022, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 12.468510967657643, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.00016920268535614014, "logits/rejected": 0.0627494752407074, "logps/chosen": -1.714073896408081, "logps/rejected": -2.1507229804992676, "loss": 0.8385, "rewards/accuracies": 0.625, "rewards/chosen": -1.714073896408081, "rewards/margins": 0.43664899468421936, "rewards/rejected": -2.1507229804992676, "sft_loss": 1.680690050125122, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 10.816664188289334, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.19193141162395477, "logits/rejected": 0.083371601998806, "logps/chosen": -1.5172638893127441, "logps/rejected": -2.1189401149749756, "loss": 0.7186, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5172638893127441, "rewards/margins": 0.601676344871521, "rewards/rejected": -2.1189401149749756, "sft_loss": 1.5433123111724854, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 7.2096050935175455, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.19457030296325684, "logits/rejected": 0.0140345748513937, "logps/chosen": -1.6426728963851929, "logps/rejected": -2.2261767387390137, "loss": 0.7211, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6426728963851929, "rewards/margins": 0.5835039019584656, "rewards/rejected": -2.2261767387390137, "sft_loss": 1.6838672161102295, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 11.153525567971421, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.17745575308799744, "logits/rejected": 0.022489432245492935, "logps/chosen": -1.6385818719863892, "logps/rejected": -2.1156294345855713, "loss": 0.7869, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6385818719863892, "rewards/margins": 0.4770474433898926, "rewards/rejected": -2.1156294345855713, "sft_loss": 1.6342036724090576, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 12.66522340055357, "learning_rate": 4.093952802428457e-07, "logits/chosen": 0.00498633086681366, "logits/rejected": 0.08237729221582413, "logps/chosen": -1.7283580303192139, "logps/rejected": -2.21094012260437, "loss": 0.8066, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7283580303192139, "rewards/margins": 0.4825819134712219, "rewards/rejected": -2.21094012260437, "sft_loss": 1.706624984741211, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 7.307867617871973, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.22959277033805847, "logits/rejected": -0.036974240094423294, "logps/chosen": -1.628914475440979, "logps/rejected": -2.2404022216796875, "loss": 0.7054, "rewards/accuracies": 0.6875, "rewards/chosen": -1.628914475440979, "rewards/margins": 0.6114878058433533, "rewards/rejected": -2.2404022216796875, "sft_loss": 1.599521279335022, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 8.851439286575424, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.19177380204200745, "logits/rejected": -0.1527838408946991, "logps/chosen": -1.661956548690796, "logps/rejected": -2.119969367980957, "loss": 0.7723, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.661956548690796, "rewards/margins": 0.45801275968551636, "rewards/rejected": -2.119969367980957, "sft_loss": 1.679813027381897, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 13.11729626000739, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.16784217953681946, "logits/rejected": 0.002752959728240967, "logps/chosen": -1.7143226861953735, "logps/rejected": -2.2099227905273438, "loss": 0.7708, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7143226861953735, "rewards/margins": 0.4956004023551941, "rewards/rejected": -2.2099227905273438, "sft_loss": 1.7196691036224365, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 10.855422150552018, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.19284026324748993, "logits/rejected": -0.042516712099313736, "logps/chosen": -1.536863923072815, "logps/rejected": -2.1849122047424316, "loss": 0.7037, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.536863923072815, "rewards/margins": 0.6480484008789062, "rewards/rejected": -2.1849122047424316, "sft_loss": 1.534368872642517, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 9.3550135915996, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.13389965891838074, "logits/rejected": 0.018958495929837227, "logps/chosen": -1.6331771612167358, "logps/rejected": -2.198584794998169, "loss": 0.7293, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6331771612167358, "rewards/margins": 0.5654075741767883, "rewards/rejected": -2.198584794998169, "sft_loss": 1.6358585357666016, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 14.793942672748782, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.15781688690185547, "logits/rejected": 0.03489111736416817, "logps/chosen": -1.5986695289611816, "logps/rejected": -2.1426074504852295, "loss": 0.7259, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5986695289611816, "rewards/margins": 0.5439378619194031, "rewards/rejected": -2.1426074504852295, "sft_loss": 1.6520202159881592, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 11.874229551619395, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.18533504009246826, "logits/rejected": 0.11039905250072479, "logps/chosen": -1.746463418006897, "logps/rejected": -2.2957587242126465, "loss": 0.7474, "rewards/accuracies": 0.65625, "rewards/chosen": -1.746463418006897, "rewards/margins": 0.5492954254150391, "rewards/rejected": -2.2957587242126465, "sft_loss": 1.7126915454864502, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 11.922904515853077, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.1678035408258438, "logits/rejected": -0.01894417032599449, "logps/chosen": -1.643080711364746, "logps/rejected": -2.282376766204834, "loss": 0.7468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.643080711364746, "rewards/margins": 0.6392959356307983, "rewards/rejected": -2.282376766204834, "sft_loss": 1.7416213750839233, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 19.361638691693887, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.07469090074300766, "logits/rejected": 0.07822644710540771, "logps/chosen": -1.7850029468536377, "logps/rejected": -2.4024927616119385, "loss": 0.7958, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7850029468536377, "rewards/margins": 0.6174899339675903, "rewards/rejected": -2.4024927616119385, "sft_loss": 1.801253318786621, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 6.7495910239271595, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.1647300273180008, "logits/rejected": -0.10051450878381729, "logps/chosen": -1.5987623929977417, "logps/rejected": -2.183371067047119, "loss": 0.7356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5987623929977417, "rewards/margins": 0.5846089124679565, "rewards/rejected": -2.183371067047119, "sft_loss": 1.591897964477539, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 9.559968354812593, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.15312853455543518, "logits/rejected": 0.007118262350559235, "logps/chosen": -1.6157350540161133, "logps/rejected": -2.346029758453369, "loss": 0.7096, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6157350540161133, "rewards/margins": 0.7302943468093872, "rewards/rejected": -2.346029758453369, "sft_loss": 1.627651572227478, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 11.70742765504433, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.16065241396427155, "logits/rejected": -0.028475571423768997, "logps/chosen": -1.5399911403656006, "logps/rejected": -2.1506786346435547, "loss": 0.6784, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5399911403656006, "rewards/margins": 0.6106875538825989, "rewards/rejected": -2.1506786346435547, "sft_loss": 1.5866378545761108, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 11.309235328745057, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.22967243194580078, "logits/rejected": -0.12530867755413055, "logps/chosen": -1.5206987857818604, "logps/rejected": -2.219179630279541, "loss": 0.7208, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5206987857818604, "rewards/margins": 0.6984809041023254, "rewards/rejected": -2.219179630279541, "sft_loss": 1.54509437084198, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 8.405946283092145, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.22225873172283173, "logits/rejected": 0.04461182653903961, "logps/chosen": -1.5875943899154663, "logps/rejected": -2.1397206783294678, "loss": 0.7422, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5875943899154663, "rewards/margins": 0.5521259307861328, "rewards/rejected": -2.1397206783294678, "sft_loss": 1.6139698028564453, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 11.081759832394587, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.3076205849647522, "logits/rejected": -0.0845259353518486, "logps/chosen": -1.608341932296753, "logps/rejected": -2.245190143585205, "loss": 0.7247, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.608341932296753, "rewards/margins": 0.636847972869873, "rewards/rejected": -2.245190143585205, "sft_loss": 1.6670973300933838, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 9.42170591031814, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.16034550964832306, "logits/rejected": 0.06474418938159943, "logps/chosen": -1.7657172679901123, "logps/rejected": -2.383849620819092, "loss": 0.7574, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7657172679901123, "rewards/margins": 0.6181322932243347, "rewards/rejected": -2.383849620819092, "sft_loss": 1.7854160070419312, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 15.985883747128684, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.05079017952084541, "logits/rejected": 0.09796609729528427, "logps/chosen": -1.614545226097107, "logps/rejected": -2.2385549545288086, "loss": 0.7393, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.614545226097107, "rewards/margins": 0.6240096092224121, "rewards/rejected": -2.2385549545288086, "sft_loss": 1.6241651773452759, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 8.210830961618273, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.16164612770080566, "logits/rejected": -0.008709913119673729, "logps/chosen": -1.5105348825454712, "logps/rejected": -2.0109338760375977, "loss": 0.72, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5105348825454712, "rewards/margins": 0.5003989934921265, "rewards/rejected": -2.0109338760375977, "sft_loss": 1.5298774242401123, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 10.400200694153305, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.19967922568321228, "logits/rejected": -0.08786850422620773, "logps/chosen": -1.694928526878357, "logps/rejected": -2.2611489295959473, "loss": 0.7577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.694928526878357, "rewards/margins": 0.5662199258804321, "rewards/rejected": -2.2611489295959473, "sft_loss": 1.7095798254013062, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 11.781657369534488, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.2163340151309967, "logits/rejected": -0.1382766216993332, "logps/chosen": -1.5853474140167236, "logps/rejected": -2.1211342811584473, "loss": 0.7288, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5853474140167236, "rewards/margins": 0.5357868671417236, "rewards/rejected": -2.1211342811584473, "sft_loss": 1.5868964195251465, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 8.556276996858685, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.06274105608463287, "logits/rejected": -0.015728969126939774, "logps/chosen": -1.625140905380249, "logps/rejected": -2.1532254219055176, "loss": 0.7413, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.625140905380249, "rewards/margins": 0.5280844569206238, "rewards/rejected": -2.1532254219055176, "sft_loss": 1.6544675827026367, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 9.7140360461685, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.09756962954998016, "logits/rejected": 0.050882868468761444, "logps/chosen": -1.5445709228515625, "logps/rejected": -2.0798630714416504, "loss": 0.7344, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5445709228515625, "rewards/margins": 0.5352921485900879, "rewards/rejected": -2.0798630714416504, "sft_loss": 1.620279312133789, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 12.862299848033977, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.1269254982471466, "logits/rejected": 0.027856886386871338, "logps/chosen": -1.5682541131973267, "logps/rejected": -2.0286543369293213, "loss": 0.7453, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.5682541131973267, "rewards/margins": 0.4604003429412842, "rewards/rejected": -2.0286543369293213, "sft_loss": 1.6590089797973633, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 9.287614219077945, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.21697914600372314, "logits/rejected": -0.011072764173150063, "logps/chosen": -1.6104047298431396, "logps/rejected": -2.086874008178711, "loss": 0.737, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6104047298431396, "rewards/margins": 0.47646933794021606, "rewards/rejected": -2.086874008178711, "sft_loss": 1.6542373895645142, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 8.172638924344986, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.1281202882528305, "logits/rejected": 0.05985052511096001, "logps/chosen": -1.5847066640853882, "logps/rejected": -2.2846851348876953, "loss": 0.7077, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5847066640853882, "rewards/margins": 0.6999785304069519, "rewards/rejected": -2.2846851348876953, "sft_loss": 1.6472995281219482, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 13.041564552120994, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.1288767009973526, "logits/rejected": 0.04675941914319992, "logps/chosen": -1.6174272298812866, "logps/rejected": -2.2540555000305176, "loss": 0.7145, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6174272298812866, "rewards/margins": 0.636628270149231, "rewards/rejected": -2.2540555000305176, "sft_loss": 1.6466875076293945, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 13.182624895205818, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.16028103232383728, "logits/rejected": -0.04944011569023132, "logps/chosen": -1.5418676137924194, "logps/rejected": -2.121840238571167, "loss": 0.713, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5418676137924194, "rewards/margins": 0.5799726247787476, "rewards/rejected": -2.121840238571167, "sft_loss": 1.583237886428833, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 13.957512823459624, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.1579606831073761, "logits/rejected": 0.08689354360103607, "logps/chosen": -1.5945956707000732, "logps/rejected": -2.1859207153320312, "loss": 0.7234, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.5945956707000732, "rewards/margins": 0.5913248658180237, "rewards/rejected": -2.1859207153320312, "sft_loss": 1.580637812614441, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 11.239450267497457, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.22359701991081238, "logits/rejected": -0.03328874707221985, "logps/chosen": -1.6039142608642578, "logps/rejected": -2.236027717590332, "loss": 0.7241, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6039142608642578, "rewards/margins": 0.632113516330719, "rewards/rejected": -2.236027717590332, "sft_loss": 1.6580703258514404, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 7.487628594128432, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.1239256039261818, "logits/rejected": 0.11735118925571442, "logps/chosen": -1.5950149297714233, "logps/rejected": -2.178715944290161, "loss": 0.7307, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5950149297714233, "rewards/margins": 0.5837005376815796, "rewards/rejected": -2.178715944290161, "sft_loss": 1.6609176397323608, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 17.631147416346465, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.11558713763952255, "logits/rejected": -0.0242562647908926, "logps/chosen": -1.5977771282196045, "logps/rejected": -2.1724162101745605, "loss": 0.7244, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5977771282196045, "rewards/margins": 0.574639081954956, "rewards/rejected": -2.1724162101745605, "sft_loss": 1.6741511821746826, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 8.703289576203192, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.228302001953125, "logits/rejected": -0.058668047189712524, "logps/chosen": -1.5291993618011475, "logps/rejected": -2.162330150604248, "loss": 0.7218, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.5291993618011475, "rewards/margins": 0.6331309080123901, "rewards/rejected": -2.162330150604248, "sft_loss": 1.597410798072815, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 12.434773303946738, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.18907403945922852, "logits/rejected": 0.08992107957601547, "logps/chosen": -1.5999428033828735, "logps/rejected": -2.0549299716949463, "loss": 0.7676, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.5999428033828735, "rewards/margins": 0.45498746633529663, "rewards/rejected": -2.0549299716949463, "sft_loss": 1.6349014043807983, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 11.567324660099407, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.33280640840530396, "logits/rejected": 0.004661875776946545, "logps/chosen": -1.5515568256378174, "logps/rejected": -2.1178009510040283, "loss": 0.7036, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5515568256378174, "rewards/margins": 0.5662440061569214, "rewards/rejected": -2.1178009510040283, "sft_loss": 1.5804178714752197, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 9.273665867070601, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.07865197211503983, "logits/rejected": 0.034912388771772385, "logps/chosen": -1.5886871814727783, "logps/rejected": -2.0912280082702637, "loss": 0.737, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5886871814727783, "rewards/margins": 0.5025407075881958, "rewards/rejected": -2.0912280082702637, "sft_loss": 1.5304675102233887, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 10.028149528997467, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.2463723123073578, "logits/rejected": -0.06868429481983185, "logps/chosen": -1.5848485231399536, "logps/rejected": -2.174999952316284, "loss": 0.7598, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5848485231399536, "rewards/margins": 0.5901514291763306, "rewards/rejected": -2.174999952316284, "sft_loss": 1.5741559267044067, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 11.297721119997345, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.14862100780010223, "logits/rejected": -0.02488403208553791, "logps/chosen": -1.5863415002822876, "logps/rejected": -2.1816811561584473, "loss": 0.7172, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5863415002822876, "rewards/margins": 0.5953398942947388, "rewards/rejected": -2.1816811561584473, "sft_loss": 1.6118249893188477, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 10.01247977820727, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.12059799581766129, "logits/rejected": -0.019911272451281548, "logps/chosen": -1.552743673324585, "logps/rejected": -2.2916336059570312, "loss": 0.6982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.552743673324585, "rewards/margins": 0.738889753818512, "rewards/rejected": -2.2916336059570312, "sft_loss": 1.6109209060668945, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 10.301064630411307, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.19101211428642273, "logits/rejected": 0.011400984600186348, "logps/chosen": -1.568054437637329, "logps/rejected": -2.1744346618652344, "loss": 0.6961, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.568054437637329, "rewards/margins": 0.6063801646232605, "rewards/rejected": -2.1744346618652344, "sft_loss": 1.5899173021316528, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 8.708574303907081, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.06535448879003525, "logits/rejected": 0.08843336254358292, "logps/chosen": -1.5790965557098389, "logps/rejected": -2.213759660720825, "loss": 0.7487, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5790965557098389, "rewards/margins": 0.634662926197052, "rewards/rejected": -2.213759660720825, "sft_loss": 1.5902222394943237, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 7.99429280784537, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.143599733710289, "logits/rejected": -0.071458600461483, "logps/chosen": -1.5827624797821045, "logps/rejected": -2.3023605346679688, "loss": 0.6703, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5827624797821045, "rewards/margins": 0.7195979356765747, "rewards/rejected": -2.3023605346679688, "sft_loss": 1.6041113138198853, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 8.851930764690566, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.04648059606552124, "logits/rejected": 0.030413443222641945, "logps/chosen": -1.6942846775054932, "logps/rejected": -2.2820076942443848, "loss": 0.7292, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6942846775054932, "rewards/margins": 0.5877227783203125, "rewards/rejected": -2.2820076942443848, "sft_loss": 1.758832573890686, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 10.512011212870283, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.10294010490179062, "logits/rejected": -0.016281917691230774, "logps/chosen": -1.650456190109253, "logps/rejected": -2.09480357170105, "loss": 0.792, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.650456190109253, "rewards/margins": 0.4443473815917969, "rewards/rejected": -2.09480357170105, "sft_loss": 1.6763296127319336, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 8.29278536004856, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.259746789932251, "logits/rejected": -0.006632113363593817, "logps/chosen": -1.6425392627716064, "logps/rejected": -2.242915630340576, "loss": 0.7187, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6425392627716064, "rewards/margins": 0.6003759503364563, "rewards/rejected": -2.242915630340576, "sft_loss": 1.5589948892593384, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 9.818591184948843, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.08305975794792175, "logits/rejected": 0.1297585815191269, "logps/chosen": -1.6400636434555054, "logps/rejected": -2.415120840072632, "loss": 0.7128, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6400636434555054, "rewards/margins": 0.7750571370124817, "rewards/rejected": -2.415120840072632, "sft_loss": 1.6632683277130127, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.2486271858215332, "eval_logits/rejected": 0.36073699593544006, "eval_logps/chosen": -1.6446138620376587, "eval_logps/rejected": -2.2337498664855957, "eval_loss": 0.7399167418479919, "eval_rewards/accuracies": 0.6780415177345276, "eval_rewards/chosen": -1.6446138620376587, "eval_rewards/margins": 0.5891358852386475, "eval_rewards/rejected": -2.2337498664855957, "eval_runtime": 50.9846, "eval_samples_per_second": 26.381, "eval_sft_loss": 1.636842131614685, "eval_steps_per_second": 6.61, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 9.743572514859725, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.19039756059646606, "logits/rejected": -0.13929222524166107, "logps/chosen": -1.6055266857147217, "logps/rejected": -2.085793972015381, "loss": 0.7706, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6055266857147217, "rewards/margins": 0.480267196893692, "rewards/rejected": -2.085793972015381, "sft_loss": 1.6365588903427124, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 9.211179775419433, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.1625978946685791, "logits/rejected": 0.0010643110144883394, "logps/chosen": -1.6014732122421265, "logps/rejected": -2.2072250843048096, "loss": 0.7251, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6014732122421265, "rewards/margins": 0.6057518720626831, "rewards/rejected": -2.2072250843048096, "sft_loss": 1.6735950708389282, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 8.629299950344242, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.062279112637043, "logits/rejected": 0.09005732834339142, "logps/chosen": -1.6360442638397217, "logps/rejected": -2.2158823013305664, "loss": 0.7235, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6360442638397217, "rewards/margins": 0.5798379778862, "rewards/rejected": -2.2158823013305664, "sft_loss": 1.6420892477035522, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 12.925281665633173, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.14776435494422913, "logits/rejected": -0.056493986397981644, "logps/chosen": -1.698695182800293, "logps/rejected": -2.1507368087768555, "loss": 0.7807, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.698695182800293, "rewards/margins": 0.45204171538352966, "rewards/rejected": -2.1507368087768555, "sft_loss": 1.7433712482452393, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 9.326921785744677, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.04479864984750748, "logits/rejected": -0.024631520733237267, "logps/chosen": -1.60796320438385, "logps/rejected": -2.109769105911255, "loss": 0.7503, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.60796320438385, "rewards/margins": 0.5018059015274048, "rewards/rejected": -2.109769105911255, "sft_loss": 1.6425793170928955, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 13.694928102784823, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.14936873316764832, "logits/rejected": -0.08626748621463776, "logps/chosen": -1.664642572402954, "logps/rejected": -2.2107229232788086, "loss": 0.798, "rewards/accuracies": 0.59375, "rewards/chosen": -1.664642572402954, "rewards/margins": 0.5460804104804993, "rewards/rejected": -2.2107229232788086, "sft_loss": 1.7481571435928345, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 9.31883285307852, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.20505651831626892, "logits/rejected": -0.08992957323789597, "logps/chosen": -1.5795940160751343, "logps/rejected": -2.1468145847320557, "loss": 0.7259, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5795940160751343, "rewards/margins": 0.5672203302383423, "rewards/rejected": -2.1468145847320557, "sft_loss": 1.6363407373428345, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 8.676289732907712, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.11893777549266815, "logits/rejected": 0.004642007406800985, "logps/chosen": -1.6742401123046875, "logps/rejected": -2.0781495571136475, "loss": 0.7648, "rewards/accuracies": 0.625, "rewards/chosen": -1.6742401123046875, "rewards/margins": 0.4039096236228943, "rewards/rejected": -2.0781495571136475, "sft_loss": 1.6985105276107788, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 8.979181147880695, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.16905741393566132, "logits/rejected": -0.00790588092058897, "logps/chosen": -1.6887584924697876, "logps/rejected": -2.0864083766937256, "loss": 0.7726, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6887584924697876, "rewards/margins": 0.39764994382858276, "rewards/rejected": -2.0864083766937256, "sft_loss": 1.7341880798339844, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 8.325053628677033, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.22554373741149902, "logits/rejected": -0.10361931473016739, "logps/chosen": -1.5729840993881226, "logps/rejected": -2.0182149410247803, "loss": 0.7542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.5729840993881226, "rewards/margins": 0.4452306628227234, "rewards/rejected": -2.0182149410247803, "sft_loss": 1.6342014074325562, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 11.510864078425929, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.1654638648033142, "logits/rejected": -0.009549139998853207, "logps/chosen": -1.6484769582748413, "logps/rejected": -2.1484692096710205, "loss": 0.7537, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6484769582748413, "rewards/margins": 0.4999920427799225, "rewards/rejected": -2.1484692096710205, "sft_loss": 1.631192922592163, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 11.863486153114126, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.1989479959011078, "logits/rejected": -0.03458351269364357, "logps/chosen": -1.514382004737854, "logps/rejected": -2.0508694648742676, "loss": 0.7357, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.514382004737854, "rewards/margins": 0.5364874601364136, "rewards/rejected": -2.0508694648742676, "sft_loss": 1.5978753566741943, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 11.62994681963042, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.14714178442955017, "logits/rejected": -0.015805020928382874, "logps/chosen": -1.4970061779022217, "logps/rejected": -2.067319393157959, "loss": 0.7302, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4970061779022217, "rewards/margins": 0.5703133344650269, "rewards/rejected": -2.067319393157959, "sft_loss": 1.4799169301986694, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 8.976874663242901, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.1882084310054779, "logits/rejected": -0.008587071672081947, "logps/chosen": -1.5499083995819092, "logps/rejected": -2.2772908210754395, "loss": 0.686, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5499083995819092, "rewards/margins": 0.7273823618888855, "rewards/rejected": -2.2772908210754395, "sft_loss": 1.654982566833496, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 7.922361268708236, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.18709039688110352, "logits/rejected": -0.010397210717201233, "logps/chosen": -1.7090257406234741, "logps/rejected": -2.423692464828491, "loss": 0.7259, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7090257406234741, "rewards/margins": 0.7146669626235962, "rewards/rejected": -2.423692464828491, "sft_loss": 1.7446457147598267, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 14.319351094302018, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.16515375673770905, "logits/rejected": -0.01795039139688015, "logps/chosen": -1.4703328609466553, "logps/rejected": -2.050914764404297, "loss": 0.6851, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4703328609466553, "rewards/margins": 0.5805819034576416, "rewards/rejected": -2.050914764404297, "sft_loss": 1.5185655355453491, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 11.050482268166853, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.1832316368818283, "logits/rejected": -0.07136061787605286, "logps/chosen": -1.5768686532974243, "logps/rejected": -2.217076301574707, "loss": 0.7243, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5768686532974243, "rewards/margins": 0.6402075290679932, "rewards/rejected": -2.217076301574707, "sft_loss": 1.6429831981658936, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 8.170718381875641, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.15254482626914978, "logits/rejected": 0.059175778180360794, "logps/chosen": -1.6513283252716064, "logps/rejected": -2.270390748977661, "loss": 0.7317, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6513283252716064, "rewards/margins": 0.6190625429153442, "rewards/rejected": -2.270390748977661, "sft_loss": 1.6093699932098389, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 9.142529234355417, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.11047704517841339, "logits/rejected": -0.032970573753118515, "logps/chosen": -1.7152369022369385, "logps/rejected": -2.294235944747925, "loss": 0.7648, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7152369022369385, "rewards/margins": 0.5789992213249207, "rewards/rejected": -2.294235944747925, "sft_loss": 1.7734344005584717, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 12.195345417014321, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.08410428464412689, "logits/rejected": 0.057231903076171875, "logps/chosen": -1.5722923278808594, "logps/rejected": -2.154660224914551, "loss": 0.7226, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5722923278808594, "rewards/margins": 0.5823677182197571, "rewards/rejected": -2.154660224914551, "sft_loss": 1.5988538265228271, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 10.394917134769837, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.14630797505378723, "logits/rejected": -0.046213530004024506, "logps/chosen": -1.6012938022613525, "logps/rejected": -2.228280782699585, "loss": 0.7102, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6012938022613525, "rewards/margins": 0.626987099647522, "rewards/rejected": -2.228280782699585, "sft_loss": 1.6041920185089111, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 10.88254208383536, "learning_rate": 3.094859304170293e-07, "logits/chosen": 0.013003816828131676, "logits/rejected": 0.09070932865142822, "logps/chosen": -1.6228872537612915, "logps/rejected": -2.129692792892456, "loss": 0.7764, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6228872537612915, "rewards/margins": 0.5068058967590332, "rewards/rejected": -2.129692792892456, "sft_loss": 1.6901829242706299, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 8.453213022807756, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.06357358396053314, "logits/rejected": 0.02723897434771061, "logps/chosen": -1.6235229969024658, "logps/rejected": -2.2506234645843506, "loss": 0.7382, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6235229969024658, "rewards/margins": 0.6271007061004639, "rewards/rejected": -2.2506234645843506, "sft_loss": 1.678961992263794, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 6.814735982915753, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.11387670040130615, "logits/rejected": 0.0014181584119796753, "logps/chosen": -1.5900146961212158, "logps/rejected": -2.045799493789673, "loss": 0.7537, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5900146961212158, "rewards/margins": 0.4557846188545227, "rewards/rejected": -2.045799493789673, "sft_loss": 1.5816245079040527, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 8.01633318648078, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.18409691751003265, "logits/rejected": -0.05851252004504204, "logps/chosen": -1.6137990951538086, "logps/rejected": -2.1909549236297607, "loss": 0.7526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6137990951538086, "rewards/margins": 0.5771559476852417, "rewards/rejected": -2.1909549236297607, "sft_loss": 1.6941196918487549, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 8.7458228859743, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.08741353452205658, "logits/rejected": 0.08174075931310654, "logps/chosen": -1.5408705472946167, "logps/rejected": -2.1303999423980713, "loss": 0.6924, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5408705472946167, "rewards/margins": 0.589529275894165, "rewards/rejected": -2.1303999423980713, "sft_loss": 1.5628222227096558, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 14.424928119569573, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.01694369874894619, "logits/rejected": 0.09473001956939697, "logps/chosen": -1.6902602910995483, "logps/rejected": -2.113116502761841, "loss": 0.8413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6902602910995483, "rewards/margins": 0.42285624146461487, "rewards/rejected": -2.113116502761841, "sft_loss": 1.695860505104065, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 9.045701970726677, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.023767894133925438, "logits/rejected": 0.06364504247903824, "logps/chosen": -1.6748148202896118, "logps/rejected": -2.125621795654297, "loss": 0.7746, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.6748148202896118, "rewards/margins": 0.4508071839809418, "rewards/rejected": -2.125621795654297, "sft_loss": 1.646785020828247, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 10.526317797792855, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.08777356147766113, "logits/rejected": 0.03373908996582031, "logps/chosen": -1.6140108108520508, "logps/rejected": -2.1882472038269043, "loss": 0.7086, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6140108108520508, "rewards/margins": 0.5742363929748535, "rewards/rejected": -2.1882472038269043, "sft_loss": 1.6249473094940186, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 9.544501434112822, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.16061075031757355, "logits/rejected": -0.046558480709791183, "logps/chosen": -1.4760711193084717, "logps/rejected": -2.1706199645996094, "loss": 0.6736, "rewards/accuracies": 0.75, "rewards/chosen": -1.4760711193084717, "rewards/margins": 0.6945487856864929, "rewards/rejected": -2.1706199645996094, "sft_loss": 1.5252652168273926, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 9.822006896178703, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.1253000646829605, "logits/rejected": 0.01346740871667862, "logps/chosen": -1.5189131498336792, "logps/rejected": -2.2918996810913086, "loss": 0.6703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5189131498336792, "rewards/margins": 0.7729868292808533, "rewards/rejected": -2.2918996810913086, "sft_loss": 1.6150785684585571, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 8.25146401257379, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.22532884776592255, "logits/rejected": -0.17570312321186066, "logps/chosen": -1.5567013025283813, "logps/rejected": -2.308267116546631, "loss": 0.6949, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5567013025283813, "rewards/margins": 0.7515658140182495, "rewards/rejected": -2.308267116546631, "sft_loss": 1.6419492959976196, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 13.946633835937762, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.11414532363414764, "logits/rejected": 0.12219560146331787, "logps/chosen": -1.6283546686172485, "logps/rejected": -2.325040340423584, "loss": 0.7075, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6283546686172485, "rewards/margins": 0.6966856718063354, "rewards/rejected": -2.325040340423584, "sft_loss": 1.6282196044921875, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 11.936664389061903, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.22026380896568298, "logits/rejected": -0.010797997005283833, "logps/chosen": -1.5182678699493408, "logps/rejected": -2.436857223510742, "loss": 0.6426, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5182678699493408, "rewards/margins": 0.9185894727706909, "rewards/rejected": -2.436857223510742, "sft_loss": 1.5795509815216064, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 14.084417770595923, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.15429076552391052, "logits/rejected": -0.08392228931188583, "logps/chosen": -1.6110146045684814, "logps/rejected": -2.2458596229553223, "loss": 0.7145, "rewards/accuracies": 0.625, "rewards/chosen": -1.6110146045684814, "rewards/margins": 0.6348448395729065, "rewards/rejected": -2.2458596229553223, "sft_loss": 1.621387243270874, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 9.05248102840996, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.1064993366599083, "logits/rejected": -0.055342577397823334, "logps/chosen": -1.5859105587005615, "logps/rejected": -2.264536142349243, "loss": 0.7211, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.5859105587005615, "rewards/margins": 0.6786257028579712, "rewards/rejected": -2.264536142349243, "sft_loss": 1.6530290842056274, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 5.9943899448908935, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.14914092421531677, "logits/rejected": 0.02509469911456108, "logps/chosen": -1.6400749683380127, "logps/rejected": -2.381646156311035, "loss": 0.6754, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6400749683380127, "rewards/margins": 0.7415715456008911, "rewards/rejected": -2.381646156311035, "sft_loss": 1.6350305080413818, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 6.607980718059979, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.23991665244102478, "logits/rejected": -0.029693976044654846, "logps/chosen": -1.5858159065246582, "logps/rejected": -2.3997271060943604, "loss": 0.6581, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5858159065246582, "rewards/margins": 0.8139110803604126, "rewards/rejected": -2.3997271060943604, "sft_loss": 1.6728794574737549, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 10.855979185892906, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.11668483167886734, "logits/rejected": -0.08222740888595581, "logps/chosen": -1.5457876920700073, "logps/rejected": -2.0951225757598877, "loss": 0.7339, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5457876920700073, "rewards/margins": 0.5493348240852356, "rewards/rejected": -2.0951225757598877, "sft_loss": 1.6347955465316772, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 9.34789152427309, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.15070918202400208, "logits/rejected": 0.005772867705672979, "logps/chosen": -1.6197843551635742, "logps/rejected": -2.3066611289978027, "loss": 0.693, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6197843551635742, "rewards/margins": 0.686876654624939, "rewards/rejected": -2.3066611289978027, "sft_loss": 1.7020184993743896, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 10.759852837899983, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.12711107730865479, "logits/rejected": 0.025174003094434738, "logps/chosen": -1.5201184749603271, "logps/rejected": -2.1760687828063965, "loss": 0.6808, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5201184749603271, "rewards/margins": 0.6559504270553589, "rewards/rejected": -2.1760687828063965, "sft_loss": 1.5555031299591064, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 9.549992790534587, "learning_rate": 2.81075756698315e-07, "logits/chosen": 0.00995289720594883, "logits/rejected": 0.11867634207010269, "logps/chosen": -1.5571832656860352, "logps/rejected": -2.370918035507202, "loss": 0.6539, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5571832656860352, "rewards/margins": 0.8137348294258118, "rewards/rejected": -2.370918035507202, "sft_loss": 1.5516964197158813, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 9.913230599596586, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.24721892178058624, "logits/rejected": -0.09468363225460052, "logps/chosen": -1.6414110660552979, "logps/rejected": -2.2487573623657227, "loss": 0.7084, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6414110660552979, "rewards/margins": 0.6073463559150696, "rewards/rejected": -2.2487573623657227, "sft_loss": 1.6050937175750732, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 9.856460794103564, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.2382887899875641, "logits/rejected": 0.02291843853890896, "logps/chosen": -1.6796343326568604, "logps/rejected": -2.3517861366271973, "loss": 0.7292, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6796343326568604, "rewards/margins": 0.6721519231796265, "rewards/rejected": -2.3517861366271973, "sft_loss": 1.7107181549072266, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 10.82517782525042, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.15518540143966675, "logits/rejected": -0.02807353064417839, "logps/chosen": -1.5908386707305908, "logps/rejected": -2.292300224304199, "loss": 0.6735, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5908386707305908, "rewards/margins": 0.7014617919921875, "rewards/rejected": -2.292300224304199, "sft_loss": 1.6439765691757202, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 9.927374100447528, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.2502720355987549, "logits/rejected": -0.06395257264375687, "logps/chosen": -1.6771581172943115, "logps/rejected": -2.4648046493530273, "loss": 0.6705, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6771581172943115, "rewards/margins": 0.7876464128494263, "rewards/rejected": -2.4648046493530273, "sft_loss": 1.6874353885650635, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 11.726373203482728, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.14782342314720154, "logits/rejected": 0.06055955961346626, "logps/chosen": -1.621753454208374, "logps/rejected": -2.3838016986846924, "loss": 0.6632, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.621753454208374, "rewards/margins": 0.7620481252670288, "rewards/rejected": -2.3838016986846924, "sft_loss": 1.6636728048324585, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 9.645147855479866, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.09425531327724457, "logits/rejected": 0.05427926778793335, "logps/chosen": -1.5416395664215088, "logps/rejected": -2.3300321102142334, "loss": 0.6617, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5416395664215088, "rewards/margins": 0.7883927822113037, "rewards/rejected": -2.3300321102142334, "sft_loss": 1.5825732946395874, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 7.261028813388567, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3189387917518616, "logits/rejected": -0.02189609408378601, "logps/chosen": -1.703832983970642, "logps/rejected": -2.442976713180542, "loss": 0.6794, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.703832983970642, "rewards/margins": 0.7391437888145447, "rewards/rejected": -2.442976713180542, "sft_loss": 1.7219880819320679, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 8.274575532361197, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.20712879300117493, "logits/rejected": 0.01394024956971407, "logps/chosen": -1.6701123714447021, "logps/rejected": -2.320915937423706, "loss": 0.7261, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6701123714447021, "rewards/margins": 0.6508036851882935, "rewards/rejected": -2.320915937423706, "sft_loss": 1.7097814083099365, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 12.660211365598903, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.1191878691315651, "logits/rejected": 0.11547447741031647, "logps/chosen": -1.6440610885620117, "logps/rejected": -2.5760855674743652, "loss": 0.6444, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6440610885620117, "rewards/margins": 0.9320244789123535, "rewards/rejected": -2.5760855674743652, "sft_loss": 1.6310327053070068, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 11.922363740692255, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.12891000509262085, "logits/rejected": 0.06500129401683807, "logps/chosen": -1.5909931659698486, "logps/rejected": -2.4072279930114746, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": -1.5909931659698486, "rewards/margins": 0.8162347078323364, "rewards/rejected": -2.4072279930114746, "sft_loss": 1.5814789533615112, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 15.16829133384325, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2943059802055359, "logits/rejected": -0.07183589041233063, "logps/chosen": -1.6762807369232178, "logps/rejected": -2.294644832611084, "loss": 0.7346, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6762807369232178, "rewards/margins": 0.6183642148971558, "rewards/rejected": -2.294644832611084, "sft_loss": 1.6995090246200562, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 9.08603571545962, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.19647717475891113, "logits/rejected": -0.10413942486047745, "logps/chosen": -1.50521981716156, "logps/rejected": -2.1868722438812256, "loss": 0.6737, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.50521981716156, "rewards/margins": 0.681652307510376, "rewards/rejected": -2.1868722438812256, "sft_loss": 1.5386106967926025, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 9.272370920971673, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.17041842639446259, "logits/rejected": -0.04400542378425598, "logps/chosen": -1.6328155994415283, "logps/rejected": -2.3400864601135254, "loss": 0.6699, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6328155994415283, "rewards/margins": 0.707270622253418, "rewards/rejected": -2.3400864601135254, "sft_loss": 1.699509859085083, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 12.675105207998715, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.21715426445007324, "logits/rejected": -0.008628025650978088, "logps/chosen": -1.4647619724273682, "logps/rejected": -2.2710070610046387, "loss": 0.6337, "rewards/accuracies": 0.75, "rewards/chosen": -1.4647619724273682, "rewards/margins": 0.8062450289726257, "rewards/rejected": -2.2710070610046387, "sft_loss": 1.4784409999847412, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 9.127856675239261, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.23247845470905304, "logits/rejected": -0.05331588536500931, "logps/chosen": -1.532557487487793, "logps/rejected": -2.414947032928467, "loss": 0.6549, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.532557487487793, "rewards/margins": 0.8823897242546082, "rewards/rejected": -2.414947032928467, "sft_loss": 1.5929491519927979, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 10.438562436524752, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.10926779359579086, "logits/rejected": 0.005693843122571707, "logps/chosen": -1.6112648248672485, "logps/rejected": -2.278351068496704, "loss": 0.741, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6112648248672485, "rewards/margins": 0.6670863032341003, "rewards/rejected": -2.278351068496704, "sft_loss": 1.6648328304290771, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 8.872938895579125, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.22083063423633575, "logits/rejected": 0.06206611916422844, "logps/chosen": -1.601793646812439, "logps/rejected": -2.283752679824829, "loss": 0.7114, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.601793646812439, "rewards/margins": 0.6819590330123901, "rewards/rejected": -2.283752679824829, "sft_loss": 1.5994501113891602, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 10.146576735489928, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.052188485860824585, "logits/rejected": -0.022229081019759178, "logps/chosen": -1.5832908153533936, "logps/rejected": -2.2681241035461426, "loss": 0.6884, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5832908153533936, "rewards/margins": 0.6848331689834595, "rewards/rejected": -2.2681241035461426, "sft_loss": 1.6359937191009521, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 8.826215200445214, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.12511709332466125, "logits/rejected": -0.012917520478367805, "logps/chosen": -1.6173568964004517, "logps/rejected": -2.2473435401916504, "loss": 0.7244, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6173568964004517, "rewards/margins": 0.6299864053726196, "rewards/rejected": -2.2473435401916504, "sft_loss": 1.5940289497375488, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 8.510765257469792, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.11667634546756744, "logits/rejected": 0.04485910013318062, "logps/chosen": -1.49495267868042, "logps/rejected": -2.172240734100342, "loss": 0.6798, "rewards/accuracies": 0.71875, "rewards/chosen": -1.49495267868042, "rewards/margins": 0.6772879958152771, "rewards/rejected": -2.172240734100342, "sft_loss": 1.531685471534729, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 9.4471047808534, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.18877005577087402, "logits/rejected": 0.007182592060416937, "logps/chosen": -1.6653181314468384, "logps/rejected": -2.3829312324523926, "loss": 0.7058, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6653181314468384, "rewards/margins": 0.7176133394241333, "rewards/rejected": -2.3829312324523926, "sft_loss": 1.6994606256484985, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 12.79899007848401, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.18659181892871857, "logits/rejected": 0.0040053874254226685, "logps/chosen": -1.644914984703064, "logps/rejected": -2.571887254714966, "loss": 0.6693, "rewards/accuracies": 0.71875, "rewards/chosen": -1.644914984703064, "rewards/margins": 0.9269720911979675, "rewards/rejected": -2.571887254714966, "sft_loss": 1.6463935375213623, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 11.196069216233868, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.19748292863368988, "logits/rejected": -0.07073149085044861, "logps/chosen": -1.5426435470581055, "logps/rejected": -2.335000991821289, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": -1.5426435470581055, "rewards/margins": 0.7923575639724731, "rewards/rejected": -2.335000991821289, "sft_loss": 1.5800834894180298, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 8.961982142319261, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.2811095714569092, "logits/rejected": -0.0877077579498291, "logps/chosen": -1.5879201889038086, "logps/rejected": -2.3596010208129883, "loss": 0.6632, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5879201889038086, "rewards/margins": 0.7716808319091797, "rewards/rejected": -2.3596010208129883, "sft_loss": 1.5789204835891724, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 11.663288656420253, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.13586275279521942, "logits/rejected": -0.05168810486793518, "logps/chosen": -1.6255013942718506, "logps/rejected": -2.4374451637268066, "loss": 0.6907, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6255013942718506, "rewards/margins": 0.8119437098503113, "rewards/rejected": -2.4374451637268066, "sft_loss": 1.5966488122940063, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 12.763786131740927, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.1542709916830063, "logits/rejected": -0.07183860242366791, "logps/chosen": -1.5464205741882324, "logps/rejected": -2.41371488571167, "loss": 0.6489, "rewards/accuracies": 0.75, "rewards/chosen": -1.5464205741882324, "rewards/margins": 0.8672944903373718, "rewards/rejected": -2.41371488571167, "sft_loss": 1.506961464881897, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 12.966817844183794, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.2011301964521408, "logits/rejected": 0.0018308945000171661, "logps/chosen": -1.674024224281311, "logps/rejected": -2.316845417022705, "loss": 0.7418, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.674024224281311, "rewards/margins": 0.6428213119506836, "rewards/rejected": -2.316845417022705, "sft_loss": 1.68631911277771, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 12.309857540752159, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.0920797660946846, "logits/rejected": 0.10061581432819366, "logps/chosen": -1.5836281776428223, "logps/rejected": -2.2875466346740723, "loss": 0.7015, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5836281776428223, "rewards/margins": 0.7039185166358948, "rewards/rejected": -2.2875466346740723, "sft_loss": 1.5962741374969482, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 9.20374943064197, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.24745337665081024, "logits/rejected": -0.14488694071769714, "logps/chosen": -1.5692138671875, "logps/rejected": -2.4327547550201416, "loss": 0.6808, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5692138671875, "rewards/margins": 0.8635409474372864, "rewards/rejected": -2.4327547550201416, "sft_loss": 1.6375614404678345, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 11.694428111078226, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.15586218237876892, "logits/rejected": -0.02569444850087166, "logps/chosen": -1.5978189706802368, "logps/rejected": -2.5465469360351562, "loss": 0.6587, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5978189706802368, "rewards/margins": 0.9487277865409851, "rewards/rejected": -2.5465469360351562, "sft_loss": 1.6753488779067993, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 8.756189673593619, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.1437523514032364, "logits/rejected": -0.010031482204794884, "logps/chosen": -1.5562829971313477, "logps/rejected": -2.4190354347229004, "loss": 0.6636, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5562829971313477, "rewards/margins": 0.8627522587776184, "rewards/rejected": -2.4190354347229004, "sft_loss": 1.6413791179656982, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 7.583367625122719, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.1638123095035553, "logits/rejected": -0.047650910913944244, "logps/chosen": -1.5771219730377197, "logps/rejected": -2.366272449493408, "loss": 0.702, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5771219730377197, "rewards/margins": 0.7891504764556885, "rewards/rejected": -2.366272449493408, "sft_loss": 1.668800950050354, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 11.147417191723994, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.14589455723762512, "logits/rejected": -0.0158296637237072, "logps/chosen": -1.4728577136993408, "logps/rejected": -2.1111085414886475, "loss": 0.6746, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4728577136993408, "rewards/margins": 0.6382508277893066, "rewards/rejected": -2.1111085414886475, "sft_loss": 1.4788661003112793, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 10.085299235554329, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.24280306696891785, "logits/rejected": -0.13830220699310303, "logps/chosen": -1.4863742589950562, "logps/rejected": -2.326418161392212, "loss": 0.6647, "rewards/accuracies": 0.75, "rewards/chosen": -1.4863742589950562, "rewards/margins": 0.840043842792511, "rewards/rejected": -2.326418161392212, "sft_loss": 1.6096107959747314, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 18.71948108102109, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.2757745683193207, "logits/rejected": -0.13689342141151428, "logps/chosen": -1.6436183452606201, "logps/rejected": -2.450040340423584, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": -1.6436183452606201, "rewards/margins": 0.8064218759536743, "rewards/rejected": -2.450040340423584, "sft_loss": 1.6699615716934204, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 9.660197254841627, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.31278321146965027, "logits/rejected": -0.14628355205059052, "logps/chosen": -1.4739224910736084, "logps/rejected": -2.277055501937866, "loss": 0.6408, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4739224910736084, "rewards/margins": 0.8031331300735474, "rewards/rejected": -2.277055501937866, "sft_loss": 1.5410268306732178, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 16.505327024594862, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.35794010758399963, "logits/rejected": -0.05095381662249565, "logps/chosen": -1.606528639793396, "logps/rejected": -2.441112995147705, "loss": 0.6723, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.606528639793396, "rewards/margins": 0.8345844149589539, "rewards/rejected": -2.441112995147705, "sft_loss": 1.6314605474472046, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 17.338869249257876, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.20488686859607697, "logits/rejected": -0.08215707540512085, "logps/chosen": -1.5839723348617554, "logps/rejected": -2.4989943504333496, "loss": 0.6636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5839723348617554, "rewards/margins": 0.9150223731994629, "rewards/rejected": -2.4989943504333496, "sft_loss": 1.6238740682601929, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.19546522200107574, "eval_logits/rejected": 0.30635809898376465, "eval_logps/chosen": -1.6828129291534424, "eval_logps/rejected": -2.3161661624908447, "eval_loss": 0.7399002909660339, "eval_rewards/accuracies": 0.6780415177345276, "eval_rewards/chosen": -1.6828129291534424, "eval_rewards/margins": 0.6333534717559814, "eval_rewards/rejected": -2.3161661624908447, "eval_runtime": 48.5675, "eval_samples_per_second": 27.693, "eval_sft_loss": 1.6737545728683472, "eval_steps_per_second": 6.939, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 9.089283257616241, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.21204760670661926, "logits/rejected": 0.04469328373670578, "logps/chosen": -1.738537073135376, "logps/rejected": -2.423532009124756, "loss": 0.7258, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.738537073135376, "rewards/margins": 0.6849948763847351, "rewards/rejected": -2.423532009124756, "sft_loss": 1.702416181564331, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 16.3995034684504, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.22659006714820862, "logits/rejected": -0.10399820655584335, "logps/chosen": -1.5988924503326416, "logps/rejected": -2.2675509452819824, "loss": 0.7064, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5988924503326416, "rewards/margins": 0.6686583757400513, "rewards/rejected": -2.2675509452819824, "sft_loss": 1.6292423009872437, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 9.753753536173049, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.22557011246681213, "logits/rejected": 0.08333168923854828, "logps/chosen": -1.6155731678009033, "logps/rejected": -2.264874219894409, "loss": 0.7224, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6155731678009033, "rewards/margins": 0.6493009924888611, "rewards/rejected": -2.264874219894409, "sft_loss": 1.6614946126937866, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 10.633328417592894, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.3499174118041992, "logits/rejected": -0.17866572737693787, "logps/chosen": -1.681825041770935, "logps/rejected": -2.414431095123291, "loss": 0.7056, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.681825041770935, "rewards/margins": 0.7326058149337769, "rewards/rejected": -2.414431095123291, "sft_loss": 1.759488821029663, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 10.017370630557496, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.21839971840381622, "logits/rejected": -0.0883932113647461, "logps/chosen": -1.5945594310760498, "logps/rejected": -2.3205671310424805, "loss": 0.6954, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5945594310760498, "rewards/margins": 0.726007878780365, "rewards/rejected": -2.3205671310424805, "sft_loss": 1.593219518661499, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 14.062331915159096, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.25459155440330505, "logits/rejected": -0.15023931860923767, "logps/chosen": -1.58211350440979, "logps/rejected": -2.3791286945343018, "loss": 0.6693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.58211350440979, "rewards/margins": 0.7970150113105774, "rewards/rejected": -2.3791286945343018, "sft_loss": 1.6424058675765991, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 12.19362998197408, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.22388438880443573, "logits/rejected": -0.1847975254058838, "logps/chosen": -1.495990514755249, "logps/rejected": -2.0582988262176514, "loss": 0.7051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.495990514755249, "rewards/margins": 0.5623081922531128, "rewards/rejected": -2.0582988262176514, "sft_loss": 1.5875338315963745, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 7.7462470698924175, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.23963327705860138, "logits/rejected": -0.1977156698703766, "logps/chosen": -1.48516047000885, "logps/rejected": -2.183156967163086, "loss": 0.6861, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.48516047000885, "rewards/margins": 0.6979966163635254, "rewards/rejected": -2.183156967163086, "sft_loss": 1.5076370239257812, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 11.570653412037872, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.17418940365314484, "logits/rejected": -0.01625337265431881, "logps/chosen": -1.6110522747039795, "logps/rejected": -2.29060697555542, "loss": 0.7095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6110522747039795, "rewards/margins": 0.6795545816421509, "rewards/rejected": -2.29060697555542, "sft_loss": 1.6591590642929077, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 11.108327293962244, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.29471340775489807, "logits/rejected": -0.16111920773983002, "logps/chosen": -1.6958866119384766, "logps/rejected": -2.2696728706359863, "loss": 0.7537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6958866119384766, "rewards/margins": 0.5737861394882202, "rewards/rejected": -2.2696728706359863, "sft_loss": 1.7458206415176392, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 11.059441725129613, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.16642776131629944, "logits/rejected": -0.09998462349176407, "logps/chosen": -1.63600754737854, "logps/rejected": -2.110837697982788, "loss": 0.7238, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.63600754737854, "rewards/margins": 0.4748304486274719, "rewards/rejected": -2.110837697982788, "sft_loss": 1.6510950326919556, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 11.461735940176967, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.08211231976747513, "logits/rejected": 0.06658978760242462, "logps/chosen": -1.5464023351669312, "logps/rejected": -2.2698678970336914, "loss": 0.6847, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5464023351669312, "rewards/margins": 0.7234654426574707, "rewards/rejected": -2.2698678970336914, "sft_loss": 1.6170175075531006, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 10.929882032578224, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.3041019141674042, "logits/rejected": -0.20276598632335663, "logps/chosen": -1.6240959167480469, "logps/rejected": -2.41873836517334, "loss": 0.6792, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6240959167480469, "rewards/margins": 0.7946425676345825, "rewards/rejected": -2.41873836517334, "sft_loss": 1.6536413431167603, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 13.384930493545731, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.16803541779518127, "logits/rejected": 0.09081745892763138, "logps/chosen": -1.5781902074813843, "logps/rejected": -2.513279438018799, "loss": 0.6684, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5781902074813843, "rewards/margins": 0.9350892305374146, "rewards/rejected": -2.513279438018799, "sft_loss": 1.664940595626831, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 11.810652806574588, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.2417813241481781, "logits/rejected": -0.00826177280396223, "logps/chosen": -1.688616394996643, "logps/rejected": -2.296610116958618, "loss": 0.7243, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.688616394996643, "rewards/margins": 0.6079936623573303, "rewards/rejected": -2.296610116958618, "sft_loss": 1.6804134845733643, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 8.969856358830445, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.25203242897987366, "logits/rejected": -0.08781935274600983, "logps/chosen": -1.6907424926757812, "logps/rejected": -2.571897029876709, "loss": 0.6832, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6907424926757812, "rewards/margins": 0.8811545372009277, "rewards/rejected": -2.571897029876709, "sft_loss": 1.7461318969726562, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 8.935248990910313, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.18559393286705017, "logits/rejected": -0.08009595423936844, "logps/chosen": -1.604569435119629, "logps/rejected": -2.222858190536499, "loss": 0.7066, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.604569435119629, "rewards/margins": 0.6182886362075806, "rewards/rejected": -2.222858190536499, "sft_loss": 1.6059370040893555, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 11.555302285116815, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.31439024209976196, "logits/rejected": -0.06748132407665253, "logps/chosen": -1.5718104839324951, "logps/rejected": -2.2686119079589844, "loss": 0.7032, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5718104839324951, "rewards/margins": 0.6968012452125549, "rewards/rejected": -2.2686119079589844, "sft_loss": 1.6103675365447998, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 8.931975821247107, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.2708786427974701, "logits/rejected": -0.1651076227426529, "logps/chosen": -1.6807162761688232, "logps/rejected": -2.320239543914795, "loss": 0.7174, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6807162761688232, "rewards/margins": 0.6395233273506165, "rewards/rejected": -2.320239543914795, "sft_loss": 1.7137653827667236, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 9.7832355608753, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.165264293551445, "logits/rejected": 0.013227510266005993, "logps/chosen": -1.5037528276443481, "logps/rejected": -2.413733959197998, "loss": 0.6348, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5037528276443481, "rewards/margins": 0.909981369972229, "rewards/rejected": -2.413733959197998, "sft_loss": 1.6315758228302002, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 12.629690742827133, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.2250804454088211, "logits/rejected": 0.0018769055604934692, "logps/chosen": -1.769735336303711, "logps/rejected": -2.4452288150787354, "loss": 0.7142, "rewards/accuracies": 0.71875, "rewards/chosen": -1.769735336303711, "rewards/margins": 0.6754934191703796, "rewards/rejected": -2.4452288150787354, "sft_loss": 1.7322639226913452, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 22.970218454512974, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.20109529793262482, "logits/rejected": 0.012713703326880932, "logps/chosen": -1.6316086053848267, "logps/rejected": -2.40693998336792, "loss": 0.7072, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6316086053848267, "rewards/margins": 0.7753316164016724, "rewards/rejected": -2.40693998336792, "sft_loss": 1.7024939060211182, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 10.81391742744317, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.15072837471961975, "logits/rejected": -0.07874925434589386, "logps/chosen": -1.641068458557129, "logps/rejected": -2.4472856521606445, "loss": 0.6658, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.641068458557129, "rewards/margins": 0.8062170743942261, "rewards/rejected": -2.4472856521606445, "sft_loss": 1.682905912399292, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 10.522910503559332, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.2065780609846115, "logits/rejected": -0.09320701658725739, "logps/chosen": -1.5824291706085205, "logps/rejected": -2.4108710289001465, "loss": 0.6657, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5824291706085205, "rewards/margins": 0.8284417986869812, "rewards/rejected": -2.4108710289001465, "sft_loss": 1.6705067157745361, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 11.691147036563297, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.2945500314235687, "logits/rejected": -0.03571607917547226, "logps/chosen": -1.5922911167144775, "logps/rejected": -2.5326988697052, "loss": 0.6343, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5922911167144775, "rewards/margins": 0.9404075741767883, "rewards/rejected": -2.5326988697052, "sft_loss": 1.6695963144302368, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 12.373728906345356, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.1745511144399643, "logits/rejected": -0.06725641340017319, "logps/chosen": -1.644447684288025, "logps/rejected": -2.4574968814849854, "loss": 0.6955, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.644447684288025, "rewards/margins": 0.8130491971969604, "rewards/rejected": -2.4574968814849854, "sft_loss": 1.707310438156128, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.957640400601655, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.15849007666110992, "logits/rejected": -0.16887658834457397, "logps/chosen": -1.7253891229629517, "logps/rejected": -2.6144256591796875, "loss": 0.7181, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7253891229629517, "rewards/margins": 0.8890363574028015, "rewards/rejected": -2.6144256591796875, "sft_loss": 1.7500331401824951, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 10.238204940673812, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.35746997594833374, "logits/rejected": -0.25876617431640625, "logps/chosen": -1.5974210500717163, "logps/rejected": -2.3321309089660645, "loss": 0.6782, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5974210500717163, "rewards/margins": 0.7347100973129272, "rewards/rejected": -2.3321309089660645, "sft_loss": 1.628101110458374, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 13.452542837030343, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.1260557472705841, "logits/rejected": 0.038332048803567886, "logps/chosen": -1.7455084323883057, "logps/rejected": -2.481240749359131, "loss": 0.7128, "rewards/accuracies": 0.75, "rewards/chosen": -1.7455084323883057, "rewards/margins": 0.7357321977615356, "rewards/rejected": -2.481240749359131, "sft_loss": 1.7408088445663452, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 10.607047860758295, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.21611149609088898, "logits/rejected": -0.09441132843494415, "logps/chosen": -1.5571849346160889, "logps/rejected": -2.369811773300171, "loss": 0.6783, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5571849346160889, "rewards/margins": 0.8126265406608582, "rewards/rejected": -2.369811773300171, "sft_loss": 1.6382062435150146, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 11.00911605682797, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.15509618818759918, "logits/rejected": -0.16245296597480774, "logps/chosen": -1.590057611465454, "logps/rejected": -2.2679991722106934, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": -1.590057611465454, "rewards/margins": 0.6779417395591736, "rewards/rejected": -2.2679991722106934, "sft_loss": 1.581096887588501, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 16.781770455072472, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.2184685468673706, "logits/rejected": -0.043954916298389435, "logps/chosen": -1.705082654953003, "logps/rejected": -2.3112621307373047, "loss": 0.7153, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.705082654953003, "rewards/margins": 0.606179416179657, "rewards/rejected": -2.3112621307373047, "sft_loss": 1.7354850769042969, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 10.955631198558818, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.22156643867492676, "logits/rejected": -0.06780660152435303, "logps/chosen": -1.6345361471176147, "logps/rejected": -2.470580816268921, "loss": 0.6868, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6345361471176147, "rewards/margins": 0.8360446095466614, "rewards/rejected": -2.470580816268921, "sft_loss": 1.694688081741333, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 16.000301335472273, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.11134836822748184, "logits/rejected": -0.03346944600343704, "logps/chosen": -1.613207221031189, "logps/rejected": -2.4353206157684326, "loss": 0.6593, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.613207221031189, "rewards/margins": 0.8221133947372437, "rewards/rejected": -2.4353206157684326, "sft_loss": 1.6618369817733765, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 11.998543075453684, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.2220427244901657, "logits/rejected": -0.10063277184963226, "logps/chosen": -1.618257761001587, "logps/rejected": -2.1705355644226074, "loss": 0.7399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.618257761001587, "rewards/margins": 0.5522776246070862, "rewards/rejected": -2.1705355644226074, "sft_loss": 1.6149768829345703, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 12.001727134757187, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.26687371730804443, "logits/rejected": -0.00254420330747962, "logps/chosen": -1.5691627264022827, "logps/rejected": -2.380427360534668, "loss": 0.6502, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5691627264022827, "rewards/margins": 0.8112645149230957, "rewards/rejected": -2.380427360534668, "sft_loss": 1.62030029296875, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 16.168320605262107, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.2659373879432678, "logits/rejected": -0.23626577854156494, "logps/chosen": -1.5715490579605103, "logps/rejected": -2.2968928813934326, "loss": 0.706, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5715490579605103, "rewards/margins": 0.7253440618515015, "rewards/rejected": -2.2968928813934326, "sft_loss": 1.6026023626327515, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 12.394040160936669, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.12356863915920258, "logits/rejected": -0.0015356764197349548, "logps/chosen": -1.5802079439163208, "logps/rejected": -2.472146511077881, "loss": 0.6625, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5802079439163208, "rewards/margins": 0.8919386863708496, "rewards/rejected": -2.472146511077881, "sft_loss": 1.6293643712997437, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 7.3784287140857145, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.3481757342815399, "logits/rejected": -0.17929382622241974, "logps/chosen": -1.6148115396499634, "logps/rejected": -2.190422534942627, "loss": 0.7513, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6148115396499634, "rewards/margins": 0.575610876083374, "rewards/rejected": -2.190422534942627, "sft_loss": 1.6746292114257812, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 12.84114132561403, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.0644916445016861, "logits/rejected": -0.07482218742370605, "logps/chosen": -1.7007497549057007, "logps/rejected": -2.3855552673339844, "loss": 0.7256, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7007497549057007, "rewards/margins": 0.6848055124282837, "rewards/rejected": -2.3855552673339844, "sft_loss": 1.708988904953003, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 8.262089297233556, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.24868635833263397, "logits/rejected": -0.16337426006793976, "logps/chosen": -1.5385116338729858, "logps/rejected": -2.2479724884033203, "loss": 0.6664, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5385116338729858, "rewards/margins": 0.7094607353210449, "rewards/rejected": -2.2479724884033203, "sft_loss": 1.583030343055725, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 11.18944509342681, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.2430190145969391, "logits/rejected": -0.0969865694642067, "logps/chosen": -1.616960883140564, "logps/rejected": -2.573047399520874, "loss": 0.6721, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.616960883140564, "rewards/margins": 0.9560863375663757, "rewards/rejected": -2.573047399520874, "sft_loss": 1.6809444427490234, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 11.050091982006032, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.21678009629249573, "logits/rejected": -0.04356417804956436, "logps/chosen": -1.6071479320526123, "logps/rejected": -2.3576242923736572, "loss": 0.7038, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6071479320526123, "rewards/margins": 0.7504763007164001, "rewards/rejected": -2.3576242923736572, "sft_loss": 1.6264982223510742, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 8.942618445243067, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.29563814401626587, "logits/rejected": -0.043746042996644974, "logps/chosen": -1.553081750869751, "logps/rejected": -2.393733501434326, "loss": 0.6662, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.553081750869751, "rewards/margins": 0.8406518697738647, "rewards/rejected": -2.393733501434326, "sft_loss": 1.6148624420166016, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 8.579546863967716, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.26378101110458374, "logits/rejected": -0.016275247558951378, "logps/chosen": -1.641641616821289, "logps/rejected": -2.4806206226348877, "loss": 0.6559, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.641641616821289, "rewards/margins": 0.8389788866043091, "rewards/rejected": -2.4806206226348877, "sft_loss": 1.6694024801254272, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 9.121218787401439, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.2829732298851013, "logits/rejected": -0.16510283946990967, "logps/chosen": -1.679758071899414, "logps/rejected": -2.4852185249328613, "loss": 0.6986, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.679758071899414, "rewards/margins": 0.8054605722427368, "rewards/rejected": -2.4852185249328613, "sft_loss": 1.6560070514678955, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 8.282971227412126, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.21455052495002747, "logits/rejected": -0.08861131221055984, "logps/chosen": -1.6706950664520264, "logps/rejected": -2.2545063495635986, "loss": 0.7488, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6706950664520264, "rewards/margins": 0.5838110446929932, "rewards/rejected": -2.2545063495635986, "sft_loss": 1.727036714553833, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 9.454176930131542, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.23077169060707092, "logits/rejected": -0.08707382529973984, "logps/chosen": -1.579712152481079, "logps/rejected": -2.254929304122925, "loss": 0.6874, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.579712152481079, "rewards/margins": 0.6752171516418457, "rewards/rejected": -2.254929304122925, "sft_loss": 1.6380122900009155, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 12.97218473641976, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.3003261089324951, "logits/rejected": -0.14061914384365082, "logps/chosen": -1.6543731689453125, "logps/rejected": -2.3564724922180176, "loss": 0.724, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6543731689453125, "rewards/margins": 0.7020995020866394, "rewards/rejected": -2.3564724922180176, "sft_loss": 1.6953704357147217, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 7.848632218228047, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.21353331208229065, "logits/rejected": 0.01264294795691967, "logps/chosen": -1.6973285675048828, "logps/rejected": -2.421572208404541, "loss": 0.7139, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6973285675048828, "rewards/margins": 0.7242437601089478, "rewards/rejected": -2.421572208404541, "sft_loss": 1.670739769935608, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 19.649251903430677, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.2555299401283264, "logits/rejected": 0.0315743163228035, "logps/chosen": -1.6559680700302124, "logps/rejected": -2.5446953773498535, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": -1.6559680700302124, "rewards/margins": 0.8887273669242859, "rewards/rejected": -2.5446953773498535, "sft_loss": 1.6526873111724854, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 10.58661586469778, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.20800435543060303, "logits/rejected": -0.09311346709728241, "logps/chosen": -1.5613033771514893, "logps/rejected": -2.2913155555725098, "loss": 0.6753, "rewards/accuracies": 0.75, "rewards/chosen": -1.5613033771514893, "rewards/margins": 0.7300121784210205, "rewards/rejected": -2.2913155555725098, "sft_loss": 1.5293738842010498, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 7.819840049633919, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.3474529981613159, "logits/rejected": -0.132430762052536, "logps/chosen": -1.5635740756988525, "logps/rejected": -2.394151210784912, "loss": 0.6764, "rewards/accuracies": 0.75, "rewards/chosen": -1.5635740756988525, "rewards/margins": 0.8305767774581909, "rewards/rejected": -2.394151210784912, "sft_loss": 1.6081364154815674, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 10.115975348487002, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.1569996178150177, "logits/rejected": 0.13438673317432404, "logps/chosen": -1.651344656944275, "logps/rejected": -2.512503147125244, "loss": 0.6763, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.651344656944275, "rewards/margins": 0.8611583709716797, "rewards/rejected": -2.512503147125244, "sft_loss": 1.603162169456482, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 12.736470472350288, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.2039128839969635, "logits/rejected": -0.0853031724691391, "logps/chosen": -1.7288665771484375, "logps/rejected": -2.4566359519958496, "loss": 0.7301, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7288665771484375, "rewards/margins": 0.7277693152427673, "rewards/rejected": -2.4566359519958496, "sft_loss": 1.746774435043335, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 9.192663304800051, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.2541688084602356, "logits/rejected": -0.13433417677879333, "logps/chosen": -1.5581443309783936, "logps/rejected": -2.3666722774505615, "loss": 0.6596, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5581443309783936, "rewards/margins": 0.808527946472168, "rewards/rejected": -2.3666722774505615, "sft_loss": 1.6160786151885986, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 12.81893248717066, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.16918596625328064, "logits/rejected": -0.18163521587848663, "logps/chosen": -1.5742213726043701, "logps/rejected": -2.3694703578948975, "loss": 0.6778, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5742213726043701, "rewards/margins": 0.7952491044998169, "rewards/rejected": -2.3694703578948975, "sft_loss": 1.6507028341293335, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 15.509438997920547, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.30652400851249695, "logits/rejected": -0.10289986431598663, "logps/chosen": -1.724452018737793, "logps/rejected": -2.4997074604034424, "loss": 0.7092, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.724452018737793, "rewards/margins": 0.775255560874939, "rewards/rejected": -2.4997074604034424, "sft_loss": 1.710101842880249, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 11.863260227145227, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.27409297227859497, "logits/rejected": -0.19014033675193787, "logps/chosen": -1.6777464151382446, "logps/rejected": -2.3485777378082275, "loss": 0.7086, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6777464151382446, "rewards/margins": 0.6708315014839172, "rewards/rejected": -2.3485777378082275, "sft_loss": 1.6903235912322998, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 10.093143620453924, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.2066335380077362, "logits/rejected": -0.05350092053413391, "logps/chosen": -1.633599042892456, "logps/rejected": -2.344357490539551, "loss": 0.6945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.633599042892456, "rewards/margins": 0.7107582688331604, "rewards/rejected": -2.344357490539551, "sft_loss": 1.6154781579971313, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 19.511939789946904, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.300477534532547, "logits/rejected": -0.16097351908683777, "logps/chosen": -1.6401588916778564, "logps/rejected": -2.6300368309020996, "loss": 0.6323, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6401588916778564, "rewards/margins": 0.9898775219917297, "rewards/rejected": -2.6300368309020996, "sft_loss": 1.723106026649475, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 10.967135347054848, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.2538626790046692, "logits/rejected": -0.011468528769910336, "logps/chosen": -1.616067886352539, "logps/rejected": -2.612074851989746, "loss": 0.6243, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.616067886352539, "rewards/margins": 0.9960067868232727, "rewards/rejected": -2.612074851989746, "sft_loss": 1.6387971639633179, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 15.412044382189954, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.24483045935630798, "logits/rejected": -0.046250246465206146, "logps/chosen": -1.7047497034072876, "logps/rejected": -2.3337831497192383, "loss": 0.7267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7047497034072876, "rewards/margins": 0.6290335655212402, "rewards/rejected": -2.3337831497192383, "sft_loss": 1.7390830516815186, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 12.383114573353495, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.2617846429347992, "logits/rejected": -0.09488927572965622, "logps/chosen": -1.6723592281341553, "logps/rejected": -2.4052927494049072, "loss": 0.6872, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6723592281341553, "rewards/margins": 0.732933521270752, "rewards/rejected": -2.4052927494049072, "sft_loss": 1.6800689697265625, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 14.535813430041602, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.24740847945213318, "logits/rejected": -0.0494442954659462, "logps/chosen": -1.6466270685195923, "logps/rejected": -2.5152499675750732, "loss": 0.7061, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6466270685195923, "rewards/margins": 0.8686231374740601, "rewards/rejected": -2.5152499675750732, "sft_loss": 1.683328628540039, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 13.716543856521612, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.22817941009998322, "logits/rejected": -0.0856315866112709, "logps/chosen": -1.7677967548370361, "logps/rejected": -2.533576726913452, "loss": 0.7393, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7677967548370361, "rewards/margins": 0.765779972076416, "rewards/rejected": -2.533576726913452, "sft_loss": 1.728803038597107, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 8.264882683874543, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.18927092850208282, "logits/rejected": -0.10091712325811386, "logps/chosen": -1.721134901046753, "logps/rejected": -2.447286367416382, "loss": 0.7155, "rewards/accuracies": 0.71875, "rewards/chosen": -1.721134901046753, "rewards/margins": 0.7261516451835632, "rewards/rejected": -2.447286367416382, "sft_loss": 1.650465965270996, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 13.355474126324367, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.2467312067747116, "logits/rejected": -0.02907741069793701, "logps/chosen": -1.572540044784546, "logps/rejected": -2.5249338150024414, "loss": 0.6403, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.572540044784546, "rewards/margins": 0.9523937106132507, "rewards/rejected": -2.5249338150024414, "sft_loss": 1.6242434978485107, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 23.026264795037655, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.23782236874103546, "logits/rejected": -0.15537124872207642, "logps/chosen": -1.7547528743743896, "logps/rejected": -2.3736822605133057, "loss": 0.7578, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7547528743743896, "rewards/margins": 0.6189290881156921, "rewards/rejected": -2.3736822605133057, "sft_loss": 1.7577035427093506, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 11.026317241141811, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.18229186534881592, "logits/rejected": -0.10631884634494781, "logps/chosen": -1.710971474647522, "logps/rejected": -2.5552868843078613, "loss": 0.6708, "rewards/accuracies": 0.71875, "rewards/chosen": -1.710971474647522, "rewards/margins": 0.8443149328231812, "rewards/rejected": -2.5552868843078613, "sft_loss": 1.7185615301132202, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 10.286910114576408, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.3182876408100128, "logits/rejected": -0.16705667972564697, "logps/chosen": -1.6780900955200195, "logps/rejected": -2.404869556427002, "loss": 0.6848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6780900955200195, "rewards/margins": 0.7267793416976929, "rewards/rejected": -2.404869556427002, "sft_loss": 1.7108476161956787, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 11.72310906240698, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.33330950140953064, "logits/rejected": -0.07562915980815887, "logps/chosen": -1.7125946283340454, "logps/rejected": -2.4305922985076904, "loss": 0.6814, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7125946283340454, "rewards/margins": 0.717997670173645, "rewards/rejected": -2.4305922985076904, "sft_loss": 1.7155481576919556, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 10.762000849075521, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.1484236717224121, "logits/rejected": -0.15011349320411682, "logps/chosen": -1.640467882156372, "logps/rejected": -2.367255687713623, "loss": 0.6956, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.640467882156372, "rewards/margins": 0.7267881035804749, "rewards/rejected": -2.367255687713623, "sft_loss": 1.6443017721176147, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 18.295307248073126, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.2094794511795044, "logits/rejected": -0.052414268255233765, "logps/chosen": -1.5577723979949951, "logps/rejected": -2.475332736968994, "loss": 0.623, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5577723979949951, "rewards/margins": 0.9175603985786438, "rewards/rejected": -2.475332736968994, "sft_loss": 1.588687539100647, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 16.91373029122271, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.3289126753807068, "logits/rejected": -0.08697455376386642, "logps/chosen": -1.7038342952728271, "logps/rejected": -2.499950408935547, "loss": 0.7214, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7038342952728271, "rewards/margins": 0.7961161136627197, "rewards/rejected": -2.499950408935547, "sft_loss": 1.7854511737823486, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 11.023008345560633, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.25679293274879456, "logits/rejected": -0.014876279048621655, "logps/chosen": -1.6452186107635498, "logps/rejected": -2.3947396278381348, "loss": 0.6922, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6452186107635498, "rewards/margins": 0.749521017074585, "rewards/rejected": -2.3947396278381348, "sft_loss": 1.6658779382705688, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 12.582506311602824, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.28035983443260193, "logits/rejected": -0.10792120546102524, "logps/chosen": -1.709864616394043, "logps/rejected": -2.4221134185791016, "loss": 0.7094, "rewards/accuracies": 0.71875, "rewards/chosen": -1.709864616394043, "rewards/margins": 0.7122488021850586, "rewards/rejected": -2.4221134185791016, "sft_loss": 1.732394814491272, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 9.55516409749658, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.2662433087825775, "logits/rejected": -0.1103045791387558, "logps/chosen": -1.5945457220077515, "logps/rejected": -2.260127544403076, "loss": 0.7015, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5945457220077515, "rewards/margins": 0.6655817627906799, "rewards/rejected": -2.260127544403076, "sft_loss": 1.6504939794540405, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 11.612950267933856, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.23081263899803162, "logits/rejected": -0.04323858022689819, "logps/chosen": -1.6416513919830322, "logps/rejected": -2.5492002964019775, "loss": 0.6833, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6416513919830322, "rewards/margins": 0.9075489044189453, "rewards/rejected": -2.5492002964019775, "sft_loss": 1.6769310235977173, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 8.849384782613154, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.29834747314453125, "logits/rejected": -0.11559565365314484, "logps/chosen": -1.7053306102752686, "logps/rejected": -2.477020025253296, "loss": 0.6929, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7053306102752686, "rewards/margins": 0.7716895341873169, "rewards/rejected": -2.477020025253296, "sft_loss": 1.738478660583496, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.19015556573867798, "eval_logits/rejected": 0.3029745817184448, "eval_logps/chosen": -1.7384995222091675, "eval_logps/rejected": -2.4029359817504883, "eval_loss": 0.742077112197876, "eval_rewards/accuracies": 0.6795251965522766, "eval_rewards/chosen": -1.7384995222091675, "eval_rewards/margins": 0.6644363403320312, "eval_rewards/rejected": -2.4029359817504883, "eval_runtime": 48.1812, "eval_samples_per_second": 27.915, "eval_sft_loss": 1.7042850255966187, "eval_steps_per_second": 6.994, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 10.938624305016122, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.29718244075775146, "logits/rejected": -0.26397815346717834, "logps/chosen": -1.6112855672836304, "logps/rejected": -2.2392969131469727, "loss": 0.7032, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6112855672836304, "rewards/margins": 0.6280113458633423, "rewards/rejected": -2.2392969131469727, "sft_loss": 1.6521110534667969, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 9.558494092108186, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.22843368351459503, "logits/rejected": -0.16027602553367615, "logps/chosen": -1.7824885845184326, "logps/rejected": -2.7060914039611816, "loss": 0.6878, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7824885845184326, "rewards/margins": 0.9236028790473938, "rewards/rejected": -2.7060914039611816, "sft_loss": 1.8025773763656616, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 9.490276060406975, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.22319719195365906, "logits/rejected": -0.01933179423213005, "logps/chosen": -1.7116012573242188, "logps/rejected": -2.4217066764831543, "loss": 0.7191, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7116012573242188, "rewards/margins": 0.7101053595542908, "rewards/rejected": -2.4217066764831543, "sft_loss": 1.7021703720092773, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 10.479792591125774, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.24675317108631134, "logits/rejected": -0.14040735363960266, "logps/chosen": -1.6944862604141235, "logps/rejected": -2.2922325134277344, "loss": 0.7207, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6944862604141235, "rewards/margins": 0.5977464914321899, "rewards/rejected": -2.2922325134277344, "sft_loss": 1.6854808330535889, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 17.675125023362586, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.22842493653297424, "logits/rejected": -0.12210488319396973, "logps/chosen": -1.5011399984359741, "logps/rejected": -2.1839780807495117, "loss": 0.6608, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5011399984359741, "rewards/margins": 0.6828380823135376, "rewards/rejected": -2.1839780807495117, "sft_loss": 1.6095644235610962, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 10.901693588593412, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.29783469438552856, "logits/rejected": -0.0860317200422287, "logps/chosen": -1.6769516468048096, "logps/rejected": -2.414076328277588, "loss": 0.6969, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6769516468048096, "rewards/margins": 0.7371248006820679, "rewards/rejected": -2.414076328277588, "sft_loss": 1.677979826927185, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 9.446393775343152, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.32861456274986267, "logits/rejected": -0.07950712740421295, "logps/chosen": -1.5789549350738525, "logps/rejected": -2.3780465126037598, "loss": 0.6552, "rewards/accuracies": 0.75, "rewards/chosen": -1.5789549350738525, "rewards/margins": 0.7990915179252625, "rewards/rejected": -2.3780465126037598, "sft_loss": 1.6623684167861938, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 12.952230692399095, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.3662447929382324, "logits/rejected": -0.09339338541030884, "logps/chosen": -1.628441572189331, "logps/rejected": -2.316749095916748, "loss": 0.7039, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.628441572189331, "rewards/margins": 0.6883074045181274, "rewards/rejected": -2.316749095916748, "sft_loss": 1.6711467504501343, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 11.267096818408705, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.29916131496429443, "logits/rejected": -0.06138089299201965, "logps/chosen": -1.6560900211334229, "logps/rejected": -2.2495615482330322, "loss": 0.7531, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6560900211334229, "rewards/margins": 0.5934714078903198, "rewards/rejected": -2.2495615482330322, "sft_loss": 1.6750061511993408, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 11.228308958888386, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.21163494884967804, "logits/rejected": -0.044960394501686096, "logps/chosen": -1.7255054712295532, "logps/rejected": -2.5176339149475098, "loss": 0.7258, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7255054712295532, "rewards/margins": 0.792128324508667, "rewards/rejected": -2.5176339149475098, "sft_loss": 1.7074692249298096, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 12.057963469614213, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.3196309208869934, "logits/rejected": -0.19720716774463654, "logps/chosen": -1.6557862758636475, "logps/rejected": -2.395984649658203, "loss": 0.6987, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6557862758636475, "rewards/margins": 0.7401983737945557, "rewards/rejected": -2.395984649658203, "sft_loss": 1.6996265649795532, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 12.623093507692076, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.3484382629394531, "logits/rejected": -0.08056129515171051, "logps/chosen": -1.5996696949005127, "logps/rejected": -2.491976022720337, "loss": 0.6652, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5996696949005127, "rewards/margins": 0.8923061490058899, "rewards/rejected": -2.491976022720337, "sft_loss": 1.6900190114974976, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 12.343227820238907, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.30928876996040344, "logits/rejected": -0.08133810758590698, "logps/chosen": -1.7402416467666626, "logps/rejected": -2.418012857437134, "loss": 0.7479, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.7402416467666626, "rewards/margins": 0.6777713894844055, "rewards/rejected": -2.418012857437134, "sft_loss": 1.7204927206039429, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 13.283074394048242, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.15687677264213562, "logits/rejected": -0.05835603550076485, "logps/chosen": -1.5474400520324707, "logps/rejected": -2.218463182449341, "loss": 0.7129, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5474400520324707, "rewards/margins": 0.6710229516029358, "rewards/rejected": -2.218463182449341, "sft_loss": 1.5909714698791504, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 12.613055993636612, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.26195085048675537, "logits/rejected": -0.0033874064683914185, "logps/chosen": -1.6839637756347656, "logps/rejected": -2.4216296672821045, "loss": 0.6917, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6839637756347656, "rewards/margins": 0.7376658320426941, "rewards/rejected": -2.4216296672821045, "sft_loss": 1.6693741083145142, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 15.62160515212973, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.2762789726257324, "logits/rejected": -0.15391430258750916, "logps/chosen": -1.5333744287490845, "logps/rejected": -2.166827440261841, "loss": 0.707, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5333744287490845, "rewards/margins": 0.633452832698822, "rewards/rejected": -2.166827440261841, "sft_loss": 1.5893917083740234, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 14.283728938629904, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.22520525753498077, "logits/rejected": -0.12561757862567902, "logps/chosen": -1.6866658926010132, "logps/rejected": -2.4461896419525146, "loss": 0.6945, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6866658926010132, "rewards/margins": 0.7595240473747253, "rewards/rejected": -2.4461896419525146, "sft_loss": 1.6630300283432007, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 14.71564098593029, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.29125526547431946, "logits/rejected": -0.036132752895355225, "logps/chosen": -1.6057840585708618, "logps/rejected": -2.514495849609375, "loss": 0.6376, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6057840585708618, "rewards/margins": 0.9087116122245789, "rewards/rejected": -2.514495849609375, "sft_loss": 1.648680329322815, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 12.968540777388107, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.36038586497306824, "logits/rejected": -0.11205749213695526, "logps/chosen": -1.68861985206604, "logps/rejected": -2.4093689918518066, "loss": 0.6963, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.68861985206604, "rewards/margins": 0.7207491993904114, "rewards/rejected": -2.4093689918518066, "sft_loss": 1.7390022277832031, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 10.871127602202431, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.14322242140769958, "logits/rejected": -0.1278444081544876, "logps/chosen": -1.6371939182281494, "logps/rejected": -2.3984198570251465, "loss": 0.6818, "rewards/accuracies": 0.75, "rewards/chosen": -1.6371939182281494, "rewards/margins": 0.7612259387969971, "rewards/rejected": -2.3984198570251465, "sft_loss": 1.6630172729492188, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 13.545046360376807, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.274238646030426, "logits/rejected": -0.0696074366569519, "logps/chosen": -1.6298294067382812, "logps/rejected": -2.3537814617156982, "loss": 0.6921, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6298294067382812, "rewards/margins": 0.7239521741867065, "rewards/rejected": -2.3537814617156982, "sft_loss": 1.6520847082138062, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 8.38727094541389, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.2858451008796692, "logits/rejected": -0.11598008871078491, "logps/chosen": -1.6850965023040771, "logps/rejected": -2.3812060356140137, "loss": 0.6963, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6850965023040771, "rewards/margins": 0.6961097121238708, "rewards/rejected": -2.3812060356140137, "sft_loss": 1.6423461437225342, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 9.591233181401488, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.28318411111831665, "logits/rejected": -0.18070612847805023, "logps/chosen": -1.5955157279968262, "logps/rejected": -2.243325710296631, "loss": 0.6991, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5955157279968262, "rewards/margins": 0.6478098630905151, "rewards/rejected": -2.243325710296631, "sft_loss": 1.6338489055633545, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 10.053127105175376, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.25133439898490906, "logits/rejected": -0.13577647507190704, "logps/chosen": -1.6146109104156494, "logps/rejected": -2.490821599960327, "loss": 0.6434, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6146109104156494, "rewards/margins": 0.8762105703353882, "rewards/rejected": -2.490821599960327, "sft_loss": 1.6183521747589111, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 12.673363009011775, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.33062610030174255, "logits/rejected": -0.1125609278678894, "logps/chosen": -1.716835379600525, "logps/rejected": -2.5508780479431152, "loss": 0.6938, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.716835379600525, "rewards/margins": 0.8340429067611694, "rewards/rejected": -2.5508780479431152, "sft_loss": 1.7881269454956055, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 17.89958065335537, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.28661757707595825, "logits/rejected": -0.1711474359035492, "logps/chosen": -1.6419957876205444, "logps/rejected": -2.1811983585357666, "loss": 0.7534, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6419957876205444, "rewards/margins": 0.5392026901245117, "rewards/rejected": -2.1811983585357666, "sft_loss": 1.6192384958267212, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 9.5469357704804, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.1380929797887802, "logits/rejected": 2.9008277124376036e-05, "logps/chosen": -1.5946447849273682, "logps/rejected": -2.3358330726623535, "loss": 0.7102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5946447849273682, "rewards/margins": 0.7411883473396301, "rewards/rejected": -2.3358330726623535, "sft_loss": 1.6597063541412354, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 9.25758023577779, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.17908170819282532, "logits/rejected": -0.07438255101442337, "logps/chosen": -1.5592056512832642, "logps/rejected": -2.2189433574676514, "loss": 0.7305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5592056512832642, "rewards/margins": 0.6597377061843872, "rewards/rejected": -2.2189433574676514, "sft_loss": 1.5880261659622192, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 18.72047661633386, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.19627758860588074, "logits/rejected": -0.04923213645815849, "logps/chosen": -1.631665587425232, "logps/rejected": -2.364366054534912, "loss": 0.7199, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.631665587425232, "rewards/margins": 0.7327004671096802, "rewards/rejected": -2.364366054534912, "sft_loss": 1.655683159828186, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 17.81074418027734, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.3011097311973572, "logits/rejected": -0.17508465051651, "logps/chosen": -1.5420258045196533, "logps/rejected": -2.253735303878784, "loss": 0.6956, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5420258045196533, "rewards/margins": 0.7117095589637756, "rewards/rejected": -2.253735303878784, "sft_loss": 1.5664559602737427, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 7.913030849985102, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.22462257742881775, "logits/rejected": -0.1376952975988388, "logps/chosen": -1.6933692693710327, "logps/rejected": -2.2557225227355957, "loss": 0.7405, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6933692693710327, "rewards/margins": 0.5623533129692078, "rewards/rejected": -2.2557225227355957, "sft_loss": 1.7606439590454102, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 11.166736558581775, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.2805453836917877, "logits/rejected": -0.055518947541713715, "logps/chosen": -1.5741121768951416, "logps/rejected": -2.306300401687622, "loss": 0.7188, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5741121768951416, "rewards/margins": 0.7321882843971252, "rewards/rejected": -2.306300401687622, "sft_loss": 1.5512548685073853, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 10.179844179885663, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.20439159870147705, "logits/rejected": -0.08071372658014297, "logps/chosen": -1.699591875076294, "logps/rejected": -2.4369356632232666, "loss": 0.6887, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.699591875076294, "rewards/margins": 0.7373435497283936, "rewards/rejected": -2.4369356632232666, "sft_loss": 1.6730149984359741, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 19.805758689130165, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.18510176241397858, "logits/rejected": -0.0029919669032096863, "logps/chosen": -1.5924351215362549, "logps/rejected": -2.381690263748169, "loss": 0.6743, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5924351215362549, "rewards/margins": 0.7892551422119141, "rewards/rejected": -2.381690263748169, "sft_loss": 1.6224682331085205, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 13.535282511068937, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.1515834629535675, "logits/rejected": -0.07991237193346024, "logps/chosen": -1.5851836204528809, "logps/rejected": -2.3033299446105957, "loss": 0.6888, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5851836204528809, "rewards/margins": 0.7181463241577148, "rewards/rejected": -2.3033299446105957, "sft_loss": 1.6022968292236328, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 11.819139815495932, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.22652992606163025, "logits/rejected": -0.16959702968597412, "logps/chosen": -1.5638071298599243, "logps/rejected": -2.2427945137023926, "loss": 0.698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5638071298599243, "rewards/margins": 0.6789871454238892, "rewards/rejected": -2.2427945137023926, "sft_loss": 1.6128761768341064, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 8.364769399033436, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.23054222762584686, "logits/rejected": -0.06470415741205215, "logps/chosen": -1.6527926921844482, "logps/rejected": -2.304823398590088, "loss": 0.7582, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6527926921844482, "rewards/margins": 0.6520308256149292, "rewards/rejected": -2.304823398590088, "sft_loss": 1.7377954721450806, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 10.610472355173886, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.3337964415550232, "logits/rejected": -0.12685856223106384, "logps/chosen": -1.5666965246200562, "logps/rejected": -2.203728437423706, "loss": 0.7206, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5666965246200562, "rewards/margins": 0.6370319128036499, "rewards/rejected": -2.203728437423706, "sft_loss": 1.612461805343628, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 12.46805731428514, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.2881069779396057, "logits/rejected": -0.14584079384803772, "logps/chosen": -1.6148103475570679, "logps/rejected": -2.4213805198669434, "loss": 0.7049, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6148103475570679, "rewards/margins": 0.8065702319145203, "rewards/rejected": -2.4213805198669434, "sft_loss": 1.6213207244873047, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 9.96251084940874, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.2968924343585968, "logits/rejected": 0.0007657714304514229, "logps/chosen": -1.646558165550232, "logps/rejected": -2.501307964324951, "loss": 0.6673, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.646558165550232, "rewards/margins": 0.8547500371932983, "rewards/rejected": -2.501307964324951, "sft_loss": 1.717869758605957, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 10.004923730008452, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.26072630286216736, "logits/rejected": -0.08342810720205307, "logps/chosen": -1.6531994342803955, "logps/rejected": -2.5159811973571777, "loss": 0.7062, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6531994342803955, "rewards/margins": 0.8627820014953613, "rewards/rejected": -2.5159811973571777, "sft_loss": 1.6630159616470337, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 13.090309498077668, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.29476550221443176, "logits/rejected": -0.12466315925121307, "logps/chosen": -1.6264028549194336, "logps/rejected": -2.4993860721588135, "loss": 0.6761, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6264028549194336, "rewards/margins": 0.8729833364486694, "rewards/rejected": -2.4993860721588135, "sft_loss": 1.6414413452148438, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 14.216917210951022, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.22088441252708435, "logits/rejected": -0.054220955818891525, "logps/chosen": -1.671741247177124, "logps/rejected": -2.602494239807129, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": -1.671741247177124, "rewards/margins": 0.9307527542114258, "rewards/rejected": -2.602494239807129, "sft_loss": 1.7307860851287842, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 9.337860397646399, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.2169807255268097, "logits/rejected": 0.02115618623793125, "logps/chosen": -1.5316988229751587, "logps/rejected": -2.2888731956481934, "loss": 0.6685, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5316988229751587, "rewards/margins": 0.7571748495101929, "rewards/rejected": -2.2888731956481934, "sft_loss": 1.649409532546997, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 9.989340253027926, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.3478337228298187, "logits/rejected": -0.2983989417552948, "logps/chosen": -1.6109521389007568, "logps/rejected": -2.35640287399292, "loss": 0.6893, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6109521389007568, "rewards/margins": 0.7454506754875183, "rewards/rejected": -2.35640287399292, "sft_loss": 1.6475645303726196, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 10.516045821532142, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.20415540039539337, "logits/rejected": -0.10252511501312256, "logps/chosen": -1.6040103435516357, "logps/rejected": -2.3586628437042236, "loss": 0.711, "rewards/accuracies": 0.75, "rewards/chosen": -1.6040103435516357, "rewards/margins": 0.7546524405479431, "rewards/rejected": -2.3586628437042236, "sft_loss": 1.6626554727554321, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 10.829226572175655, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.15059241652488708, "logits/rejected": 0.042471010237932205, "logps/chosen": -1.6137421131134033, "logps/rejected": -2.476954221725464, "loss": 0.6785, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6137421131134033, "rewards/margins": 0.8632121086120605, "rewards/rejected": -2.476954221725464, "sft_loss": 1.636469841003418, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 10.833792903743456, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.18061409890651703, "logits/rejected": -0.05339163541793823, "logps/chosen": -1.6397415399551392, "logps/rejected": -2.4378011226654053, "loss": 0.6628, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6397415399551392, "rewards/margins": 0.7980595827102661, "rewards/rejected": -2.4378011226654053, "sft_loss": 1.656537652015686, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 11.338129202328322, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.2310868203639984, "logits/rejected": -0.06772772967815399, "logps/chosen": -1.5897670984268188, "logps/rejected": -2.280256509780884, "loss": 0.6892, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5897670984268188, "rewards/margins": 0.6904891729354858, "rewards/rejected": -2.280256509780884, "sft_loss": 1.647322416305542, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 12.925637064480494, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.2835717499256134, "logits/rejected": 0.0046598403714597225, "logps/chosen": -1.6291673183441162, "logps/rejected": -2.4596385955810547, "loss": 0.654, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6291673183441162, "rewards/margins": 0.830471396446228, "rewards/rejected": -2.4596385955810547, "sft_loss": 1.6112010478973389, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 11.348798085785033, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.2701950967311859, "logits/rejected": -0.08966036140918732, "logps/chosen": -1.617082953453064, "logps/rejected": -2.3176779747009277, "loss": 0.6909, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.617082953453064, "rewards/margins": 0.7005949020385742, "rewards/rejected": -2.3176779747009277, "sft_loss": 1.6887394189834595, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 11.25220348407038, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.2563892900943756, "logits/rejected": -0.11084787547588348, "logps/chosen": -1.6776103973388672, "logps/rejected": -2.411189317703247, "loss": 0.6893, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6776103973388672, "rewards/margins": 0.7335788011550903, "rewards/rejected": -2.411189317703247, "sft_loss": 1.6805721521377563, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 12.829778178647237, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.2448205202817917, "logits/rejected": -0.16019026935100555, "logps/chosen": -1.7305723428726196, "logps/rejected": -2.3103702068328857, "loss": 0.7656, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7305723428726196, "rewards/margins": 0.5797977447509766, "rewards/rejected": -2.3103702068328857, "sft_loss": 1.721859335899353, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 18.980117780221455, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.24948939681053162, "logits/rejected": -0.0920846164226532, "logps/chosen": -1.6394065618515015, "logps/rejected": -2.1757190227508545, "loss": 0.729, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6394065618515015, "rewards/margins": 0.5363125205039978, "rewards/rejected": -2.1757190227508545, "sft_loss": 1.669259786605835, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 14.325780136631856, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.20548641681671143, "logits/rejected": -0.013326513580977917, "logps/chosen": -1.6016470193862915, "logps/rejected": -2.3489766120910645, "loss": 0.706, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6016470193862915, "rewards/margins": 0.747329592704773, "rewards/rejected": -2.3489766120910645, "sft_loss": 1.60581374168396, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 16.19243634253581, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.17815425992012024, "logits/rejected": -0.04836118966341019, "logps/chosen": -1.6504271030426025, "logps/rejected": -2.4420382976531982, "loss": 0.6983, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6504271030426025, "rewards/margins": 0.7916114926338196, "rewards/rejected": -2.4420382976531982, "sft_loss": 1.6500117778778076, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 13.724419250218553, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.3153532147407532, "logits/rejected": -0.07037369906902313, "logps/chosen": -1.5792293548583984, "logps/rejected": -2.303767681121826, "loss": 0.6731, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5792293548583984, "rewards/margins": 0.7245380878448486, "rewards/rejected": -2.303767681121826, "sft_loss": 1.599976658821106, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 11.82833845745009, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.18549230694770813, "logits/rejected": -0.11423400789499283, "logps/chosen": -1.5666887760162354, "logps/rejected": -2.31834077835083, "loss": 0.6817, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5666887760162354, "rewards/margins": 0.7516517639160156, "rewards/rejected": -2.31834077835083, "sft_loss": 1.5825238227844238, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 9.230520002724067, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.21890421211719513, "logits/rejected": 0.005529178772121668, "logps/chosen": -1.6638309955596924, "logps/rejected": -2.418092727661133, "loss": 0.7277, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6638309955596924, "rewards/margins": 0.75426185131073, "rewards/rejected": -2.418092727661133, "sft_loss": 1.6921039819717407, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 9.89600287126039, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.3217540681362152, "logits/rejected": -0.1658337563276291, "logps/chosen": -1.5957491397857666, "logps/rejected": -2.2903361320495605, "loss": 0.7061, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.5957491397857666, "rewards/margins": 0.6945871114730835, "rewards/rejected": -2.2903361320495605, "sft_loss": 1.6473162174224854, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 12.349542384526853, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.16067500412464142, "logits/rejected": -0.09984975308179855, "logps/chosen": -1.681488037109375, "logps/rejected": -2.389871120452881, "loss": 0.711, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.681488037109375, "rewards/margins": 0.7083832621574402, "rewards/rejected": -2.389871120452881, "sft_loss": 1.6468899250030518, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 13.314471561553384, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2832106649875641, "logits/rejected": -0.08498354256153107, "logps/chosen": -1.6763321161270142, "logps/rejected": -2.4621031284332275, "loss": 0.6813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6763321161270142, "rewards/margins": 0.7857707738876343, "rewards/rejected": -2.4621031284332275, "sft_loss": 1.7055528163909912, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 8.05202026692747, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.2358255833387375, "logits/rejected": -0.18660102784633636, "logps/chosen": -1.7259280681610107, "logps/rejected": -2.501217842102051, "loss": 0.679, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7259280681610107, "rewards/margins": 0.7752898931503296, "rewards/rejected": -2.501217842102051, "sft_loss": 1.760166883468628, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 10.45517698259413, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.22739899158477783, "logits/rejected": -0.08847493678331375, "logps/chosen": -1.4906214475631714, "logps/rejected": -2.278884172439575, "loss": 0.6523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4906214475631714, "rewards/margins": 0.7882627248764038, "rewards/rejected": -2.278884172439575, "sft_loss": 1.5516926050186157, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 10.438014409309998, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.2749941051006317, "logits/rejected": -0.20564429461956024, "logps/chosen": -1.6889903545379639, "logps/rejected": -2.359412670135498, "loss": 0.7336, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6889903545379639, "rewards/margins": 0.6704226732254028, "rewards/rejected": -2.359412670135498, "sft_loss": 1.70111882686615, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 9.195862626992424, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.21129819750785828, "logits/rejected": -0.06121910735964775, "logps/chosen": -1.5822350978851318, "logps/rejected": -2.4218201637268066, "loss": 0.6612, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5822350978851318, "rewards/margins": 0.8395853042602539, "rewards/rejected": -2.4218201637268066, "sft_loss": 1.6787328720092773, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 7.05855505031799, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.18115553259849548, "logits/rejected": -0.24121864140033722, "logps/chosen": -1.620356798171997, "logps/rejected": -2.3000826835632324, "loss": 0.6812, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.620356798171997, "rewards/margins": 0.6797260046005249, "rewards/rejected": -2.3000826835632324, "sft_loss": 1.6455609798431396, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 7.136352432078746, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.30187302827835083, "logits/rejected": -0.13055703043937683, "logps/chosen": -1.6646325588226318, "logps/rejected": -2.4494311809539795, "loss": 0.6795, "rewards/accuracies": 0.75, "rewards/chosen": -1.6646325588226318, "rewards/margins": 0.7847987413406372, "rewards/rejected": -2.4494311809539795, "sft_loss": 1.6976865530014038, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 10.242090280856157, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.23686742782592773, "logits/rejected": -0.1735285520553589, "logps/chosen": -1.6956369876861572, "logps/rejected": -2.452082872390747, "loss": 0.7108, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6956369876861572, "rewards/margins": 0.756445586681366, "rewards/rejected": -2.452082872390747, "sft_loss": 1.7371715307235718, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 9.065886832664216, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.2004161775112152, "logits/rejected": 0.07438705116510391, "logps/chosen": -1.6316092014312744, "logps/rejected": -2.6196351051330566, "loss": 0.6128, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6316092014312744, "rewards/margins": 0.9880256652832031, "rewards/rejected": -2.6196351051330566, "sft_loss": 1.6358451843261719, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 8.67453443220106, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.2947368025779724, "logits/rejected": -0.08691274374723434, "logps/chosen": -1.5217931270599365, "logps/rejected": -2.4924776554107666, "loss": 0.6501, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.5217931270599365, "rewards/margins": 0.9706846475601196, "rewards/rejected": -2.4924776554107666, "sft_loss": 1.581291675567627, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 9.635043302559676, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.3500779867172241, "logits/rejected": -0.18925541639328003, "logps/chosen": -1.589550495147705, "logps/rejected": -2.397437572479248, "loss": 0.6713, "rewards/accuracies": 0.75, "rewards/chosen": -1.589550495147705, "rewards/margins": 0.8078867197036743, "rewards/rejected": -2.397437572479248, "sft_loss": 1.6601976156234741, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 9.413243208610313, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.3179710805416107, "logits/rejected": -0.09681042283773422, "logps/chosen": -1.7094900608062744, "logps/rejected": -2.4280431270599365, "loss": 0.7236, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7094900608062744, "rewards/margins": 0.7185533046722412, "rewards/rejected": -2.4280431270599365, "sft_loss": 1.7506961822509766, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 12.721203130386941, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.26807981729507446, "logits/rejected": -0.1362856924533844, "logps/chosen": -1.7068068981170654, "logps/rejected": -2.4833457469940186, "loss": 0.6965, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.7068068981170654, "rewards/margins": 0.7765390276908875, "rewards/rejected": -2.4833457469940186, "sft_loss": 1.6986442804336548, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 16.281888101748613, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.13167652487754822, "logits/rejected": -0.008784021250903606, "logps/chosen": -1.642232894897461, "logps/rejected": -2.4456238746643066, "loss": 0.6883, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.642232894897461, "rewards/margins": 0.8033912777900696, "rewards/rejected": -2.4456238746643066, "sft_loss": 1.5835784673690796, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 10.766552415582913, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.2363930195569992, "logits/rejected": -0.11599330604076385, "logps/chosen": -1.6609506607055664, "logps/rejected": -2.3672306537628174, "loss": 0.6988, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6609506607055664, "rewards/margins": 0.7062800526618958, "rewards/rejected": -2.3672306537628174, "sft_loss": 1.676262617111206, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 11.899138856304777, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.23602426052093506, "logits/rejected": -0.1639963984489441, "logps/chosen": -1.6352084875106812, "logps/rejected": -2.2261688709259033, "loss": 0.721, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6352084875106812, "rewards/margins": 0.5909605622291565, "rewards/rejected": -2.2261688709259033, "sft_loss": 1.6245943307876587, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 11.467918927222362, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.3258369565010071, "logits/rejected": -0.15932396054267883, "logps/chosen": -1.6418447494506836, "logps/rejected": -2.5581870079040527, "loss": 0.6574, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6418447494506836, "rewards/margins": 0.9163424372673035, "rewards/rejected": -2.5581870079040527, "sft_loss": 1.6420872211456299, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 18.518245453618903, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.21528804302215576, "logits/rejected": -0.10906976461410522, "logps/chosen": -1.681335687637329, "logps/rejected": -2.407076120376587, "loss": 0.7049, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.681335687637329, "rewards/margins": 0.7257404327392578, "rewards/rejected": -2.407076120376587, "sft_loss": 1.67661452293396, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 8.287915224258985, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.2865227460861206, "logits/rejected": -0.1475687175989151, "logps/chosen": -1.5694164037704468, "logps/rejected": -2.24953556060791, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": -1.5694164037704468, "rewards/margins": 0.6801191568374634, "rewards/rejected": -2.24953556060791, "sft_loss": 1.6407816410064697, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.09442966431379318, "eval_logits/rejected": 0.1985771507024765, "eval_logps/chosen": -1.7077901363372803, "eval_logps/rejected": -2.3536388874053955, "eval_loss": 0.7410878539085388, "eval_rewards/accuracies": 0.675815999507904, "eval_rewards/chosen": -1.7077901363372803, "eval_rewards/margins": 0.64584881067276, "eval_rewards/rejected": -2.3536388874053955, "eval_runtime": 48.5966, "eval_samples_per_second": 27.677, "eval_sft_loss": 1.6769425868988037, "eval_steps_per_second": 6.935, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 11.810361343229172, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.351688027381897, "logits/rejected": -0.2100204974412918, "logps/chosen": -1.672376275062561, "logps/rejected": -2.4296040534973145, "loss": 0.6823, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.672376275062561, "rewards/margins": 0.7572277784347534, "rewards/rejected": -2.4296040534973145, "sft_loss": 1.6638829708099365, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 10.839954412761848, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.22825750708580017, "logits/rejected": -0.08034198731184006, "logps/chosen": -1.7043678760528564, "logps/rejected": -2.4143729209899902, "loss": 0.7325, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7043678760528564, "rewards/margins": 0.7100049257278442, "rewards/rejected": -2.4143729209899902, "sft_loss": 1.7052783966064453, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 11.20609109787132, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.22829201817512512, "logits/rejected": -0.05746559053659439, "logps/chosen": -1.4910621643066406, "logps/rejected": -2.3956239223480225, "loss": 0.6689, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.4910621643066406, "rewards/margins": 0.9045615196228027, "rewards/rejected": -2.3956239223480225, "sft_loss": 1.569577932357788, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 14.118612454459367, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.1882447898387909, "logits/rejected": -0.09149028360843658, "logps/chosen": -1.701373815536499, "logps/rejected": -2.331512928009033, "loss": 0.7167, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.701373815536499, "rewards/margins": 0.6301389932632446, "rewards/rejected": -2.331512928009033, "sft_loss": 1.6795806884765625, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 13.107778944368144, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.21654561161994934, "logits/rejected": -0.0521714985370636, "logps/chosen": -1.5970170497894287, "logps/rejected": -2.2822751998901367, "loss": 0.6911, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5970170497894287, "rewards/margins": 0.6852580308914185, "rewards/rejected": -2.2822751998901367, "sft_loss": 1.5799387693405151, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 13.920827462594783, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.2660030126571655, "logits/rejected": -0.08644221723079681, "logps/chosen": -1.6726162433624268, "logps/rejected": -2.3386168479919434, "loss": 0.715, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6726162433624268, "rewards/margins": 0.666000485420227, "rewards/rejected": -2.3386168479919434, "sft_loss": 1.675969123840332, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 14.414613561636898, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.352024644613266, "logits/rejected": -0.12190060317516327, "logps/chosen": -1.6388378143310547, "logps/rejected": -2.5021276473999023, "loss": 0.6568, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6388378143310547, "rewards/margins": 0.863290011882782, "rewards/rejected": -2.5021276473999023, "sft_loss": 1.6602637767791748, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 14.020910356356277, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.33984386920928955, "logits/rejected": -0.09964685142040253, "logps/chosen": -1.5840517282485962, "logps/rejected": -2.4188971519470215, "loss": 0.6521, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5840517282485962, "rewards/margins": 0.8348451852798462, "rewards/rejected": -2.4188971519470215, "sft_loss": 1.614556908607483, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 10.64276328205877, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.24680741131305695, "logits/rejected": -0.13927848637104034, "logps/chosen": -1.6210944652557373, "logps/rejected": -2.321995258331299, "loss": 0.6988, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6210944652557373, "rewards/margins": 0.700900673866272, "rewards/rejected": -2.321995258331299, "sft_loss": 1.6853511333465576, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 13.272855169791058, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3922897279262543, "logits/rejected": -0.20143163204193115, "logps/chosen": -1.636775255203247, "logps/rejected": -2.4975388050079346, "loss": 0.6795, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.636775255203247, "rewards/margins": 0.860763669013977, "rewards/rejected": -2.4975388050079346, "sft_loss": 1.624566674232483, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 11.379822713661476, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.21656003594398499, "logits/rejected": -0.03493952378630638, "logps/chosen": -1.5528054237365723, "logps/rejected": -2.3800268173217773, "loss": 0.6826, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5528054237365723, "rewards/margins": 0.8272212743759155, "rewards/rejected": -2.3800268173217773, "sft_loss": 1.629116415977478, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 12.940410065155216, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.24888157844543457, "logits/rejected": -0.16180197894573212, "logps/chosen": -1.5373536348342896, "logps/rejected": -2.397681474685669, "loss": 0.6275, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5373536348342896, "rewards/margins": 0.8603278994560242, "rewards/rejected": -2.397681474685669, "sft_loss": 1.5879541635513306, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 7.444132464753356, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.3648526072502136, "logits/rejected": -0.24708771705627441, "logps/chosen": -1.587809681892395, "logps/rejected": -2.4187474250793457, "loss": 0.6514, "rewards/accuracies": 0.75, "rewards/chosen": -1.587809681892395, "rewards/margins": 0.8309377431869507, "rewards/rejected": -2.4187474250793457, "sft_loss": 1.612841248512268, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 12.612879012959835, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.20833304524421692, "logits/rejected": -0.05588734894990921, "logps/chosen": -1.6059906482696533, "logps/rejected": -2.402313470840454, "loss": 0.6765, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6059906482696533, "rewards/margins": 0.7963228225708008, "rewards/rejected": -2.402313470840454, "sft_loss": 1.5987026691436768, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 11.665193011310286, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.09003891795873642, "logits/rejected": 0.00922683160752058, "logps/chosen": -1.5567409992218018, "logps/rejected": -2.5216331481933594, "loss": 0.6483, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5567409992218018, "rewards/margins": 0.964891791343689, "rewards/rejected": -2.5216331481933594, "sft_loss": 1.6186094284057617, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 12.426004221709048, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.17096517980098724, "logits/rejected": -0.019394760951399803, "logps/chosen": -1.5368496179580688, "logps/rejected": -2.4421775341033936, "loss": 0.6672, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5368496179580688, "rewards/margins": 0.905328094959259, "rewards/rejected": -2.4421775341033936, "sft_loss": 1.6107442378997803, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 11.539509863771814, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.14591090381145477, "logits/rejected": -0.06807564944028854, "logps/chosen": -1.7288970947265625, "logps/rejected": -2.544593334197998, "loss": 0.7046, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7288970947265625, "rewards/margins": 0.8156958818435669, "rewards/rejected": -2.544593334197998, "sft_loss": 1.8225723505020142, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 9.856189340462384, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.3308108448982239, "logits/rejected": -0.1206996887922287, "logps/chosen": -1.638318657875061, "logps/rejected": -2.4511218070983887, "loss": 0.6833, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.638318657875061, "rewards/margins": 0.8128031492233276, "rewards/rejected": -2.4511218070983887, "sft_loss": 1.69119393825531, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 9.750910846025096, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.25326764583587646, "logits/rejected": -0.14939264953136444, "logps/chosen": -1.6787688732147217, "logps/rejected": -2.234114170074463, "loss": 0.7661, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6787688732147217, "rewards/margins": 0.5553451776504517, "rewards/rejected": -2.234114170074463, "sft_loss": 1.7490723133087158, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 9.405599912021003, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.18301703035831451, "logits/rejected": -0.019637729972600937, "logps/chosen": -1.5830962657928467, "logps/rejected": -2.3951687812805176, "loss": 0.6651, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5830962657928467, "rewards/margins": 0.8120723962783813, "rewards/rejected": -2.3951687812805176, "sft_loss": 1.6575126647949219, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 9.707569150608215, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.2273918092250824, "logits/rejected": -0.03705140948295593, "logps/chosen": -1.6867218017578125, "logps/rejected": -2.5362789630889893, "loss": 0.6776, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6867218017578125, "rewards/margins": 0.849557101726532, "rewards/rejected": -2.5362789630889893, "sft_loss": 1.7770799398422241, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 13.358959017817625, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.23624499142169952, "logits/rejected": -0.25516921281814575, "logps/chosen": -1.6457563638687134, "logps/rejected": -2.6610262393951416, "loss": 0.6663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6457563638687134, "rewards/margins": 1.0152698755264282, "rewards/rejected": -2.6610262393951416, "sft_loss": 1.661829948425293, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 12.459127038551248, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.19813086092472076, "logits/rejected": -0.04774543642997742, "logps/chosen": -1.514463186264038, "logps/rejected": -2.31068754196167, "loss": 0.6633, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.514463186264038, "rewards/margins": 0.7962244749069214, "rewards/rejected": -2.31068754196167, "sft_loss": 1.60591721534729, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 13.098187744041828, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.18732097744941711, "logits/rejected": -0.07609729468822479, "logps/chosen": -1.6028735637664795, "logps/rejected": -2.412036418914795, "loss": 0.6947, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6028735637664795, "rewards/margins": 0.8091629147529602, "rewards/rejected": -2.412036418914795, "sft_loss": 1.657893180847168, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 10.587091244854564, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.40807557106018066, "logits/rejected": -0.1196105107665062, "logps/chosen": -1.5978686809539795, "logps/rejected": -2.4935269355773926, "loss": 0.6627, "rewards/accuracies": 0.75, "rewards/chosen": -1.5978686809539795, "rewards/margins": 0.8956578969955444, "rewards/rejected": -2.4935269355773926, "sft_loss": 1.677520990371704, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 9.618141203611305, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.20703010261058807, "logits/rejected": -0.0043792943470180035, "logps/chosen": -1.754251480102539, "logps/rejected": -2.6629347801208496, "loss": 0.6802, "rewards/accuracies": 0.78125, "rewards/chosen": -1.754251480102539, "rewards/margins": 0.9086835980415344, "rewards/rejected": -2.6629347801208496, "sft_loss": 1.702167272567749, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 12.43772492874109, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.28863781690597534, "logits/rejected": -0.09000730514526367, "logps/chosen": -1.5869609117507935, "logps/rejected": -2.3102962970733643, "loss": 0.7234, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5869609117507935, "rewards/margins": 0.723335325717926, "rewards/rejected": -2.3102962970733643, "sft_loss": 1.630984902381897, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 8.782445148489737, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.24556434154510498, "logits/rejected": -0.13989314436912537, "logps/chosen": -1.6189382076263428, "logps/rejected": -2.1726937294006348, "loss": 0.7113, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6189382076263428, "rewards/margins": 0.553755521774292, "rewards/rejected": -2.1726937294006348, "sft_loss": 1.6286262273788452, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 10.644498522580193, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.26942482590675354, "logits/rejected": -0.03364920616149902, "logps/chosen": -1.6211296319961548, "logps/rejected": -2.550506591796875, "loss": 0.6542, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6211296319961548, "rewards/margins": 0.9293769598007202, "rewards/rejected": -2.550506591796875, "sft_loss": 1.635293960571289, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 10.544637175544887, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.27639904618263245, "logits/rejected": -0.11306875944137573, "logps/chosen": -1.6275110244750977, "logps/rejected": -2.3331377506256104, "loss": 0.713, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6275110244750977, "rewards/margins": 0.7056267261505127, "rewards/rejected": -2.3331377506256104, "sft_loss": 1.6215118169784546, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 10.193418604379474, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.24848809838294983, "logits/rejected": -0.23636803030967712, "logps/chosen": -1.6728041172027588, "logps/rejected": -2.389587879180908, "loss": 0.7043, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6728041172027588, "rewards/margins": 0.7167838215827942, "rewards/rejected": -2.389587879180908, "sft_loss": 1.7119734287261963, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 14.095012997661698, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.1586824506521225, "logits/rejected": -0.033896416425704956, "logps/chosen": -1.5915911197662354, "logps/rejected": -2.3378329277038574, "loss": 0.6812, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5915911197662354, "rewards/margins": 0.7462421655654907, "rewards/rejected": -2.3378329277038574, "sft_loss": 1.5919703245162964, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 12.333530509473823, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.2927999496459961, "logits/rejected": -0.15135972201824188, "logps/chosen": -1.5222244262695312, "logps/rejected": -2.381122589111328, "loss": 0.6489, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5222244262695312, "rewards/margins": 0.8588981628417969, "rewards/rejected": -2.381122589111328, "sft_loss": 1.6105819940567017, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 14.9111337401977, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.16607865691184998, "logits/rejected": -0.08409662544727325, "logps/chosen": -1.57377290725708, "logps/rejected": -2.366957426071167, "loss": 0.654, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.57377290725708, "rewards/margins": 0.7931844592094421, "rewards/rejected": -2.366957426071167, "sft_loss": 1.5438182353973389, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 13.157968562208003, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.2357436716556549, "logits/rejected": -0.1693754643201828, "logps/chosen": -1.6895902156829834, "logps/rejected": -2.3946168422698975, "loss": 0.6989, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6895902156829834, "rewards/margins": 0.7050267457962036, "rewards/rejected": -2.3946168422698975, "sft_loss": 1.6792442798614502, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 16.03682522667059, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.14633958041667938, "logits/rejected": -0.06359054148197174, "logps/chosen": -1.7830045223236084, "logps/rejected": -2.5048351287841797, "loss": 0.7291, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7830045223236084, "rewards/margins": 0.7218302488327026, "rewards/rejected": -2.5048351287841797, "sft_loss": 1.678049087524414, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 14.023170155691968, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.19137059152126312, "logits/rejected": 0.023235727101564407, "logps/chosen": -1.5731264352798462, "logps/rejected": -2.4480862617492676, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5731264352798462, "rewards/margins": 0.8749601244926453, "rewards/rejected": -2.4480862617492676, "sft_loss": 1.5597152709960938, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 11.73239410173279, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.34296464920043945, "logits/rejected": -0.10354363918304443, "logps/chosen": -1.598331093788147, "logps/rejected": -2.4530701637268066, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": -1.598331093788147, "rewards/margins": 0.8547390103340149, "rewards/rejected": -2.4530701637268066, "sft_loss": 1.628316879272461, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 10.077278212873187, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.2891957759857178, "logits/rejected": -0.26858973503112793, "logps/chosen": -1.666072130203247, "logps/rejected": -2.2597265243530273, "loss": 0.7306, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.666072130203247, "rewards/margins": 0.593654215335846, "rewards/rejected": -2.2597265243530273, "sft_loss": 1.7169806957244873, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 9.939029528916706, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.2657594382762909, "logits/rejected": -0.23113341629505157, "logps/chosen": -1.661736249923706, "logps/rejected": -2.4238317012786865, "loss": 0.7094, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.661736249923706, "rewards/margins": 0.7620955109596252, "rewards/rejected": -2.4238317012786865, "sft_loss": 1.6939157247543335, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 12.655342309116568, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.16649341583251953, "logits/rejected": -0.10416650772094727, "logps/chosen": -1.625309705734253, "logps/rejected": -2.2988686561584473, "loss": 0.6831, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.625309705734253, "rewards/margins": 0.6735588908195496, "rewards/rejected": -2.2988686561584473, "sft_loss": 1.6133670806884766, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 12.63819361618371, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.15031428635120392, "logits/rejected": -0.035445403307676315, "logps/chosen": -1.7651119232177734, "logps/rejected": -2.4991495609283447, "loss": 0.7385, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.7651119232177734, "rewards/margins": 0.7340375781059265, "rewards/rejected": -2.4991495609283447, "sft_loss": 1.7837398052215576, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 15.350925089829763, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.28147023916244507, "logits/rejected": -0.2025039941072464, "logps/chosen": -1.6879581212997437, "logps/rejected": -2.40765118598938, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": -1.6879581212997437, "rewards/margins": 0.7196931838989258, "rewards/rejected": -2.40765118598938, "sft_loss": 1.619342565536499, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 17.99484263315512, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.3191513419151306, "logits/rejected": -0.06911689788103104, "logps/chosen": -1.6092344522476196, "logps/rejected": -2.329237937927246, "loss": 0.6996, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6092344522476196, "rewards/margins": 0.7200034856796265, "rewards/rejected": -2.329237937927246, "sft_loss": 1.638487458229065, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 10.912108318578028, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.2827785313129425, "logits/rejected": -0.12100187689065933, "logps/chosen": -1.6438062191009521, "logps/rejected": -2.3690648078918457, "loss": 0.7027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6438062191009521, "rewards/margins": 0.7252583503723145, "rewards/rejected": -2.3690648078918457, "sft_loss": 1.6404037475585938, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 9.731192900189654, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.2499154806137085, "logits/rejected": -0.16976672410964966, "logps/chosen": -1.640061616897583, "logps/rejected": -2.4979777336120605, "loss": 0.7051, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.640061616897583, "rewards/margins": 0.8579161763191223, "rewards/rejected": -2.4979777336120605, "sft_loss": 1.6242501735687256, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 15.801422485380732, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.2540862560272217, "logits/rejected": -0.11490116268396378, "logps/chosen": -1.7152860164642334, "logps/rejected": -2.4072487354278564, "loss": 0.7079, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7152860164642334, "rewards/margins": 0.6919624209403992, "rewards/rejected": -2.4072487354278564, "sft_loss": 1.7464466094970703, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 8.03963607567851, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.03891091048717499, "logits/rejected": -0.0817728191614151, "logps/chosen": -1.6593310832977295, "logps/rejected": -2.370170831680298, "loss": 0.7024, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6593310832977295, "rewards/margins": 0.7108396291732788, "rewards/rejected": -2.370170831680298, "sft_loss": 1.6408748626708984, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 18.644527727020584, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.167449489235878, "logits/rejected": -0.14476385712623596, "logps/chosen": -1.5764930248260498, "logps/rejected": -2.3111233711242676, "loss": 0.6775, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5764930248260498, "rewards/margins": 0.7346304655075073, "rewards/rejected": -2.3111233711242676, "sft_loss": 1.5921560525894165, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 7.718337021361548, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.2252693921327591, "logits/rejected": -0.13295769691467285, "logps/chosen": -1.5100374221801758, "logps/rejected": -2.2813303470611572, "loss": 0.6426, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5100374221801758, "rewards/margins": 0.7712931632995605, "rewards/rejected": -2.2813303470611572, "sft_loss": 1.5447124242782593, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 11.94908566275378, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.3772529065608978, "logits/rejected": -0.2794039845466614, "logps/chosen": -1.5032932758331299, "logps/rejected": -2.257519245147705, "loss": 0.6729, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5032932758331299, "rewards/margins": 0.7542258501052856, "rewards/rejected": -2.257519245147705, "sft_loss": 1.552524209022522, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 13.40411856685503, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.22525887191295624, "logits/rejected": -0.14506056904792786, "logps/chosen": -1.6216704845428467, "logps/rejected": -2.291947841644287, "loss": 0.7203, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6216704845428467, "rewards/margins": 0.6702772974967957, "rewards/rejected": -2.291947841644287, "sft_loss": 1.693621039390564, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 18.3237975464237, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.2763119339942932, "logits/rejected": 0.013517215847969055, "logps/chosen": -1.6374975442886353, "logps/rejected": -2.3659424781799316, "loss": 0.6821, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6374975442886353, "rewards/margins": 0.7284448146820068, "rewards/rejected": -2.3659424781799316, "sft_loss": 1.6454788446426392, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 10.282414122198409, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.14273320138454437, "logits/rejected": 0.036741144955158234, "logps/chosen": -1.5218318700790405, "logps/rejected": -2.1617281436920166, "loss": 0.688, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5218318700790405, "rewards/margins": 0.6398963928222656, "rewards/rejected": -2.1617281436920166, "sft_loss": 1.5611913204193115, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 19.87979390872604, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.22723546624183655, "logits/rejected": -0.13415133953094482, "logps/chosen": -1.5647614002227783, "logps/rejected": -2.0992584228515625, "loss": 0.745, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5647614002227783, "rewards/margins": 0.5344969034194946, "rewards/rejected": -2.0992584228515625, "sft_loss": 1.6174991130828857, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 11.895663515126252, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.26551052927970886, "logits/rejected": -0.09764693677425385, "logps/chosen": -1.638248085975647, "logps/rejected": -2.381021022796631, "loss": 0.6776, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.638248085975647, "rewards/margins": 0.74277263879776, "rewards/rejected": -2.381021022796631, "sft_loss": 1.6405537128448486, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 21.082462553367506, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.28359124064445496, "logits/rejected": -0.1318884640932083, "logps/chosen": -1.5293941497802734, "logps/rejected": -2.449746608734131, "loss": 0.6322, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5293941497802734, "rewards/margins": 0.920352578163147, "rewards/rejected": -2.449746608734131, "sft_loss": 1.5505578517913818, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 15.496579575031447, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.38021615147590637, "logits/rejected": -0.18842045962810516, "logps/chosen": -1.6410424709320068, "logps/rejected": -2.4443531036376953, "loss": 0.6823, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6410424709320068, "rewards/margins": 0.8033105134963989, "rewards/rejected": -2.4443531036376953, "sft_loss": 1.6692800521850586, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 13.088760069435347, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.35648486018180847, "logits/rejected": -0.10282965749502182, "logps/chosen": -1.5907763242721558, "logps/rejected": -2.4708077907562256, "loss": 0.6587, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5907763242721558, "rewards/margins": 0.880031406879425, "rewards/rejected": -2.4708077907562256, "sft_loss": 1.6385835409164429, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 10.286685969142658, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.15295718610286713, "logits/rejected": 0.03394906222820282, "logps/chosen": -1.5599133968353271, "logps/rejected": -2.621251344680786, "loss": 0.6292, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5599133968353271, "rewards/margins": 1.0613378286361694, "rewards/rejected": -2.621251344680786, "sft_loss": 1.586523413658142, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 20.217165534930206, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.21457111835479736, "logits/rejected": -0.024632802233099937, "logps/chosen": -1.6446702480316162, "logps/rejected": -2.4213435649871826, "loss": 0.6629, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6446702480316162, "rewards/margins": 0.7766732573509216, "rewards/rejected": -2.4213435649871826, "sft_loss": 1.6333211660385132, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 12.95252952751756, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.28488826751708984, "logits/rejected": -0.1302223652601242, "logps/chosen": -1.623578667640686, "logps/rejected": -2.376253604888916, "loss": 0.6962, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.623578667640686, "rewards/margins": 0.75267493724823, "rewards/rejected": -2.376253604888916, "sft_loss": 1.6034011840820312, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 10.045014954943717, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.3393504023551941, "logits/rejected": -0.08003746718168259, "logps/chosen": -1.606422781944275, "logps/rejected": -2.516679286956787, "loss": 0.6514, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.606422781944275, "rewards/margins": 0.9102565050125122, "rewards/rejected": -2.516679286956787, "sft_loss": 1.6458886861801147, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 14.694710395971331, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.2591537833213806, "logits/rejected": -0.17624430358409882, "logps/chosen": -1.7068341970443726, "logps/rejected": -2.553440570831299, "loss": 0.6541, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7068341970443726, "rewards/margins": 0.8466061353683472, "rewards/rejected": -2.553440570831299, "sft_loss": 1.6998831033706665, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 26.786870307632142, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.3961165249347687, "logits/rejected": -0.13789108395576477, "logps/chosen": -1.6048774719238281, "logps/rejected": -2.314772844314575, "loss": 0.7073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6048774719238281, "rewards/margins": 0.7098954319953918, "rewards/rejected": -2.314772844314575, "sft_loss": 1.6947174072265625, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 9.307251995995035, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.400416761636734, "logits/rejected": -0.17295320332050323, "logps/chosen": -1.6209526062011719, "logps/rejected": -2.48704195022583, "loss": 0.6753, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6209526062011719, "rewards/margins": 0.8660893440246582, "rewards/rejected": -2.48704195022583, "sft_loss": 1.7005434036254883, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 11.93991285156542, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.15378838777542114, "logits/rejected": -0.07503332197666168, "logps/chosen": -1.6249189376831055, "logps/rejected": -2.321840763092041, "loss": 0.6932, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6249189376831055, "rewards/margins": 0.6969217658042908, "rewards/rejected": -2.321840763092041, "sft_loss": 1.6611210107803345, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 12.216847268558698, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.2069709748029709, "logits/rejected": -0.04932025074958801, "logps/chosen": -1.6817525625228882, "logps/rejected": -2.2973766326904297, "loss": 0.7253, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6817525625228882, "rewards/margins": 0.615623950958252, "rewards/rejected": -2.2973766326904297, "sft_loss": 1.685738205909729, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 13.076951790589266, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.22594070434570312, "logits/rejected": -0.0328756682574749, "logps/chosen": -1.575772762298584, "logps/rejected": -2.3027195930480957, "loss": 0.7097, "rewards/accuracies": 0.6875, "rewards/chosen": -1.575772762298584, "rewards/margins": 0.7269467115402222, "rewards/rejected": -2.3027195930480957, "sft_loss": 1.6151736974716187, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 8.931649669578603, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.3158307373523712, "logits/rejected": -0.166262686252594, "logps/chosen": -1.5803234577178955, "logps/rejected": -2.344715118408203, "loss": 0.6551, "rewards/accuracies": 0.75, "rewards/chosen": -1.5803234577178955, "rewards/margins": 0.7643915414810181, "rewards/rejected": -2.344715118408203, "sft_loss": 1.607454538345337, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 8.923726483592807, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.31551894545555115, "logits/rejected": -0.19294817745685577, "logps/chosen": -1.6508829593658447, "logps/rejected": -2.359785556793213, "loss": 0.6945, "rewards/accuracies": 0.75, "rewards/chosen": -1.6508829593658447, "rewards/margins": 0.7089024782180786, "rewards/rejected": -2.359785556793213, "sft_loss": 1.7128108739852905, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 8.90495185705992, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.2576938271522522, "logits/rejected": -0.19679024815559387, "logps/chosen": -1.5943269729614258, "logps/rejected": -2.4396283626556396, "loss": 0.6538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5943269729614258, "rewards/margins": 0.8453014492988586, "rewards/rejected": -2.4396283626556396, "sft_loss": 1.6705970764160156, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 13.29805692369786, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.23019611835479736, "logits/rejected": -0.004331841133534908, "logps/chosen": -1.5425282716751099, "logps/rejected": -2.3237481117248535, "loss": 0.6521, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5425282716751099, "rewards/margins": 0.7812197804450989, "rewards/rejected": -2.3237481117248535, "sft_loss": 1.5640408992767334, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 8.420886854080155, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.24373432993888855, "logits/rejected": -0.0884752869606018, "logps/chosen": -1.6471850872039795, "logps/rejected": -2.4145665168762207, "loss": 0.7051, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6471850872039795, "rewards/margins": 0.7673813104629517, "rewards/rejected": -2.4145665168762207, "sft_loss": 1.7203576564788818, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 14.680825053132672, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.26672497391700745, "logits/rejected": -0.1523597538471222, "logps/chosen": -1.6596254110336304, "logps/rejected": -2.4555373191833496, "loss": 0.6812, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.6596254110336304, "rewards/margins": 0.7959118485450745, "rewards/rejected": -2.4555373191833496, "sft_loss": 1.6531959772109985, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 13.081943532066477, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.17072239518165588, "logits/rejected": -0.10030049085617065, "logps/chosen": -1.669908881187439, "logps/rejected": -2.3693387508392334, "loss": 0.7223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.669908881187439, "rewards/margins": 0.6994299292564392, "rewards/rejected": -2.3693387508392334, "sft_loss": 1.7081491947174072, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 9.223338259489795, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.2360965758562088, "logits/rejected": -0.08316276222467422, "logps/chosen": -1.5378961563110352, "logps/rejected": -2.4518680572509766, "loss": 0.666, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5378961563110352, "rewards/margins": 0.9139717221260071, "rewards/rejected": -2.4518680572509766, "sft_loss": 1.5981072187423706, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 15.788468485005295, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.3577505350112915, "logits/rejected": -0.15907980501651764, "logps/chosen": -1.5762702226638794, "logps/rejected": -2.365091323852539, "loss": 0.6674, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5762702226638794, "rewards/margins": 0.788821280002594, "rewards/rejected": -2.365091323852539, "sft_loss": 1.6816511154174805, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 13.28320349308605, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.20729181170463562, "logits/rejected": -0.05018044263124466, "logps/chosen": -1.6028248071670532, "logps/rejected": -2.33538818359375, "loss": 0.6738, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6028248071670532, "rewards/margins": 0.7325634956359863, "rewards/rejected": -2.33538818359375, "sft_loss": 1.6399381160736084, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 12.385996155269273, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.3261379599571228, "logits/rejected": -0.13367603719234467, "logps/chosen": -1.5806565284729004, "logps/rejected": -2.392848253250122, "loss": 0.6831, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5806565284729004, "rewards/margins": 0.8121916651725769, "rewards/rejected": -2.392848253250122, "sft_loss": 1.6431503295898438, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.21098588407039642, "eval_logits/rejected": 0.3256378471851349, "eval_logps/chosen": -1.7130357027053833, "eval_logps/rejected": -2.369403839111328, "eval_loss": 0.7408645153045654, "eval_rewards/accuracies": 0.6765578389167786, "eval_rewards/chosen": -1.7130357027053833, "eval_rewards/margins": 0.6563680171966553, "eval_rewards/rejected": -2.369403839111328, "eval_runtime": 47.381, "eval_samples_per_second": 28.387, "eval_sft_loss": 1.6829732656478882, "eval_steps_per_second": 7.113, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 6.775665696893502, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.17758964002132416, "logits/rejected": -0.12170843780040741, "logps/chosen": -1.604562520980835, "logps/rejected": -2.277920722961426, "loss": 0.7218, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.604562520980835, "rewards/margins": 0.6733577847480774, "rewards/rejected": -2.277920722961426, "sft_loss": 1.6154165267944336, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 14.145423509667294, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.23000183701515198, "logits/rejected": -0.10633592307567596, "logps/chosen": -1.6004230976104736, "logps/rejected": -2.4686572551727295, "loss": 0.6535, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6004230976104736, "rewards/margins": 0.868233859539032, "rewards/rejected": -2.4686572551727295, "sft_loss": 1.6132562160491943, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 9.456264058288562, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.26327142119407654, "logits/rejected": -0.14456121623516083, "logps/chosen": -1.540578842163086, "logps/rejected": -2.390961170196533, "loss": 0.6472, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.540578842163086, "rewards/margins": 0.8503824472427368, "rewards/rejected": -2.390961170196533, "sft_loss": 1.6041126251220703, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 13.709972165607368, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.2972463071346283, "logits/rejected": -0.2040620744228363, "logps/chosen": -1.5755565166473389, "logps/rejected": -2.4879636764526367, "loss": 0.6566, "rewards/accuracies": 0.75, "rewards/chosen": -1.5755565166473389, "rewards/margins": 0.9124069213867188, "rewards/rejected": -2.4879636764526367, "sft_loss": 1.6569610834121704, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 13.322945163574934, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.22480659186840057, "logits/rejected": -0.051394373178482056, "logps/chosen": -1.7018409967422485, "logps/rejected": -2.549172878265381, "loss": 0.692, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.7018409967422485, "rewards/margins": 0.8473318815231323, "rewards/rejected": -2.549172878265381, "sft_loss": 1.7330865859985352, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 12.260735897881622, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.36754292249679565, "logits/rejected": -0.1878909170627594, "logps/chosen": -1.6431642770767212, "logps/rejected": -2.3938143253326416, "loss": 0.6735, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6431642770767212, "rewards/margins": 0.7506500482559204, "rewards/rejected": -2.3938143253326416, "sft_loss": 1.6279329061508179, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 10.615049915737567, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.3411548435688019, "logits/rejected": -0.1879447102546692, "logps/chosen": -1.632992148399353, "logps/rejected": -2.3886585235595703, "loss": 0.6766, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.632992148399353, "rewards/margins": 0.7556661367416382, "rewards/rejected": -2.3886585235595703, "sft_loss": 1.6099475622177124, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 19.553483327548474, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.2899969220161438, "logits/rejected": -0.039637502282857895, "logps/chosen": -1.6525678634643555, "logps/rejected": -2.446349859237671, "loss": 0.7073, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6525678634643555, "rewards/margins": 0.7937819361686707, "rewards/rejected": -2.446349859237671, "sft_loss": 1.6651198863983154, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 13.917295050309697, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.2003723382949829, "logits/rejected": -0.135641410946846, "logps/chosen": -1.6125447750091553, "logps/rejected": -2.2707650661468506, "loss": 0.7355, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.6125447750091553, "rewards/margins": 0.658220112323761, "rewards/rejected": -2.2707650661468506, "sft_loss": 1.6257193088531494, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 14.58176761823626, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.3262428641319275, "logits/rejected": -0.15604010224342346, "logps/chosen": -1.6684221029281616, "logps/rejected": -2.409224271774292, "loss": 0.7179, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6684221029281616, "rewards/margins": 0.7408021688461304, "rewards/rejected": -2.409224271774292, "sft_loss": 1.7110408544540405, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 15.253425198950076, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.2912864685058594, "logits/rejected": -0.033874742686748505, "logps/chosen": -1.6666243076324463, "logps/rejected": -2.4093141555786133, "loss": 0.6975, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6666243076324463, "rewards/margins": 0.7426899671554565, "rewards/rejected": -2.4093141555786133, "sft_loss": 1.6382312774658203, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 16.418111525664113, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.2318984717130661, "logits/rejected": -0.14551374316215515, "logps/chosen": -1.6123816967010498, "logps/rejected": -2.4394803047180176, "loss": 0.6963, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6123816967010498, "rewards/margins": 0.827098548412323, "rewards/rejected": -2.4394803047180176, "sft_loss": 1.7117074728012085, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 8.812025289902227, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.404472291469574, "logits/rejected": -0.13561639189720154, "logps/chosen": -1.8090083599090576, "logps/rejected": -2.5208821296691895, "loss": 0.7347, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.8090083599090576, "rewards/margins": 0.7118737697601318, "rewards/rejected": -2.5208821296691895, "sft_loss": 1.8155949115753174, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 10.452325524711286, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.23652303218841553, "logits/rejected": -0.06062141805887222, "logps/chosen": -1.6060326099395752, "logps/rejected": -2.429530382156372, "loss": 0.6524, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6060326099395752, "rewards/margins": 0.8234976530075073, "rewards/rejected": -2.429530382156372, "sft_loss": 1.6235265731811523, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 8.26824093370893, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.23518164455890656, "logits/rejected": -0.057272959500551224, "logps/chosen": -1.6684112548828125, "logps/rejected": -2.4470505714416504, "loss": 0.6664, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6684112548828125, "rewards/margins": 0.7786393761634827, "rewards/rejected": -2.4470505714416504, "sft_loss": 1.6589549779891968, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 14.497711552099227, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.24882030487060547, "logits/rejected": -0.032215673476457596, "logps/chosen": -1.7587112188339233, "logps/rejected": -2.637768268585205, "loss": 0.6734, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7587112188339233, "rewards/margins": 0.8790567517280579, "rewards/rejected": -2.637768268585205, "sft_loss": 1.7179298400878906, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 20.056276396315972, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.29785749316215515, "logits/rejected": -0.1693790853023529, "logps/chosen": -1.6061737537384033, "logps/rejected": -2.5774359703063965, "loss": 0.6605, "rewards/accuracies": 0.75, "rewards/chosen": -1.6061737537384033, "rewards/margins": 0.9712620973587036, "rewards/rejected": -2.5774359703063965, "sft_loss": 1.610002875328064, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 12.152515656069522, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.24681887030601501, "logits/rejected": -0.07183097302913666, "logps/chosen": -1.5848981142044067, "logps/rejected": -2.305112361907959, "loss": 0.6836, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5848981142044067, "rewards/margins": 0.7202144861221313, "rewards/rejected": -2.305112361907959, "sft_loss": 1.6380090713500977, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 8.044325952205341, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.3412021994590759, "logits/rejected": -0.11180172115564346, "logps/chosen": -1.7468408346176147, "logps/rejected": -2.5858652591705322, "loss": 0.695, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.7468408346176147, "rewards/margins": 0.8390243649482727, "rewards/rejected": -2.5858652591705322, "sft_loss": 1.7600462436676025, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 10.365173305913668, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.2458866536617279, "logits/rejected": -0.18141455948352814, "logps/chosen": -1.660651445388794, "logps/rejected": -2.544678211212158, "loss": 0.6607, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.660651445388794, "rewards/margins": 0.8840262293815613, "rewards/rejected": -2.544678211212158, "sft_loss": 1.6395816802978516, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 12.26419091119961, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.23464541137218475, "logits/rejected": -0.07829667627811432, "logps/chosen": -1.6506378650665283, "logps/rejected": -2.43139386177063, "loss": 0.6987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6506378650665283, "rewards/margins": 0.7807559370994568, "rewards/rejected": -2.43139386177063, "sft_loss": 1.7268307209014893, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 8.79553536951941, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.40730661153793335, "logits/rejected": -0.11108116805553436, "logps/chosen": -1.6109867095947266, "logps/rejected": -2.552522659301758, "loss": 0.675, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6109867095947266, "rewards/margins": 0.9415359497070312, "rewards/rejected": -2.552522659301758, "sft_loss": 1.6381902694702148, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 13.83158951200596, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.16057129204273224, "logits/rejected": -0.0003592655120883137, "logps/chosen": -1.622968077659607, "logps/rejected": -2.276597261428833, "loss": 0.6955, "rewards/accuracies": 0.75, "rewards/chosen": -1.622968077659607, "rewards/margins": 0.6536290049552917, "rewards/rejected": -2.276597261428833, "sft_loss": 1.6610084772109985, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 10.156964618881265, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.17051255702972412, "logits/rejected": -0.0679701417684555, "logps/chosen": -1.604508399963379, "logps/rejected": -2.2647974491119385, "loss": 0.6981, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.604508399963379, "rewards/margins": 0.6602886915206909, "rewards/rejected": -2.2647974491119385, "sft_loss": 1.5778746604919434, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 7.571055225712559, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.23836679756641388, "logits/rejected": -0.05869082733988762, "logps/chosen": -1.685672402381897, "logps/rejected": -2.57939076423645, "loss": 0.6527, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.685672402381897, "rewards/margins": 0.8937181234359741, "rewards/rejected": -2.57939076423645, "sft_loss": 1.6912319660186768, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 14.611718078525094, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.16602027416229248, "logits/rejected": -0.06961579620838165, "logps/chosen": -1.5615472793579102, "logps/rejected": -2.414815902709961, "loss": 0.6819, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5615472793579102, "rewards/margins": 0.8532684445381165, "rewards/rejected": -2.414815902709961, "sft_loss": 1.6263830661773682, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 14.819194457402386, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.12699346244335175, "logits/rejected": -0.16053619980812073, "logps/chosen": -1.6247823238372803, "logps/rejected": -2.1847691535949707, "loss": 0.7503, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6247823238372803, "rewards/margins": 0.5599866509437561, "rewards/rejected": -2.1847691535949707, "sft_loss": 1.6163861751556396, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 10.111628055549842, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.20047006011009216, "logits/rejected": -0.14914670586585999, "logps/chosen": -1.604811429977417, "logps/rejected": -2.329857110977173, "loss": 0.685, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.604811429977417, "rewards/margins": 0.7250456809997559, "rewards/rejected": -2.329857110977173, "sft_loss": 1.5811692476272583, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 7.418681785444381, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.36672285199165344, "logits/rejected": -0.150108203291893, "logps/chosen": -1.5577113628387451, "logps/rejected": -2.3844823837280273, "loss": 0.6271, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5577113628387451, "rewards/margins": 0.8267711400985718, "rewards/rejected": -2.3844823837280273, "sft_loss": 1.5964959859848022, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 13.317436232892733, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.14914193749427795, "logits/rejected": -0.16660656034946442, "logps/chosen": -1.686402678489685, "logps/rejected": -2.3276093006134033, "loss": 0.7294, "rewards/accuracies": 0.71875, "rewards/chosen": -1.686402678489685, "rewards/margins": 0.6412065029144287, "rewards/rejected": -2.3276093006134033, "sft_loss": 1.6596577167510986, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 12.046941556817272, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.2786480486392975, "logits/rejected": -0.18549850583076477, "logps/chosen": -1.682329773902893, "logps/rejected": -2.3425230979919434, "loss": 0.7202, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.682329773902893, "rewards/margins": 0.6601935625076294, "rewards/rejected": -2.3425230979919434, "sft_loss": 1.7000534534454346, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 15.360483179444728, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.26658084988594055, "logits/rejected": -0.03695317357778549, "logps/chosen": -1.6402456760406494, "logps/rejected": -2.3740615844726562, "loss": 0.7122, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6402456760406494, "rewards/margins": 0.7338162660598755, "rewards/rejected": -2.3740615844726562, "sft_loss": 1.6912143230438232, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 8.564216533205856, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.351350337266922, "logits/rejected": -0.157076895236969, "logps/chosen": -1.6695470809936523, "logps/rejected": -2.4723823070526123, "loss": 0.6756, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6695470809936523, "rewards/margins": 0.8028355836868286, "rewards/rejected": -2.4723823070526123, "sft_loss": 1.7054131031036377, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 10.866380226607719, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.17400893568992615, "logits/rejected": -0.13609933853149414, "logps/chosen": -1.6208839416503906, "logps/rejected": -2.1606128215789795, "loss": 0.7352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6208839416503906, "rewards/margins": 0.5397289991378784, "rewards/rejected": -2.1606128215789795, "sft_loss": 1.6140400171279907, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 11.45952499402734, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.21742463111877441, "logits/rejected": -0.11183615773916245, "logps/chosen": -1.5770115852355957, "logps/rejected": -2.577120780944824, "loss": 0.6835, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5770115852355957, "rewards/margins": 1.0001091957092285, "rewards/rejected": -2.577120780944824, "sft_loss": 1.6516717672348022, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 8.9853606169844, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.28926947712898254, "logits/rejected": -0.002605715300887823, "logps/chosen": -1.7459113597869873, "logps/rejected": -2.579491138458252, "loss": 0.6657, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7459113597869873, "rewards/margins": 0.8335798382759094, "rewards/rejected": -2.579491138458252, "sft_loss": 1.7989447116851807, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 11.134680820218726, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.17780157923698425, "logits/rejected": -0.06660255044698715, "logps/chosen": -1.6019551753997803, "logps/rejected": -2.3067893981933594, "loss": 0.6797, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6019551753997803, "rewards/margins": 0.7048341035842896, "rewards/rejected": -2.3067893981933594, "sft_loss": 1.6184184551239014, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 8.712754082552982, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.22522005438804626, "logits/rejected": -0.06678642332553864, "logps/chosen": -1.699501395225525, "logps/rejected": -2.326427698135376, "loss": 0.7205, "rewards/accuracies": 0.71875, "rewards/chosen": -1.699501395225525, "rewards/margins": 0.626926064491272, "rewards/rejected": -2.326427698135376, "sft_loss": 1.7224886417388916, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 10.18914812666347, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.41520434617996216, "logits/rejected": -0.17200851440429688, "logps/chosen": -1.5514239072799683, "logps/rejected": -2.3866875171661377, "loss": 0.6293, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5514239072799683, "rewards/margins": 0.8352635502815247, "rewards/rejected": -2.3866875171661377, "sft_loss": 1.6133817434310913, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 9.803494294622043, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.2982565760612488, "logits/rejected": -0.005096912384033203, "logps/chosen": -1.6078659296035767, "logps/rejected": -2.6576056480407715, "loss": 0.6313, "rewards/accuracies": 0.75, "rewards/chosen": -1.6078659296035767, "rewards/margins": 1.0497398376464844, "rewards/rejected": -2.6576056480407715, "sft_loss": 1.6127275228500366, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 11.397981001706375, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.18544739484786987, "logits/rejected": -0.11820479482412338, "logps/chosen": -1.823001503944397, "logps/rejected": -2.544473886489868, "loss": 0.7249, "rewards/accuracies": 0.71875, "rewards/chosen": -1.823001503944397, "rewards/margins": 0.7214723825454712, "rewards/rejected": -2.544473886489868, "sft_loss": 1.789475679397583, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 10.291723654185843, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.1771518886089325, "logits/rejected": -0.028028875589370728, "logps/chosen": -1.6053473949432373, "logps/rejected": -2.5792415142059326, "loss": 0.6763, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6053473949432373, "rewards/margins": 0.9738942980766296, "rewards/rejected": -2.5792415142059326, "sft_loss": 1.6318649053573608, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 8.04377401934129, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.2549716830253601, "logits/rejected": -0.03750636801123619, "logps/chosen": -1.6649290323257446, "logps/rejected": -2.3399558067321777, "loss": 0.7077, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6649290323257446, "rewards/margins": 0.6750270128250122, "rewards/rejected": -2.3399558067321777, "sft_loss": 1.747473955154419, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 11.645250056115824, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.19613017141819, "logits/rejected": -0.1112993136048317, "logps/chosen": -1.6175925731658936, "logps/rejected": -2.315768003463745, "loss": 0.7134, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6175925731658936, "rewards/margins": 0.6981754302978516, "rewards/rejected": -2.315768003463745, "sft_loss": 1.6502103805541992, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 17.047614988161822, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.21510866284370422, "logits/rejected": -0.051203738898038864, "logps/chosen": -1.666786551475525, "logps/rejected": -2.4820597171783447, "loss": 0.6645, "rewards/accuracies": 0.75, "rewards/chosen": -1.666786551475525, "rewards/margins": 0.815273106098175, "rewards/rejected": -2.4820597171783447, "sft_loss": 1.6489909887313843, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 11.854450460269762, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.252532035112381, "logits/rejected": -0.043065041303634644, "logps/chosen": -1.6138187646865845, "logps/rejected": -2.4538750648498535, "loss": 0.6376, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6138187646865845, "rewards/margins": 0.840056300163269, "rewards/rejected": -2.4538750648498535, "sft_loss": 1.6333093643188477, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 12.566740841030384, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.35212627053260803, "logits/rejected": -0.23056992888450623, "logps/chosen": -1.6019790172576904, "logps/rejected": -2.4452896118164062, "loss": 0.666, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6019790172576904, "rewards/margins": 0.8433103561401367, "rewards/rejected": -2.4452896118164062, "sft_loss": 1.6663410663604736, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 12.365286720932547, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.2977662980556488, "logits/rejected": -0.14218056201934814, "logps/chosen": -1.6072008609771729, "logps/rejected": -2.3295769691467285, "loss": 0.6966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6072008609771729, "rewards/margins": 0.7223759889602661, "rewards/rejected": -2.3295769691467285, "sft_loss": 1.668554663658142, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 7.908598604373969, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.040316391736269, "logits/rejected": -0.00786541122943163, "logps/chosen": -1.6507285833358765, "logps/rejected": -2.703517436981201, "loss": 0.6721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6507285833358765, "rewards/margins": 1.0527892112731934, "rewards/rejected": -2.703517436981201, "sft_loss": 1.671103835105896, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 12.738237371486237, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.12731394171714783, "logits/rejected": -0.02375071682035923, "logps/chosen": -1.678500771522522, "logps/rejected": -2.51442289352417, "loss": 0.6842, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.678500771522522, "rewards/margins": 0.8359218835830688, "rewards/rejected": -2.51442289352417, "sft_loss": 1.6660511493682861, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 11.662885763472985, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.2795417904853821, "logits/rejected": -0.07758977264165878, "logps/chosen": -1.678045630455017, "logps/rejected": -2.3479385375976562, "loss": 0.7202, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.678045630455017, "rewards/margins": 0.6698927879333496, "rewards/rejected": -2.3479385375976562, "sft_loss": 1.6801170110702515, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 9.4667501965755, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.23939184844493866, "logits/rejected": -0.10171794891357422, "logps/chosen": -1.620661973953247, "logps/rejected": -2.4587013721466064, "loss": 0.6685, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.620661973953247, "rewards/margins": 0.8380392789840698, "rewards/rejected": -2.4587013721466064, "sft_loss": 1.6676385402679443, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 8.356843634981466, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.32459867000579834, "logits/rejected": -0.09155640751123428, "logps/chosen": -1.745084524154663, "logps/rejected": -2.3966400623321533, "loss": 0.6998, "rewards/accuracies": 0.71875, "rewards/chosen": -1.745084524154663, "rewards/margins": 0.6515554189682007, "rewards/rejected": -2.3966400623321533, "sft_loss": 1.6971805095672607, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 19.240441807451575, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.20487406849861145, "logits/rejected": -0.1029997244477272, "logps/chosen": -1.6266998052597046, "logps/rejected": -2.2977185249328613, "loss": 0.7137, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6266998052597046, "rewards/margins": 0.6710187196731567, "rewards/rejected": -2.2977185249328613, "sft_loss": 1.6257264614105225, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 14.36525778533676, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.17981144785881042, "logits/rejected": -0.22500362992286682, "logps/chosen": -1.5425150394439697, "logps/rejected": -2.2829604148864746, "loss": 0.6873, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5425150394439697, "rewards/margins": 0.7404453158378601, "rewards/rejected": -2.2829604148864746, "sft_loss": 1.6271302700042725, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 9.035647174343278, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.231882244348526, "logits/rejected": -0.09338878095149994, "logps/chosen": -1.6521456241607666, "logps/rejected": -2.373229503631592, "loss": 0.7056, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6521456241607666, "rewards/margins": 0.7210837602615356, "rewards/rejected": -2.373229503631592, "sft_loss": 1.700937032699585, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 10.455888219471658, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.1631014347076416, "logits/rejected": 0.04565007612109184, "logps/chosen": -1.5792930126190186, "logps/rejected": -2.292510747909546, "loss": 0.673, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5792930126190186, "rewards/margins": 0.7132178544998169, "rewards/rejected": -2.292510747909546, "sft_loss": 1.6217149496078491, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 15.300238915117614, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.23477379977703094, "logits/rejected": -0.17607273161411285, "logps/chosen": -1.5679047107696533, "logps/rejected": -2.3857674598693848, "loss": 0.6739, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5679047107696533, "rewards/margins": 0.8178626894950867, "rewards/rejected": -2.3857674598693848, "sft_loss": 1.6498298645019531, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 9.827608153796792, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.3342897891998291, "logits/rejected": -0.0910370871424675, "logps/chosen": -1.5775277614593506, "logps/rejected": -2.4565913677215576, "loss": 0.6527, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5775277614593506, "rewards/margins": 0.8790637254714966, "rewards/rejected": -2.4565913677215576, "sft_loss": 1.6550567150115967, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 7.955063752152563, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.18442848324775696, "logits/rejected": -0.05632861703634262, "logps/chosen": -1.7048654556274414, "logps/rejected": -2.3924946784973145, "loss": 0.7247, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7048654556274414, "rewards/margins": 0.6876288652420044, "rewards/rejected": -2.3924946784973145, "sft_loss": 1.6946197748184204, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 13.794215377977427, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.15591557323932648, "logits/rejected": -0.14203740656375885, "logps/chosen": -1.6669740676879883, "logps/rejected": -2.3792734146118164, "loss": 0.6925, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6669740676879883, "rewards/margins": 0.7122994661331177, "rewards/rejected": -2.3792734146118164, "sft_loss": 1.684309720993042, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 9.396878057251937, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.2930406928062439, "logits/rejected": -0.14096228778362274, "logps/chosen": -1.6486921310424805, "logps/rejected": -2.6836915016174316, "loss": 0.649, "rewards/accuracies": 0.75, "rewards/chosen": -1.6486921310424805, "rewards/margins": 1.0349994897842407, "rewards/rejected": -2.6836915016174316, "sft_loss": 1.6713956594467163, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 16.47893453594267, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.3719504475593567, "logits/rejected": -0.20318636298179626, "logps/chosen": -1.6200330257415771, "logps/rejected": -2.5011987686157227, "loss": 0.69, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6200330257415771, "rewards/margins": 0.8811656832695007, "rewards/rejected": -2.5011987686157227, "sft_loss": 1.6898200511932373, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 11.77005074058629, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.23018448054790497, "logits/rejected": -0.15001408755779266, "logps/chosen": -1.608838677406311, "logps/rejected": -2.278439998626709, "loss": 0.7093, "rewards/accuracies": 0.71875, "rewards/chosen": -1.608838677406311, "rewards/margins": 0.6696012616157532, "rewards/rejected": -2.278439998626709, "sft_loss": 1.6235387325286865, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 11.867183733154901, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.2350422888994217, "logits/rejected": -0.080734483897686, "logps/chosen": -1.7380975484848022, "logps/rejected": -2.6187663078308105, "loss": 0.6935, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7380975484848022, "rewards/margins": 0.8806688189506531, "rewards/rejected": -2.6187663078308105, "sft_loss": 1.7597763538360596, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 10.753011070016338, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.29815274477005005, "logits/rejected": -0.026742761954665184, "logps/chosen": -1.7312679290771484, "logps/rejected": -2.3701531887054443, "loss": 0.7204, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.7312679290771484, "rewards/margins": 0.6388850808143616, "rewards/rejected": -2.3701531887054443, "sft_loss": 1.7271066904067993, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 14.060771476595141, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.2242444008588791, "logits/rejected": -0.1685485690832138, "logps/chosen": -1.606527328491211, "logps/rejected": -2.303962230682373, "loss": 0.7274, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.606527328491211, "rewards/margins": 0.6974350810050964, "rewards/rejected": -2.303962230682373, "sft_loss": 1.6881955862045288, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 11.791099919953691, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.34817785024642944, "logits/rejected": -0.1696995198726654, "logps/chosen": -1.6621816158294678, "logps/rejected": -2.3796112537384033, "loss": 0.7049, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6621816158294678, "rewards/margins": 0.717429518699646, "rewards/rejected": -2.3796112537384033, "sft_loss": 1.607971429824829, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 17.54746310334012, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.24411115050315857, "logits/rejected": -0.10472285747528076, "logps/chosen": -1.560499906539917, "logps/rejected": -2.2243080139160156, "loss": 0.6908, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.560499906539917, "rewards/margins": 0.6638081073760986, "rewards/rejected": -2.2243080139160156, "sft_loss": 1.581453561782837, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 9.18716073819998, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.1307852566242218, "logits/rejected": -0.013145471923053265, "logps/chosen": -1.6680675745010376, "logps/rejected": -2.4653637409210205, "loss": 0.684, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6680675745010376, "rewards/margins": 0.7972962260246277, "rewards/rejected": -2.4653637409210205, "sft_loss": 1.7134125232696533, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 15.975153218637699, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.2370811253786087, "logits/rejected": 0.010354919359087944, "logps/chosen": -1.7212677001953125, "logps/rejected": -2.5519747734069824, "loss": 0.6996, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7212677001953125, "rewards/margins": 0.8307073712348938, "rewards/rejected": -2.5519747734069824, "sft_loss": 1.7108638286590576, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 13.247806168647509, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.15307848155498505, "logits/rejected": -0.12351334095001221, "logps/chosen": -1.6680986881256104, "logps/rejected": -2.2105205059051514, "loss": 0.7765, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6680986881256104, "rewards/margins": 0.5424218773841858, "rewards/rejected": -2.2105205059051514, "sft_loss": 1.7354910373687744, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 13.470890059957924, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.28426069021224976, "logits/rejected": -0.19852329790592194, "logps/chosen": -1.6065629720687866, "logps/rejected": -2.3380188941955566, "loss": 0.6914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6065629720687866, "rewards/margins": 0.73145592212677, "rewards/rejected": -2.3380188941955566, "sft_loss": 1.6260297298431396, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 9.677269647741532, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.2000984400510788, "logits/rejected": -0.04230368882417679, "logps/chosen": -1.562959909439087, "logps/rejected": -2.6095166206359863, "loss": 0.6456, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.562959909439087, "rewards/margins": 1.0465565919876099, "rewards/rejected": -2.6095166206359863, "sft_loss": 1.640472650527954, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 9.571074953012484, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.22703365981578827, "logits/rejected": -0.0405532531440258, "logps/chosen": -1.5831021070480347, "logps/rejected": -2.4356350898742676, "loss": 0.6828, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5831021070480347, "rewards/margins": 0.8525330424308777, "rewards/rejected": -2.4356350898742676, "sft_loss": 1.646332025527954, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 8.333944897865214, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.34473010897636414, "logits/rejected": -0.02939082682132721, "logps/chosen": -1.5874412059783936, "logps/rejected": -2.3952198028564453, "loss": 0.6764, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.5874412059783936, "rewards/margins": 0.8077786564826965, "rewards/rejected": -2.3952198028564453, "sft_loss": 1.6250718832015991, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 6.594725031948847, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.28364163637161255, "logits/rejected": -0.08140794187784195, "logps/chosen": -1.656720519065857, "logps/rejected": -2.571990489959717, "loss": 0.654, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.656720519065857, "rewards/margins": 0.9152697324752808, "rewards/rejected": -2.571990489959717, "sft_loss": 1.7422959804534912, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 8.628784604656813, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.18402740359306335, "logits/rejected": -0.11157803237438202, "logps/chosen": -1.7139402627944946, "logps/rejected": -2.429527521133423, "loss": 0.7189, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7139402627944946, "rewards/margins": 0.7155871987342834, "rewards/rejected": -2.429527521133423, "sft_loss": 1.6843442916870117, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 11.381273445991754, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.23412509262561798, "logits/rejected": -0.038752295076847076, "logps/chosen": -1.553330421447754, "logps/rejected": -2.387714385986328, "loss": 0.6587, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.553330421447754, "rewards/margins": 0.8343838453292847, "rewards/rejected": -2.387714385986328, "sft_loss": 1.6010615825653076, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 15.32720262685647, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.22002415359020233, "logits/rejected": -0.0756683275103569, "logps/chosen": -1.6602970361709595, "logps/rejected": -2.6124515533447266, "loss": 0.6951, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6602970361709595, "rewards/margins": 0.9521546363830566, "rewards/rejected": -2.6124515533447266, "sft_loss": 1.7284297943115234, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.18746614456176758, "eval_logits/rejected": 0.3000437915325165, "eval_logps/chosen": -1.7146235704421997, "eval_logps/rejected": -2.372737169265747, "eval_loss": 0.7405083179473877, "eval_rewards/accuracies": 0.6772996783256531, "eval_rewards/chosen": -1.7146235704421997, "eval_rewards/margins": 0.6581135392189026, "eval_rewards/rejected": -2.372737169265747, "eval_runtime": 48.5263, "eval_samples_per_second": 27.717, "eval_sft_loss": 1.6847718954086304, "eval_steps_per_second": 6.945, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.7456102068299996, "train_runtime": 34974.0603, "train_samples_per_second": 5.129, "train_steps_per_second": 0.16 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }