{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 13.0625, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 10.875, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.074951171875, "logits/rejected": 80.7809829711914, "logps/chosen": -34.20733642578125, "logps/rejected": -32.97297668457031, "loss": 0.9951, "rewards/accuracies": 0.4583333432674408, "rewards/chosen": 0.003438829444348812, "rewards/margins": 0.004895869642496109, "rewards/rejected": -0.0014570390339940786, "step": 10 }, { "epoch": 0.05, "grad_norm": 12.6875, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.64326477050781, "logits/rejected": 80.53416442871094, "logps/chosen": -33.720584869384766, "logps/rejected": -30.82167625427246, "loss": 0.9986, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003067571669816971, "rewards/margins": 0.0013512909645214677, "rewards/rejected": -0.004418861120939255, "step": 20 }, { "epoch": 0.08, "grad_norm": 12.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.51445007324219, "logits/rejected": 82.54810333251953, "logps/chosen": -33.81728744506836, "logps/rejected": -31.204355239868164, "loss": 1.0024, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.019553204998373985, "rewards/margins": -0.002394508570432663, "rewards/rejected": 0.0219477117061615, "step": 30 }, { "epoch": 0.1, "grad_norm": 11.8125, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.101806640625, "logits/rejected": 81.09938049316406, "logps/chosen": -32.73223876953125, "logps/rejected": -33.143699645996094, "loss": 0.9783, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04418279603123665, "rewards/margins": 0.02170029655098915, "rewards/rejected": 0.022482499480247498, "step": 40 }, { "epoch": 0.13, "grad_norm": 11.9375, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.82136535644531, "logits/rejected": 78.82911682128906, "logps/chosen": -30.3783016204834, "logps/rejected": -30.641677856445312, "loss": 0.9605, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.082694411277771, "rewards/margins": 0.039533428847789764, "rewards/rejected": 0.04316098242998123, "step": 50 }, { "epoch": 0.16, "grad_norm": 9.75, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.49494171142578, "logits/rejected": 83.55232238769531, "logps/chosen": -30.781469345092773, "logps/rejected": -29.190662384033203, "loss": 1.0006, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.03936903923749924, "rewards/margins": -0.0006168211111798882, "rewards/rejected": 0.039985861629247665, "step": 60 }, { "epoch": 0.18, "grad_norm": 12.0625, "learning_rate": 4.901618883413549e-06, "logits/chosen": 84.1231918334961, "logits/rejected": 84.15650939941406, "logps/chosen": -30.209863662719727, "logps/rejected": -32.619781494140625, "loss": 0.9999, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.041915904730558395, "rewards/margins": 0.000123101839562878, "rewards/rejected": 0.04179280251264572, "step": 70 }, { "epoch": 0.21, "grad_norm": 12.375, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.8515853881836, "logits/rejected": 81.83155822753906, "logps/chosen": -30.982410430908203, "logps/rejected": -30.617040634155273, "loss": 0.9618, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.05225307494401932, "rewards/margins": 0.03819245845079422, "rewards/rejected": 0.014060619287192822, "step": 80 }, { "epoch": 0.23, "grad_norm": 14.75, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.68502807617188, "logits/rejected": 78.65934753417969, "logps/chosen": -32.18014144897461, "logps/rejected": -30.878421783447266, "loss": 0.9627, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04580371826887131, "rewards/margins": 0.03849860280752182, "rewards/rejected": 0.007305114530026913, "step": 90 }, { "epoch": 0.26, "grad_norm": 12.375, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.6978530883789, "logits/rejected": 83.72080993652344, "logps/chosen": -33.769813537597656, "logps/rejected": -31.638240814208984, "loss": 0.9577, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.05351760983467102, "rewards/margins": 0.04228735715150833, "rewards/rejected": 0.01123025082051754, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.71279907226562, "eval_logits/rejected": 98.70589447021484, "eval_logps/chosen": -32.32750701904297, "eval_logps/rejected": -35.914642333984375, "eval_loss": 0.993482768535614, "eval_rewards/accuracies": 0.5066444873809814, "eval_rewards/chosen": 0.011567190289497375, "eval_rewards/margins": 0.006378817837685347, "eval_rewards/rejected": 0.005188372451812029, "eval_runtime": 104.0772, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 14.125, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.8444595336914, "logits/rejected": 83.72880554199219, "logps/chosen": -32.138458251953125, "logps/rejected": -32.649452209472656, "loss": 0.9122, "rewards/accuracies": 0.625, "rewards/chosen": 0.08843465149402618, "rewards/margins": 0.0877910926938057, "rewards/rejected": 0.0006435603136196733, "step": 110 }, { "epoch": 0.31, "grad_norm": 13.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.81868743896484, "logits/rejected": 83.92826080322266, "logps/chosen": -28.111730575561523, "logps/rejected": -35.34394454956055, "loss": 0.9297, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.08178504556417465, "rewards/margins": 0.07026515156030655, "rewards/rejected": 0.011519892141222954, "step": 120 }, { "epoch": 0.34, "grad_norm": 9.375, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 80.93681335449219, "logits/rejected": 80.97130584716797, "logps/chosen": -30.195592880249023, "logps/rejected": -31.842870712280273, "loss": 0.939, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07117608934640884, "rewards/margins": 0.06230046600103378, "rewards/rejected": 0.008875617757439613, "step": 130 }, { "epoch": 0.36, "grad_norm": 11.0625, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 81.7886962890625, "logits/rejected": 81.80296325683594, "logps/chosen": -26.81143569946289, "logps/rejected": -32.920143127441406, "loss": 0.8867, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0635889396071434, "rewards/margins": 0.11326569318771362, "rewards/rejected": -0.04967674985527992, "step": 140 }, { "epoch": 0.39, "grad_norm": 12.1875, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.10134887695312, "logits/rejected": 80.06209564208984, "logps/chosen": -28.821029663085938, "logps/rejected": -33.226470947265625, "loss": 0.8772, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.05680803209543228, "rewards/margins": 0.1228049248456955, "rewards/rejected": -0.06599690765142441, "step": 150 }, { "epoch": 0.42, "grad_norm": 13.4375, "learning_rate": 3.636998309800573e-06, "logits/chosen": 81.63700866699219, "logits/rejected": 81.66020202636719, "logps/chosen": -34.080284118652344, "logps/rejected": -30.817296981811523, "loss": 0.9079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005646559409797192, "rewards/margins": 0.09212217479944229, "rewards/rejected": -0.08647561073303223, "step": 160 }, { "epoch": 0.44, "grad_norm": 15.125, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 82.27827453613281, "logits/rejected": 82.23299407958984, "logps/chosen": -31.016094207763672, "logps/rejected": -33.03407669067383, "loss": 0.8743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.024319300428032875, "rewards/margins": 0.12572081387043, "rewards/rejected": -0.10140150785446167, "step": 170 }, { "epoch": 0.47, "grad_norm": 12.5, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 79.44108581542969, "logits/rejected": 79.4173355102539, "logps/chosen": -30.948467254638672, "logps/rejected": -32.03376007080078, "loss": 0.8875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02486853487789631, "rewards/margins": 0.1125236377120018, "rewards/rejected": -0.08765510469675064, "step": 180 }, { "epoch": 0.49, "grad_norm": 9.6875, "learning_rate": 2.996071664294641e-06, "logits/chosen": 80.90037536621094, "logits/rejected": 80.8733901977539, "logps/chosen": -30.740875244140625, "logps/rejected": -31.226177215576172, "loss": 0.9221, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002553999889642, "rewards/margins": 0.07836906611919403, "rewards/rejected": -0.07581506669521332, "step": 190 }, { "epoch": 0.52, "grad_norm": 14.4375, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 76.02571105957031, "logits/rejected": 75.9735107421875, "logps/chosen": -34.12778854370117, "logps/rejected": -33.30614471435547, "loss": 0.8562, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0624953992664814, "rewards/margins": 0.14376921951770782, "rewards/rejected": -0.08127383887767792, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 97.97191619873047, "eval_logits/rejected": 97.95458984375, "eval_logps/chosen": -32.94685363769531, "eval_logps/rejected": -36.8216552734375, "eval_loss": 0.9646754860877991, "eval_rewards/accuracies": 0.5676910281181335, "eval_rewards/chosen": -0.05036771669983864, "eval_rewards/margins": 0.035145342350006104, "eval_rewards/rejected": -0.08551305532455444, "eval_runtime": 103.8205, "eval_samples_per_second": 3.304, "eval_steps_per_second": 0.414, "step": 200 }, { "epoch": 0.55, "grad_norm": 18.625, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 78.56858825683594, "logits/rejected": 78.47276306152344, "logps/chosen": -33.586204528808594, "logps/rejected": -36.05295944213867, "loss": 0.8633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0359516479074955, "rewards/margins": 0.1426382064819336, "rewards/rejected": -0.1066865548491478, "step": 210 }, { "epoch": 0.57, "grad_norm": 14.6875, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 80.53726959228516, "logits/rejected": 80.62950134277344, "logps/chosen": -31.454355239868164, "logps/rejected": -32.02970504760742, "loss": 0.8286, "rewards/accuracies": 0.6875, "rewards/chosen": 0.049460187554359436, "rewards/margins": 0.17953529953956604, "rewards/rejected": -0.1300750970840454, "step": 220 }, { "epoch": 0.6, "grad_norm": 14.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 77.57215881347656, "logits/rejected": 77.61759948730469, "logps/chosen": -32.74131393432617, "logps/rejected": -35.29151153564453, "loss": 0.8713, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.006443141493946314, "rewards/margins": 0.13437876105308533, "rewards/rejected": -0.12793561816215515, "step": 230 }, { "epoch": 0.62, "grad_norm": 16.5, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 79.76509094238281, "logits/rejected": 80.07305908203125, "logps/chosen": -31.252161026000977, "logps/rejected": -32.75459671020508, "loss": 0.8345, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04359050467610359, "rewards/margins": 0.16584597527980804, "rewards/rejected": -0.12225550413131714, "step": 240 }, { "epoch": 0.65, "grad_norm": 15.0625, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 78.32225036621094, "logits/rejected": 78.37224578857422, "logps/chosen": -27.815731048583984, "logps/rejected": -31.077539443969727, "loss": 0.9043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02198370359838009, "rewards/margins": 0.09569612890481949, "rewards/rejected": -0.11767983436584473, "step": 250 }, { "epoch": 0.68, "grad_norm": 13.75, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 75.31268310546875, "logits/rejected": 75.46823120117188, "logps/chosen": -31.16106605529785, "logps/rejected": -38.22980880737305, "loss": 0.7589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.039992958307266235, "rewards/margins": 0.2528113126754761, "rewards/rejected": -0.21281830966472626, "step": 260 }, { "epoch": 0.7, "grad_norm": 13.1875, "learning_rate": 1.243452991757889e-06, "logits/chosen": 74.42684173583984, "logits/rejected": 74.45647430419922, "logps/chosen": -31.796234130859375, "logps/rejected": -33.113197326660156, "loss": 0.8432, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0017309554386883974, "rewards/margins": 0.15904627740383148, "rewards/rejected": -0.1607772409915924, "step": 270 }, { "epoch": 0.73, "grad_norm": 19.75, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 77.391845703125, "logits/rejected": 77.1694564819336, "logps/chosen": -32.110328674316406, "logps/rejected": -30.959293365478516, "loss": 0.9109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030043601989746094, "rewards/margins": 0.09918614476919174, "rewards/rejected": -0.12922975420951843, "step": 280 }, { "epoch": 0.75, "grad_norm": 13.1875, "learning_rate": 8.737922755071455e-07, "logits/chosen": 77.39460754394531, "logits/rejected": 77.31553649902344, "logps/chosen": -33.84746551513672, "logps/rejected": -34.28327178955078, "loss": 0.7586, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.02567802369594574, "rewards/margins": 0.2523185908794403, "rewards/rejected": -0.22664058208465576, "step": 290 }, { "epoch": 0.78, "grad_norm": 14.8125, "learning_rate": 7.08321427484816e-07, "logits/chosen": 73.05286407470703, "logits/rejected": 73.1805648803711, "logps/chosen": -33.01818084716797, "logps/rejected": -30.520904541015625, "loss": 0.8271, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03875284641981125, "rewards/margins": 0.18064062297344208, "rewards/rejected": -0.14188775420188904, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 97.71524047851562, "eval_logits/rejected": 97.68975067138672, "eval_logps/chosen": -33.206886291503906, "eval_logps/rejected": -37.196205139160156, "eval_loss": 0.9533767700195312, "eval_rewards/accuracies": 0.5714285373687744, "eval_rewards/chosen": -0.07637124508619308, "eval_rewards/margins": 0.04659651592373848, "eval_rewards/rejected": -0.12296776473522186, "eval_runtime": 104.0124, "eval_samples_per_second": 3.298, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.81, "grad_norm": 13.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 80.51168060302734, "logits/rejected": 80.51959228515625, "logps/chosen": -30.940738677978516, "logps/rejected": -33.976158142089844, "loss": 0.841, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00772014120593667, "rewards/margins": 0.1699191778898239, "rewards/rejected": -0.17763930559158325, "step": 310 }, { "epoch": 0.83, "grad_norm": 14.9375, "learning_rate": 4.229036944380913e-07, "logits/chosen": 77.665771484375, "logits/rejected": 77.6807861328125, "logps/chosen": -31.073196411132812, "logps/rejected": -30.321044921875, "loss": 0.8202, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05253634601831436, "rewards/margins": 0.19269177317619324, "rewards/rejected": -0.14015543460845947, "step": 320 }, { "epoch": 0.86, "grad_norm": 16.375, "learning_rate": 3.053082288996112e-07, "logits/chosen": 74.7007064819336, "logits/rejected": 74.74031829833984, "logps/chosen": -29.66641616821289, "logps/rejected": -34.42815399169922, "loss": 0.7549, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.06861907988786697, "rewards/margins": 0.24794825911521912, "rewards/rejected": -0.17932915687561035, "step": 330 }, { "epoch": 0.88, "grad_norm": 17.125, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 79.32682800292969, "logits/rejected": 79.36729431152344, "logps/chosen": -33.05027770996094, "logps/rejected": -35.377281188964844, "loss": 0.7972, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.01618681475520134, "rewards/margins": 0.21704569458961487, "rewards/rejected": -0.20085887610912323, "step": 340 }, { "epoch": 0.91, "grad_norm": 15.5, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 78.2139663696289, "logits/rejected": 78.22695922851562, "logps/chosen": -33.175575256347656, "logps/rejected": -34.91777801513672, "loss": 0.7788, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04681625962257385, "rewards/margins": 0.22941403090953827, "rewards/rejected": -0.18259775638580322, "step": 350 }, { "epoch": 0.94, "grad_norm": 11.6875, "learning_rate": 6.41315865106129e-08, "logits/chosen": 79.90480041503906, "logits/rejected": 79.93946075439453, "logps/chosen": -28.85941505432129, "logps/rejected": -33.032447814941406, "loss": 0.8014, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06927979737520218, "rewards/margins": 0.20753948390483856, "rewards/rejected": -0.13825969398021698, "step": 360 }, { "epoch": 0.96, "grad_norm": 12.8125, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 79.3287582397461, "logits/rejected": 79.33840942382812, "logps/chosen": -33.099693298339844, "logps/rejected": -37.02233123779297, "loss": 0.8425, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.022353025153279305, "rewards/margins": 0.16894304752349854, "rewards/rejected": -0.1912960708141327, "step": 370 }, { "epoch": 0.99, "grad_norm": 13.25, "learning_rate": 2.575864278703266e-09, "logits/chosen": 72.94795227050781, "logits/rejected": 72.81517791748047, "logps/chosen": -30.674768447875977, "logps/rejected": -29.731273651123047, "loss": 0.8564, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009864235296845436, "rewards/margins": 0.14720600843429565, "rewards/rejected": -0.15707024931907654, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.88636748450143, "train_runtime": 2554.6221, "train_samples_per_second": 1.205, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }