{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.8663291931152344, "logits/rejected": -1.870633602142334, "logps/chosen": -37.00596237182617, "logps/rejected": -33.65876007080078, "loss": 0.688, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0002556463878136128, "rewards/margins": 0.011729677207767963, "rewards/rejected": -0.011474031955003738, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.997443437576294, "logits/rejected": -2.000075340270996, "logps/chosen": -29.63150405883789, "logps/rejected": -29.064098358154297, "loss": 0.6915, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.004276981111615896, "rewards/margins": 0.004539952147752047, "rewards/rejected": -0.0002629714144859463, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.920101523399353, "logits/rejected": -1.9174143075942993, "logps/chosen": -31.427581787109375, "logps/rejected": -33.214088439941406, "loss": 0.6971, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.004594600293785334, "rewards/margins": -0.006341943051666021, "rewards/rejected": 0.0017473434563726187, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.0168609619140625, "logits/rejected": -2.008105754852295, "logps/chosen": -32.59783935546875, "logps/rejected": -32.4984130859375, "loss": 0.701, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.008411901071667671, "rewards/margins": -0.014114728197455406, "rewards/rejected": 0.00570282619446516, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.864262342453003, "logits/rejected": -1.8534845113754272, "logps/chosen": -33.57244873046875, "logps/rejected": -35.4432373046875, "loss": 0.6987, "rewards/accuracies": 0.4375, "rewards/chosen": -0.005986724980175495, "rewards/margins": -0.00962964165955782, "rewards/rejected": 0.0036429166793823242, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.9451322555541992, "logits/rejected": -1.9470863342285156, "logps/chosen": -32.58552551269531, "logps/rejected": -33.180747985839844, "loss": 0.689, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.005862490274012089, "rewards/margins": 0.009610554203391075, "rewards/rejected": -0.003748063463717699, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.0796828269958496, "logits/rejected": -2.0846545696258545, "logps/chosen": -33.98699188232422, "logps/rejected": -36.566322326660156, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": 0.000597482081502676, "rewards/margins": -0.00042274221777915955, "rewards/rejected": 0.0010202232515439391, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9422571659088135, "logits/rejected": -1.945412278175354, "logps/chosen": -34.415496826171875, "logps/rejected": -34.577476501464844, "loss": 0.6947, "rewards/accuracies": 0.5, "rewards/chosen": 0.004924021661281586, "rewards/margins": -0.0010487461695447564, "rewards/rejected": 0.00597276771441102, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.9508498907089233, "logits/rejected": -1.955348253250122, "logps/chosen": -32.44896697998047, "logps/rejected": -32.381385803222656, "loss": 0.6855, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0073806666769087315, "rewards/margins": 0.01637081615626812, "rewards/rejected": -0.00899015087634325, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.0491819381713867, "logits/rejected": -2.0471725463867188, "logps/chosen": -32.225120544433594, "logps/rejected": -31.267704010009766, "loss": 0.6904, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0062875039875507355, "rewards/margins": 0.006883688271045685, "rewards/rejected": -0.0005961857968941331, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.243363618850708, "eval_logits/rejected": -2.2384824752807617, "eval_logps/chosen": -34.026004791259766, "eval_logps/rejected": -37.51805114746094, "eval_loss": 0.6918829679489136, "eval_rewards/accuracies": 0.5427741408348083, "eval_rewards/chosen": 0.0034199401270598173, "eval_rewards/margins": 0.003991155419498682, "eval_rewards/rejected": -0.0005712147103622556, "eval_runtime": 146.1635, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.0055806636810303, "logits/rejected": -2.003166675567627, "logps/chosen": -33.227996826171875, "logps/rejected": -34.02782440185547, "loss": 0.6912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0063266693614423275, "rewards/margins": 0.0057087307795882225, "rewards/rejected": 0.0006179373594932258, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.016270160675049, "logits/rejected": -2.0078837871551514, "logps/chosen": -32.447757720947266, "logps/rejected": -32.19353485107422, "loss": 0.6881, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0011401770170778036, "rewards/margins": 0.011673182249069214, "rewards/rejected": -0.012813357636332512, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.04642653465271, "logits/rejected": -2.038364887237549, "logps/chosen": -30.48968505859375, "logps/rejected": -32.04593276977539, "loss": 0.6979, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.005588879343122244, "rewards/margins": -0.008417831733822823, "rewards/rejected": 0.0028289512265473604, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.9769046306610107, "logits/rejected": -1.9871702194213867, "logps/chosen": -31.396442413330078, "logps/rejected": -32.558387756347656, "loss": 0.6847, "rewards/accuracies": 0.5625, "rewards/chosen": 0.009287373162806034, "rewards/margins": 0.01869189366698265, "rewards/rejected": -0.009404524229466915, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8908485174179077, "logits/rejected": -1.8919233083724976, "logps/chosen": -34.16999053955078, "logps/rejected": -34.78479766845703, "loss": 0.6841, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009080578573048115, "rewards/margins": 0.02009742520749569, "rewards/rejected": -0.011016845703125, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.942665457725525, "logits/rejected": -1.9391899108886719, "logps/chosen": -36.12689971923828, "logps/rejected": -32.72255325317383, "loss": 0.685, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.017910802736878395, "rewards/margins": 0.01783105731010437, "rewards/rejected": 7.97472894191742e-05, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.042165756225586, "logits/rejected": -2.0347695350646973, "logps/chosen": -33.787109375, "logps/rejected": -31.368520736694336, "loss": 0.6889, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004300374537706375, "rewards/margins": 0.010968221351504326, "rewards/rejected": -0.006667847745120525, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.047497510910034, "logits/rejected": -2.052760601043701, "logps/chosen": -32.53215789794922, "logps/rejected": -32.50490951538086, "loss": 0.6874, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0050529674626886845, "rewards/margins": 0.01296031754463911, "rewards/rejected": -0.007907351478934288, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.0487968921661377, "logits/rejected": -2.0460152626037598, "logps/chosen": -31.495594024658203, "logps/rejected": -31.3470401763916, "loss": 0.6896, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002655787393450737, "rewards/margins": 0.008874570950865746, "rewards/rejected": -0.011530356481671333, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9187023639678955, "logits/rejected": -1.9233903884887695, "logps/chosen": -31.589069366455078, "logps/rejected": -32.78667449951172, "loss": 0.6912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006885607726871967, "rewards/margins": 0.0051818834617733955, "rewards/rejected": 0.0017037258949130774, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2433600425720215, "eval_logits/rejected": -2.238468885421753, "eval_logps/chosen": -34.01414489746094, "eval_logps/rejected": -37.50386428833008, "eval_loss": 0.692382276058197, "eval_rewards/accuracies": 0.4991694688796997, "eval_rewards/chosen": 0.008161893114447594, "eval_rewards/margins": 0.003057094058021903, "eval_rewards/rejected": 0.00510479835793376, "eval_runtime": 145.9559, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.031435489654541, "logits/rejected": -2.042121648788452, "logps/chosen": -31.93819236755371, "logps/rejected": -33.87244415283203, "loss": 0.6855, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.013244074769318104, "rewards/margins": 0.017141219228506088, "rewards/rejected": -0.003897144692018628, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9252116680145264, "logits/rejected": -1.9400631189346313, "logps/chosen": -30.0819149017334, "logps/rejected": -31.582805633544922, "loss": 0.6833, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.011850157752633095, "rewards/margins": 0.021701235324144363, "rewards/rejected": -0.009851074777543545, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.9831390380859375, "logits/rejected": -1.9871015548706055, "logps/chosen": -33.370384216308594, "logps/rejected": -31.524139404296875, "loss": 0.6896, "rewards/accuracies": 0.5625, "rewards/chosen": 0.018833406269550323, "rewards/margins": 0.008769527077674866, "rewards/rejected": 0.010063880123198032, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.9828351736068726, "logits/rejected": -1.9608606100082397, "logps/chosen": -34.157318115234375, "logps/rejected": -34.97474670410156, "loss": 0.6898, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0014271701220422983, "rewards/margins": 0.008153198286890984, "rewards/rejected": -0.009580368176102638, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.0243985652923584, "logits/rejected": -2.0210859775543213, "logps/chosen": -32.91604995727539, "logps/rejected": -36.21431350708008, "loss": 0.6945, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.003607477992773056, "rewards/margins": -0.0006370179471559823, "rewards/rejected": 0.004244496580213308, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.8909612894058228, "logits/rejected": -1.8885223865509033, "logps/chosen": -34.173240661621094, "logps/rejected": -35.51388931274414, "loss": 0.6892, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.006062888540327549, "rewards/margins": 0.009769372642040253, "rewards/rejected": -0.0037064836360514164, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.8755378723144531, "logits/rejected": -1.8729900121688843, "logps/chosen": -34.416507720947266, "logps/rejected": -31.75918197631836, "loss": 0.7001, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007562094368040562, "rewards/margins": -0.012420646846294403, "rewards/rejected": 0.004858553409576416, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.979413628578186, "logits/rejected": -1.9687983989715576, "logps/chosen": -35.34419631958008, "logps/rejected": -31.839731216430664, "loss": 0.6917, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004978640470653772, "rewards/margins": 0.004474071320146322, "rewards/rejected": 0.0005045672878623009, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.075562000274658, "logits/rejected": -2.060526132583618, "logps/chosen": -30.924896240234375, "logps/rejected": -32.64234161376953, "loss": 0.696, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0003082419862039387, "rewards/margins": -0.004107682965695858, "rewards/rejected": 0.0037994408048689365, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.947762131690979, "logits/rejected": -1.9452297687530518, "logps/chosen": -32.90521240234375, "logps/rejected": -30.821033477783203, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.008759219199419022, "rewards/margins": 0.010448457673192024, "rewards/rejected": -0.0016892381245270371, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2443997859954834, "eval_logits/rejected": -2.2395174503326416, "eval_logps/chosen": -34.02724838256836, "eval_logps/rejected": -37.50510787963867, "eval_loss": 0.6948643326759338, "eval_rewards/accuracies": 0.5116279125213623, "eval_rewards/chosen": 0.0029200678691267967, "eval_rewards/margins": -0.0016844052588567138, "eval_rewards/rejected": 0.00460447371006012, "eval_runtime": 145.9303, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.9295498132705688, "logits/rejected": -1.9262762069702148, "logps/chosen": -31.575878143310547, "logps/rejected": -33.75602340698242, "loss": 0.6856, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010988839901983738, "rewards/margins": 0.0165377426892519, "rewards/rejected": -0.0055489009246230125, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9806196689605713, "logits/rejected": -1.9683196544647217, "logps/chosen": -34.554603576660156, "logps/rejected": -33.58423614501953, "loss": 0.6768, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.01658753491938114, "rewards/margins": 0.034744132310152054, "rewards/rejected": -0.01815659925341606, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.016143798828125, "logits/rejected": -2.014680862426758, "logps/chosen": -33.479766845703125, "logps/rejected": -32.474754333496094, "loss": 0.6937, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.002409630687907338, "rewards/margins": 0.0004784400516655296, "rewards/rejected": 0.0019311904907226562, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.103015899658203, "logits/rejected": -2.08721923828125, "logps/chosen": -34.18162155151367, "logps/rejected": -33.10049819946289, "loss": 0.6982, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0008922239649109542, "rewards/margins": -0.007907899096608162, "rewards/rejected": 0.0070156739093363285, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9744770526885986, "logits/rejected": -1.9735443592071533, "logps/chosen": -33.24970245361328, "logps/rejected": -32.445960998535156, "loss": 0.6916, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.012956788763403893, "rewards/margins": 0.00528245884925127, "rewards/rejected": 0.007674329914152622, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.9305318593978882, "logits/rejected": -1.9408973455429077, "logps/chosen": -32.2140007019043, "logps/rejected": -35.3133430480957, "loss": 0.6908, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0037243987899273634, "rewards/margins": 0.006108095403760672, "rewards/rejected": -0.00983249582350254, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.069563150405884, "logits/rejected": -2.063000202178955, "logps/chosen": -33.635009765625, "logps/rejected": -29.221439361572266, "loss": 0.6915, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006782642100006342, "rewards/margins": 0.004788074642419815, "rewards/rejected": 0.0019945662934333086, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.9291139841079712, "logits/rejected": -1.9312822818756104, "logps/chosen": -34.2513313293457, "logps/rejected": -30.88556480407715, "loss": 0.6938, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.004911360330879688, "rewards/margins": 0.00026298072771169245, "rewards/rejected": -0.005174341145902872, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6908149576806403, "train_runtime": 3255.681, "train_samples_per_second": 0.946, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }