{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 91.5, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 76.5, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08255004882812, "logits/rejected": 80.78926086425781, "logps/chosen": -34.20470428466797, "logps/rejected": -33.038047790527344, "loss": 0.9368, "rewards/accuracies": 0.5, "rewards/chosen": 0.02591484785079956, "rewards/margins": 0.0816662609577179, "rewards/rejected": -0.055751409381628036, "step": 10 }, { "epoch": 0.05, "grad_norm": 60.25, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.67174530029297, "logits/rejected": 80.55998229980469, "logps/chosen": -33.60923767089844, "logps/rejected": -30.828128814697266, "loss": 0.9439, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.05647152662277222, "rewards/margins": 0.09191782772541046, "rewards/rejected": -0.035446297377347946, "step": 20 }, { "epoch": 0.08, "grad_norm": 67.5, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.49557495117188, "logits/rejected": 82.5277099609375, "logps/chosen": -33.90664291381836, "logps/rejected": -31.188213348388672, "loss": 1.0985, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.07432325184345245, "rewards/margins": -0.0906088799238205, "rewards/rejected": 0.16493213176727295, "step": 30 }, { "epoch": 0.1, "grad_norm": 75.5, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.06272888183594, "logits/rejected": 81.05645751953125, "logps/chosen": -32.722740173339844, "logps/rejected": -33.15789031982422, "loss": 0.8862, "rewards/accuracies": 0.625, "rewards/chosen": 0.3159271776676178, "rewards/margins": 0.1684812754392624, "rewards/rejected": 0.1474459171295166, "step": 40 }, { "epoch": 0.13, "grad_norm": 48.75, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.66886901855469, "logits/rejected": 78.68830871582031, "logps/chosen": -30.57431411743164, "logps/rejected": -30.798864364624023, "loss": 0.9351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.44165611267089844, "rewards/margins": 0.24956238269805908, "rewards/rejected": 0.19209368526935577, "step": 50 }, { "epoch": 0.16, "grad_norm": 72.5, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.24879455566406, "logits/rejected": 83.29969787597656, "logps/chosen": -30.918521881103516, "logps/rejected": -29.482006072998047, "loss": 1.0359, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.17964980006217957, "rewards/margins": 0.1036890521645546, "rewards/rejected": 0.07596075534820557, "step": 60 }, { "epoch": 0.18, "grad_norm": 87.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.91014099121094, "logits/rejected": 83.93647766113281, "logps/chosen": -30.502422332763672, "logps/rejected": -33.12609100341797, "loss": 0.9567, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.08862228691577911, "rewards/margins": 0.15049318969249725, "rewards/rejected": -0.06187089532613754, "step": 70 }, { "epoch": 0.21, "grad_norm": 74.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.4886245727539, "logits/rejected": 81.47552490234375, "logps/chosen": -31.330123901367188, "logps/rejected": -31.121978759765625, "loss": 0.814, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12237221002578735, "rewards/margins": 0.3774043917655945, "rewards/rejected": -0.25503218173980713, "step": 80 }, { "epoch": 0.23, "grad_norm": 83.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.25160217285156, "logits/rejected": 78.21989440917969, "logps/chosen": -32.49519729614258, "logps/rejected": -31.26288414001465, "loss": 0.8564, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10008607059717178, "rewards/margins": 0.31807559728622437, "rewards/rejected": -0.21798951923847198, "step": 90 }, { "epoch": 0.26, "grad_norm": 82.5, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.49636840820312, "logits/rejected": 83.53155517578125, "logps/chosen": -34.187034606933594, "logps/rejected": -31.907695770263672, "loss": 0.9161, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0825684443116188, "rewards/margins": 0.19257497787475586, "rewards/rejected": -0.11000655591487885, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.7728500366211, "eval_logits/rejected": 98.76228332519531, "eval_logps/chosen": -32.4925651550293, "eval_logps/rejected": -36.00117492675781, "eval_loss": 1.0662287473678589, "eval_rewards/accuracies": 0.5128737688064575, "eval_rewards/chosen": -0.03457321599125862, "eval_rewards/margins": -0.010320436209440231, "eval_rewards/rejected": -0.02425277978181839, "eval_runtime": 104.2438, "eval_samples_per_second": 3.29, "eval_steps_per_second": 0.412, "step": 100 }, { "epoch": 0.29, "grad_norm": 91.5, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.7912368774414, "logits/rejected": 83.67459106445312, "logps/chosen": -32.45995330810547, "logps/rejected": -32.786006927490234, "loss": 0.8008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.39399290084838867, "rewards/margins": 0.4850761294364929, "rewards/rejected": -0.09108323603868484, "step": 110 }, { "epoch": 0.31, "grad_norm": 95.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 84.00267028808594, "logits/rejected": 84.11933898925781, "logps/chosen": -28.27166175842285, "logps/rejected": -35.5056037902832, "loss": 0.7011, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4605434536933899, "rewards/margins": 0.4930640757083893, "rewards/rejected": -0.03252064064145088, "step": 120 }, { "epoch": 0.34, "grad_norm": 64.5, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.23250579833984, "logits/rejected": 81.24813842773438, "logps/chosen": -30.38728904724121, "logps/rejected": -32.12664031982422, "loss": 0.7344, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3640448749065399, "rewards/margins": 0.5005542039871216, "rewards/rejected": -0.13650932908058167, "step": 130 }, { "epoch": 0.36, "grad_norm": 59.75, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.44822692871094, "logits/rejected": 82.46932220458984, "logps/chosen": -27.172740936279297, "logps/rejected": -33.0168571472168, "loss": 0.7509, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19220882654190063, "rewards/margins": 0.6076450347900391, "rewards/rejected": -0.4154362082481384, "step": 140 }, { "epoch": 0.39, "grad_norm": 54.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.97503662109375, "logits/rejected": 80.94606018066406, "logps/chosen": -29.037616729736328, "logps/rejected": -33.28493118286133, "loss": 0.6493, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2460467517375946, "rewards/margins": 0.7489484548568726, "rewards/rejected": -0.5029016733169556, "step": 150 }, { "epoch": 0.42, "grad_norm": 63.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 82.87214660644531, "logits/rejected": 82.87894439697266, "logps/chosen": -33.497344970703125, "logps/rejected": -30.375295639038086, "loss": 0.7389, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4475820064544678, "rewards/margins": 0.7435113191604614, "rewards/rejected": -0.2959292531013489, "step": 160 }, { "epoch": 0.44, "grad_norm": 63.25, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 83.60514831542969, "logits/rejected": 83.55717468261719, "logps/chosen": -30.812519073486328, "logps/rejected": -32.62251281738281, "loss": 0.7275, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.31273749470710754, "rewards/margins": 0.7344537377357483, "rewards/rejected": -0.42171627283096313, "step": 170 }, { "epoch": 0.47, "grad_norm": 54.25, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 81.30427551269531, "logits/rejected": 81.28076171875, "logps/chosen": -30.593530654907227, "logps/rejected": -31.66329574584961, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": 0.4225357472896576, "rewards/margins": 0.7767966985702515, "rewards/rejected": -0.3542609214782715, "step": 180 }, { "epoch": 0.49, "grad_norm": 37.5, "learning_rate": 2.996071664294641e-06, "logits/chosen": 82.88330078125, "logits/rejected": 82.88325500488281, "logps/chosen": -30.3848934173584, "logps/rejected": -30.6480770111084, "loss": 0.8789, "rewards/accuracies": 0.625, "rewards/chosen": 0.2670655846595764, "rewards/margins": 0.39310184121131897, "rewards/rejected": -0.12603625655174255, "step": 190 }, { "epoch": 0.52, "grad_norm": 56.25, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 78.49826049804688, "logits/rejected": 78.43463134765625, "logps/chosen": -33.972293853759766, "logps/rejected": -32.59092330932617, "loss": 0.8398, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5463098287582397, "rewards/margins": 0.6145724058151245, "rewards/rejected": -0.06826266646385193, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.71674346923828, "eval_logits/rejected": 98.69157409667969, "eval_logps/chosen": -32.63681411743164, "eval_logps/rejected": -36.29216003417969, "eval_loss": 1.0089657306671143, "eval_rewards/accuracies": 0.5141196250915527, "eval_rewards/chosen": -0.13554596900939941, "eval_rewards/margins": 0.09239647537469864, "eval_rewards/rejected": -0.22794245183467865, "eval_runtime": 104.0982, "eval_samples_per_second": 3.295, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 73.5, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 81.18994140625, "logits/rejected": 81.10676574707031, "logps/chosen": -33.214935302734375, "logps/rejected": -35.187870025634766, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.511551022529602, "rewards/margins": 0.6527955532073975, "rewards/rejected": -0.14124450087547302, "step": 210 }, { "epoch": 0.57, "grad_norm": 53.5, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 83.3000259399414, "logits/rejected": 83.38983917236328, "logps/chosen": -31.05356216430664, "logps/rejected": -31.100086212158203, "loss": 0.5573, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6267757415771484, "rewards/margins": 0.8865677714347839, "rewards/rejected": -0.2597920000553131, "step": 220 }, { "epoch": 0.6, "grad_norm": 59.5, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 80.46002197265625, "logits/rejected": 80.5117416381836, "logps/chosen": -32.2435417175293, "logps/rejected": -34.2344970703125, "loss": 0.7635, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.393540620803833, "rewards/margins": 0.5491863489151001, "rewards/rejected": -0.15564575791358948, "step": 230 }, { "epoch": 0.62, "grad_norm": 72.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 82.81127166748047, "logits/rejected": 83.10184478759766, "logps/chosen": -30.6541748046875, "logps/rejected": -31.797021865844727, "loss": 0.5215, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7237205505371094, "rewards/margins": 0.9092057347297668, "rewards/rejected": -0.18548506498336792, "step": 240 }, { "epoch": 0.65, "grad_norm": 76.5, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 81.55049133300781, "logits/rejected": 81.6218032836914, "logps/chosen": -26.869796752929688, "logps/rejected": -30.16421890258789, "loss": 0.7649, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5082693696022034, "rewards/margins": 0.6927058696746826, "rewards/rejected": -0.18443644046783447, "step": 250 }, { "epoch": 0.68, "grad_norm": 52.5, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 78.84456634521484, "logits/rejected": 78.9933090209961, "logps/chosen": -30.304040908813477, "logps/rejected": -36.30915069580078, "loss": 0.4765, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.8798693418502808, "rewards/margins": 1.0251331329345703, "rewards/rejected": -0.1452637016773224, "step": 260 }, { "epoch": 0.7, "grad_norm": 57.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": 78.18994903564453, "logits/rejected": 78.21639251708984, "logps/chosen": -30.825641632080078, "logps/rejected": -31.800342559814453, "loss": 0.6623, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6672974228858948, "rewards/margins": 0.8737428784370422, "rewards/rejected": -0.20644548535346985, "step": 270 }, { "epoch": 0.73, "grad_norm": 76.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 80.78196716308594, "logits/rejected": 80.56169128417969, "logps/chosen": -30.946773529052734, "logps/rejected": -29.854522705078125, "loss": 0.6812, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6041832566261292, "rewards/margins": 0.7354522943496704, "rewards/rejected": -0.13126906752586365, "step": 280 }, { "epoch": 0.75, "grad_norm": 60.25, "learning_rate": 8.737922755071455e-07, "logits/chosen": 80.99944305419922, "logits/rejected": 80.91734313964844, "logps/chosen": -32.94629669189453, "logps/rejected": -32.49176025390625, "loss": 0.5405, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.8105659484863281, "rewards/margins": 1.1429924964904785, "rewards/rejected": -0.3324265480041504, "step": 290 }, { "epoch": 0.78, "grad_norm": 72.5, "learning_rate": 7.08321427484816e-07, "logits/chosen": 76.61840057373047, "logits/rejected": 76.69322204589844, "logps/chosen": -32.09418487548828, "logps/rejected": -29.156299591064453, "loss": 0.733, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9180667996406555, "rewards/margins": 0.9560586214065552, "rewards/rejected": -0.03799174353480339, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.73394012451172, "eval_logits/rejected": 98.70979309082031, "eval_logps/chosen": -32.53404235839844, "eval_logps/rejected": -36.16787338256836, "eval_loss": 1.0123846530914307, "eval_rewards/accuracies": 0.5398671627044678, "eval_rewards/chosen": -0.06360965222120285, "eval_rewards/margins": 0.07733342051506042, "eval_rewards/rejected": -0.14094306528568268, "eval_runtime": 104.0616, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.81, "grad_norm": 63.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 83.6187515258789, "logits/rejected": 83.65609741210938, "logps/chosen": -30.124670028686523, "logps/rejected": -32.455326080322266, "loss": 0.6913, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5172038078308105, "rewards/margins": 0.6960991024971008, "rewards/rejected": -0.1788952797651291, "step": 310 }, { "epoch": 0.83, "grad_norm": 53.5, "learning_rate": 4.229036944380913e-07, "logits/chosen": 81.21861267089844, "logits/rejected": 81.21974182128906, "logps/chosen": -30.413522720336914, "logps/rejected": -29.050277709960938, "loss": 0.5746, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.8295270204544067, "rewards/margins": 0.921076774597168, "rewards/rejected": -0.09154972434043884, "step": 320 }, { "epoch": 0.86, "grad_norm": 42.5, "learning_rate": 3.053082288996112e-07, "logits/chosen": 78.42388153076172, "logits/rejected": 78.47276306152344, "logps/chosen": -28.912973403930664, "logps/rejected": -32.85163116455078, "loss": 0.5118, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 1.007744550704956, "rewards/margins": 1.1594822406768799, "rewards/rejected": -0.151737779378891, "step": 330 }, { "epoch": 0.88, "grad_norm": 78.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 82.58137512207031, "logits/rejected": 82.63166046142578, "logps/chosen": -32.287044525146484, "logps/rejected": -33.664268493652344, "loss": 0.7276, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6475721597671509, "rewards/margins": 0.8544757962226868, "rewards/rejected": -0.2069036066532135, "step": 340 }, { "epoch": 0.91, "grad_norm": 56.75, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 81.68583679199219, "logits/rejected": 81.69776916503906, "logps/chosen": -32.62708282470703, "logps/rejected": -33.35443115234375, "loss": 0.626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7116604447364807, "rewards/margins": 0.8955025672912598, "rewards/rejected": -0.18384216725826263, "step": 350 }, { "epoch": 0.94, "grad_norm": 45.5, "learning_rate": 6.41315865106129e-08, "logits/chosen": 83.15383911132812, "logits/rejected": 83.18663024902344, "logps/chosen": -28.314571380615234, "logps/rejected": -31.6760196685791, "loss": 0.6231, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8663473129272461, "rewards/margins": 0.8846683502197266, "rewards/rejected": -0.01832098886370659, "step": 360 }, { "epoch": 0.96, "grad_norm": 76.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 82.54973602294922, "logits/rejected": 82.57438659667969, "logps/chosen": -31.776432037353516, "logps/rejected": -35.274253845214844, "loss": 0.6836, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7698130011558533, "rewards/margins": 0.8852267265319824, "rewards/rejected": -0.11541371047496796, "step": 370 }, { "epoch": 0.99, "grad_norm": 72.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": 76.53568267822266, "logits/rejected": 76.40200805664062, "logps/chosen": -29.737590789794922, "logps/rejected": -28.21639060974121, "loss": 0.7558, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5869752168655396, "rewards/margins": 0.6260467767715454, "rewards/rejected": -0.03907149285078049, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.7429099231571347, "train_runtime": 2555.3187, "train_samples_per_second": 1.205, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }