{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.3906, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.8662992715835571, "logits/rejected": -1.8706117868423462, "logps/chosen": -36.97681427001953, "logps/rejected": -33.66523361206055, "loss": 0.3447, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.023829741403460503, "rewards/margins": 0.05195777863264084, "rewards/rejected": -0.028128040954470634, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.997193694114685, "logits/rejected": -1.9998573064804077, "logps/chosen": -29.65359878540039, "logps/rejected": -29.054311752319336, "loss": 0.4534, "rewards/accuracies": 0.4375, "rewards/chosen": -0.009124360978603363, "rewards/margins": -0.016431041061878204, "rewards/rejected": 0.007306680083274841, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.9197280406951904, "logits/rejected": -1.9170429706573486, "logps/chosen": -31.414775848388672, "logps/rejected": -33.24064254760742, "loss": 0.4025, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0010542668169364333, "rewards/margins": 0.018799806013703346, "rewards/rejected": -0.01774553768336773, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.0171010494232178, "logits/rejected": -2.008350372314453, "logps/chosen": -32.59648513793945, "logps/rejected": -32.50862121582031, "loss": 0.4532, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.015740757808089256, "rewards/margins": -0.018979396671056747, "rewards/rejected": 0.0032386414241045713, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.8645904064178467, "logits/rejected": -1.8538070917129517, "logps/chosen": -33.542823791503906, "logps/rejected": -35.43744659423828, "loss": 0.4304, "rewards/accuracies": 0.5, "rewards/chosen": 0.011724205687642097, "rewards/margins": -0.00019102543592453003, "rewards/rejected": 0.011915231123566628, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.9461469650268555, "logits/rejected": -1.948094129562378, "logps/chosen": -32.587215423583984, "logps/rejected": -33.21445846557617, "loss": 0.3539, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.010374858975410461, "rewards/margins": 0.04483799636363983, "rewards/rejected": -0.03446313738822937, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.079591989517212, "logits/rejected": -2.0845742225646973, "logps/chosen": -34.00908279418945, "logps/rejected": -36.58150863647461, "loss": 0.4506, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.01648041605949402, "rewards/margins": -0.006370754446834326, "rewards/rejected": -0.010109663009643555, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9423307180404663, "logits/rejected": -1.9454963207244873, "logps/chosen": -34.39698028564453, "logps/rejected": -34.59761428833008, "loss": 0.3889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.024663064628839493, "rewards/margins": 0.028829574584960938, "rewards/rejected": -0.00416650902479887, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.9508873224258423, "logits/rejected": -1.9553953409194946, "logps/chosen": -32.48583221435547, "logps/rejected": -32.35867691040039, "loss": 0.4445, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.014726865105330944, "rewards/margins": -0.014909917488694191, "rewards/rejected": 0.00018304325931239873, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.0491719245910645, "logits/rejected": -2.047179937362671, "logps/chosen": -32.22635269165039, "logps/rejected": -31.287487030029297, "loss": 0.3805, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011584864929318428, "rewards/margins": 0.02860759198665619, "rewards/rejected": -0.01702272891998291, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.244148015975952, "eval_logits/rejected": -2.2392663955688477, "eval_logps/chosen": -34.01713943481445, "eval_logps/rejected": -37.514495849609375, "eval_loss": 0.4061162769794464, "eval_rewards/accuracies": 0.5394518375396729, "eval_rewards/chosen": 0.013928660191595554, "eval_rewards/margins": 0.012230273336172104, "eval_rewards/rejected": 0.0016983875539153814, "eval_runtime": 146.1091, "eval_samples_per_second": 2.348, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.005580186843872, "logits/rejected": -2.0031564235687256, "logps/chosen": -33.24415588378906, "logps/rejected": -33.99993133544922, "loss": 0.4792, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00026987865567207336, "rewards/margins": -0.02382112666964531, "rewards/rejected": 0.023551244288682938, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.01680326461792, "logits/rejected": -2.0084121227264404, "logps/chosen": -32.45465850830078, "logps/rejected": -32.1729850769043, "loss": 0.4286, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007803081069141626, "rewards/margins": 0.0013800703454762697, "rewards/rejected": -0.009183152578771114, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.0466856956481934, "logits/rejected": -2.0386409759521484, "logps/chosen": -30.494335174560547, "logps/rejected": -32.051963806152344, "loss": 0.4497, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.014895597472786903, "rewards/margins": -0.015729816630482674, "rewards/rejected": 0.0008342192741110921, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.977065086364746, "logits/rejected": -1.987343430519104, "logps/chosen": -31.37868309020996, "logps/rejected": -32.54730224609375, "loss": 0.3589, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03278205543756485, "rewards/margins": 0.04271895810961723, "rewards/rejected": -0.009936909191310406, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8916466236114502, "logits/rejected": -1.8927490711212158, "logps/chosen": -34.209651947021484, "logps/rejected": -34.766143798828125, "loss": 0.4325, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.013569498434662819, "rewards/margins": -0.00645996630191803, "rewards/rejected": -0.007109532598406076, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.9427303075790405, "logits/rejected": -1.939252495765686, "logps/chosen": -36.14452362060547, "logps/rejected": -32.73284149169922, "loss": 0.3796, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.021723434329032898, "rewards/margins": 0.02979486621916294, "rewards/rejected": -0.008071433752775192, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.0424513816833496, "logits/rejected": -2.035060405731201, "logps/chosen": -33.786170959472656, "logps/rejected": -31.34820556640625, "loss": 0.4172, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.009350698441267014, "rewards/margins": 0.006434415467083454, "rewards/rejected": 0.0029162843711674213, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.0475661754608154, "logits/rejected": -2.052834987640381, "logps/chosen": -32.528114318847656, "logps/rejected": -32.50902557373047, "loss": 0.3766, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.013338183984160423, "rewards/margins": 0.03244578838348389, "rewards/rejected": -0.019107606261968613, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.048661231994629, "logits/rejected": -2.045872926712036, "logps/chosen": -31.496755599975586, "logps/rejected": -31.314464569091797, "loss": 0.4475, "rewards/accuracies": 0.5, "rewards/chosen": -0.00624021515250206, "rewards/margins": -0.009242123924195766, "rewards/rejected": 0.003001909703016281, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9193611145019531, "logits/rejected": -1.9240529537200928, "logps/chosen": -31.579212188720703, "logps/rejected": -32.80603790283203, "loss": 0.371, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02165621519088745, "rewards/margins": 0.033739686012268066, "rewards/rejected": -0.012083468958735466, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.243856191635132, "eval_logits/rejected": -2.2389791011810303, "eval_logps/chosen": -34.023136138916016, "eval_logps/rejected": -37.501853942871094, "eval_loss": 0.4322855770587921, "eval_rewards/accuracies": 0.5253322720527649, "eval_rewards/chosen": 0.009133166633546352, "eval_rewards/margins": -0.002679171971976757, "eval_rewards/rejected": 0.01181233860552311, "eval_runtime": 145.9618, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.0320448875427246, "logits/rejected": -2.0427393913269043, "logps/chosen": -31.947372436523438, "logps/rejected": -33.899864196777344, "loss": 0.3622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.019143681973218918, "rewards/margins": 0.04887578636407852, "rewards/rejected": -0.029732098802924156, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9257709980010986, "logits/rejected": -1.940639853477478, "logps/chosen": -30.079341888427734, "logps/rejected": -31.5616397857666, "loss": 0.3884, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.025758206844329834, "rewards/margins": 0.028527002781629562, "rewards/rejected": -0.0027687971014529467, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.9829959869384766, "logits/rejected": -1.9869670867919922, "logps/chosen": -33.4053955078125, "logps/rejected": -31.562353134155273, "loss": 0.3918, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.009657363407313824, "rewards/margins": 0.02010050043463707, "rewards/rejected": -0.010443134233355522, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.9836658239364624, "logits/rejected": -1.9616800546646118, "logps/chosen": -34.175201416015625, "logps/rejected": -34.966102600097656, "loss": 0.4346, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.017159918323159218, "rewards/margins": -0.004913450218737125, "rewards/rejected": -0.012246469967067242, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.0244596004486084, "logits/rejected": -2.021144390106201, "logps/chosen": -32.932373046875, "logps/rejected": -36.2264289855957, "loss": 0.4397, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.005845203995704651, "rewards/margins": -0.004640025552362204, "rewards/rejected": -0.001205177279189229, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.891283392906189, "logits/rejected": -1.8888483047485352, "logps/chosen": -34.20048522949219, "logps/rejected": -35.507022857666016, "loss": 0.4441, "rewards/accuracies": 0.5625, "rewards/chosen": -0.009669994935393333, "rewards/margins": -0.0077504729852080345, "rewards/rejected": -0.0019195213681086898, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.8760111331939697, "logits/rejected": -1.8734772205352783, "logps/chosen": -34.39020919799805, "logps/rejected": -31.75579261779785, "loss": 0.4295, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0059113698080182076, "rewards/margins": -0.006519269198179245, "rewards/rejected": 0.012430639937520027, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.9797089099884033, "logits/rejected": -1.9690707921981812, "logps/chosen": -35.31258773803711, "logps/rejected": -31.837697982788086, "loss": 0.3868, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.03524213656783104, "rewards/margins": 0.0326063297688961, "rewards/rejected": 0.0026358035393059254, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.0755209922790527, "logits/rejected": -2.060497760772705, "logps/chosen": -30.90865135192871, "logps/rejected": -32.64521026611328, "loss": 0.41, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.012379061430692673, "rewards/margins": 0.007080032490193844, "rewards/rejected": 0.005299028940498829, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.9463651180267334, "logits/rejected": -1.9438308477401733, "logps/chosen": -32.89823532104492, "logps/rejected": -30.81850814819336, "loss": 0.3839, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.02309919334948063, "rewards/margins": 0.024458223953843117, "rewards/rejected": -0.0013590289745479822, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2435548305511475, "eval_logits/rejected": -2.2386720180511475, "eval_logps/chosen": -34.023826599121094, "eval_logps/rejected": -37.50039291381836, "eval_loss": 0.4344586431980133, "eval_rewards/accuracies": 0.4808970093727112, "eval_rewards/chosen": 0.00858243927359581, "eval_rewards/margins": -0.004399063065648079, "eval_rewards/rejected": 0.012981505133211613, "eval_runtime": 145.9473, "eval_samples_per_second": 2.35, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.9286285638809204, "logits/rejected": -1.9253568649291992, "logps/chosen": -31.583232879638672, "logps/rejected": -33.75123596191406, "loss": 0.3881, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.016091840341687202, "rewards/margins": 0.023359118029475212, "rewards/rejected": -0.007267280016094446, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9803975820541382, "logits/rejected": -1.9680900573730469, "logps/chosen": -34.58079147338867, "logps/rejected": -33.575767517089844, "loss": 0.3778, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.012225466780364513, "rewards/margins": 0.041770923882722855, "rewards/rejected": -0.029545456171035767, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.015634298324585, "logits/rejected": -2.0141713619232178, "logps/chosen": -33.45996856689453, "logps/rejected": -32.48029327392578, "loss": 0.4032, "rewards/accuracies": 0.5625, "rewards/chosen": 0.020660031586885452, "rewards/margins": 0.021225089207291603, "rewards/rejected": -0.0005650619277730584, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.1035873889923096, "logits/rejected": -2.0877768993377686, "logps/chosen": -34.162208557128906, "logps/rejected": -33.095733642578125, "loss": 0.439, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.013746557757258415, "rewards/margins": -0.004091509617865086, "rewards/rejected": 0.017838066443800926, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9746344089508057, "logits/rejected": -1.9736888408660889, "logps/chosen": -33.253448486328125, "logps/rejected": -32.458499908447266, "loss": 0.3954, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02291743829846382, "rewards/margins": 0.017600122839212418, "rewards/rejected": 0.005317316390573978, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.9309253692626953, "logits/rejected": -1.941303014755249, "logps/chosen": -32.21305465698242, "logps/rejected": -35.309688568115234, "loss": 0.4095, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.006690672133117914, "rewards/margins": 0.010053041391074657, "rewards/rejected": -0.01674371212720871, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.069108009338379, "logits/rejected": -2.0625574588775635, "logps/chosen": -33.637176513671875, "logps/rejected": -29.221187591552734, "loss": 0.4059, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.011833530850708485, "rewards/margins": 0.00764369685202837, "rewards/rejected": 0.004189834464341402, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.9292182922363281, "logits/rejected": -1.9313886165618896, "logps/chosen": -34.24732971191406, "logps/rejected": -30.893402099609375, "loss": 0.4086, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.006622877903282642, "rewards/margins": 0.009996414184570312, "rewards/rejected": -0.01661929115653038, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.4092799360101873, "train_runtime": 3256.9261, "train_samples_per_second": 0.945, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }