{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 23.75, "learning_rate": 3.2467532467532474e-08, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 38.5, "learning_rate": 3.2467532467532465e-07, "logits/chosen": -3.009772777557373, "logits/rejected": -2.999285936355591, "logps/chosen": -33.21327209472656, "logps/rejected": -31.971134185791016, "loss": 0.7026, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.019398299977183342, "rewards/margins": -0.015061982907354832, "rewards/rejected": -0.0043363189324736595, "step": 10 }, { "epoch": 0.05, "grad_norm": 27.375, "learning_rate": 6.493506493506493e-07, "logits/chosen": -2.89970064163208, "logits/rejected": -2.8947174549102783, "logps/chosen": -32.48947525024414, "logps/rejected": -28.9757080078125, "loss": 0.7014, "rewards/accuracies": 0.4375, "rewards/chosen": -0.008624804206192493, "rewards/margins": -0.012319705449044704, "rewards/rejected": 0.0036949000786989927, "step": 20 }, { "epoch": 0.08, "grad_norm": 25.25, "learning_rate": 9.740259740259742e-07, "logits/chosen": -3.09592866897583, "logits/rejected": -3.107868194580078, "logps/chosen": -32.89606857299805, "logps/rejected": -30.1730899810791, "loss": 0.6998, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.007569731678813696, "rewards/margins": -0.006207291968166828, "rewards/rejected": 0.013777022249996662, "step": 30 }, { "epoch": 0.1, "grad_norm": 27.125, "learning_rate": 1.2987012987012986e-06, "logits/chosen": -2.8652756214141846, "logits/rejected": -2.8561177253723145, "logps/chosen": -31.76981544494629, "logps/rejected": -32.3731575012207, "loss": 0.6743, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03630450740456581, "rewards/margins": 0.04572600871324539, "rewards/rejected": -0.009421499446034431, "step": 40 }, { "epoch": 0.13, "grad_norm": 21.125, "learning_rate": 1.6233766233766235e-06, "logits/chosen": -2.8860361576080322, "logits/rejected": -2.8838560581207275, "logps/chosen": -29.639389038085938, "logps/rejected": -30.066131591796875, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": 0.06046304851770401, "rewards/margins": 0.04677456617355347, "rewards/rejected": 0.013688492588698864, "step": 50 }, { "epoch": 0.16, "grad_norm": 23.25, "learning_rate": 1.9480519480519483e-06, "logits/chosen": -2.915588855743408, "logits/rejected": -2.9170215129852295, "logps/chosen": -30.02178382873535, "logps/rejected": -27.954660415649414, "loss": 0.6659, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.06479138135910034, "rewards/margins": 0.06417764723300934, "rewards/rejected": 0.0006137322634458542, "step": 60 }, { "epoch": 0.18, "grad_norm": 36.75, "learning_rate": 2.2727272727272728e-06, "logits/chosen": -2.995821475982666, "logits/rejected": -3.002281665802002, "logps/chosen": -29.207416534423828, "logps/rejected": -30.850021362304688, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.044131673872470856, "rewards/margins": 0.019063914194703102, "rewards/rejected": 0.025067755952477455, "step": 70 }, { "epoch": 0.21, "grad_norm": 29.875, "learning_rate": 2.597402597402597e-06, "logits/chosen": -2.8110384941101074, "logits/rejected": -2.827033519744873, "logps/chosen": -29.421245574951172, "logps/rejected": -29.7005558013916, "loss": 0.6571, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.07168709486722946, "rewards/margins": 0.08745081722736359, "rewards/rejected": -0.015763718634843826, "step": 80 }, { "epoch": 0.23, "grad_norm": 25.0, "learning_rate": 2.922077922077922e-06, "logits/chosen": -2.89847993850708, "logits/rejected": -2.8813650608062744, "logps/chosen": -32.741859436035156, "logps/rejected": -30.076095581054688, "loss": 0.6708, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0567438118159771, "rewards/margins": 0.08941256999969482, "rewards/rejected": -0.03266875073313713, "step": 90 }, { "epoch": 0.26, "grad_norm": 23.0, "learning_rate": 3.246753246753247e-06, "logits/chosen": -3.0026650428771973, "logits/rejected": -3.0032451152801514, "logps/chosen": -31.96946144104004, "logps/rejected": -30.823959350585938, "loss": 0.6746, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.052449680864810944, "rewards/margins": 0.05712243169546127, "rewards/rejected": -0.00467275083065033, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.8084466457366943, "eval_logits/rejected": -2.8058016300201416, "eval_logps/chosen": -31.25157928466797, "eval_logps/rejected": -34.729000091552734, "eval_loss": 0.6828339099884033, "eval_rewards/accuracies": 0.5693521499633789, "eval_rewards/chosen": 0.01852412335574627, "eval_rewards/margins": 0.0370328426361084, "eval_rewards/rejected": -0.01850871555507183, "eval_runtime": 113.0199, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 100 }, { "epoch": 0.29, "grad_norm": 30.75, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -2.9552407264709473, "logits/rejected": -2.931591510772705, "logps/chosen": -32.020774841308594, "logps/rejected": -31.225658416748047, "loss": 0.6329, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11761404573917389, "rewards/margins": 0.1576995849609375, "rewards/rejected": -0.040085554122924805, "step": 110 }, { "epoch": 0.31, "grad_norm": 23.75, "learning_rate": 3.896103896103897e-06, "logits/chosen": -3.03885817527771, "logits/rejected": -3.067966938018799, "logps/chosen": -28.88214683532715, "logps/rejected": -34.20409393310547, "loss": 0.621, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.16905571520328522, "rewards/margins": 0.20298466086387634, "rewards/rejected": -0.03392895311117172, "step": 120 }, { "epoch": 0.34, "grad_norm": 19.375, "learning_rate": 4.220779220779221e-06, "logits/chosen": -2.741839647293091, "logits/rejected": -2.737023115158081, "logps/chosen": -28.76812171936035, "logps/rejected": -30.218597412109375, "loss": 0.6385, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1239342913031578, "rewards/margins": 0.17480790615081787, "rewards/rejected": -0.050873614847660065, "step": 130 }, { "epoch": 0.36, "grad_norm": 20.875, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -3.016112804412842, "logits/rejected": -3.013633966445923, "logps/chosen": -27.28360366821289, "logps/rejected": -31.76962661743164, "loss": 0.6517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12157317250967026, "rewards/margins": 0.18928703665733337, "rewards/rejected": -0.06771388649940491, "step": 140 }, { "epoch": 0.39, "grad_norm": 20.0, "learning_rate": 4.870129870129871e-06, "logits/chosen": -2.8128550052642822, "logits/rejected": -2.807783603668213, "logps/chosen": -27.434728622436523, "logps/rejected": -31.449132919311523, "loss": 0.5642, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23446612060070038, "rewards/margins": 0.3697589635848999, "rewards/rejected": -0.13529284298419952, "step": 150 }, { "epoch": 0.42, "grad_norm": 26.5, "learning_rate": 4.999768804644796e-06, "logits/chosen": -3.129559278488159, "logits/rejected": -3.1123318672180176, "logps/chosen": -31.945110321044922, "logps/rejected": -29.27609634399414, "loss": 0.5266, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.32796427607536316, "rewards/margins": 0.5057547688484192, "rewards/rejected": -0.17779052257537842, "step": 160 }, { "epoch": 0.44, "grad_norm": 23.875, "learning_rate": 4.998356098992574e-06, "logits/chosen": -2.942965030670166, "logits/rejected": -2.950735569000244, "logps/chosen": -29.63945960998535, "logps/rejected": -31.524499893188477, "loss": 0.5615, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15309393405914307, "rewards/margins": 0.408103883266449, "rewards/rejected": -0.2550099492073059, "step": 170 }, { "epoch": 0.47, "grad_norm": 23.25, "learning_rate": 4.9956598544545566e-06, "logits/chosen": -2.7956109046936035, "logits/rejected": -2.793761730194092, "logps/chosen": -29.341201782226562, "logps/rejected": -30.051944732666016, "loss": 0.5951, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19718877971172333, "rewards/margins": 0.3674519658088684, "rewards/rejected": -0.17026321589946747, "step": 180 }, { "epoch": 0.49, "grad_norm": 14.0625, "learning_rate": 4.991681456235483e-06, "logits/chosen": -2.9083733558654785, "logits/rejected": -2.904571533203125, "logps/chosen": -29.67318344116211, "logps/rejected": -28.667194366455078, "loss": 0.5737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.30811822414398193, "rewards/margins": 0.4939153790473938, "rewards/rejected": -0.18579718470573425, "step": 190 }, { "epoch": 0.52, "grad_norm": 13.375, "learning_rate": 4.986422948250881e-06, "logits/chosen": -2.9793169498443604, "logits/rejected": -2.967294216156006, "logps/chosen": -33.094722747802734, "logps/rejected": -30.4979248046875, "loss": 0.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.36257725954055786, "rewards/margins": 0.40088003873825073, "rewards/rejected": -0.03830284625291824, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8200676441192627, "eval_logits/rejected": -2.8176209926605225, "eval_logps/chosen": -31.358734130859375, "eval_logps/rejected": -34.91850280761719, "eval_loss": 0.6735296845436096, "eval_rewards/accuracies": 0.5510797500610352, "eval_rewards/chosen": -0.04576955363154411, "eval_rewards/margins": 0.08643829822540283, "eval_rewards/rejected": -0.13220785558223724, "eval_runtime": 112.8088, "eval_samples_per_second": 3.041, "eval_steps_per_second": 0.381, "step": 200 }, { "epoch": 0.55, "grad_norm": 20.125, "learning_rate": 4.9798870320769884e-06, "logits/chosen": -2.9180569648742676, "logits/rejected": -2.918910264968872, "logps/chosen": -32.37510299682617, "logps/rejected": -34.01622772216797, "loss": 0.5638, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39498284459114075, "rewards/margins": 0.48783811926841736, "rewards/rejected": -0.0928553119301796, "step": 210 }, { "epoch": 0.57, "grad_norm": 16.125, "learning_rate": 4.9720770655628216e-06, "logits/chosen": -2.898266553878784, "logits/rejected": -2.913907527923584, "logps/chosen": -29.420312881469727, "logps/rejected": -28.794261932373047, "loss": 0.5663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.45463672280311584, "rewards/margins": 0.6142338514328003, "rewards/rejected": -0.15959712862968445, "step": 220 }, { "epoch": 0.6, "grad_norm": 18.625, "learning_rate": 4.96299706110506e-06, "logits/chosen": -2.944633960723877, "logits/rejected": -2.948894500732422, "logps/chosen": -30.680644989013672, "logps/rejected": -31.86056137084961, "loss": 0.6034, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.28273314237594604, "rewards/margins": 0.34417831897735596, "rewards/rejected": -0.061445169150829315, "step": 230 }, { "epoch": 0.62, "grad_norm": 18.875, "learning_rate": 4.952651683586668e-06, "logits/chosen": -3.0014634132385254, "logits/rejected": -3.009148597717285, "logps/chosen": -29.685632705688477, "logps/rejected": -30.464202880859375, "loss": 0.4269, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.7315748333930969, "rewards/margins": 0.868951141834259, "rewards/rejected": -0.13737639784812927, "step": 240 }, { "epoch": 0.65, "grad_norm": 21.875, "learning_rate": 4.9410462479802945e-06, "logits/chosen": -2.8361315727233887, "logits/rejected": -2.826373338699341, "logps/chosen": -26.182703018188477, "logps/rejected": -29.741592407226562, "loss": 0.5404, "rewards/accuracies": 0.75, "rewards/chosen": 0.46290817856788635, "rewards/margins": 0.5888797640800476, "rewards/rejected": -0.12597161531448364, "step": 250 }, { "epoch": 0.68, "grad_norm": 12.3125, "learning_rate": 4.928186716617686e-06, "logits/chosen": -2.8184003829956055, "logits/rejected": -2.837860345840454, "logps/chosen": -28.799774169921875, "logps/rejected": -34.532344818115234, "loss": 0.5319, "rewards/accuracies": 0.75, "rewards/chosen": 0.6181110739707947, "rewards/margins": 0.8330680131912231, "rewards/rejected": -0.21495695412158966, "step": 260 }, { "epoch": 0.7, "grad_norm": 16.125, "learning_rate": 4.914079696126526e-06, "logits/chosen": -2.9628233909606934, "logits/rejected": -2.968907356262207, "logps/chosen": -29.94136619567871, "logps/rejected": -30.401519775390625, "loss": 0.4633, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.5039950609207153, "rewards/margins": 0.8257501721382141, "rewards/rejected": -0.32175517082214355, "step": 270 }, { "epoch": 0.73, "grad_norm": 13.5, "learning_rate": 4.8987324340362445e-06, "logits/chosen": -2.9749794006347656, "logits/rejected": -2.962801218032837, "logps/chosen": -29.97719955444336, "logps/rejected": -28.926509857177734, "loss": 0.5875, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4466908574104309, "rewards/margins": 0.623786449432373, "rewards/rejected": -0.17709562182426453, "step": 280 }, { "epoch": 0.75, "grad_norm": 13.9375, "learning_rate": 4.882152815054587e-06, "logits/chosen": -2.903925657272339, "logits/rejected": -2.886340618133545, "logps/chosen": -31.33095932006836, "logps/rejected": -31.375301361083984, "loss": 0.3679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.7832067608833313, "rewards/margins": 1.2944037914276123, "rewards/rejected": -0.511197030544281, "step": 290 }, { "epoch": 0.78, "grad_norm": 25.125, "learning_rate": 4.864349357016816e-06, "logits/chosen": -2.9058868885040283, "logits/rejected": -2.9024853706359863, "logps/chosen": -31.235179901123047, "logps/rejected": -28.030147552490234, "loss": 0.5567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5924834609031677, "rewards/margins": 0.8484223484992981, "rewards/rejected": -0.25593873858451843, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.8230674266815186, "eval_logits/rejected": -2.8203024864196777, "eval_logps/chosen": -31.48798370361328, "eval_logps/rejected": -35.10240936279297, "eval_loss": 0.6810446381568909, "eval_rewards/accuracies": 0.5722591280937195, "eval_rewards/chosen": -0.12331710010766983, "eval_rewards/margins": 0.11923385411500931, "eval_rewards/rejected": -0.24255095422267914, "eval_runtime": 112.9994, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.381, "step": 300 }, { "epoch": 0.81, "grad_norm": 21.875, "learning_rate": 4.84533120650964e-06, "logits/chosen": -2.7906689643859863, "logits/rejected": -2.8061537742614746, "logps/chosen": -28.591060638427734, "logps/rejected": -31.535472869873047, "loss": 0.4812, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3830157220363617, "rewards/margins": 0.8650951385498047, "rewards/rejected": -0.48207932710647583, "step": 310 }, { "epoch": 0.83, "grad_norm": 13.6875, "learning_rate": 4.825108134172131e-06, "logits/chosen": -3.033379554748535, "logits/rejected": -3.0197348594665527, "logps/chosen": -29.19796371459961, "logps/rejected": -29.172719955444336, "loss": 0.4534, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6081727147102356, "rewards/margins": 1.120539903640747, "rewards/rejected": -0.5123672485351562, "step": 320 }, { "epoch": 0.86, "grad_norm": 8.0625, "learning_rate": 4.80369052967602e-06, "logits/chosen": -2.951292037963867, "logits/rejected": -2.9344191551208496, "logps/chosen": -27.382125854492188, "logps/rejected": -31.329797744750977, "loss": 0.4283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5879670977592468, "rewards/margins": 1.1508591175079346, "rewards/rejected": -0.5628920793533325, "step": 330 }, { "epoch": 0.88, "grad_norm": 23.5, "learning_rate": 4.781089396387968e-06, "logits/chosen": -3.1658730506896973, "logits/rejected": -3.1736338138580322, "logps/chosen": -30.635177612304688, "logps/rejected": -33.81396484375, "loss": 0.4196, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.602415919303894, "rewards/margins": 1.2731926441192627, "rewards/rejected": -0.6707767248153687, "step": 340 }, { "epoch": 0.91, "grad_norm": 11.25, "learning_rate": 4.757316345716554e-06, "logits/chosen": -3.046379327774048, "logits/rejected": -3.0505118370056152, "logps/chosen": -29.554956436157227, "logps/rejected": -32.34663009643555, "loss": 0.4581, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.7511544823646545, "rewards/margins": 1.251146912574768, "rewards/rejected": -0.4999924600124359, "step": 350 }, { "epoch": 0.94, "grad_norm": 12.125, "learning_rate": 4.73238359114687e-06, "logits/chosen": -2.885371446609497, "logits/rejected": -2.887463331222534, "logps/chosen": -27.74961280822754, "logps/rejected": -30.798757553100586, "loss": 0.4613, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.5400681495666504, "rewards/margins": 1.1207592487335205, "rewards/rejected": -0.5806912183761597, "step": 360 }, { "epoch": 0.96, "grad_norm": 30.25, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.9629409313201904, "logits/rejected": -2.9607956409454346, "logps/chosen": -29.71516990661621, "logps/rejected": -32.788673400878906, "loss": 0.4285, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6428869962692261, "rewards/margins": 1.1592637300491333, "rewards/rejected": -0.5163766741752625, "step": 370 }, { "epoch": 0.99, "grad_norm": 16.125, "learning_rate": 4.679090796681225e-06, "logits/chosen": -2.915102005004883, "logits/rejected": -2.8993585109710693, "logps/chosen": -27.930139541625977, "logps/rejected": -28.983240127563477, "loss": 0.3961, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.6089375615119934, "rewards/margins": 1.2651110887527466, "rewards/rejected": -0.6561735272407532, "step": 380 }, { "epoch": 1.01, "grad_norm": 9.0625, "learning_rate": 4.650758136138454e-06, "logits/chosen": -3.20560884475708, "logits/rejected": -3.1781651973724365, "logps/chosen": -28.22250747680664, "logps/rejected": -36.23353576660156, "loss": 0.2827, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.8412960171699524, "rewards/margins": 1.9863579273223877, "rewards/rejected": -1.1450618505477905, "step": 390 }, { "epoch": 1.04, "grad_norm": 5.125, "learning_rate": 4.621320516337559e-06, "logits/chosen": -2.9704136848449707, "logits/rejected": -2.9771697521209717, "logps/chosen": -30.30599594116211, "logps/rejected": -31.767135620117188, "loss": 0.2251, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.1859118938446045, "rewards/margins": 2.4125428199768066, "rewards/rejected": -1.226630687713623, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.8203864097595215, "eval_logits/rejected": -2.8175127506256104, "eval_logps/chosen": -31.82402229309082, "eval_logps/rejected": -35.526432037353516, "eval_loss": 0.6779412627220154, "eval_rewards/accuracies": 0.6013289093971252, "eval_rewards/chosen": -0.32494381070137024, "eval_rewards/margins": 0.17202156782150269, "eval_rewards/rejected": -0.4969654083251953, "eval_runtime": 112.9877, "eval_samples_per_second": 3.036, "eval_steps_per_second": 0.381, "step": 400 }, { "epoch": 1.06, "grad_norm": 10.5, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.91953706741333, "logits/rejected": -2.9032034873962402, "logps/chosen": -26.497915267944336, "logps/rejected": -30.3842716217041, "loss": 0.1921, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7503674030303955, "rewards/margins": 2.4338011741638184, "rewards/rejected": -1.6834341287612915, "step": 410 }, { "epoch": 1.09, "grad_norm": 3.71875, "learning_rate": 4.559191453574582e-06, "logits/chosen": -2.958489179611206, "logits/rejected": -2.975358486175537, "logps/chosen": -30.06867027282715, "logps/rejected": -29.58856773376465, "loss": 0.2889, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.2172677516937256, "rewards/margins": 2.2790169715881348, "rewards/rejected": -1.0617492198944092, "step": 420 }, { "epoch": 1.12, "grad_norm": 6.4375, "learning_rate": 4.52653192962838e-06, "logits/chosen": -2.9180357456207275, "logits/rejected": -2.9390196800231934, "logps/chosen": -27.29958152770996, "logps/rejected": -33.08103942871094, "loss": 0.1982, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8723928332328796, "rewards/margins": 2.6052730083465576, "rewards/rejected": -1.7328803539276123, "step": 430 }, { "epoch": 1.14, "grad_norm": 7.875, "learning_rate": 4.492831268057307e-06, "logits/chosen": -3.005812168121338, "logits/rejected": -2.991055727005005, "logps/chosen": -32.43821716308594, "logps/rejected": -34.362449645996094, "loss": 0.1944, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1018731594085693, "rewards/margins": 2.664522886276245, "rewards/rejected": -1.5626493692398071, "step": 440 }, { "epoch": 1.17, "grad_norm": 7.4375, "learning_rate": 4.458106782690094e-06, "logits/chosen": -2.835895538330078, "logits/rejected": -2.841238021850586, "logps/chosen": -27.996952056884766, "logps/rejected": -33.16737365722656, "loss": 0.1935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2257856130599976, "rewards/margins": 2.535287857055664, "rewards/rejected": -1.309502124786377, "step": 450 }, { "epoch": 1.19, "grad_norm": 4.75, "learning_rate": 4.422376313348405e-06, "logits/chosen": -2.908581256866455, "logits/rejected": -2.8960087299346924, "logps/chosen": -28.098934173583984, "logps/rejected": -37.033470153808594, "loss": 0.1398, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1988991498947144, "rewards/margins": 3.1904969215393066, "rewards/rejected": -1.9915975332260132, "step": 460 }, { "epoch": 1.22, "grad_norm": 35.5, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -2.961662530899048, "logits/rejected": -2.967067241668701, "logps/chosen": -30.011981964111328, "logps/rejected": -33.761478424072266, "loss": 0.2217, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2641057968139648, "rewards/margins": 2.9333133697509766, "rewards/rejected": -1.6692078113555908, "step": 470 }, { "epoch": 1.25, "grad_norm": 6.375, "learning_rate": 4.347971356735789e-06, "logits/chosen": -2.897602081298828, "logits/rejected": -2.903383255004883, "logps/chosen": -26.06301498413086, "logps/rejected": -32.249534606933594, "loss": 0.2337, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.9800997972488403, "rewards/margins": 2.5127570629119873, "rewards/rejected": -1.532657504081726, "step": 480 }, { "epoch": 1.27, "grad_norm": 4.9375, "learning_rate": 4.309335095262675e-06, "logits/chosen": -2.9619221687316895, "logits/rejected": -2.9760589599609375, "logps/chosen": -30.95633888244629, "logps/rejected": -33.317893981933594, "loss": 0.1951, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0749385356903076, "rewards/margins": 2.807677745819092, "rewards/rejected": -1.7327392101287842, "step": 490 }, { "epoch": 1.3, "grad_norm": 25.25, "learning_rate": 4.269769281772082e-06, "logits/chosen": -3.1099023818969727, "logits/rejected": -3.0993502140045166, "logps/chosen": -27.969009399414062, "logps/rejected": -36.54082489013672, "loss": 0.2082, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.0757564306259155, "rewards/margins": 3.2704696655273438, "rewards/rejected": -2.1947131156921387, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.8486714363098145, "eval_logits/rejected": -2.8475496768951416, "eval_logps/chosen": -31.971736907958984, "eval_logps/rejected": -35.81857681274414, "eval_loss": 0.6858980655670166, "eval_rewards/accuracies": 0.6092192530632019, "eval_rewards/chosen": -0.413571298122406, "eval_rewards/margins": 0.2586813271045685, "eval_rewards/rejected": -0.6722525954246521, "eval_runtime": 112.9971, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.381, "step": 500 }, { "epoch": 1.32, "grad_norm": 13.8125, "learning_rate": 4.22929424333435e-06, "logits/chosen": -2.9114999771118164, "logits/rejected": -2.9171700477600098, "logps/chosen": -28.95807456970215, "logps/rejected": -32.554012298583984, "loss": 0.1718, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.2146211862564087, "rewards/margins": 3.0521254539489746, "rewards/rejected": -1.8375046253204346, "step": 510 }, { "epoch": 1.35, "grad_norm": 9.5, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -2.8482823371887207, "logits/rejected": -2.851170063018799, "logps/chosen": -27.4453182220459, "logps/rejected": -32.90190124511719, "loss": 0.2558, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.3344898223876953, "rewards/margins": 3.184781789779663, "rewards/rejected": -1.8502919673919678, "step": 520 }, { "epoch": 1.38, "grad_norm": 4.53125, "learning_rate": 4.145700124802693e-06, "logits/chosen": -2.9950766563415527, "logits/rejected": -2.980553150177002, "logps/chosen": -28.1324462890625, "logps/rejected": -33.559776306152344, "loss": 0.1474, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1568902730941772, "rewards/margins": 3.404855728149414, "rewards/rejected": -2.2479655742645264, "step": 530 }, { "epoch": 1.4, "grad_norm": 5.15625, "learning_rate": 4.102623991469562e-06, "logits/chosen": -3.148716688156128, "logits/rejected": -3.149186134338379, "logps/chosen": -27.875808715820312, "logps/rejected": -33.190120697021484, "loss": 0.2048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2375982999801636, "rewards/margins": 2.9263930320739746, "rewards/rejected": -1.688794493675232, "step": 540 }, { "epoch": 1.43, "grad_norm": 15.375, "learning_rate": 4.058724504646834e-06, "logits/chosen": -3.111207962036133, "logits/rejected": -3.115915536880493, "logps/chosen": -29.704254150390625, "logps/rejected": -30.918773651123047, "loss": 0.2088, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.1263028383255005, "rewards/margins": 2.9283792972564697, "rewards/rejected": -1.8020765781402588, "step": 550 }, { "epoch": 1.45, "grad_norm": 1.6328125, "learning_rate": 4.014024217844167e-06, "logits/chosen": -2.8809523582458496, "logits/rejected": -2.8608622550964355, "logps/chosen": -28.932674407958984, "logps/rejected": -31.60831642150879, "loss": 0.2277, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.2164714336395264, "rewards/margins": 2.8543777465820312, "rewards/rejected": -1.6379063129425049, "step": 560 }, { "epoch": 1.48, "grad_norm": 9.5625, "learning_rate": 3.968546095984911e-06, "logits/chosen": -3.127673625946045, "logits/rejected": -3.1156859397888184, "logps/chosen": -28.24026870727539, "logps/rejected": -30.688241958618164, "loss": 0.203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3376219272613525, "rewards/margins": 2.8865649700164795, "rewards/rejected": -1.548943042755127, "step": 570 }, { "epoch": 1.51, "grad_norm": 6.9375, "learning_rate": 3.922313503607806e-06, "logits/chosen": -3.031052827835083, "logits/rejected": -3.0235180854797363, "logps/chosen": -27.095932006835938, "logps/rejected": -34.802921295166016, "loss": 0.1872, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1634767055511475, "rewards/margins": 3.093201160430908, "rewards/rejected": -1.9297244548797607, "step": 580 }, { "epoch": 1.53, "grad_norm": 6.40625, "learning_rate": 3.875350192863368e-06, "logits/chosen": -2.850231885910034, "logits/rejected": -2.8259904384613037, "logps/chosen": -26.238988876342773, "logps/rejected": -30.916696548461914, "loss": 0.1868, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7704905271530151, "rewards/margins": 2.6334352493286133, "rewards/rejected": -1.8629448413848877, "step": 590 }, { "epoch": 1.56, "grad_norm": 5.875, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -2.9241130352020264, "logits/rejected": -2.9332339763641357, "logps/chosen": -27.865304946899414, "logps/rejected": -32.76335525512695, "loss": 0.2119, "rewards/accuracies": 0.9375, "rewards/chosen": 1.113793134689331, "rewards/margins": 3.0938680171966553, "rewards/rejected": -1.9800748825073242, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.8321738243103027, "eval_logits/rejected": -2.8301031589508057, "eval_logps/chosen": -32.18595886230469, "eval_logps/rejected": -36.014678955078125, "eval_loss": 0.699341893196106, "eval_rewards/accuracies": 0.5926079750061035, "eval_rewards/chosen": -0.5421043038368225, "eval_rewards/margins": 0.24781100451946259, "eval_rewards/rejected": -0.7899153232574463, "eval_runtime": 112.7941, "eval_samples_per_second": 3.041, "eval_steps_per_second": 0.381, "step": 600 }, { "epoch": 1.58, "grad_norm": 13.5625, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -2.946056842803955, "logits/rejected": -2.9427828788757324, "logps/chosen": -30.479598999023438, "logps/rejected": -33.436546325683594, "loss": 0.1429, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.316603660583496, "rewards/margins": 3.323195219039917, "rewards/rejected": -2.006591558456421, "step": 610 }, { "epoch": 1.61, "grad_norm": 9.625, "learning_rate": 3.730319028506478e-06, "logits/chosen": -3.014833927154541, "logits/rejected": -3.022573947906494, "logps/chosen": -28.9743595123291, "logps/rejected": -34.54629898071289, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.335627555847168, "rewards/margins": 3.2767157554626465, "rewards/rejected": -1.941088080406189, "step": 620 }, { "epoch": 1.64, "grad_norm": 22.125, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -2.894308090209961, "logits/rejected": -2.8933069705963135, "logps/chosen": -27.416006088256836, "logps/rejected": -31.5345401763916, "loss": 0.2028, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4153752326965332, "rewards/margins": 3.449214458465576, "rewards/rejected": -2.033839702606201, "step": 630 }, { "epoch": 1.66, "grad_norm": 8.375, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -2.82157301902771, "logits/rejected": -2.8144097328186035, "logps/chosen": -28.129154205322266, "logps/rejected": -32.73756408691406, "loss": 0.1817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2250679731369019, "rewards/margins": 3.056783676147461, "rewards/rejected": -1.8317155838012695, "step": 640 }, { "epoch": 1.69, "grad_norm": 8.625, "learning_rate": 3.579601087369492e-06, "logits/chosen": -3.041685104370117, "logits/rejected": -3.0316648483276367, "logps/chosen": -29.226932525634766, "logps/rejected": -35.155330657958984, "loss": 0.1016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4800013303756714, "rewards/margins": 3.3574302196502686, "rewards/rejected": -1.877428650856018, "step": 650 }, { "epoch": 1.71, "grad_norm": 34.0, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -2.9072232246398926, "logits/rejected": -2.9152984619140625, "logps/chosen": -29.22675132751465, "logps/rejected": -32.41046905517578, "loss": 0.1796, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.361181616783142, "rewards/margins": 3.326385974884033, "rewards/rejected": -1.9652040004730225, "step": 660 }, { "epoch": 1.74, "grad_norm": 5.46875, "learning_rate": 3.476306177936961e-06, "logits/chosen": -2.8980605602264404, "logits/rejected": -2.9036636352539062, "logps/chosen": -28.326217651367188, "logps/rejected": -33.606929779052734, "loss": 0.0961, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4718984365463257, "rewards/margins": 3.5891189575195312, "rewards/rejected": -2.117220401763916, "step": 670 }, { "epoch": 1.77, "grad_norm": 7.34375, "learning_rate": 3.423893017450324e-06, "logits/chosen": -3.068453311920166, "logits/rejected": -3.0703165531158447, "logps/chosen": -26.69721031188965, "logps/rejected": -35.200775146484375, "loss": 0.1478, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6269474029541016, "rewards/margins": 3.727607011795044, "rewards/rejected": -2.1006596088409424, "step": 680 }, { "epoch": 1.79, "grad_norm": 9.8125, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -3.1004300117492676, "logits/rejected": -3.0753302574157715, "logps/chosen": -29.50313377380371, "logps/rejected": -35.621055603027344, "loss": 0.1709, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.4712846279144287, "rewards/margins": 3.5642027854919434, "rewards/rejected": -2.0929179191589355, "step": 690 }, { "epoch": 1.82, "grad_norm": 7.125, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -3.0888259410858154, "logits/rejected": -3.082875967025757, "logps/chosen": -26.177494049072266, "logps/rejected": -36.16416549682617, "loss": 0.1579, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.642321228981018, "rewards/margins": 3.7625675201416016, "rewards/rejected": -2.120246410369873, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.828380584716797, "eval_logits/rejected": -2.8261237144470215, "eval_logps/chosen": -32.29281997680664, "eval_logps/rejected": -36.07339096069336, "eval_loss": 0.7178195118904114, "eval_rewards/accuracies": 0.5805647969245911, "eval_rewards/chosen": -0.6062225699424744, "eval_rewards/margins": 0.21891874074935913, "eval_rewards/rejected": -0.8251413702964783, "eval_runtime": 112.945, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 700 }, { "epoch": 1.84, "grad_norm": 4.875, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -2.907222270965576, "logits/rejected": -2.915740728378296, "logps/chosen": -29.818323135375977, "logps/rejected": -35.702247619628906, "loss": 0.184, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.5511611700057983, "rewards/margins": 3.508523464202881, "rewards/rejected": -1.9573619365692139, "step": 710 }, { "epoch": 1.87, "grad_norm": 13.25, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -3.004633665084839, "logits/rejected": -3.0026512145996094, "logps/chosen": -30.316162109375, "logps/rejected": -33.810691833496094, "loss": 0.1853, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3776910305023193, "rewards/margins": 3.3225979804992676, "rewards/rejected": -1.9449069499969482, "step": 720 }, { "epoch": 1.9, "grad_norm": 7.4375, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -2.9351017475128174, "logits/rejected": -2.942718982696533, "logps/chosen": -25.93600082397461, "logps/rejected": -32.751060485839844, "loss": 0.1238, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3286634683609009, "rewards/margins": 3.64039945602417, "rewards/rejected": -2.3117361068725586, "step": 730 }, { "epoch": 1.92, "grad_norm": 5.59375, "learning_rate": 3.100405083388799e-06, "logits/chosen": -3.0043129920959473, "logits/rejected": -3.0114331245422363, "logps/chosen": -28.998254776000977, "logps/rejected": -40.23419952392578, "loss": 0.1188, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4500622749328613, "rewards/margins": 4.418519020080566, "rewards/rejected": -2.968456745147705, "step": 740 }, { "epoch": 1.95, "grad_norm": 9.9375, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -3.0003299713134766, "logits/rejected": -2.9911446571350098, "logps/chosen": -31.06940269470215, "logps/rejected": -36.11074447631836, "loss": 0.2631, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.3671112060546875, "rewards/margins": 3.5888779163360596, "rewards/rejected": -2.221766233444214, "step": 750 }, { "epoch": 1.97, "grad_norm": 5.90625, "learning_rate": 2.989809792446417e-06, "logits/chosen": -2.9182653427124023, "logits/rejected": -2.9267349243164062, "logps/chosen": -28.694416046142578, "logps/rejected": -32.506526947021484, "loss": 0.146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3879867792129517, "rewards/margins": 3.4621334075927734, "rewards/rejected": -2.0741465091705322, "step": 760 }, { "epoch": 2.0, "grad_norm": 11.75, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -2.920984983444214, "logits/rejected": -2.9358911514282227, "logps/chosen": -28.243701934814453, "logps/rejected": -35.78112030029297, "loss": 0.2048, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.52691650390625, "rewards/margins": 3.398749589920044, "rewards/rejected": -1.8718332052230835, "step": 770 }, { "epoch": 2.03, "grad_norm": 1.671875, "learning_rate": 2.878208065043501e-06, "logits/chosen": -2.994871139526367, "logits/rejected": -2.987257719039917, "logps/chosen": -29.053070068359375, "logps/rejected": -33.76008605957031, "loss": 0.0836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9706106185913086, "rewards/margins": 4.18499755859375, "rewards/rejected": -2.214386463165283, "step": 780 }, { "epoch": 2.05, "grad_norm": 2.75, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -2.9811806678771973, "logits/rejected": -2.9775445461273193, "logps/chosen": -24.44116973876953, "logps/rejected": -34.26652145385742, "loss": 0.057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5414597988128662, "rewards/margins": 4.266219615936279, "rewards/rejected": -2.724759817123413, "step": 790 }, { "epoch": 2.08, "grad_norm": 0.69921875, "learning_rate": 2.76582921478147e-06, "logits/chosen": -2.921146869659424, "logits/rejected": -2.902113199234009, "logps/chosen": -27.294260025024414, "logps/rejected": -36.3679084777832, "loss": 0.0649, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8000681400299072, "rewards/margins": 4.472109794616699, "rewards/rejected": -2.672041416168213, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.8270516395568848, "eval_logits/rejected": -2.824305772781372, "eval_logps/chosen": -32.480831146240234, "eval_logps/rejected": -36.364830017089844, "eval_loss": 0.7259832620620728, "eval_rewards/accuracies": 0.6071428656578064, "eval_rewards/chosen": -0.7190301418304443, "eval_rewards/margins": 0.28097450733184814, "eval_rewards/rejected": -1.0000046491622925, "eval_runtime": 112.9479, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 800 }, { "epoch": 2.1, "grad_norm": 0.515625, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -3.0023553371429443, "logits/rejected": -3.0219783782958984, "logps/chosen": -28.510021209716797, "logps/rejected": -36.82685089111328, "loss": 0.0608, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.8590720891952515, "rewards/margins": 5.111330986022949, "rewards/rejected": -3.252258777618408, "step": 810 }, { "epoch": 2.13, "grad_norm": 4.5625, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -2.9950449466705322, "logits/rejected": -2.9868369102478027, "logps/chosen": -29.983810424804688, "logps/rejected": -34.7702522277832, "loss": 0.0753, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7618064880371094, "rewards/margins": 4.724579811096191, "rewards/rejected": -2.962772846221924, "step": 820 }, { "epoch": 2.16, "grad_norm": 3.90625, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -2.873569965362549, "logits/rejected": -2.872542381286621, "logps/chosen": -28.7579345703125, "logps/rejected": -30.285070419311523, "loss": 0.0967, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7464784383773804, "rewards/margins": 4.3574628829956055, "rewards/rejected": -2.6109836101531982, "step": 830 }, { "epoch": 2.18, "grad_norm": 1.328125, "learning_rate": 2.53966490958702e-06, "logits/chosen": -2.976682662963867, "logits/rejected": -2.9772753715515137, "logps/chosen": -29.0651912689209, "logps/rejected": -31.696544647216797, "loss": 0.0447, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.118293285369873, "rewards/margins": 4.679741859436035, "rewards/rejected": -2.561448335647583, "step": 840 }, { "epoch": 2.21, "grad_norm": 13.6875, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -3.0549464225769043, "logits/rejected": -3.056971549987793, "logps/chosen": -26.48697280883789, "logps/rejected": -35.12908935546875, "loss": 0.05, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.833518385887146, "rewards/margins": 4.853404521942139, "rewards/rejected": -3.0198864936828613, "step": 850 }, { "epoch": 2.23, "grad_norm": 1.3828125, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -2.8957679271698, "logits/rejected": -2.9036927223205566, "logps/chosen": -21.315093994140625, "logps/rejected": -32.645179748535156, "loss": 0.0701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7470932006835938, "rewards/margins": 4.7478485107421875, "rewards/rejected": -3.000755786895752, "step": 860 }, { "epoch": 2.26, "grad_norm": 1.2109375, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -3.064497470855713, "logits/rejected": -3.0464019775390625, "logps/chosen": -28.66514015197754, "logps/rejected": -32.98708724975586, "loss": 0.0539, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.899011254310608, "rewards/margins": 4.9983906745910645, "rewards/rejected": -3.099379301071167, "step": 870 }, { "epoch": 2.29, "grad_norm": 1.953125, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -2.961402416229248, "logits/rejected": -2.941373348236084, "logps/chosen": -28.94051742553711, "logps/rejected": -35.915077209472656, "loss": 0.1267, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2919337749481201, "rewards/margins": 4.449277877807617, "rewards/rejected": -3.1573433876037598, "step": 880 }, { "epoch": 2.31, "grad_norm": 4.96875, "learning_rate": 2.256719512667651e-06, "logits/chosen": -2.942389726638794, "logits/rejected": -2.9418246746063232, "logps/chosen": -30.278453826904297, "logps/rejected": -39.759918212890625, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6221895217895508, "rewards/margins": 5.620645523071289, "rewards/rejected": -3.998455762863159, "step": 890 }, { "epoch": 2.34, "grad_norm": 1.8671875, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -2.9577229022979736, "logits/rejected": -2.975241184234619, "logps/chosen": -28.08145523071289, "logps/rejected": -37.178321838378906, "loss": 0.1014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1683677434921265, "rewards/margins": 5.182019233703613, "rewards/rejected": -4.0136518478393555, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.8304085731506348, "eval_logits/rejected": -2.827810287475586, "eval_logps/chosen": -32.957435607910156, "eval_logps/rejected": -36.925601959228516, "eval_loss": 0.775818407535553, "eval_rewards/accuracies": 0.5830564498901367, "eval_rewards/chosen": -1.0049890279769897, "eval_rewards/margins": 0.331479549407959, "eval_rewards/rejected": -1.3364684581756592, "eval_runtime": 112.9719, "eval_samples_per_second": 3.036, "eval_steps_per_second": 0.381, "step": 900 }, { "epoch": 2.36, "grad_norm": 4.4375, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -2.9465219974517822, "logits/rejected": -2.936427354812622, "logps/chosen": -29.5849666595459, "logps/rejected": -38.474422454833984, "loss": 0.0556, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0259087085723877, "rewards/margins": 5.3213396072387695, "rewards/rejected": -4.295430660247803, "step": 910 }, { "epoch": 2.39, "grad_norm": 4.5, "learning_rate": 2.088219349982323e-06, "logits/chosen": -3.0080857276916504, "logits/rejected": -2.988001823425293, "logps/chosen": -30.026147842407227, "logps/rejected": -35.326351165771484, "loss": 0.0998, "rewards/accuracies": 0.9375, "rewards/chosen": 1.7186342477798462, "rewards/margins": 5.033928871154785, "rewards/rejected": -3.3152947425842285, "step": 920 }, { "epoch": 2.42, "grad_norm": 4.375, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -2.9182701110839844, "logits/rejected": -2.9180097579956055, "logps/chosen": -27.986026763916016, "logps/rejected": -38.27654266357422, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": 1.2587478160858154, "rewards/margins": 5.192648410797119, "rewards/rejected": -3.933900833129883, "step": 930 }, { "epoch": 2.44, "grad_norm": 1.484375, "learning_rate": 1.976895560604729e-06, "logits/chosen": -3.001537561416626, "logits/rejected": -3.0352466106414795, "logps/chosen": -27.144699096679688, "logps/rejected": -36.409183502197266, "loss": 0.0498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4297518730163574, "rewards/margins": 5.339670658111572, "rewards/rejected": -3.9099185466766357, "step": 940 }, { "epoch": 2.47, "grad_norm": 3.109375, "learning_rate": 1.921622518534466e-06, "logits/chosen": -2.944477081298828, "logits/rejected": -2.947420835494995, "logps/chosen": -27.510986328125, "logps/rejected": -38.851806640625, "loss": 0.0953, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9855767488479614, "rewards/margins": 5.092670917510986, "rewards/rejected": -4.107093334197998, "step": 950 }, { "epoch": 2.49, "grad_norm": 0.5859375, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -3.0110037326812744, "logits/rejected": -2.9924447536468506, "logps/chosen": -28.574935913085938, "logps/rejected": -40.97336959838867, "loss": 0.0474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5641006231307983, "rewards/margins": 5.6208696365356445, "rewards/rejected": -4.056769371032715, "step": 960 }, { "epoch": 2.52, "grad_norm": 0.51953125, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -3.15667462348938, "logits/rejected": -3.160064220428467, "logps/chosen": -26.655324935913086, "logps/rejected": -37.378997802734375, "loss": 0.0603, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3655481338500977, "rewards/margins": 5.361147880554199, "rewards/rejected": -3.9955997467041016, "step": 970 }, { "epoch": 2.55, "grad_norm": 4.125, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -2.9752297401428223, "logits/rejected": -3.0014469623565674, "logps/chosen": -27.867412567138672, "logps/rejected": -37.458885192871094, "loss": 0.0588, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6494334936141968, "rewards/margins": 5.35553503036499, "rewards/rejected": -3.706101655960083, "step": 980 }, { "epoch": 2.57, "grad_norm": 3.078125, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -2.8809103965759277, "logits/rejected": -2.879664897918701, "logps/chosen": -31.8229923248291, "logps/rejected": -35.25844955444336, "loss": 0.1466, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2134218215942383, "rewards/margins": 4.447299003601074, "rewards/rejected": -3.233877182006836, "step": 990 }, { "epoch": 2.6, "grad_norm": 3.421875, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -3.044959783554077, "logits/rejected": -3.0451231002807617, "logps/chosen": -29.768239974975586, "logps/rejected": -38.18815231323242, "loss": 0.0425, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.273899793624878, "rewards/margins": 4.840017795562744, "rewards/rejected": -3.5661182403564453, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.82666015625, "eval_logits/rejected": -2.8237953186035156, "eval_logps/chosen": -33.11478042602539, "eval_logps/rejected": -37.10795974731445, "eval_loss": 0.7951747179031372, "eval_rewards/accuracies": 0.5826411843299866, "eval_rewards/chosen": -1.099395990371704, "eval_rewards/margins": 0.34648579359054565, "eval_rewards/rejected": -1.445881724357605, "eval_runtime": 112.9893, "eval_samples_per_second": 3.036, "eval_steps_per_second": 0.381, "step": 1000 }, { "epoch": 2.62, "grad_norm": 23.875, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -2.985290288925171, "logits/rejected": -2.9671034812927246, "logps/chosen": -28.381534576416016, "logps/rejected": -37.65121078491211, "loss": 0.1117, "rewards/accuracies": 0.9375, "rewards/chosen": 1.111913800239563, "rewards/margins": 5.119040489196777, "rewards/rejected": -4.007126808166504, "step": 1010 }, { "epoch": 2.65, "grad_norm": 26.875, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -2.9878487586975098, "logits/rejected": -2.9929039478302, "logps/chosen": -26.1834774017334, "logps/rejected": -35.29491424560547, "loss": 0.08, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1799854040145874, "rewards/margins": 5.289762496948242, "rewards/rejected": -4.109776973724365, "step": 1020 }, { "epoch": 2.68, "grad_norm": 1.25, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -2.9306976795196533, "logits/rejected": -2.9175031185150146, "logps/chosen": -28.533618927001953, "logps/rejected": -37.49860382080078, "loss": 0.0675, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2261769771575928, "rewards/margins": 5.032643795013428, "rewards/rejected": -3.806466579437256, "step": 1030 }, { "epoch": 2.7, "grad_norm": 3.1875, "learning_rate": 1.440887158673332e-06, "logits/chosen": -2.8356387615203857, "logits/rejected": -2.852898120880127, "logps/chosen": -30.96673011779785, "logps/rejected": -39.19948196411133, "loss": 0.0475, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2193964719772339, "rewards/margins": 5.134554862976074, "rewards/rejected": -3.9151573181152344, "step": 1040 }, { "epoch": 2.73, "grad_norm": 2.65625, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -2.9140465259552, "logits/rejected": -2.9217119216918945, "logps/chosen": -26.750823974609375, "logps/rejected": -36.618751525878906, "loss": 0.0998, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.5606456995010376, "rewards/margins": 5.167923927307129, "rewards/rejected": -3.6072781085968018, "step": 1050 }, { "epoch": 2.75, "grad_norm": 6.96875, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -3.0197348594665527, "logits/rejected": -3.012899875640869, "logps/chosen": -27.697092056274414, "logps/rejected": -37.08921813964844, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 1.4635608196258545, "rewards/margins": 5.2141265869140625, "rewards/rejected": -3.750566005706787, "step": 1060 }, { "epoch": 2.78, "grad_norm": 1.5234375, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -2.9200081825256348, "logits/rejected": -2.934675455093384, "logps/chosen": -29.083492279052734, "logps/rejected": -38.6815071105957, "loss": 0.0528, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4372832775115967, "rewards/margins": 5.663827419281006, "rewards/rejected": -4.226544380187988, "step": 1070 }, { "epoch": 2.81, "grad_norm": 2.15625, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -2.8056931495666504, "logits/rejected": -2.7873711585998535, "logps/chosen": -29.705490112304688, "logps/rejected": -36.016380310058594, "loss": 0.1643, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2028892040252686, "rewards/margins": 4.897210121154785, "rewards/rejected": -3.6943202018737793, "step": 1080 }, { "epoch": 2.83, "grad_norm": 1.359375, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -2.8858981132507324, "logits/rejected": -2.875478506088257, "logps/chosen": -29.841075897216797, "logps/rejected": -34.505531311035156, "loss": 0.0712, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.508385419845581, "rewards/margins": 5.07808780670166, "rewards/rejected": -3.5697035789489746, "step": 1090 }, { "epoch": 2.86, "grad_norm": 4.59375, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -3.0002076625823975, "logits/rejected": -2.9881558418273926, "logps/chosen": -29.46624183654785, "logps/rejected": -39.757606506347656, "loss": 0.0878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5060511827468872, "rewards/margins": 5.485553741455078, "rewards/rejected": -3.9795022010803223, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.8282854557037354, "eval_logits/rejected": -2.825747489929199, "eval_logps/chosen": -33.10422134399414, "eval_logps/rejected": -37.09624099731445, "eval_loss": 0.7929172515869141, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -1.0930629968643188, "eval_rewards/margins": 0.3457900285720825, "eval_rewards/rejected": -1.4388530254364014, "eval_runtime": 113.0018, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.381, "step": 1100 }, { "epoch": 2.88, "grad_norm": 2.296875, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -2.8882246017456055, "logits/rejected": -2.8708715438842773, "logps/chosen": -30.411975860595703, "logps/rejected": -40.37923049926758, "loss": 0.0861, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3059101104736328, "rewards/margins": 5.2735981941223145, "rewards/rejected": -3.967688798904419, "step": 1110 }, { "epoch": 2.91, "grad_norm": 1.5234375, "learning_rate": 1.049857726072005e-06, "logits/chosen": -2.9631104469299316, "logits/rejected": -2.9671130180358887, "logps/chosen": -27.215435028076172, "logps/rejected": -36.89886474609375, "loss": 0.0417, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3935225009918213, "rewards/margins": 5.249666690826416, "rewards/rejected": -3.8561434745788574, "step": 1120 }, { "epoch": 2.94, "grad_norm": 2.1875, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -2.940502643585205, "logits/rejected": -2.9377074241638184, "logps/chosen": -25.027814865112305, "logps/rejected": -33.21368408203125, "loss": 0.0557, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6164283752441406, "rewards/margins": 5.1664042472839355, "rewards/rejected": -3.549975872039795, "step": 1130 }, { "epoch": 2.96, "grad_norm": 0.61328125, "learning_rate": 9.59060791022566e-07, "logits/chosen": -2.8224692344665527, "logits/rejected": -2.817772626876831, "logps/chosen": -29.64740562438965, "logps/rejected": -36.215633392333984, "loss": 0.0744, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1733062267303467, "rewards/margins": 4.800383567810059, "rewards/rejected": -3.6270766258239746, "step": 1140 }, { "epoch": 2.99, "grad_norm": 3.171875, "learning_rate": 9.148382544856885e-07, "logits/chosen": -3.0086512565612793, "logits/rejected": -3.0138959884643555, "logps/chosen": -24.296131134033203, "logps/rejected": -36.91239547729492, "loss": 0.0624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2671399116516113, "rewards/margins": 5.15519905090332, "rewards/rejected": -3.888059139251709, "step": 1150 }, { "epoch": 3.01, "grad_norm": 0.375, "learning_rate": 8.714301001505568e-07, "logits/chosen": -2.880089521408081, "logits/rejected": -2.867708921432495, "logps/chosen": -28.80812644958496, "logps/rejected": -40.27983093261719, "loss": 0.0588, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2912962436676025, "rewards/margins": 5.556339740753174, "rewards/rejected": -4.265043258666992, "step": 1160 }, { "epoch": 3.04, "grad_norm": 2.75, "learning_rate": 8.288586291031025e-07, "logits/chosen": -3.001692771911621, "logits/rejected": -2.9999001026153564, "logps/chosen": -26.936702728271484, "logps/rejected": -37.397525787353516, "loss": 0.0313, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.323664665222168, "rewards/margins": 5.6624555587768555, "rewards/rejected": -4.338790416717529, "step": 1170 }, { "epoch": 3.06, "grad_norm": 0.236328125, "learning_rate": 7.871457125803897e-07, "logits/chosen": -2.917046308517456, "logits/rejected": -2.911705732345581, "logps/chosen": -31.885528564453125, "logps/rejected": -39.40591049194336, "loss": 0.0893, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.519230604171753, "rewards/margins": 5.741326332092285, "rewards/rejected": -4.222095489501953, "step": 1180 }, { "epoch": 3.09, "grad_norm": 0.98828125, "learning_rate": 7.463127807341966e-07, "logits/chosen": -3.032956838607788, "logits/rejected": -3.034572124481201, "logps/chosen": -30.1025333404541, "logps/rejected": -37.68088150024414, "loss": 0.0492, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7244186401367188, "rewards/margins": 5.53281307220459, "rewards/rejected": -3.8083953857421875, "step": 1190 }, { "epoch": 3.12, "grad_norm": 8.5625, "learning_rate": 7.063808116212021e-07, "logits/chosen": -3.031939744949341, "logits/rejected": -3.020230293273926, "logps/chosen": -27.5956974029541, "logps/rejected": -35.825416564941406, "loss": 0.0534, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5125226974487305, "rewards/margins": 5.0801496505737305, "rewards/rejected": -3.567626953125, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.8285086154937744, "eval_logits/rejected": -2.8258352279663086, "eval_logps/chosen": -33.16932678222656, "eval_logps/rejected": -37.17423629760742, "eval_loss": 0.7996842265129089, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -1.1321243047714233, "eval_rewards/margins": 0.35352617502212524, "eval_rewards/rejected": -1.4856503009796143, "eval_runtime": 112.7911, "eval_samples_per_second": 3.041, "eval_steps_per_second": 0.381, "step": 1200 }, { "epoch": 3.14, "grad_norm": 15.0625, "learning_rate": 6.673703204254348e-07, "logits/chosen": -2.9277374744415283, "logits/rejected": -2.9358885288238525, "logps/chosen": -27.505834579467773, "logps/rejected": -35.87134552001953, "loss": 0.0495, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1557390689849854, "rewards/margins": 5.609216690063477, "rewards/rejected": -4.453477382659912, "step": 1210 }, { "epoch": 3.17, "grad_norm": 0.51953125, "learning_rate": 6.293013489185315e-07, "logits/chosen": -2.9534084796905518, "logits/rejected": -2.9638400077819824, "logps/chosen": -28.469470977783203, "logps/rejected": -37.100807189941406, "loss": 0.0428, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8623542785644531, "rewards/margins": 5.414062976837158, "rewards/rejected": -3.551708698272705, "step": 1220 }, { "epoch": 3.19, "grad_norm": 0.69140625, "learning_rate": 5.921934551632086e-07, "logits/chosen": -2.8607680797576904, "logits/rejected": -2.8647537231445312, "logps/chosen": -28.534021377563477, "logps/rejected": -38.18036651611328, "loss": 0.0965, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.4797532558441162, "rewards/margins": 5.64510440826416, "rewards/rejected": -4.165350914001465, "step": 1230 }, { "epoch": 3.22, "grad_norm": 0.9921875, "learning_rate": 5.560657034652405e-07, "logits/chosen": -2.927048683166504, "logits/rejected": -2.914555072784424, "logps/chosen": -28.605026245117188, "logps/rejected": -35.72632598876953, "loss": 0.0492, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7661066055297852, "rewards/margins": 5.441458702087402, "rewards/rejected": -3.675352096557617, "step": 1240 }, { "epoch": 3.25, "grad_norm": 4.09375, "learning_rate": 5.2093665457911e-07, "logits/chosen": -2.8794398307800293, "logits/rejected": -2.883221387863159, "logps/chosen": -29.48556900024414, "logps/rejected": -39.60463333129883, "loss": 0.0413, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.490134596824646, "rewards/margins": 5.7836713790893555, "rewards/rejected": -4.293536186218262, "step": 1250 }, { "epoch": 3.27, "grad_norm": 0.3359375, "learning_rate": 4.868243561723535e-07, "logits/chosen": -2.925483226776123, "logits/rejected": -2.9221901893615723, "logps/chosen": -26.7332820892334, "logps/rejected": -37.69228744506836, "loss": 0.0422, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5008256435394287, "rewards/margins": 5.654725074768066, "rewards/rejected": -4.1538987159729, "step": 1260 }, { "epoch": 3.3, "grad_norm": 0.8203125, "learning_rate": 4.537463335535161e-07, "logits/chosen": -2.9841361045837402, "logits/rejected": -2.9754185676574707, "logps/chosen": -26.88946533203125, "logps/rejected": -36.481143951416016, "loss": 0.0543, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6370275020599365, "rewards/margins": 5.698369026184082, "rewards/rejected": -4.061341285705566, "step": 1270 }, { "epoch": 3.32, "grad_norm": 1.671875, "learning_rate": 4.217195806684629e-07, "logits/chosen": -3.086789608001709, "logits/rejected": -3.0812721252441406, "logps/chosen": -29.082738876342773, "logps/rejected": -37.11894989013672, "loss": 0.0392, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.423052191734314, "rewards/margins": 5.66457462310791, "rewards/rejected": -4.241522312164307, "step": 1280 }, { "epoch": 3.35, "grad_norm": 0.625, "learning_rate": 3.907605513696808e-07, "logits/chosen": -3.1405227184295654, "logits/rejected": -3.136273145675659, "logps/chosen": -28.447912216186523, "logps/rejected": -41.65070724487305, "loss": 0.0934, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.576712965965271, "rewards/margins": 5.907447814941406, "rewards/rejected": -4.330735206604004, "step": 1290 }, { "epoch": 3.38, "grad_norm": 1.6171875, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -3.004781723022461, "logits/rejected": -2.992069959640503, "logps/chosen": -27.656494140625, "logps/rejected": -36.953086853027344, "loss": 0.035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8298753499984741, "rewards/margins": 5.944838047027588, "rewards/rejected": -4.114962577819824, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.829094409942627, "eval_logits/rejected": -2.8265905380249023, "eval_logps/chosen": -33.189910888671875, "eval_logps/rejected": -37.201393127441406, "eval_loss": 0.8024306297302246, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -1.144473671913147, "eval_rewards/margins": 0.35747113823890686, "eval_rewards/rejected": -1.5019447803497314, "eval_runtime": 112.8427, "eval_samples_per_second": 3.04, "eval_steps_per_second": 0.381, "step": 1300 }, { "epoch": 3.4, "grad_norm": 0.2265625, "learning_rate": 3.321087280364757e-07, "logits/chosen": -2.9668993949890137, "logits/rejected": -2.9619083404541016, "logps/chosen": -25.407943725585938, "logps/rejected": -39.095542907714844, "loss": 0.0595, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6230905055999756, "rewards/margins": 5.879204750061035, "rewards/rejected": -4.256114482879639, "step": 1310 }, { "epoch": 3.43, "grad_norm": 2.609375, "learning_rate": 3.044460665744284e-07, "logits/chosen": -2.921325206756592, "logits/rejected": -2.9301552772521973, "logps/chosen": -27.841144561767578, "logps/rejected": -38.21234130859375, "loss": 0.034, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3703900575637817, "rewards/margins": 5.775041580200195, "rewards/rejected": -4.404651165008545, "step": 1320 }, { "epoch": 3.45, "grad_norm": 0.5, "learning_rate": 2.779113783626916e-07, "logits/chosen": -2.928009510040283, "logits/rejected": -2.9167959690093994, "logps/chosen": -27.176036834716797, "logps/rejected": -38.40895462036133, "loss": 0.0451, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5559582710266113, "rewards/margins": 5.725183963775635, "rewards/rejected": -4.169225692749023, "step": 1330 }, { "epoch": 3.48, "grad_norm": 2.03125, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -3.1457560062408447, "logits/rejected": -3.1327528953552246, "logps/chosen": -28.73006248474121, "logps/rejected": -34.102195739746094, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 1.7201951742172241, "rewards/margins": 5.409661293029785, "rewards/rejected": -3.689465284347534, "step": 1340 }, { "epoch": 3.51, "grad_norm": 0.7578125, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -3.0063319206237793, "logits/rejected": -2.9997639656066895, "logps/chosen": -27.629440307617188, "logps/rejected": -38.41633605957031, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9246017932891846, "rewards/margins": 6.342138290405273, "rewards/rejected": -4.417536735534668, "step": 1350 }, { "epoch": 3.53, "grad_norm": 5.21875, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -2.905151844024658, "logits/rejected": -2.89668345451355, "logps/chosen": -29.729381561279297, "logps/rejected": -39.10436248779297, "loss": 0.0216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1899113655090332, "rewards/margins": 5.677424907684326, "rewards/rejected": -4.487513542175293, "step": 1360 }, { "epoch": 3.56, "grad_norm": 0.625, "learning_rate": 1.833161662683672e-07, "logits/chosen": -2.9205312728881836, "logits/rejected": -2.9178478717803955, "logps/chosen": -27.772390365600586, "logps/rejected": -35.85506057739258, "loss": 0.0327, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4194756746292114, "rewards/margins": 5.484891414642334, "rewards/rejected": -4.065415859222412, "step": 1370 }, { "epoch": 3.58, "grad_norm": 0.5859375, "learning_rate": 1.626139998169246e-07, "logits/chosen": -2.941387414932251, "logits/rejected": -2.953021287918091, "logps/chosen": -26.46402931213379, "logps/rejected": -34.60562515258789, "loss": 0.0423, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3287296295166016, "rewards/margins": 5.227254390716553, "rewards/rejected": -3.8985252380371094, "step": 1380 }, { "epoch": 3.61, "grad_norm": 3.6875, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -3.097482681274414, "logits/rejected": -3.1019105911254883, "logps/chosen": -27.129032135009766, "logps/rejected": -37.47324752807617, "loss": 0.0338, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2118251323699951, "rewards/margins": 5.151190757751465, "rewards/rejected": -3.939366579055786, "step": 1390 }, { "epoch": 3.64, "grad_norm": 0.90234375, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -2.914987564086914, "logits/rejected": -2.896317958831787, "logps/chosen": -30.679147720336914, "logps/rejected": -36.49955368041992, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 1.818377137184143, "rewards/margins": 5.521285533905029, "rewards/rejected": -3.702908754348755, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.8293960094451904, "eval_logits/rejected": -2.8266942501068115, "eval_logps/chosen": -33.2208366394043, "eval_logps/rejected": -37.21278381347656, "eval_loss": 0.8125642538070679, "eval_rewards/accuracies": 0.5859634280204773, "eval_rewards/chosen": -1.1630306243896484, "eval_rewards/margins": 0.3457449674606323, "eval_rewards/rejected": -1.5087755918502808, "eval_runtime": 112.9366, "eval_samples_per_second": 3.037, "eval_steps_per_second": 0.381, "step": 1400 }, { "epoch": 3.66, "grad_norm": 1.4140625, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -2.8738229274749756, "logits/rejected": -2.89139986038208, "logps/chosen": -29.04506492614746, "logps/rejected": -38.35009002685547, "loss": 0.0338, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.687996506690979, "rewards/margins": 5.789109230041504, "rewards/rejected": -4.101112365722656, "step": 1410 }, { "epoch": 3.69, "grad_norm": 4.0625, "learning_rate": 9.191080703056604e-08, "logits/chosen": -3.110901355743408, "logits/rejected": -3.107856273651123, "logps/chosen": -28.8006534576416, "logps/rejected": -36.73194122314453, "loss": 0.0216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4968552589416504, "rewards/margins": 5.476016044616699, "rewards/rejected": -3.979161024093628, "step": 1420 }, { "epoch": 3.71, "grad_norm": 1.609375, "learning_rate": 7.730678442730539e-08, "logits/chosen": -2.9601540565490723, "logits/rejected": -2.9479076862335205, "logps/chosen": -30.2454776763916, "logps/rejected": -38.509239196777344, "loss": 0.0347, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5139565467834473, "rewards/margins": 5.956180572509766, "rewards/rejected": -4.44222354888916, "step": 1430 }, { "epoch": 3.74, "grad_norm": 1.7890625, "learning_rate": 6.394742864787806e-08, "logits/chosen": -2.9412589073181152, "logits/rejected": -2.9450554847717285, "logps/chosen": -28.570653915405273, "logps/rejected": -39.62348175048828, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.05934739112854, "rewards/margins": 5.714109420776367, "rewards/rejected": -4.654762268066406, "step": 1440 }, { "epoch": 3.77, "grad_norm": 1.109375, "learning_rate": 5.183960310644748e-08, "logits/chosen": -2.916991710662842, "logits/rejected": -2.9185004234313965, "logps/chosen": -30.148340225219727, "logps/rejected": -38.64313507080078, "loss": 0.0452, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.505264163017273, "rewards/margins": 5.883156776428223, "rewards/rejected": -4.37789249420166, "step": 1450 }, { "epoch": 3.79, "grad_norm": 3.25, "learning_rate": 4.098952823928693e-08, "logits/chosen": -2.9078240394592285, "logits/rejected": -2.9042093753814697, "logps/chosen": -27.049224853515625, "logps/rejected": -37.686256408691406, "loss": 0.0372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3148393630981445, "rewards/margins": 5.67122745513916, "rewards/rejected": -4.356388568878174, "step": 1460 }, { "epoch": 3.82, "grad_norm": 0.6171875, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -2.910646915435791, "logits/rejected": -2.896136999130249, "logps/chosen": -25.80971336364746, "logps/rejected": -35.506134033203125, "loss": 0.0511, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3185876607894897, "rewards/margins": 5.667901039123535, "rewards/rejected": -4.349313259124756, "step": 1470 }, { "epoch": 3.84, "grad_norm": 1.1796875, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -2.8877739906311035, "logits/rejected": -2.8964595794677734, "logps/chosen": -29.988643646240234, "logps/rejected": -36.62062072753906, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 1.4086239337921143, "rewards/margins": 5.426566123962402, "rewards/rejected": -4.017942428588867, "step": 1480 }, { "epoch": 3.87, "grad_norm": 1.53125, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -2.9591662883758545, "logits/rejected": -2.951643466949463, "logps/chosen": -24.451513290405273, "logps/rejected": -32.70746612548828, "loss": 0.0734, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.471919059753418, "rewards/margins": 5.120394229888916, "rewards/rejected": -3.648474931716919, "step": 1490 }, { "epoch": 3.9, "grad_norm": 3.21875, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -2.883286952972412, "logits/rejected": -2.896113395690918, "logps/chosen": -27.21588706970215, "logps/rejected": -34.79141616821289, "loss": 0.0525, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.690004587173462, "rewards/margins": 5.427872657775879, "rewards/rejected": -3.737868547439575, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.829244613647461, "eval_logits/rejected": -2.8264925479888916, "eval_logps/chosen": -33.229942321777344, "eval_logps/rejected": -37.220787048339844, "eval_loss": 0.808788537979126, "eval_rewards/accuracies": 0.5917773842811584, "eval_rewards/chosen": -1.168494701385498, "eval_rewards/margins": 0.34508630633354187, "eval_rewards/rejected": -1.5135811567306519, "eval_runtime": 113.0108, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 1500 }, { "epoch": 3.92, "grad_norm": 0.3125, "learning_rate": 5.777746105209147e-09, "logits/chosen": -3.059047222137451, "logits/rejected": -3.0689499378204346, "logps/chosen": -30.036035537719727, "logps/rejected": -38.22625732421875, "loss": 0.0788, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.6072747707366943, "rewards/margins": 5.195981502532959, "rewards/rejected": -3.5887062549591064, "step": 1510 }, { "epoch": 3.95, "grad_norm": 2.3125, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -2.882291555404663, "logits/rejected": -2.8862195014953613, "logps/chosen": -26.208663940429688, "logps/rejected": -37.17507553100586, "loss": 0.0545, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.587720274925232, "rewards/margins": 5.605459690093994, "rewards/rejected": -4.017739295959473, "step": 1520 }, { "epoch": 3.97, "grad_norm": 0.7421875, "learning_rate": 6.421917227455999e-10, "logits/chosen": -2.954590320587158, "logits/rejected": -2.955815315246582, "logps/chosen": -23.254398345947266, "logps/rejected": -32.172245025634766, "loss": 0.0378, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.555316686630249, "rewards/margins": 5.062749862670898, "rewards/rejected": -3.5074334144592285, "step": 1530 }, { "epoch": 4.0, "grad_norm": 1.8203125, "learning_rate": 0.0, "logits/chosen": -2.8720169067382812, "logits/rejected": -2.8812081813812256, "logps/chosen": -28.843318939208984, "logps/rejected": -40.02803421020508, "loss": 0.0247, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1202117204666138, "rewards/margins": 5.758116245269775, "rewards/rejected": -4.637904644012451, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.21652605050763526, "train_runtime": 11213.836, "train_samples_per_second": 1.098, "train_steps_per_second": 0.137 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }