{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 7.90625, "learning_rate": 1.282051282051282e-07, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 13.4375, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.009408473968506, "logits/rejected": -2.9977328777313232, "logps/chosen": -33.20761489868164, "logps/rejected": -31.960758209228516, "loss": 1.003, "rewards/accuracies": 0.375, "rewards/chosen": -0.0026669783983379602, "rewards/margins": -0.002981698838993907, "rewards/rejected": 0.00031472102273255587, "step": 10 }, { "epoch": 0.05, "grad_norm": 9.1875, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.8990566730499268, "logits/rejected": -2.894094944000244, "logps/chosen": -32.50642013549805, "logps/rejected": -28.963293075561523, "loss": 1.005, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0031326524913311005, "rewards/margins": -0.00499005988240242, "rewards/rejected": 0.0018574076239019632, "step": 20 }, { "epoch": 0.08, "grad_norm": 8.8125, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.0970427989959717, "logits/rejected": -3.108668804168701, "logps/chosen": -32.80826187133789, "logps/rejected": -30.164159774780273, "loss": 0.9931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.010042307898402214, "rewards/margins": 0.006853123661130667, "rewards/rejected": 0.0031891819089651108, "step": 30 }, { "epoch": 0.1, "grad_norm": 9.0625, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.8629202842712402, "logits/rejected": -2.8535947799682617, "logps/chosen": -31.525354385375977, "logps/rejected": -32.399391174316406, "loss": 0.9653, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.030497059226036072, "rewards/margins": 0.034690458327531815, "rewards/rejected": -0.004193395376205444, "step": 40 }, { "epoch": 0.13, "grad_norm": 8.1875, "learning_rate": 4.987541037542187e-06, "logits/chosen": -2.87955904006958, "logits/rejected": -2.877352714538574, "logps/chosen": -29.324417114257812, "logps/rejected": -30.05527114868164, "loss": 0.9618, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04157434031367302, "rewards/margins": 0.03820687159895897, "rewards/rejected": 0.0033674687147140503, "step": 50 }, { "epoch": 0.16, "grad_norm": 8.0625, "learning_rate": 4.954691471941119e-06, "logits/chosen": -2.9058096408843994, "logits/rejected": -2.90769624710083, "logps/chosen": -29.792831420898438, "logps/rejected": -27.967548370361328, "loss": 0.9651, "rewards/accuracies": 0.625, "rewards/chosen": 0.03369411081075668, "rewards/margins": 0.03488066419959068, "rewards/rejected": -0.0011865533888339996, "step": 60 }, { "epoch": 0.18, "grad_norm": 10.75, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.98918080329895, "logits/rejected": -2.9952526092529297, "logps/chosen": -29.09405517578125, "logps/rejected": -30.891315460205078, "loss": 0.9814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.018691470846533775, "rewards/margins": 0.018642673268914223, "rewards/rejected": 4.879874177277088e-05, "step": 70 }, { "epoch": 0.21, "grad_norm": 9.375, "learning_rate": 4.828760511501322e-06, "logits/chosen": -2.804455518722534, "logits/rejected": -2.8193769454956055, "logps/chosen": -29.27066993713379, "logps/rejected": -29.863460540771484, "loss": 0.9541, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.027005871757864952, "rewards/margins": 0.04592365399003029, "rewards/rejected": -0.018917780369520187, "step": 80 }, { "epoch": 0.23, "grad_norm": 9.5, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -2.8940837383270264, "logits/rejected": -2.8764431476593018, "logps/chosen": -32.690391540527344, "logps/rejected": -30.344482421875, "loss": 0.9531, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.014603940770030022, "rewards/margins": 0.04688751697540283, "rewards/rejected": -0.03228358179330826, "step": 90 }, { "epoch": 0.26, "grad_norm": 8.4375, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.999018669128418, "logits/rejected": -2.9999501705169678, "logps/chosen": -31.824331283569336, "logps/rejected": -31.003093719482422, "loss": 0.9581, "rewards/accuracies": 0.625, "rewards/chosen": 0.023254599422216415, "rewards/margins": 0.041946638375520706, "rewards/rejected": -0.01869204081594944, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.805298089981079, "eval_logits/rejected": -2.802727460861206, "eval_logps/chosen": -31.269874572753906, "eval_logps/rejected": -34.867759704589844, "eval_loss": 0.9819382429122925, "eval_rewards/accuracies": 0.5955149531364441, "eval_rewards/chosen": 0.0012576259905472398, "eval_rewards/margins": 0.018218038603663445, "eval_rewards/rejected": -0.01696041226387024, "eval_runtime": 113.3967, "eval_samples_per_second": 3.025, "eval_steps_per_second": 0.379, "step": 100 }, { "epoch": 0.29, "grad_norm": 10.4375, "learning_rate": 4.498257201263691e-06, "logits/chosen": -2.9547231197357178, "logits/rejected": -2.9307026863098145, "logps/chosen": -31.882299423217773, "logps/rejected": -31.55330467224121, "loss": 0.9271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03344918042421341, "rewards/margins": 0.07289471477270126, "rewards/rejected": -0.039445530623197556, "step": 110 }, { "epoch": 0.31, "grad_norm": 8.9375, "learning_rate": 4.353806263777678e-06, "logits/chosen": -3.0366623401641846, "logits/rejected": -3.0655264854431152, "logps/chosen": -28.671463012695312, "logps/rejected": -34.533443450927734, "loss": 0.9122, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.049243707209825516, "rewards/margins": 0.0878329798579216, "rewards/rejected": -0.03858928382396698, "step": 120 }, { "epoch": 0.34, "grad_norm": 8.5625, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.741682529449463, "logits/rejected": -2.7387285232543945, "logps/chosen": -28.681751251220703, "logps/rejected": -30.781078338623047, "loss": 0.9069, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.029292941093444824, "rewards/margins": 0.09402014315128326, "rewards/rejected": -0.06472718715667725, "step": 130 }, { "epoch": 0.36, "grad_norm": 8.6875, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -3.0192549228668213, "logits/rejected": -3.0171239376068115, "logps/chosen": -27.48164939880371, "logps/rejected": -32.18421936035156, "loss": 0.9468, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00045731887803412974, "rewards/margins": 0.0532023087143898, "rewards/rejected": -0.05274498462677002, "step": 140 }, { "epoch": 0.39, "grad_norm": 8.8125, "learning_rate": 3.834196265035119e-06, "logits/chosen": -2.8165481090545654, "logits/rejected": -2.8123271465301514, "logps/chosen": -27.479320526123047, "logps/rejected": -32.02801513671875, "loss": 0.8849, "rewards/accuracies": 0.625, "rewards/chosen": 0.034618668258190155, "rewards/margins": 0.11505589634180069, "rewards/rejected": -0.08043723553419113, "step": 150 }, { "epoch": 0.42, "grad_norm": 10.25, "learning_rate": 3.636998309800573e-06, "logits/chosen": -3.137382745742798, "logits/rejected": -3.1210262775421143, "logps/chosen": -31.968801498413086, "logps/rejected": -30.1705265045166, "loss": 0.8343, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.05229174345731735, "rewards/margins": 0.17136676609516144, "rewards/rejected": -0.11907501518726349, "step": 160 }, { "epoch": 0.44, "grad_norm": 9.5625, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.9499218463897705, "logits/rejected": -2.958042621612549, "logps/chosen": -29.790802001953125, "logps/rejected": -32.55553436279297, "loss": 0.8471, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.010381256230175495, "rewards/margins": 0.15598610043525696, "rewards/rejected": -0.14560487866401672, "step": 170 }, { "epoch": 0.47, "grad_norm": 10.0625, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.810059070587158, "logits/rejected": -2.8075289726257324, "logps/chosen": -29.805904388427734, "logps/rejected": -30.83272361755371, "loss": 0.9106, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.013605661690235138, "rewards/margins": 0.09284953773021698, "rewards/rejected": -0.10645520687103271, "step": 180 }, { "epoch": 0.49, "grad_norm": 6.375, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.920551300048828, "logits/rejected": -2.917001247406006, "logps/chosen": -30.380672454833984, "logps/rejected": -29.473743438720703, "loss": 0.9132, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.019395861774683, "rewards/margins": 0.09222523123025894, "rewards/rejected": -0.11162110418081284, "step": 190 }, { "epoch": 0.52, "grad_norm": 9.75, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -2.9904088973999023, "logits/rejected": -2.9779367446899414, "logps/chosen": -33.54608917236328, "logps/rejected": -31.4923095703125, "loss": 0.8824, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.015293078497052193, "rewards/margins": 0.12111523002386093, "rewards/rejected": -0.10582216084003448, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8315017223358154, "eval_logits/rejected": -2.8287980556488037, "eval_logps/chosen": -31.827983856201172, "eval_logps/rejected": -35.64292526245117, "eval_loss": 0.9602205753326416, "eval_rewards/accuracies": 0.6092192530632019, "eval_rewards/chosen": -0.05455298721790314, "eval_rewards/margins": 0.039924219250679016, "eval_rewards/rejected": -0.09447719901800156, "eval_runtime": 113.1607, "eval_samples_per_second": 3.031, "eval_steps_per_second": 0.38, "step": 200 }, { "epoch": 0.55, "grad_norm": 12.125, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.9306328296661377, "logits/rejected": -2.9321701526641846, "logps/chosen": -33.18594741821289, "logps/rejected": -35.648868560791016, "loss": 0.8442, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.015253487043082714, "rewards/margins": 0.16348664462566376, "rewards/rejected": -0.17874012887477875, "step": 210 }, { "epoch": 0.57, "grad_norm": 10.375, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -2.9161877632141113, "logits/rejected": -2.9325830936431885, "logps/chosen": -29.98288345336914, "logps/rejected": -30.195465087890625, "loss": 0.818, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.019515613093972206, "rewards/margins": 0.18623510003089905, "rewards/rejected": -0.1667194664478302, "step": 220 }, { "epoch": 0.6, "grad_norm": 8.9375, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -2.9633114337921143, "logits/rejected": -2.9677231311798096, "logps/chosen": -31.465539932250977, "logps/rejected": -33.331993103027344, "loss": 0.8836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03136688470840454, "rewards/margins": 0.1260170042514801, "rewards/rejected": -0.15738390386104584, "step": 230 }, { "epoch": 0.62, "grad_norm": 10.5625, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -3.0217947959899902, "logits/rejected": -3.029099702835083, "logps/chosen": -31.11104965209961, "logps/rejected": -31.991535186767578, "loss": 0.845, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.020612526684999466, "rewards/margins": 0.15501661598682404, "rewards/rejected": -0.1756291538476944, "step": 240 }, { "epoch": 0.65, "grad_norm": 9.25, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.8557331562042236, "logits/rejected": -2.844738721847534, "logps/chosen": -27.252483367919922, "logps/rejected": -30.79751968383789, "loss": 0.9072, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029826903715729713, "rewards/margins": 0.0967610701918602, "rewards/rejected": -0.12658795714378357, "step": 250 }, { "epoch": 0.68, "grad_norm": 10.875, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -2.8332571983337402, "logits/rejected": -2.854872226715088, "logps/chosen": -29.75779151916504, "logps/rejected": -36.265663146972656, "loss": 0.7843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.007217098027467728, "rewards/margins": 0.21637487411499023, "rewards/rejected": -0.209157794713974, "step": 260 }, { "epoch": 0.7, "grad_norm": 11.6875, "learning_rate": 1.243452991757889e-06, "logits/chosen": -2.975635051727295, "logits/rejected": -2.9822394847869873, "logps/chosen": -31.34609031677246, "logps/rejected": -32.07633972167969, "loss": 0.8354, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.05647352337837219, "rewards/margins": 0.16463439166545868, "rewards/rejected": -0.22110792994499207, "step": 270 }, { "epoch": 0.73, "grad_norm": 10.9375, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -2.9911038875579834, "logits/rejected": -2.9766478538513184, "logps/chosen": -31.343215942382812, "logps/rejected": -30.425678253173828, "loss": 0.8864, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06215282529592514, "rewards/margins": 0.1172800287604332, "rewards/rejected": -0.17943285405635834, "step": 280 }, { "epoch": 0.75, "grad_norm": 13.5, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.9202213287353516, "logits/rejected": -2.9021944999694824, "logps/chosen": -32.332275390625, "logps/rejected": -32.9135627746582, "loss": 0.7348, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030402958393096924, "rewards/margins": 0.26942819356918335, "rewards/rejected": -0.23902523517608643, "step": 290 }, { "epoch": 0.78, "grad_norm": 9.0625, "learning_rate": 7.08321427484816e-07, "logits/chosen": -2.9209041595458984, "logits/rejected": -2.9169058799743652, "logps/chosen": -32.4577522277832, "logps/rejected": -29.638641357421875, "loss": 0.8245, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02350962720811367, "rewards/margins": 0.17999625205993652, "rewards/rejected": -0.20350590348243713, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.8392350673675537, "eval_logits/rejected": -2.836524486541748, "eval_logps/chosen": -32.03562545776367, "eval_logps/rejected": -35.89268112182617, "eval_loss": 0.9559255838394165, "eval_rewards/accuracies": 0.6004983186721802, "eval_rewards/chosen": -0.07531756907701492, "eval_rewards/margins": 0.044134579598903656, "eval_rewards/rejected": -0.11945215612649918, "eval_runtime": 112.8661, "eval_samples_per_second": 3.039, "eval_steps_per_second": 0.381, "step": 300 }, { "epoch": 0.81, "grad_norm": 10.1875, "learning_rate": 5.576113578589035e-07, "logits/chosen": -2.80441951751709, "logits/rejected": -2.8214964866638184, "logps/chosen": -29.70196533203125, "logps/rejected": -33.18954849243164, "loss": 0.8066, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.047254521399736404, "rewards/margins": 0.19849951565265656, "rewards/rejected": -0.24575403332710266, "step": 310 }, { "epoch": 0.83, "grad_norm": 9.5, "learning_rate": 4.229036944380913e-07, "logits/chosen": -3.0470097064971924, "logits/rejected": -3.033452033996582, "logps/chosen": -29.982385635375977, "logps/rejected": -30.24712562561035, "loss": 0.8017, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.022920172661542892, "rewards/margins": 0.21575525403022766, "rewards/rejected": -0.19283509254455566, "step": 320 }, { "epoch": 0.86, "grad_norm": 11.875, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.96113657951355, "logits/rejected": -2.9435229301452637, "logps/chosen": -28.380762100219727, "logps/rejected": -32.93931579589844, "loss": 0.7579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0018691420555114746, "rewards/margins": 0.25289779901504517, "rewards/rejected": -0.25476694107055664, "step": 330 }, { "epoch": 0.88, "grad_norm": 10.5625, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -3.182105541229248, "logits/rejected": -3.1877188682556152, "logps/chosen": -31.98989486694336, "logps/rejected": -35.27183532714844, "loss": 0.7995, "rewards/accuracies": 0.6875, "rewards/chosen": -0.035068999975919724, "rewards/margins": 0.22251394391059875, "rewards/rejected": -0.2575829327106476, "step": 340 }, { "epoch": 0.91, "grad_norm": 11.75, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -3.057163953781128, "logits/rejected": -3.0605666637420654, "logps/chosen": -30.571701049804688, "logps/rejected": -33.298683166503906, "loss": 0.8066, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.023518025875091553, "rewards/margins": 0.20205526053905487, "rewards/rejected": -0.1785372495651245, "step": 350 }, { "epoch": 0.94, "grad_norm": 11.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": -2.8954837322235107, "logits/rejected": -2.8972554206848145, "logps/chosen": -28.211322784423828, "logps/rejected": -31.518001556396484, "loss": 0.7939, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04384060949087143, "rewards/margins": 0.21254661679267883, "rewards/rejected": -0.1687059849500656, "step": 360 }, { "epoch": 0.96, "grad_norm": 10.3125, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.973878860473633, "logits/rejected": -2.9718644618988037, "logps/chosen": -30.932153701782227, "logps/rejected": -33.92184829711914, "loss": 0.8202, "rewards/accuracies": 0.6875, "rewards/chosen": -0.014550352469086647, "rewards/margins": 0.1848299205303192, "rewards/rejected": -0.1993802934885025, "step": 370 }, { "epoch": 0.99, "grad_norm": 9.8125, "learning_rate": 2.575864278703266e-09, "logits/chosen": -2.927494764328003, "logits/rejected": -2.9113755226135254, "logps/chosen": -29.455524444580078, "logps/rejected": -30.044204711914062, "loss": 0.8384, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05104842036962509, "rewards/margins": 0.16441020369529724, "rewards/rejected": -0.21545863151550293, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.8805358874333369, "train_runtime": 2723.8224, "train_samples_per_second": 1.13, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }