{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8663169145584106, "logits/rejected": -1.870638370513916, "logps/chosen": -36.98221206665039, "logps/rejected": -33.6473503112793, "loss": 0.6788, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.01950961910188198, "rewards/margins": 0.03332838416099548, "rewards/rejected": -0.013818766921758652, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9978923797607422, "logits/rejected": -2.0005345344543457, "logps/chosen": -29.642324447631836, "logps/rejected": -29.048343658447266, "loss": 0.7013, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.00010492801811778918, "rewards/margins": -0.012185259722173214, "rewards/rejected": 0.012080332264304161, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9209339618682861, "logits/rejected": -1.9182507991790771, "logps/chosen": -31.401519775390625, "logps/rejected": -33.22309875488281, "loss": 0.6891, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.011660982854664326, "rewards/margins": 0.015369392931461334, "rewards/rejected": -0.003708411008119583, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0180399417877197, "logits/rejected": -2.009289264678955, "logps/chosen": -32.559410095214844, "logps/rejected": -32.52582550048828, "loss": 0.6849, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.013918718323111534, "rewards/margins": 0.02444135770201683, "rewards/rejected": -0.010522643104195595, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8629716634750366, "logits/rejected": -1.8522107601165771, "logps/chosen": -33.554229736328125, "logps/rejected": -35.44757080078125, "loss": 0.6987, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.002600290346890688, "rewards/margins": -0.0012185067171230912, "rewards/rejected": 0.003818795783445239, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9408414363861084, "logits/rejected": -1.9427950382232666, "logps/chosen": -32.56097412109375, "logps/rejected": -33.213417053222656, "loss": 0.6725, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03136637061834335, "rewards/margins": 0.06499636918306351, "rewards/rejected": -0.03362999111413956, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.072221517562866, "logits/rejected": -2.077198028564453, "logps/chosen": -33.974578857421875, "logps/rejected": -36.629173278808594, "loss": 0.6793, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.011124782264232635, "rewards/margins": 0.059367585927248, "rewards/rejected": -0.04824279993772507, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9338233470916748, "logits/rejected": -1.9369605779647827, "logps/chosen": -34.30416488647461, "logps/rejected": -34.634437561035156, "loss": 0.6423, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09891629219055176, "rewards/margins": 0.132537841796875, "rewards/rejected": -0.03362155705690384, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9418385028839111, "logits/rejected": -1.9463545083999634, "logps/chosen": -32.39947509765625, "logps/rejected": -32.35419464111328, "loss": 0.6785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05435952544212341, "rewards/margins": 0.05058818310499191, "rewards/rejected": 0.0037713423371315002, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.039783000946045, "logits/rejected": -2.037789821624756, "logps/chosen": -32.164188385009766, "logps/rejected": -31.309520721435547, "loss": 0.6538, "rewards/accuracies": 0.625, "rewards/chosen": 0.061319977045059204, "rewards/margins": 0.09596750140190125, "rewards/rejected": -0.03464752808213234, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2338831424713135, "eval_logits/rejected": -2.229041814804077, "eval_logps/chosen": -34.03020095825195, "eval_logps/rejected": -37.52727127075195, "eval_loss": 0.6954607963562012, "eval_rewards/accuracies": 0.510797381401062, "eval_rewards/chosen": 0.0034842013847082853, "eval_rewards/margins": 0.012004716321825981, "eval_rewards/rejected": -0.008520514704287052, "eval_runtime": 146.115, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9946181774139404, "logits/rejected": -1.992236852645874, "logps/chosen": -33.117286682128906, "logps/rejected": -34.00868225097656, "loss": 0.6852, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.10122231394052505, "rewards/margins": 0.0846698135137558, "rewards/rejected": 0.016552483662962914, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.006875514984131, "logits/rejected": -1.9985148906707764, "logps/chosen": -32.336421966552734, "logps/rejected": -32.137081146240234, "loss": 0.675, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.08678452670574188, "rewards/margins": 0.06724556535482407, "rewards/rejected": 0.019538963213562965, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0351502895355225, "logits/rejected": -2.02717661857605, "logps/chosen": -30.30923843383789, "logps/rejected": -32.08501434326172, "loss": 0.6398, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.13318516314029694, "rewards/margins": 0.15878939628601074, "rewards/rejected": -0.025604233145713806, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.966170072555542, "logits/rejected": -1.9764087200164795, "logps/chosen": -31.215194702148438, "logps/rejected": -32.55674743652344, "loss": 0.627, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16357474029064178, "rewards/margins": 0.18106886744499207, "rewards/rejected": -0.017494117841124535, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8775428533554077, "logits/rejected": -1.8786998987197876, "logps/chosen": -33.92055130004883, "logps/rejected": -34.77721405029297, "loss": 0.6168, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.21771302819252014, "rewards/margins": 0.23367898166179657, "rewards/rejected": -0.015965968370437622, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9291296005249023, "logits/rejected": -1.9257177114486694, "logps/chosen": -36.01557922363281, "logps/rejected": -32.72490692138672, "loss": 0.6444, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12488000094890594, "rewards/margins": 0.12660440802574158, "rewards/rejected": -0.0017244067275896668, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0295331478118896, "logits/rejected": -2.0221762657165527, "logps/chosen": -33.48248291015625, "logps/rejected": -31.408077239990234, "loss": 0.5816, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.25230517983436584, "rewards/margins": 0.2972865700721741, "rewards/rejected": -0.04498137906193733, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.036190986633301, "logits/rejected": -2.0414373874664307, "logps/chosen": -32.22594451904297, "logps/rejected": -32.46149444580078, "loss": 0.5993, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2550733685493469, "rewards/margins": 0.2361568957567215, "rewards/rejected": 0.018916476517915726, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0371222496032715, "logits/rejected": -2.0343565940856934, "logps/chosen": -31.28468894958496, "logps/rejected": -31.336734771728516, "loss": 0.6307, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16341081261634827, "rewards/margins": 0.17822694778442383, "rewards/rejected": -0.014816122129559517, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9067039489746094, "logits/rejected": -1.91135573387146, "logps/chosen": -31.312374114990234, "logps/rejected": -32.82074737548828, "loss": 0.6015, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2351258099079132, "rewards/margins": 0.25897616147994995, "rewards/rejected": -0.0238503310829401, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.232191324234009, "eval_logits/rejected": -2.22735333442688, "eval_logps/chosen": -34.060691833496094, "eval_logps/rejected": -37.56568908691406, "eval_loss": 0.695566713809967, "eval_rewards/accuracies": 0.52491694688797, "eval_rewards/chosen": -0.020911961793899536, "eval_rewards/margins": 0.01834380254149437, "eval_rewards/rejected": -0.039255764335393906, "eval_runtime": 145.8849, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.0197176933288574, "logits/rejected": -2.0303761959075928, "logps/chosen": -31.751026153564453, "logps/rejected": -33.96234893798828, "loss": 0.5969, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17622438073158264, "rewards/margins": 0.25594404339790344, "rewards/rejected": -0.0797196701169014, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9122480154037476, "logits/rejected": -1.9270412921905518, "logps/chosen": -29.86123275756836, "logps/rejected": -31.612594604492188, "loss": 0.6025, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20024582743644714, "rewards/margins": 0.24377915263175964, "rewards/rejected": -0.04353334754705429, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9689687490463257, "logits/rejected": -1.9729585647583008, "logps/chosen": -33.11440658569336, "logps/rejected": -31.650421142578125, "loss": 0.5809, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2424498349428177, "rewards/margins": 0.3233444094657898, "rewards/rejected": -0.08089461922645569, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9670231342315674, "logits/rejected": -1.9451711177825928, "logps/chosen": -33.830162048339844, "logps/rejected": -35.1173095703125, "loss": 0.558, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2588713765144348, "rewards/margins": 0.3920826315879822, "rewards/rejected": -0.13321125507354736, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.0084152221679688, "logits/rejected": -2.005080223083496, "logps/chosen": -32.70518493652344, "logps/rejected": -36.280517578125, "loss": 0.6104, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1759072244167328, "rewards/margins": 0.22038432955741882, "rewards/rejected": -0.044477105140686035, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8755052089691162, "logits/rejected": -1.8730967044830322, "logps/chosen": -33.984092712402344, "logps/rejected": -35.538455963134766, "loss": 0.622, "rewards/accuracies": 0.75, "rewards/chosen": 0.16344432532787323, "rewards/margins": 0.19051328301429749, "rewards/rejected": -0.02706894651055336, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8608484268188477, "logits/rejected": -1.8584181070327759, "logps/chosen": -34.17797088623047, "logps/rejected": -31.830347061157227, "loss": 0.6124, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1757020801305771, "rewards/margins": 0.22291450202465057, "rewards/rejected": -0.047212425619363785, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9641139507293701, "logits/rejected": -1.9535942077636719, "logps/chosen": -35.01939010620117, "logps/rejected": -31.871440887451172, "loss": 0.5852, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26980119943618774, "rewards/margins": 0.29416200518608093, "rewards/rejected": -0.024360809475183487, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0593204498291016, "logits/rejected": -2.0443997383117676, "logps/chosen": -30.722980499267578, "logps/rejected": -32.61235809326172, "loss": 0.6599, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16091887652873993, "rewards/margins": 0.12933868169784546, "rewards/rejected": 0.031580209732055664, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.930450201034546, "logits/rejected": -1.9279005527496338, "logps/chosen": -32.415870666503906, "logps/rejected": -30.882410049438477, "loss": 0.5385, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4089924395084381, "rewards/margins": 0.4614754319190979, "rewards/rejected": -0.05248301103711128, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.229433536529541, "eval_logits/rejected": -2.22458553314209, "eval_logps/chosen": -34.090904235839844, "eval_logps/rejected": -37.59726333618164, "eval_loss": 0.69569993019104, "eval_rewards/accuracies": 0.5398671627044678, "eval_rewards/chosen": -0.04508008435368538, "eval_rewards/margins": 0.019436603412032127, "eval_rewards/rejected": -0.06451668590307236, "eval_runtime": 145.8403, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9148633480072021, "logits/rejected": -1.9115928411483765, "logps/chosen": -31.324920654296875, "logps/rejected": -33.81542205810547, "loss": 0.5949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22274336218833923, "rewards/margins": 0.28135946393013, "rewards/rejected": -0.05861610919237137, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9647932052612305, "logits/rejected": -1.9525552988052368, "logps/chosen": -34.34864044189453, "logps/rejected": -33.66791915893555, "loss": 0.5831, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.19794727861881256, "rewards/margins": 0.3012133836746216, "rewards/rejected": -0.10326610505580902, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.0001468658447266, "logits/rejected": -1.9986999034881592, "logps/chosen": -33.18779373168945, "logps/rejected": -32.54129409790039, "loss": 0.5895, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23840396106243134, "rewards/margins": 0.28777408599853516, "rewards/rejected": -0.04937009885907173, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.0865254402160645, "logits/rejected": -2.0707924365997314, "logps/chosen": -33.81252670288086, "logps/rejected": -33.110015869140625, "loss": 0.5883, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2934878468513489, "rewards/margins": 0.28707200288772583, "rewards/rejected": 0.006415897514671087, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.9590953588485718, "logits/rejected": -1.9582574367523193, "logps/chosen": -32.849937438964844, "logps/rejected": -32.53525161743164, "loss": 0.5603, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.34572547674179077, "rewards/margins": 0.4018074870109558, "rewards/rejected": -0.056082069873809814, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.914807677268982, "logits/rejected": -1.9251015186309814, "logps/chosen": -31.8874454498291, "logps/rejected": -35.34430694580078, "loss": 0.588, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2537948489189148, "rewards/margins": 0.2982342541217804, "rewards/rejected": -0.0444394052028656, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.054311513900757, "logits/rejected": -2.047823429107666, "logps/chosen": -33.35334396362305, "logps/rejected": -29.280254364013672, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": 0.23889848589897156, "rewards/margins": 0.2819606363773346, "rewards/rejected": -0.04306213930249214, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9139082431793213, "logits/rejected": -1.9161239862442017, "logps/chosen": -33.855018615722656, "logps/rejected": -30.981037139892578, "loss": 0.5466, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3072236478328705, "rewards/margins": 0.3939489424228668, "rewards/rejected": -0.08672530204057693, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6230236078237559, "train_runtime": 3254.2307, "train_samples_per_second": 0.946, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }