{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 16.375, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 8.8125, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.07431030273438, "logits/rejected": 80.78235626220703, "logps/chosen": -34.18563461303711, "logps/rejected": -33.02939987182617, "loss": 0.4904, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.028044726699590683, "rewards/margins": 0.06354151666164398, "rewards/rejected": -0.035496786236763, "step": 10 }, { "epoch": 0.05, "grad_norm": 12.1875, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.67153930664062, "logits/rejected": 80.56294250488281, "logps/chosen": -33.529541015625, "logps/rejected": -30.742996215820312, "loss": 0.4869, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.08018453419208527, "rewards/margins": 0.06293892860412598, "rewards/rejected": 0.017245600000023842, "step": 20 }, { "epoch": 0.08, "grad_norm": 14.625, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.5155029296875, "logits/rejected": 82.54915618896484, "logps/chosen": -33.7542610168457, "logps/rejected": -31.115161895751953, "loss": 0.5066, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.12928077578544617, "rewards/margins": -0.02505389414727688, "rewards/rejected": 0.15433469414710999, "step": 30 }, { "epoch": 0.1, "grad_norm": 12.5625, "learning_rate": 4.999896948438434e-06, "logits/chosen": 81.09931945800781, "logits/rejected": 81.09571838378906, "logps/chosen": -32.699241638183594, "logps/rejected": -33.092124938964844, "loss": 0.4793, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.23740839958190918, "rewards/margins": 0.0992104634642601, "rewards/rejected": 0.13819792866706848, "step": 40 }, { "epoch": 0.13, "grad_norm": 11.125, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.78851318359375, "logits/rejected": 78.79679870605469, "logps/chosen": -30.52480125427246, "logps/rejected": -30.73565673828125, "loss": 0.4662, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.34022435545921326, "rewards/margins": 0.17140880227088928, "rewards/rejected": 0.16881553828716278, "step": 50 }, { "epoch": 0.16, "grad_norm": 9.75, "learning_rate": 4.954691471941119e-06, "logits/chosen": 83.39762878417969, "logits/rejected": 83.45609283447266, "logps/chosen": -30.74808692932129, "logps/rejected": -29.20343017578125, "loss": 0.4975, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.21353764832019806, "rewards/margins": 0.019991278648376465, "rewards/rejected": 0.1935463845729828, "step": 60 }, { "epoch": 0.18, "grad_norm": 10.9375, "learning_rate": 4.901618883413549e-06, "logits/chosen": 84.00616455078125, "logits/rejected": 84.04234313964844, "logps/chosen": -30.251235961914062, "logps/rejected": -32.49794387817383, "loss": 0.5173, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.18889428675174713, "rewards/margins": -0.08099017292261124, "rewards/rejected": 0.2698844373226166, "step": 70 }, { "epoch": 0.21, "grad_norm": 12.375, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.75172424316406, "logits/rejected": 81.72709655761719, "logps/chosen": -31.1412353515625, "logps/rejected": -30.653112411499023, "loss": 0.4759, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.18185201287269592, "rewards/margins": 0.12958452105522156, "rewards/rejected": 0.05226749926805496, "step": 80 }, { "epoch": 0.23, "grad_norm": 13.9375, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.56499481201172, "logits/rejected": 78.54066467285156, "logps/chosen": -32.319190979003906, "logps/rejected": -30.92232894897461, "loss": 0.476, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.15949341654777527, "rewards/margins": 0.14492300152778625, "rewards/rejected": 0.014570409432053566, "step": 90 }, { "epoch": 0.26, "grad_norm": 11.875, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.73712158203125, "logits/rejected": 83.75628662109375, "logps/chosen": -33.78915023803711, "logps/rejected": -31.565814971923828, "loss": 0.4665, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.2579200267791748, "rewards/margins": 0.16555462777614594, "rewards/rejected": 0.09236541390419006, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.72062683105469, "eval_logits/rejected": 98.71237182617188, "eval_logps/chosen": -32.28904724121094, "eval_logps/rejected": -35.83627700805664, "eval_loss": 0.49707475304603577, "eval_rewards/accuracies": 0.5186877250671387, "eval_rewards/chosen": 0.07706477493047714, "eval_rewards/margins": 0.011940184980630875, "eval_rewards/rejected": 0.06512458622455597, "eval_runtime": 104.0818, "eval_samples_per_second": 3.295, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 14.8125, "learning_rate": 4.498257201263691e-06, "logits/chosen": 84.00208282470703, "logits/rejected": 83.8756332397461, "logps/chosen": -32.03178405761719, "logps/rejected": -32.6201171875, "loss": 0.4084, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.49551233649253845, "rewards/margins": 0.47762519121170044, "rewards/rejected": 0.01788714900612831, "step": 110 }, { "epoch": 0.31, "grad_norm": 13.5625, "learning_rate": 4.353806263777678e-06, "logits/chosen": 84.0489273071289, "logits/rejected": 84.1673355102539, "logps/chosen": -27.966100692749023, "logps/rejected": -35.121253967285156, "loss": 0.441, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.48174038529396057, "rewards/margins": 0.31279435753822327, "rewards/rejected": 0.16894599795341492, "step": 120 }, { "epoch": 0.34, "grad_norm": 7.28125, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.32206726074219, "logits/rejected": 81.3568115234375, "logps/chosen": -30.228296279907227, "logps/rejected": -31.748275756835938, "loss": 0.4559, "rewards/accuracies": 0.5625, "rewards/chosen": 0.33953094482421875, "rewards/margins": 0.24785713851451874, "rewards/rejected": 0.09167381376028061, "step": 130 }, { "epoch": 0.36, "grad_norm": 8.625, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.2553939819336, "logits/rejected": 82.27656555175781, "logps/chosen": -26.800729751586914, "logps/rejected": -32.687808990478516, "loss": 0.423, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.32329756021499634, "rewards/margins": 0.455514132976532, "rewards/rejected": -0.13221657276153564, "step": 140 }, { "epoch": 0.39, "grad_norm": 12.5625, "learning_rate": 3.834196265035119e-06, "logits/chosen": 80.7685775756836, "logits/rejected": 80.73258209228516, "logps/chosen": -28.54288673400879, "logps/rejected": -32.596839904785156, "loss": 0.415, "rewards/accuracies": 0.625, "rewards/chosen": 0.423112154006958, "rewards/margins": 0.4382806718349457, "rewards/rejected": -0.01516849733889103, "step": 150 }, { "epoch": 0.42, "grad_norm": 14.125, "learning_rate": 3.636998309800573e-06, "logits/chosen": 82.48309326171875, "logits/rejected": 82.50714111328125, "logps/chosen": -33.225772857666016, "logps/rejected": -30.1079044342041, "loss": 0.4023, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.45548883080482483, "rewards/margins": 0.533171534538269, "rewards/rejected": -0.0776827409863472, "step": 160 }, { "epoch": 0.44, "grad_norm": 7.5625, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 83.15037536621094, "logits/rejected": 83.10893249511719, "logps/chosen": -30.452091217041016, "logps/rejected": -32.47702407836914, "loss": 0.3953, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4035983681678772, "rewards/margins": 0.6320796012878418, "rewards/rejected": -0.2284812033176422, "step": 170 }, { "epoch": 0.47, "grad_norm": 6.65625, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 80.6706314086914, "logits/rejected": 80.64241027832031, "logps/chosen": -30.28702163696289, "logps/rejected": -31.376535415649414, "loss": 0.404, "rewards/accuracies": 0.75, "rewards/chosen": 0.45506635308265686, "rewards/margins": 0.5647307634353638, "rewards/rejected": -0.10966438055038452, "step": 180 }, { "epoch": 0.49, "grad_norm": 6.21875, "learning_rate": 2.996071664294641e-06, "logits/chosen": 82.2298583984375, "logits/rejected": 82.20623779296875, "logps/chosen": -29.83859634399414, "logps/rejected": -30.299697875976562, "loss": 0.4314, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4639095664024353, "rewards/margins": 0.37974613904953003, "rewards/rejected": 0.0841633602976799, "step": 190 }, { "epoch": 0.52, "grad_norm": 6.875, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 77.64122009277344, "logits/rejected": 77.57923889160156, "logps/chosen": -33.089454650878906, "logps/rejected": -32.29143524169922, "loss": 0.3678, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8316432237625122, "rewards/margins": 0.7306608557701111, "rewards/rejected": 0.10098233073949814, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 98.36553955078125, "eval_logits/rejected": 98.35772705078125, "eval_logps/chosen": -32.45161056518555, "eval_logps/rejected": -35.998619079589844, "eval_loss": 0.49831458926200867, "eval_rewards/accuracies": 0.5103820562362671, "eval_rewards/chosen": -0.004216374363750219, "eval_rewards/margins": 0.011831996031105518, "eval_rewards/rejected": -0.01604837365448475, "eval_runtime": 104.055, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.413, "step": 200 }, { "epoch": 0.55, "grad_norm": 15.25, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 80.19715881347656, "logits/rejected": 80.11329650878906, "logps/chosen": -32.6320915222168, "logps/rejected": -34.92211151123047, "loss": 0.3902, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.656814455986023, "rewards/margins": 0.624821662902832, "rewards/rejected": 0.031992800533771515, "step": 210 }, { "epoch": 0.57, "grad_norm": 12.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 82.21825408935547, "logits/rejected": 82.29025268554688, "logps/chosen": -30.436092376708984, "logps/rejected": -30.719776153564453, "loss": 0.3564, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.7564297318458557, "rewards/margins": 0.7518402934074402, "rewards/rejected": 0.004589465446770191, "step": 220 }, { "epoch": 0.6, "grad_norm": 9.625, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 79.31720733642578, "logits/rejected": 79.36396789550781, "logps/chosen": -31.852947235107422, "logps/rejected": -34.1156120300293, "loss": 0.4099, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.476398229598999, "rewards/margins": 0.5281308889389038, "rewards/rejected": -0.05173276737332344, "step": 230 }, { "epoch": 0.62, "grad_norm": 13.5, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 81.70478820800781, "logits/rejected": 81.98667907714844, "logps/chosen": -30.028610229492188, "logps/rejected": -31.500635147094727, "loss": 0.3447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.8297265768051147, "rewards/margins": 0.8140215873718262, "rewards/rejected": 0.015705054625868797, "step": 240 }, { "epoch": 0.65, "grad_norm": 12.6875, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 80.39967346191406, "logits/rejected": 80.46524810791016, "logps/chosen": -26.57110595703125, "logps/rejected": -29.88582420349121, "loss": 0.4136, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5123947858810425, "rewards/margins": 0.5049386024475098, "rewards/rejected": 0.00745623093098402, "step": 250 }, { "epoch": 0.68, "grad_norm": 9.8125, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 77.56782531738281, "logits/rejected": 77.75471496582031, "logps/chosen": -29.75429916381836, "logps/rejected": -36.2445068359375, "loss": 0.3377, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.9033478498458862, "rewards/margins": 0.9747893214225769, "rewards/rejected": -0.07144142687320709, "step": 260 }, { "epoch": 0.7, "grad_norm": 7.96875, "learning_rate": 1.243452991757889e-06, "logits/chosen": 77.03749084472656, "logits/rejected": 77.07095336914062, "logps/chosen": -30.273632049560547, "logps/rejected": -31.36687660217285, "loss": 0.3778, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7526459097862244, "rewards/margins": 0.6833735704421997, "rewards/rejected": 0.06927235424518585, "step": 270 }, { "epoch": 0.73, "grad_norm": 15.25, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 79.75798797607422, "logits/rejected": 79.53197479248047, "logps/chosen": -30.511974334716797, "logps/rejected": -29.195556640625, "loss": 0.4169, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6489585638046265, "rewards/margins": 0.41323956847190857, "rewards/rejected": 0.2357189655303955, "step": 280 }, { "epoch": 0.75, "grad_norm": 7.71875, "learning_rate": 8.737922755071455e-07, "logits/chosen": 79.92450714111328, "logits/rejected": 79.83580017089844, "logps/chosen": -32.37202453613281, "logps/rejected": -32.25506591796875, "loss": 0.3223, "rewards/accuracies": 0.75, "rewards/chosen": 0.8661117553710938, "rewards/margins": 0.9852114915847778, "rewards/rejected": -0.11909981071949005, "step": 290 }, { "epoch": 0.78, "grad_norm": 7.875, "learning_rate": 7.08321427484816e-07, "logits/chosen": 75.48292541503906, "logits/rejected": 75.55941009521484, "logps/chosen": -31.580333709716797, "logps/rejected": -28.897323608398438, "loss": 0.3546, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9126860499382019, "rewards/margins": 0.8103362321853638, "rewards/rejected": 0.10234987735748291, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.32223510742188, "eval_logits/rejected": 98.31561279296875, "eval_logps/chosen": -32.32727813720703, "eval_logps/rejected": -35.866024017333984, "eval_loss": 0.49702879786491394, "eval_rewards/accuracies": 0.5564784407615662, "eval_rewards/chosen": 0.05794913321733475, "eval_rewards/margins": 0.007699221838265657, "eval_rewards/rejected": 0.05024990811944008, "eval_runtime": 104.0255, "eval_samples_per_second": 3.297, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.81, "grad_norm": 10.25, "learning_rate": 5.576113578589035e-07, "logits/chosen": 82.65113830566406, "logits/rejected": 82.6902847290039, "logps/chosen": -29.43534278869629, "logps/rejected": -32.302738189697266, "loss": 0.3663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.714095413684845, "rewards/margins": 0.7655847668647766, "rewards/rejected": -0.05148933455348015, "step": 310 }, { "epoch": 0.83, "grad_norm": 9.75, "learning_rate": 4.229036944380913e-07, "logits/chosen": 80.09264373779297, "logits/rejected": 80.09661865234375, "logps/chosen": -29.789846420288086, "logps/rejected": -28.805282592773438, "loss": 0.3454, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9043582081794739, "rewards/margins": 0.8472534418106079, "rewards/rejected": 0.05710482597351074, "step": 320 }, { "epoch": 0.86, "grad_norm": 10.5625, "learning_rate": 3.053082288996112e-07, "logits/chosen": 77.26579284667969, "logits/rejected": 77.29621124267578, "logps/chosen": -28.363937377929688, "logps/rejected": -32.58740234375, "loss": 0.3182, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9943346977233887, "rewards/margins": 0.9706042408943176, "rewards/rejected": 0.02373054064810276, "step": 330 }, { "epoch": 0.88, "grad_norm": 12.875, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 81.5138168334961, "logits/rejected": 81.5446548461914, "logps/chosen": -31.642475128173828, "logps/rejected": -33.37983322143555, "loss": 0.3675, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7848353385925293, "rewards/margins": 0.7904062271118164, "rewards/rejected": -0.0055709658190608025, "step": 340 }, { "epoch": 0.91, "grad_norm": 8.6875, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 80.62837982177734, "logits/rejected": 80.64482116699219, "logps/chosen": -31.80319595336914, "logps/rejected": -32.94707107543945, "loss": 0.3582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9202712774276733, "rewards/margins": 0.8479059338569641, "rewards/rejected": 0.07236528396606445, "step": 350 }, { "epoch": 0.94, "grad_norm": 8.9375, "learning_rate": 6.41315865106129e-08, "logits/chosen": 82.10346984863281, "logits/rejected": 82.16381072998047, "logps/chosen": -27.89191246032715, "logps/rejected": -31.463871002197266, "loss": 0.3663, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8301493525505066, "rewards/margins": 0.7371596693992615, "rewards/rejected": 0.0929897278547287, "step": 360 }, { "epoch": 0.96, "grad_norm": 9.6875, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 81.55647277832031, "logits/rejected": 81.5948715209961, "logps/chosen": -31.506786346435547, "logps/rejected": -34.95966339111328, "loss": 0.3891, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6846884489059448, "rewards/margins": 0.6098321676254272, "rewards/rejected": 0.07485616207122803, "step": 370 }, { "epoch": 0.99, "grad_norm": 10.625, "learning_rate": 2.575864278703266e-09, "logits/chosen": 75.45634460449219, "logits/rejected": 75.32949829101562, "logps/chosen": -29.234609603881836, "logps/rejected": -28.104198455810547, "loss": 0.3841, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6707575917243958, "rewards/margins": 0.6425691246986389, "rewards/rejected": 0.02818852663040161, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.41032042689137643, "train_runtime": 2557.4055, "train_samples_per_second": 1.204, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }