{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 328.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 1.5625, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 250.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08969116210938, "logits/rejected": 80.79109191894531, "logps/chosen": -34.12071228027344, "logps/rejected": -33.01869201660156, "loss": 1.7496, "rewards/accuracies": 0.5, "rewards/chosen": 0.04840615764260292, "rewards/margins": 0.07252107560634613, "rewards/rejected": -0.024114925414323807, "step": 10 }, { "epoch": 0.05, "grad_norm": 336.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.68243408203125, "logits/rejected": 80.568359375, "logps/chosen": -33.56159210205078, "logps/rejected": -30.76800537109375, "loss": 2.2924, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.05132746696472168, "rewards/margins": 0.04753490164875984, "rewards/rejected": 0.003792577190324664, "step": 20 }, { "epoch": 0.08, "grad_norm": 318.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.52204895019531, "logits/rejected": 82.54562377929688, "logps/chosen": -33.78315353393555, "logps/rejected": -31.24331283569336, "loss": 2.7815, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.09186722338199615, "rewards/margins": 0.019660871475934982, "rewards/rejected": 0.07220635563135147, "step": 30 }, { "epoch": 0.1, "grad_norm": 426.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": 80.89669036865234, "logits/rejected": 80.89623260498047, "logps/chosen": -32.95878219604492, "logps/rejected": -33.34799575805664, "loss": 2.4607, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.08611182868480682, "rewards/margins": 0.07789957523345947, "rewards/rejected": 0.008212263695895672, "step": 40 }, { "epoch": 0.13, "grad_norm": 772.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.38099670410156, "logits/rejected": 78.40118408203125, "logps/chosen": -31.085933685302734, "logps/rejected": -31.042804718017578, "loss": 2.589, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.04772613197565079, "rewards/margins": 0.03553418815135956, "rewards/rejected": 0.01219194382429123, "step": 50 }, { "epoch": 0.16, "grad_norm": 249.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 82.9079818725586, "logits/rejected": 82.95909118652344, "logps/chosen": -31.034317016601562, "logps/rejected": -29.518524169921875, "loss": 2.8739, "rewards/accuracies": 0.5, "rewards/chosen": 0.05633721500635147, "rewards/margins": 0.027538930997252464, "rewards/rejected": 0.028798282146453857, "step": 60 }, { "epoch": 0.18, "grad_norm": 752.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.62979125976562, "logits/rejected": 83.66517639160156, "logps/chosen": -30.505615234375, "logps/rejected": -32.857994079589844, "loss": 2.9346, "rewards/accuracies": 0.5, "rewards/chosen": 0.04936310276389122, "rewards/margins": -0.02252076379954815, "rewards/rejected": 0.07188385725021362, "step": 70 }, { "epoch": 0.21, "grad_norm": 370.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.35310363769531, "logits/rejected": 81.34590148925781, "logps/chosen": -31.199981689453125, "logps/rejected": -30.634307861328125, "loss": 2.824, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.12198303639888763, "rewards/margins": 0.07264719158411026, "rewards/rejected": 0.049335844814777374, "step": 80 }, { "epoch": 0.23, "grad_norm": 524.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.339111328125, "logits/rejected": 78.30552673339844, "logps/chosen": -32.22107696533203, "logps/rejected": -31.029504776000977, "loss": 2.6385, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.16684113442897797, "rewards/margins": 0.19805487990379333, "rewards/rejected": -0.03121376410126686, "step": 90 }, { "epoch": 0.26, "grad_norm": 225.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.51721954345703, "logits/rejected": 83.54518127441406, "logps/chosen": -34.10065841674805, "logps/rejected": -31.89223861694336, "loss": 2.5238, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.08173254877328873, "rewards/margins": 0.1384100764989853, "rewards/rejected": -0.056677550077438354, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.82966613769531, "eval_logits/rejected": 98.81654357910156, "eval_logps/chosen": -32.419429779052734, "eval_logps/rejected": -36.027076721191406, "eval_loss": 2.6218504905700684, "eval_rewards/accuracies": 0.5481727719306946, "eval_rewards/chosen": 0.009498294442892075, "eval_rewards/margins": 0.03371964767575264, "eval_rewards/rejected": -0.024221351370215416, "eval_runtime": 104.1511, "eval_samples_per_second": 3.293, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 576.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.7013931274414, "logits/rejected": 83.58271789550781, "logps/chosen": -32.618534088134766, "logps/rejected": -32.656673431396484, "loss": 2.9173, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16170774400234222, "rewards/margins": 0.16202135384082794, "rewards/rejected": -0.00031360946013592184, "step": 110 }, { "epoch": 0.31, "grad_norm": 524.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.95661926269531, "logits/rejected": 84.07592010498047, "logps/chosen": -28.62860107421875, "logps/rejected": -35.55434036254883, "loss": 3.1258, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12039314210414886, "rewards/margins": 0.1584714949131012, "rewards/rejected": -0.03807835653424263, "step": 120 }, { "epoch": 0.34, "grad_norm": 216.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.45957946777344, "logits/rejected": 81.48210144042969, "logps/chosen": -30.638086318969727, "logps/rejected": -32.03240203857422, "loss": 2.331, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.10770728439092636, "rewards/margins": 0.14801691472530365, "rewards/rejected": -0.04030962288379669, "step": 130 }, { "epoch": 0.36, "grad_norm": 386.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 83.00135803222656, "logits/rejected": 83.00582122802734, "logps/chosen": -27.22810935974121, "logps/rejected": -32.823734283447266, "loss": 3.1239, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.08768650889396667, "rewards/margins": 0.2478286772966385, "rewards/rejected": -0.160142183303833, "step": 140 }, { "epoch": 0.39, "grad_norm": 444.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 81.87845611572266, "logits/rejected": 81.85296630859375, "logps/chosen": -28.920028686523438, "logps/rejected": -32.752830505371094, "loss": 2.5059, "rewards/accuracies": 0.625, "rewards/chosen": 0.18763259053230286, "rewards/margins": 0.26216596364974976, "rewards/rejected": -0.07453340291976929, "step": 150 }, { "epoch": 0.42, "grad_norm": 604.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 83.88510131835938, "logits/rejected": 83.89151763916016, "logps/chosen": -33.4796257019043, "logps/rejected": -30.0944881439209, "loss": 4.2076, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26284781098365784, "rewards/margins": 0.31962868571281433, "rewards/rejected": -0.056780923157930374, "step": 160 }, { "epoch": 0.44, "grad_norm": 372.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 84.69200134277344, "logits/rejected": 84.63328552246094, "logps/chosen": -30.8580322265625, "logps/rejected": -32.00690460205078, "loss": 2.4171, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1605023592710495, "rewards/margins": 0.15524213016033173, "rewards/rejected": 0.005260218866169453, "step": 170 }, { "epoch": 0.47, "grad_norm": 266.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 82.51325225830078, "logits/rejected": 82.49774169921875, "logps/chosen": -30.521617889404297, "logps/rejected": -31.205219268798828, "loss": 2.9555, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.27021360397338867, "rewards/margins": 0.28941839933395386, "rewards/rejected": -0.019204800948500633, "step": 180 }, { "epoch": 0.49, "grad_norm": 316.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": 84.31837463378906, "logits/rejected": 84.31514739990234, "logps/chosen": -30.445659637451172, "logps/rejected": -30.434545516967773, "loss": 3.42, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12830288708209991, "rewards/margins": 0.11491189152002335, "rewards/rejected": 0.013391007669270039, "step": 190 }, { "epoch": 0.52, "grad_norm": 372.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 79.8134765625, "logits/rejected": 79.75944519042969, "logps/chosen": -34.02518081665039, "logps/rejected": -32.41716766357422, "loss": 3.7491, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2910219728946686, "rewards/margins": 0.2605269253253937, "rewards/rejected": 0.03049505315721035, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 99.1017074584961, "eval_logits/rejected": 99.0970458984375, "eval_logps/chosen": -32.42703628540039, "eval_logps/rejected": -36.04937744140625, "eval_loss": 2.7964677810668945, "eval_rewards/accuracies": 0.5282392501831055, "eval_rewards/chosen": 0.006455874536186457, "eval_rewards/margins": 0.039595745503902435, "eval_rewards/rejected": -0.033139873296022415, "eval_runtime": 103.8428, "eval_samples_per_second": 3.303, "eval_steps_per_second": 0.414, "step": 200 }, { "epoch": 0.55, "grad_norm": 588.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 82.32527923583984, "logits/rejected": 82.2273178100586, "logps/chosen": -33.264827728271484, "logps/rejected": -34.95970153808594, "loss": 2.4885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2723545730113983, "rewards/margins": 0.2617969214916229, "rewards/rejected": 0.010557657107710838, "step": 210 }, { "epoch": 0.57, "grad_norm": 328.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 84.34782409667969, "logits/rejected": 84.42643737792969, "logps/chosen": -31.27395248413086, "logps/rejected": -30.698680877685547, "loss": 2.8757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.27000054717063904, "rewards/margins": 0.25789040327072144, "rewards/rejected": 0.012110118754208088, "step": 220 }, { "epoch": 0.6, "grad_norm": 274.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 81.63847351074219, "logits/rejected": 81.70894622802734, "logps/chosen": -32.306007385253906, "logps/rejected": -34.033103942871094, "loss": 3.5257, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19989387691020966, "rewards/margins": 0.20827460289001465, "rewards/rejected": -0.008380698040127754, "step": 230 }, { "epoch": 0.62, "grad_norm": 404.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 84.00340270996094, "logits/rejected": 84.26656341552734, "logps/chosen": -30.941274642944336, "logps/rejected": -31.65194320678711, "loss": 2.3257, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.29871702194213867, "rewards/margins": 0.3466779887676239, "rewards/rejected": -0.047960974276065826, "step": 240 }, { "epoch": 0.65, "grad_norm": 324.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 82.73006439208984, "logits/rejected": 82.80888366699219, "logps/chosen": -27.05695152282715, "logps/rejected": -29.929241180419922, "loss": 2.6343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.21557846665382385, "rewards/margins": 0.226979061961174, "rewards/rejected": -0.011400607414543629, "step": 250 }, { "epoch": 0.68, "grad_norm": 516.0, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 80.12226104736328, "logits/rejected": 80.24698638916016, "logps/chosen": -31.07294273376465, "logps/rejected": -35.9581184387207, "loss": 2.354, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.19522134959697723, "rewards/margins": 0.13781873881816864, "rewards/rejected": 0.057402610778808594, "step": 260 }, { "epoch": 0.7, "grad_norm": 344.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": 79.53446960449219, "logits/rejected": 79.55906677246094, "logps/chosen": -30.89791488647461, "logps/rejected": -31.486038208007812, "loss": 3.2552, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3524041175842285, "rewards/margins": 0.3446514904499054, "rewards/rejected": 0.007752613630145788, "step": 270 }, { "epoch": 0.73, "grad_norm": 576.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 81.95085144042969, "logits/rejected": 81.75164031982422, "logps/chosen": -31.267623901367188, "logps/rejected": -29.534778594970703, "loss": 3.0829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2169070690870285, "rewards/margins": 0.16402050852775574, "rewards/rejected": 0.05288654565811157, "step": 280 }, { "epoch": 0.75, "grad_norm": 294.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": 82.23690795898438, "logits/rejected": 82.1518325805664, "logps/chosen": -33.551673889160156, "logps/rejected": -31.901935577392578, "loss": 2.7131, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2210315465927124, "rewards/margins": 0.17505864799022675, "rewards/rejected": 0.04597286507487297, "step": 290 }, { "epoch": 0.78, "grad_norm": 414.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 77.79912567138672, "logits/rejected": 77.8694839477539, "logps/chosen": -32.68059539794922, "logps/rejected": -28.930795669555664, "loss": 2.7187, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.29004615545272827, "rewards/margins": 0.22155535221099854, "rewards/rejected": 0.06849084049463272, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 99.0815658569336, "eval_logits/rejected": 99.07491302490234, "eval_logps/chosen": -32.308189392089844, "eval_logps/rejected": -35.85464859008789, "eval_loss": 2.8584375381469727, "eval_rewards/accuracies": 0.5340532064437866, "eval_rewards/chosen": 0.05399530008435249, "eval_rewards/margins": 0.009244211949408054, "eval_rewards/rejected": 0.044751089066267014, "eval_runtime": 103.9675, "eval_samples_per_second": 3.299, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 340.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 84.72640228271484, "logits/rejected": 84.76838684082031, "logps/chosen": -30.20013999938965, "logps/rejected": -32.10908889770508, "loss": 3.4161, "rewards/accuracies": 0.6875, "rewards/chosen": 0.26535817980766296, "rewards/margins": 0.22908934950828552, "rewards/rejected": 0.03626886010169983, "step": 310 }, { "epoch": 0.83, "grad_norm": 466.0, "learning_rate": 4.229036944380913e-07, "logits/chosen": 82.32220458984375, "logits/rejected": 82.32237243652344, "logps/chosen": -30.7983341217041, "logps/rejected": -28.84746742248535, "loss": 4.1511, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3200904428958893, "rewards/margins": 0.2912808060646057, "rewards/rejected": 0.028809573501348495, "step": 320 }, { "epoch": 0.86, "grad_norm": 340.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": 79.7364501953125, "logits/rejected": 79.77400207519531, "logps/chosen": -29.450759887695312, "logps/rejected": -32.553932189941406, "loss": 2.6103, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.36073851585388184, "rewards/margins": 0.3283671438694, "rewards/rejected": 0.0323713943362236, "step": 330 }, { "epoch": 0.88, "grad_norm": 382.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 83.68680572509766, "logits/rejected": 83.69349670410156, "logps/chosen": -32.394351959228516, "logps/rejected": -33.24668502807617, "loss": 3.0897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.32711952924728394, "rewards/margins": 0.2783169746398926, "rewards/rejected": 0.04880258068442345, "step": 340 }, { "epoch": 0.91, "grad_norm": 166.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 82.930419921875, "logits/rejected": 82.9115219116211, "logps/chosen": -32.881385803222656, "logps/rejected": -33.06855010986328, "loss": 2.0393, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.30494141578674316, "rewards/margins": 0.29564136266708374, "rewards/rejected": 0.00930009689182043, "step": 350 }, { "epoch": 0.94, "grad_norm": 179.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": 84.22169494628906, "logits/rejected": 84.2500991821289, "logps/chosen": -28.731342315673828, "logps/rejected": -31.580791473388672, "loss": 2.2528, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3283465504646301, "rewards/margins": 0.30072346329689026, "rewards/rejected": 0.027623046189546585, "step": 360 }, { "epoch": 0.96, "grad_norm": 472.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 83.67330932617188, "logits/rejected": 83.69649505615234, "logps/chosen": -32.3766975402832, "logps/rejected": -35.0659065246582, "loss": 3.2317, "rewards/accuracies": 0.5625, "rewards/chosen": 0.19978591799736023, "rewards/margins": 0.18239986896514893, "rewards/rejected": 0.017386028543114662, "step": 370 }, { "epoch": 0.99, "grad_norm": 324.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": 77.76484680175781, "logits/rejected": 77.6435546875, "logps/chosen": -29.995555877685547, "logps/rejected": -28.028635025024414, "loss": 2.7989, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.23222878575325012, "rewards/margins": 0.17945319414138794, "rewards/rejected": 0.05277556926012039, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 2.828607769755574, "train_runtime": 2555.3286, "train_samples_per_second": 1.205, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }