{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.866413950920105, "logits/rejected": -1.8707411289215088, "logps/chosen": -36.98916244506836, "logps/rejected": -33.67436981201172, "loss": 0.6701, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.01569323241710663, "rewards/margins": 0.05555717274546623, "rewards/rejected": -0.039863936603069305, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9979650974273682, "logits/rejected": -2.0006086826324463, "logps/chosen": -29.624820709228516, "logps/rejected": -29.0762939453125, "loss": 0.6837, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01563635841012001, "rewards/margins": 0.027204299345612526, "rewards/rejected": -0.01156794372946024, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.921021819114685, "logits/rejected": -1.9183374643325806, "logps/chosen": -31.40532875061035, "logps/rejected": -33.23241424560547, "loss": 0.6877, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00968973059207201, "rewards/margins": 0.022251319140195847, "rewards/rejected": -0.012561586685478687, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0176353454589844, "logits/rejected": -2.008906364440918, "logps/chosen": -32.574256896972656, "logps/rejected": -32.53368377685547, "loss": 0.6874, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0022967704571783543, "rewards/margins": 0.02120940014719963, "rewards/rejected": -0.018912632018327713, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8619186878204346, "logits/rejected": -1.85114324092865, "logps/chosen": -33.55537414550781, "logps/rejected": -35.45675277709961, "loss": 0.6957, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.001892436295747757, "rewards/margins": 0.005858602002263069, "rewards/rejected": -0.003966164775192738, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9400945901870728, "logits/rejected": -1.9420464038848877, "logps/chosen": -32.56509780883789, "logps/rejected": -33.2406120300293, "loss": 0.6632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.031578924506902695, "rewards/margins": 0.09388783574104309, "rewards/rejected": -0.062308914959430695, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0712790489196777, "logits/rejected": -2.0762436389923096, "logps/chosen": -33.981910705566406, "logps/rejected": -36.62363815307617, "loss": 0.6833, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005918038543313742, "rewards/margins": 0.05520814657211304, "rewards/rejected": -0.04929010197520256, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9327905178070068, "logits/rejected": -1.935909628868103, "logps/chosen": -34.32685470581055, "logps/rejected": -34.65606689453125, "loss": 0.639, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.09085920453071594, "rewards/margins": 0.14815348386764526, "rewards/rejected": -0.057294271886348724, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9414918422698975, "logits/rejected": -1.946007490158081, "logps/chosen": -32.406803131103516, "logps/rejected": -32.36021041870117, "loss": 0.6792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.054556868970394135, "rewards/margins": 0.05573350936174393, "rewards/rejected": -0.0011766403913497925, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.039034128189087, "logits/rejected": -2.0370402336120605, "logps/chosen": -32.172786712646484, "logps/rejected": -31.333194732666016, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": 0.06124376505613327, "rewards/margins": 0.12152798473834991, "rewards/rejected": -0.06028420478105545, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2339773178100586, "eval_logits/rejected": -2.229137420654297, "eval_logps/chosen": -34.04054641723633, "eval_logps/rejected": -37.549957275390625, "eval_loss": 0.6902773976325989, "eval_rewards/accuracies": 0.5685215592384338, "eval_rewards/chosen": -0.005393954925239086, "eval_rewards/margins": 0.024608083069324493, "eval_rewards/rejected": -0.030002037063241005, "eval_runtime": 146.034, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.994192123413086, "logits/rejected": -1.9918158054351807, "logps/chosen": -33.142940521240234, "logps/rejected": -34.01188278198242, "loss": 0.6911, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.09078876674175262, "rewards/margins": 0.07505009323358536, "rewards/rejected": 0.015738680958747864, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0053954124450684, "logits/rejected": -1.997046709060669, "logps/chosen": -32.33894348144531, "logps/rejected": -32.1308708190918, "loss": 0.6746, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09536493569612503, "rewards/margins": 0.06779730319976807, "rewards/rejected": 0.027567636221647263, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0336387157440186, "logits/rejected": -2.025650978088379, "logps/chosen": -30.345691680908203, "logps/rejected": -32.078697204589844, "loss": 0.6527, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.11702337116003036, "rewards/margins": 0.14014457166194916, "rewards/rejected": -0.023121213540434837, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9642337560653687, "logits/rejected": -1.9744552373886108, "logps/chosen": -31.243911743164062, "logps/rejected": -32.590267181396484, "loss": 0.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1581769436597824, "rewards/margins": 0.20802685618400574, "rewards/rejected": -0.04984992742538452, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.876604437828064, "logits/rejected": -1.8777605295181274, "logps/chosen": -33.938690185546875, "logps/rejected": -34.807891845703125, "loss": 0.6043, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.22860188782215118, "rewards/margins": 0.2741745412349701, "rewards/rejected": -0.0455726757645607, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9282041788101196, "logits/rejected": -1.9247684478759766, "logps/chosen": -36.02125930786133, "logps/rejected": -32.71831130981445, "loss": 0.6454, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13537634909152985, "rewards/margins": 0.13137592375278473, "rewards/rejected": 0.004000450484454632, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.029125928878784, "logits/rejected": -2.0217747688293457, "logps/chosen": -33.49839401245117, "logps/rejected": -31.400177001953125, "loss": 0.5828, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.26951926946640015, "rewards/margins": 0.3130132555961609, "rewards/rejected": -0.04349397122859955, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0355944633483887, "logits/rejected": -2.040832042694092, "logps/chosen": -32.235923767089844, "logps/rejected": -32.460418701171875, "loss": 0.5943, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2779761850833893, "rewards/margins": 0.2557251751422882, "rewards/rejected": 0.02225096896290779, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0362112522125244, "logits/rejected": -2.0334599018096924, "logps/chosen": -31.269250869750977, "logps/rejected": -31.325435638427734, "loss": 0.6245, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19773444533348083, "rewards/margins": 0.20423230528831482, "rewards/rejected": -0.0064978525042533875, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9060389995574951, "logits/rejected": -1.9106788635253906, "logps/chosen": -31.306299209594727, "logps/rejected": -32.81407165527344, "loss": 0.5931, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2699825167655945, "rewards/margins": 0.2908058166503906, "rewards/rejected": -0.02082330361008644, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.231553792953491, "eval_logits/rejected": -2.2267112731933594, "eval_logps/chosen": -34.07304763793945, "eval_logps/rejected": -37.57693862915039, "eval_loss": 0.6979728937149048, "eval_rewards/accuracies": 0.5157807469367981, "eval_rewards/chosen": -0.03464451804757118, "eval_rewards/margins": 0.019641490653157234, "eval_rewards/rejected": -0.054286014288663864, "eval_runtime": 145.8095, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.018519163131714, "logits/rejected": -2.0291810035705566, "logps/chosen": -31.742992401123047, "logps/rejected": -33.946937561035156, "loss": 0.5902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2054794579744339, "rewards/margins": 0.2812942862510681, "rewards/rejected": -0.07581482082605362, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.911586046218872, "logits/rejected": -1.9263393878936768, "logps/chosen": -29.84616470336914, "logps/rejected": -31.615009307861328, "loss": 0.5879, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23883743584156036, "rewards/margins": 0.2899848222732544, "rewards/rejected": -0.051147449761629105, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9677941799163818, "logits/rejected": -1.9717823266983032, "logps/chosen": -33.100074768066406, "logps/rejected": -31.62213134765625, "loss": 0.5748, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.28565075993537903, "rewards/margins": 0.3511958718299866, "rewards/rejected": -0.06554517149925232, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9661725759506226, "logits/rejected": -1.944300651550293, "logps/chosen": -33.841453552246094, "logps/rejected": -35.11375045776367, "loss": 0.5473, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2810631990432739, "rewards/margins": 0.4277234673500061, "rewards/rejected": -0.14666026830673218, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.007416009902954, "logits/rejected": -2.0040948390960693, "logps/chosen": -32.70330810546875, "logps/rejected": -36.29412841796875, "loss": 0.5992, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1995842456817627, "rewards/margins": 0.2618715763092041, "rewards/rejected": -0.06228730082511902, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8749721050262451, "logits/rejected": -1.8725513219833374, "logps/chosen": -34.00068664550781, "logps/rejected": -35.53888702392578, "loss": 0.6254, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16894161701202393, "rewards/margins": 0.1997825801372528, "rewards/rejected": -0.030840963125228882, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8600317239761353, "logits/rejected": -1.8576066493988037, "logps/chosen": -34.1875, "logps/rejected": -31.8159122467041, "loss": 0.616, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1890900433063507, "rewards/margins": 0.22921428084373474, "rewards/rejected": -0.04012420028448105, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9631398916244507, "logits/rejected": -1.9526073932647705, "logps/chosen": -35.023719787597656, "logps/rejected": -31.869693756103516, "loss": 0.5782, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.29963088035583496, "rewards/margins": 0.32546472549438477, "rewards/rejected": -0.025833839550614357, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0582926273345947, "logits/rejected": -2.0433640480041504, "logps/chosen": -30.733753204345703, "logps/rejected": -32.67460632324219, "loss": 0.6392, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.17133468389511108, "rewards/margins": 0.19182677567005157, "rewards/rejected": -0.020492086187005043, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.929610013961792, "logits/rejected": -1.9270601272583008, "logps/chosen": -32.42620086669922, "logps/rejected": -30.873455047607422, "loss": 0.5301, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.450817346572876, "rewards/margins": 0.5018006563186646, "rewards/rejected": -0.050983332097530365, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.229154348373413, "eval_logits/rejected": -2.2243051528930664, "eval_logps/chosen": -34.09621810913086, "eval_logps/rejected": -37.59999084472656, "eval_loss": 0.6972895860671997, "eval_rewards/accuracies": 0.5390365719795227, "eval_rewards/chosen": -0.05550166219472885, "eval_rewards/margins": 0.019528048112988472, "eval_rewards/rejected": -0.07502970844507217, "eval_runtime": 145.7792, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9142345190048218, "logits/rejected": -1.9109809398651123, "logps/chosen": -31.33791732788086, "logps/rejected": -33.82014465332031, "loss": 0.5861, "rewards/accuracies": 0.75, "rewards/chosen": 0.23888680338859558, "rewards/margins": 0.30907896161079407, "rewards/rejected": -0.07019217312335968, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9650068283081055, "logits/rejected": -1.9527791738510132, "logps/chosen": -34.34791946411133, "logps/rejected": -33.650447845458984, "loss": 0.5828, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22334297001361847, "rewards/margins": 0.3237887918949127, "rewards/rejected": -0.10044582933187485, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.00040602684021, "logits/rejected": -1.9989902973175049, "logps/chosen": -33.210105895996094, "logps/rejected": -32.56142807006836, "loss": 0.5803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.24812059104442596, "rewards/margins": 0.32178014516830444, "rewards/rejected": -0.07365953922271729, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.0870866775512695, "logits/rejected": -2.0713772773742676, "logps/chosen": -33.80995178222656, "logps/rejected": -33.120697021484375, "loss": 0.5723, "rewards/accuracies": 0.75, "rewards/chosen": 0.3324963450431824, "rewards/margins": 0.33488941192626953, "rewards/rejected": -0.0023930787574499846, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.959240198135376, "logits/rejected": -1.9583876132965088, "logps/chosen": -32.863216400146484, "logps/rejected": -32.54397201538086, "loss": 0.5523, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.37699171900749207, "rewards/margins": 0.44793614745140076, "rewards/rejected": -0.0709443911910057, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9147189855575562, "logits/rejected": -1.9250224828720093, "logps/chosen": -31.902795791625977, "logps/rejected": -35.3552131652832, "loss": 0.5743, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2717086672782898, "rewards/margins": 0.33151620626449585, "rewards/rejected": -0.05980752781033516, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.05413556098938, "logits/rejected": -2.047651767730713, "logps/chosen": -33.377376556396484, "logps/rejected": -29.2799072265625, "loss": 0.5801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24713313579559326, "rewards/margins": 0.2952673137187958, "rewards/rejected": -0.04813414067029953, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9141871929168701, "logits/rejected": -1.9163949489593506, "logps/chosen": -33.87698745727539, "logps/rejected": -30.976858139038086, "loss": 0.5489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.32585546374320984, "rewards/margins": 0.4196627140045166, "rewards/rejected": -0.09380728751420975, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6175476637753573, "train_runtime": 3252.7839, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }