{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 113.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -2.7358343601226807, "logits/rejected": -2.7480404376983643, "logps/chosen": -27.35565757751465, "logps/rejected": -21.06114387512207, "loss": 0.5102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 146.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.0090532302856445, "logits/rejected": -2.998255729675293, "logps/chosen": -33.17539596557617, "logps/rejected": -31.967647552490234, "loss": 0.5411, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0038858253974467516, "rewards/margins": 0.006502790376543999, "rewards/rejected": -0.0026169654447585344, "step": 10 }, { "epoch": 0.05, "grad_norm": 144.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": -2.89949369430542, "logits/rejected": -2.8941283226013184, "logps/chosen": -32.47248077392578, "logps/rejected": -28.952869415283203, "loss": 0.6026, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018292926251888275, "rewards/margins": -0.018470000475645065, "rewards/rejected": 0.02029929682612419, "step": 20 }, { "epoch": 0.08, "grad_norm": 159.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.096463680267334, "logits/rejected": -3.1078667640686035, "logps/chosen": -32.81493377685547, "logps/rejected": -30.128747940063477, "loss": 0.5658, "rewards/accuracies": 0.4375, "rewards/chosen": 0.06562719494104385, "rewards/margins": 0.018515314906835556, "rewards/rejected": 0.047111887484788895, "step": 30 }, { "epoch": 0.1, "grad_norm": 223.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.8619472980499268, "logits/rejected": -2.8528523445129395, "logps/chosen": -31.569528579711914, "logps/rejected": -32.359840393066406, "loss": 0.8775, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1825559139251709, "rewards/margins": 0.18422597646713257, "rewards/rejected": -0.0016700520645827055, "step": 40 }, { "epoch": 0.13, "grad_norm": 156.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": -2.8859169483184814, "logits/rejected": -2.8837780952453613, "logps/chosen": -29.52107810974121, "logps/rejected": -30.1494083404541, "loss": 0.7059, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1533588469028473, "rewards/margins": 0.1956842839717865, "rewards/rejected": -0.042325448244810104, "step": 50 }, { "epoch": 0.16, "grad_norm": 150.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": -2.9192371368408203, "logits/rejected": -2.9210586547851562, "logps/chosen": -30.085376739501953, "logps/rejected": -28.13214111328125, "loss": 0.5449, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.031074851751327515, "rewards/margins": 0.1545926332473755, "rewards/rejected": -0.12351777404546738, "step": 60 }, { "epoch": 0.18, "grad_norm": 972.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": -3.001981496810913, "logits/rejected": -3.009333610534668, "logps/chosen": -29.32364273071289, "logps/rejected": -31.037628173828125, "loss": 1.1987, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.029870102182030678, "rewards/margins": 0.07220745086669922, "rewards/rejected": -0.10207755863666534, "step": 70 }, { "epoch": 0.21, "grad_norm": 192.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": -2.816981792449951, "logits/rejected": -2.8327014446258545, "logps/chosen": -29.412933349609375, "logps/rejected": -29.788555145263672, "loss": 0.6831, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.08945836126804352, "rewards/margins": 0.16944995522499084, "rewards/rejected": -0.07999160140752792, "step": 80 }, { "epoch": 0.23, "grad_norm": 362.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -2.9031848907470703, "logits/rejected": -2.8844356536865234, "logps/chosen": -32.63589859008789, "logps/rejected": -30.085968017578125, "loss": 4.9199, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.14037345349788666, "rewards/margins": 0.18539710342884064, "rewards/rejected": -0.04502364248037338, "step": 90 }, { "epoch": 0.26, "grad_norm": 157.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": -3.0054373741149902, "logits/rejected": -3.006031036376953, "logps/chosen": -31.7724552154541, "logps/rejected": -30.76922035217285, "loss": 1.1043, "rewards/accuracies": 0.5625, "rewards/chosen": 0.19909459352493286, "rewards/margins": 0.16622690856456757, "rewards/rejected": 0.03286769241094589, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.815906524658203, "eval_logits/rejected": -2.813185930252075, "eval_logps/chosen": -31.25544548034668, "eval_logps/rejected": -34.70828628540039, "eval_loss": 0.6260043978691101, "eval_rewards/accuracies": 0.5369601845741272, "eval_rewards/chosen": 0.018902836367487907, "eval_rewards/margins": 0.02599383145570755, "eval_rewards/rejected": -0.00709099555388093, "eval_runtime": 113.2921, "eval_samples_per_second": 3.028, "eval_steps_per_second": 0.38, "step": 100 }, { "epoch": 0.29, "grad_norm": 157.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": -2.9622676372528076, "logits/rejected": -2.937613010406494, "logps/chosen": -31.749774932861328, "logps/rejected": -31.12813949584961, "loss": 1.4773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3269161581993103, "rewards/margins": 0.30541908740997314, "rewards/rejected": 0.02149704284965992, "step": 110 }, { "epoch": 0.31, "grad_norm": 109.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": -3.043015718460083, "logits/rejected": -3.0729401111602783, "logps/chosen": -28.837779998779297, "logps/rejected": -34.10157012939453, "loss": 1.2172, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22828666865825653, "rewards/margins": 0.19610069692134857, "rewards/rejected": 0.032185956835746765, "step": 120 }, { "epoch": 0.34, "grad_norm": 95.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.7426066398620605, "logits/rejected": -2.738373279571533, "logps/chosen": -28.734365463256836, "logps/rejected": -30.13399887084961, "loss": 0.9835, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16822054982185364, "rewards/margins": 0.16835634410381317, "rewards/rejected": -0.00013580024824477732, "step": 130 }, { "epoch": 0.36, "grad_norm": 192.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -3.015483856201172, "logits/rejected": -3.012770652770996, "logps/chosen": -27.254077911376953, "logps/rejected": -31.771997451782227, "loss": 0.9001, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1625039130449295, "rewards/margins": 0.2431638240814209, "rewards/rejected": -0.0806598886847496, "step": 140 }, { "epoch": 0.39, "grad_norm": 147.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": -2.8096275329589844, "logits/rejected": -2.804391860961914, "logps/chosen": -27.50994300842285, "logps/rejected": -31.280899047851562, "loss": 0.7365, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22089700400829315, "rewards/margins": 0.26097819209098816, "rewards/rejected": -0.04008117690682411, "step": 150 }, { "epoch": 0.42, "grad_norm": 122.5, "learning_rate": 3.636998309800573e-06, "logits/chosen": -3.127908229827881, "logits/rejected": -3.1092982292175293, "logps/chosen": -31.936166763305664, "logps/rejected": -29.042150497436523, "loss": 1.5699, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.38888686895370483, "rewards/margins": 0.43254947662353516, "rewards/rejected": -0.043662626296281815, "step": 160 }, { "epoch": 0.44, "grad_norm": 116.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.9442355632781982, "logits/rejected": -2.9510912895202637, "logps/chosen": -29.370159149169922, "logps/rejected": -31.104833602905273, "loss": 1.1729, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.367119163274765, "rewards/margins": 0.37086552381515503, "rewards/rejected": -0.003746363567188382, "step": 170 }, { "epoch": 0.47, "grad_norm": 206.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.7904200553894043, "logits/rejected": -2.7880074977874756, "logps/chosen": -29.214679718017578, "logps/rejected": -29.80947494506836, "loss": 0.8255, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3186172544956207, "rewards/margins": 0.3475292921066284, "rewards/rejected": -0.028912032023072243, "step": 180 }, { "epoch": 0.49, "grad_norm": 264.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.9064321517944336, "logits/rejected": -2.9026076793670654, "logps/chosen": -29.777074813842773, "logps/rejected": -28.358760833740234, "loss": 1.8338, "rewards/accuracies": 0.625, "rewards/chosen": 0.2867472767829895, "rewards/margins": 0.28760695457458496, "rewards/rejected": -0.0008596793049946427, "step": 190 }, { "epoch": 0.52, "grad_norm": 344.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -2.9745757579803467, "logits/rejected": -2.9624178409576416, "logps/chosen": -33.12746810913086, "logps/rejected": -30.224645614624023, "loss": 3.0672, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4000841975212097, "rewards/margins": 0.25347504019737244, "rewards/rejected": 0.14660920202732086, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.8127942085266113, "eval_logits/rejected": -2.8107194900512695, "eval_logps/chosen": -31.217693328857422, "eval_logps/rejected": -34.68858337402344, "eval_loss": 0.6355926990509033, "eval_rewards/accuracies": 0.5627076625823975, "eval_rewards/chosen": 0.04533065855503082, "eval_rewards/margins": 0.03862937539815903, "eval_rewards/rejected": 0.006701283622533083, "eval_runtime": 112.8257, "eval_samples_per_second": 3.04, "eval_steps_per_second": 0.381, "step": 200 }, { "epoch": 0.55, "grad_norm": 272.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.909015655517578, "logits/rejected": -2.909053325653076, "logps/chosen": -32.618431091308594, "logps/rejected": -33.84246063232422, "loss": 1.7638, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2904837131500244, "rewards/margins": 0.2771795690059662, "rewards/rejected": 0.013304118998348713, "step": 210 }, { "epoch": 0.57, "grad_norm": 195.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -2.8854432106018066, "logits/rejected": -2.901151657104492, "logps/chosen": -29.67086410522461, "logps/rejected": -28.4765567779541, "loss": 1.6395, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3550260066986084, "rewards/margins": 0.3188283145427704, "rewards/rejected": 0.03619767725467682, "step": 220 }, { "epoch": 0.6, "grad_norm": 102.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -2.9346377849578857, "logits/rejected": -2.9390294551849365, "logps/chosen": -30.749608993530273, "logps/rejected": -31.755859375, "loss": 0.8936, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.28158265352249146, "rewards/margins": 0.279979407787323, "rewards/rejected": 0.0016032479470595717, "step": 230 }, { "epoch": 0.62, "grad_norm": 77.5, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -2.983455181121826, "logits/rejected": -2.9928596019744873, "logps/chosen": -30.32903480529785, "logps/rejected": -30.288427352905273, "loss": 0.9824, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4031239151954651, "rewards/margins": 0.4403551518917084, "rewards/rejected": -0.03723124787211418, "step": 240 }, { "epoch": 0.65, "grad_norm": 282.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.817836046218872, "logits/rejected": -2.808262348175049, "logps/chosen": -26.585535049438477, "logps/rejected": -29.320880889892578, "loss": 1.3431, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.25807633996009827, "rewards/margins": 0.11054243892431259, "rewards/rejected": 0.14753387868404388, "step": 250 }, { "epoch": 0.68, "grad_norm": 89.0, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -2.8003945350646973, "logits/rejected": -2.8208835124969482, "logps/chosen": -29.268306732177734, "logps/rejected": -34.10762405395508, "loss": 1.0424, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.39315709471702576, "rewards/margins": 0.34663277864456177, "rewards/rejected": 0.04652435705065727, "step": 260 }, { "epoch": 0.7, "grad_norm": 54.25, "learning_rate": 1.243452991757889e-06, "logits/chosen": -2.9433441162109375, "logits/rejected": -2.9482321739196777, "logps/chosen": -30.294042587280273, "logps/rejected": -29.78550148010254, "loss": 1.0077, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.341118723154068, "rewards/margins": 0.2852852940559387, "rewards/rejected": 0.05583342909812927, "step": 270 }, { "epoch": 0.73, "grad_norm": 78.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -2.9577131271362305, "logits/rejected": -2.944226026535034, "logps/chosen": -30.35019874572754, "logps/rejected": -28.367935180664062, "loss": 1.9189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2600388824939728, "rewards/margins": 0.0756484642624855, "rewards/rejected": 0.18439041078090668, "step": 280 }, { "epoch": 0.75, "grad_norm": 227.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.885504722595215, "logits/rejected": -2.8676083087921143, "logps/chosen": -32.159873962402344, "logps/rejected": -30.42794418334961, "loss": 1.5952, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.33349719643592834, "rewards/margins": 0.26674339175224304, "rewards/rejected": 0.06675383448600769, "step": 290 }, { "epoch": 0.78, "grad_norm": 340.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": -2.889909267425537, "logits/rejected": -2.8869872093200684, "logps/chosen": -31.75823402404785, "logps/rejected": -27.508203506469727, "loss": 1.2353, "rewards/accuracies": 0.625, "rewards/chosen": 0.32509008049964905, "rewards/margins": 0.2583235204219818, "rewards/rejected": 0.06676653772592545, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.809739589691162, "eval_logits/rejected": -2.807137966156006, "eval_logps/chosen": -31.20755958557129, "eval_logps/rejected": -34.65956115722656, "eval_loss": 0.6301615834236145, "eval_rewards/accuracies": 0.531146228313446, "eval_rewards/chosen": 0.052424702793359756, "eval_rewards/margins": 0.025408506393432617, "eval_rewards/rejected": 0.02701619826257229, "eval_runtime": 113.0311, "eval_samples_per_second": 3.035, "eval_steps_per_second": 0.38, "step": 300 }, { "epoch": 0.81, "grad_norm": 251.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": -2.7691893577575684, "logits/rejected": -2.787349224090576, "logps/chosen": -28.8099422454834, "logps/rejected": -30.58675765991211, "loss": 1.3555, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2936348617076874, "rewards/margins": 0.19195988774299622, "rewards/rejected": 0.10167495906352997, "step": 310 }, { "epoch": 0.83, "grad_norm": 93.5, "learning_rate": 4.229036944380913e-07, "logits/chosen": -3.0164008140563965, "logits/rejected": -3.0019757747650146, "logps/chosen": -29.883535385131836, "logps/rejected": -28.36444664001465, "loss": 0.9933, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22963504493236542, "rewards/margins": 0.26160377264022827, "rewards/rejected": -0.03196870535612106, "step": 320 }, { "epoch": 0.86, "grad_norm": 119.5, "learning_rate": 3.053082288996112e-07, "logits/chosen": -2.931257724761963, "logits/rejected": -2.9136319160461426, "logps/chosen": -27.954730987548828, "logps/rejected": -30.42913246154785, "loss": 0.922, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.285138338804245, "rewards/margins": 0.3113783299922943, "rewards/rejected": -0.026239976286888123, "step": 330 }, { "epoch": 0.88, "grad_norm": 131.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -3.146080493927002, "logits/rejected": -3.152937650680542, "logps/chosen": -31.1461181640625, "logps/rejected": -32.72796630859375, "loss": 0.797, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3451611399650574, "rewards/margins": 0.3675331473350525, "rewards/rejected": -0.022372011095285416, "step": 340 }, { "epoch": 0.91, "grad_norm": 117.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -3.0231246948242188, "logits/rejected": -3.0266261100769043, "logps/chosen": -30.33087158203125, "logps/rejected": -31.515050888061523, "loss": 0.8954, "rewards/accuracies": 0.75, "rewards/chosen": 0.33320528268814087, "rewards/margins": 0.334422767162323, "rewards/rejected": -0.001217484474182129, "step": 350 }, { "epoch": 0.94, "grad_norm": 223.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": -2.86474871635437, "logits/rejected": -2.866323947906494, "logps/chosen": -27.935626983642578, "logps/rejected": -29.81571388244629, "loss": 0.7841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.49987268447875977, "rewards/margins": 0.4892123341560364, "rewards/rejected": 0.010660367086529732, "step": 360 }, { "epoch": 0.96, "grad_norm": 127.5, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.9371769428253174, "logits/rejected": -2.936225414276123, "logps/chosen": -30.32781410217285, "logps/rejected": -31.941625595092773, "loss": 0.7687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3211846947669983, "rewards/margins": 0.3306921422481537, "rewards/rejected": -0.00950746051967144, "step": 370 }, { "epoch": 0.99, "grad_norm": 484.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": -2.8952066898345947, "logits/rejected": -2.8765997886657715, "logps/chosen": -28.4578857421875, "logps/rejected": -27.789112091064453, "loss": 0.8783, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3410065770149231, "rewards/margins": 0.2706519067287445, "rewards/rejected": 0.07035474479198456, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 1.2163990609057538, "train_runtime": 2720.9807, "train_samples_per_second": 1.132, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }