{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 164.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 0.3906, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 338.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08870697021484, "logits/rejected": 80.791259765625, "logps/chosen": -34.342586517333984, "logps/rejected": -33.04642105102539, "loss": 1.0826, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.08069034665822983, "rewards/margins": -0.010280769318342209, "rewards/rejected": -0.07040956616401672, "step": 10 }, { "epoch": 0.05, "grad_norm": 192.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.631591796875, "logits/rejected": 80.52144622802734, "logps/chosen": -33.67961883544922, "logps/rejected": -30.759857177734375, "loss": 1.1487, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.008231614716351032, "rewards/margins": -0.005876022391021252, "rewards/rejected": 0.014107631519436836, "step": 20 }, { "epoch": 0.08, "grad_norm": 320.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.36092376708984, "logits/rejected": 82.38922882080078, "logps/chosen": -33.96755599975586, "logps/rejected": -31.327926635742188, "loss": 1.3772, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.03620918095111847, "rewards/margins": -0.04051407054066658, "rewards/rejected": 0.07672326266765594, "step": 30 }, { "epoch": 0.1, "grad_norm": 616.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": 80.7266616821289, "logits/rejected": 80.72164154052734, "logps/chosen": -33.125850677490234, "logps/rejected": -33.346900939941406, "loss": 2.0081, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.03856855630874634, "rewards/margins": 0.02127004601061344, "rewards/rejected": 0.017298510298132896, "step": 40 }, { "epoch": 0.13, "grad_norm": 173.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.16896057128906, "logits/rejected": 78.18494415283203, "logps/chosen": -31.25204849243164, "logps/rejected": -31.230030059814453, "loss": 1.2803, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.037441618740558624, "rewards/margins": 0.08795086294412613, "rewards/rejected": -0.12539246678352356, "step": 50 }, { "epoch": 0.16, "grad_norm": 270.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 82.7291259765625, "logits/rejected": 82.77815246582031, "logps/chosen": -31.22023582458496, "logps/rejected": -29.69997787475586, "loss": 1.5003, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0360613577067852, "rewards/margins": 0.05150580406188965, "rewards/rejected": -0.08756714314222336, "step": 60 }, { "epoch": 0.18, "grad_norm": 648.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.51155090332031, "logits/rejected": 83.56068420410156, "logps/chosen": -30.490299224853516, "logps/rejected": -32.93412780761719, "loss": 1.9177, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.11098086833953857, "rewards/margins": 0.028119858354330063, "rewards/rejected": 0.08286102861166, "step": 70 }, { "epoch": 0.21, "grad_norm": 294.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.31900787353516, "logits/rejected": 81.31642150878906, "logps/chosen": -31.447376251220703, "logps/rejected": -30.629833221435547, "loss": 2.0412, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.04605206102132797, "rewards/margins": -0.05619793012738228, "rewards/rejected": 0.10225000232458115, "step": 80 }, { "epoch": 0.23, "grad_norm": 260.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.35929870605469, "logits/rejected": 78.31613159179688, "logps/chosen": -32.67215347290039, "logps/rejected": -31.034778594970703, "loss": 2.0579, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.027179216966032982, "rewards/margins": 0.039463870227336884, "rewards/rejected": -0.06664308160543442, "step": 90 }, { "epoch": 0.26, "grad_norm": 168.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": 83.58841705322266, "logits/rejected": 83.63834381103516, "logps/chosen": -34.13097381591797, "logps/rejected": -31.874624252319336, "loss": 1.5488, "rewards/accuracies": 0.625, "rewards/chosen": 0.13921667635440826, "rewards/margins": 0.2384806126356125, "rewards/rejected": -0.09926395863294601, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.81468963623047, "eval_logits/rejected": 98.80232238769531, "eval_logps/chosen": -32.53538513183594, "eval_logps/rejected": -36.158119201660156, "eval_loss": 1.7742832899093628, "eval_rewards/accuracies": 0.5377907156944275, "eval_rewards/chosen": -0.07376820594072342, "eval_rewards/margins": 0.0795043483376503, "eval_rewards/rejected": -0.1532725691795349, "eval_runtime": 104.2041, "eval_samples_per_second": 3.292, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 304.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": 83.73497009277344, "logits/rejected": 83.63249206542969, "logps/chosen": -32.8060417175293, "logps/rejected": -32.912742614746094, "loss": 3.7056, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.17341332137584686, "rewards/margins": 0.3788934051990509, "rewards/rejected": -0.20548005402088165, "step": 110 }, { "epoch": 0.31, "grad_norm": 326.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 83.89222717285156, "logits/rejected": 84.00482940673828, "logps/chosen": -28.764019012451172, "logps/rejected": -35.54996109008789, "loss": 1.7008, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13245216012001038, "rewards/margins": 0.20510384440422058, "rewards/rejected": -0.0726516991853714, "step": 120 }, { "epoch": 0.34, "grad_norm": 190.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 81.15211486816406, "logits/rejected": 81.18699645996094, "logps/chosen": -30.555667877197266, "logps/rejected": -32.08338928222656, "loss": 2.0248, "rewards/accuracies": 0.625, "rewards/chosen": 0.281352162361145, "rewards/margins": 0.4027627110481262, "rewards/rejected": -0.1214105486869812, "step": 130 }, { "epoch": 0.36, "grad_norm": 193.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 82.7738037109375, "logits/rejected": 82.75199890136719, "logps/chosen": -27.381637573242188, "logps/rejected": -32.79624938964844, "loss": 2.9965, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05254943296313286, "rewards/margins": 0.3508468568325043, "rewards/rejected": -0.2982974350452423, "step": 140 }, { "epoch": 0.39, "grad_norm": 320.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 81.73448181152344, "logits/rejected": 81.72250366210938, "logps/chosen": -29.080272674560547, "logps/rejected": -32.661338806152344, "loss": 2.5467, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2470671832561493, "rewards/margins": 0.32294052839279175, "rewards/rejected": -0.07587336003780365, "step": 150 }, { "epoch": 0.42, "grad_norm": 470.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 83.80685424804688, "logits/rejected": 83.82804870605469, "logps/chosen": -33.54126739501953, "logps/rejected": -29.94925880432129, "loss": 3.6642, "rewards/accuracies": 0.625, "rewards/chosen": 0.4763854444026947, "rewards/margins": 0.4737597107887268, "rewards/rejected": 0.002625748049467802, "step": 160 }, { "epoch": 0.44, "grad_norm": 410.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 84.45747375488281, "logits/rejected": 84.39142608642578, "logps/chosen": -30.8850040435791, "logps/rejected": -32.122833251953125, "loss": 2.1347, "rewards/accuracies": 0.625, "rewards/chosen": 0.2994261682033539, "rewards/margins": 0.38164275884628296, "rewards/rejected": -0.08221657574176788, "step": 170 }, { "epoch": 0.47, "grad_norm": 338.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 82.16630554199219, "logits/rejected": 82.15303802490234, "logps/chosen": -30.96017837524414, "logps/rejected": -31.404443740844727, "loss": 2.1896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.189576655626297, "rewards/margins": 0.38736575841903687, "rewards/rejected": -0.19778910279273987, "step": 180 }, { "epoch": 0.49, "grad_norm": 260.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": 84.08841705322266, "logits/rejected": 84.07431030273438, "logps/chosen": -31.131977081298828, "logps/rejected": -30.759252548217773, "loss": 2.7916, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.29245084524154663, "rewards/margins": -0.059467971324920654, "rewards/rejected": -0.2329828441143036, "step": 190 }, { "epoch": 0.52, "grad_norm": 478.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 79.57664489746094, "logits/rejected": 79.53712463378906, "logps/chosen": -34.537757873535156, "logps/rejected": -32.43874740600586, "loss": 3.6133, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.17198507487773895, "rewards/margins": 0.12826156616210938, "rewards/rejected": 0.04372352734208107, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 99.06522369384766, "eval_logits/rejected": 99.04875183105469, "eval_logps/chosen": -32.56056213378906, "eval_logps/rejected": -36.141395568847656, "eval_loss": 1.8921738862991333, "eval_rewards/accuracies": 0.5166113376617432, "eval_rewards/chosen": -0.09390944987535477, "eval_rewards/margins": 0.045985639095306396, "eval_rewards/rejected": -0.13989506661891937, "eval_runtime": 103.9514, "eval_samples_per_second": 3.3, "eval_steps_per_second": 0.414, "step": 200 }, { "epoch": 0.55, "grad_norm": 414.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 82.20321655273438, "logits/rejected": 82.10101318359375, "logps/chosen": -33.50377655029297, "logps/rejected": -34.905250549316406, "loss": 3.0392, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3535541892051697, "rewards/margins": 0.28887811303138733, "rewards/rejected": 0.06467613577842712, "step": 210 }, { "epoch": 0.57, "grad_norm": 227.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 84.46163177490234, "logits/rejected": 84.5374526977539, "logps/chosen": -31.67867088317871, "logps/rejected": -30.870431900024414, "loss": 2.5301, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.21622808277606964, "rewards/margins": 0.32940909266471863, "rewards/rejected": -0.11318100988864899, "step": 220 }, { "epoch": 0.6, "grad_norm": 256.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 81.72822570800781, "logits/rejected": 81.78887939453125, "logps/chosen": -32.73725128173828, "logps/rejected": -33.852149963378906, "loss": 3.0508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0547933354973793, "rewards/margins": -0.07320666313171387, "rewards/rejected": 0.12799999117851257, "step": 230 }, { "epoch": 0.62, "grad_norm": 264.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 84.00396728515625, "logits/rejected": 84.2620620727539, "logps/chosen": -31.22537612915039, "logps/rejected": -31.510395050048828, "loss": 4.0684, "rewards/accuracies": 0.625, "rewards/chosen": 0.3701513409614563, "rewards/margins": 0.3528321385383606, "rewards/rejected": 0.017319146543741226, "step": 240 }, { "epoch": 0.65, "grad_norm": 185.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 82.73152160644531, "logits/rejected": 82.81182098388672, "logps/chosen": -27.366741180419922, "logps/rejected": -29.901432037353516, "loss": 1.8356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.18332336843013763, "rewards/margins": 0.18387673795223236, "rewards/rejected": -0.0005533635849133134, "step": 250 }, { "epoch": 0.68, "grad_norm": 1064.0, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 80.05500793457031, "logits/rejected": 80.20561218261719, "logps/chosen": -31.206958770751953, "logps/rejected": -35.96382522583008, "loss": 3.0499, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.283230721950531, "rewards/margins": 0.17298570275306702, "rewards/rejected": 0.1102449893951416, "step": 260 }, { "epoch": 0.7, "grad_norm": 253.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": 79.4796142578125, "logits/rejected": 79.51309967041016, "logps/chosen": -31.116466522216797, "logps/rejected": -31.364765167236328, "loss": 2.1069, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.5299628376960754, "rewards/margins": 0.4174376428127289, "rewards/rejected": 0.11252517998218536, "step": 270 }, { "epoch": 0.73, "grad_norm": 536.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 81.92121124267578, "logits/rejected": 81.70929718017578, "logps/chosen": -31.43599510192871, "logps/rejected": -29.533435821533203, "loss": 2.4746, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.29911860823631287, "rewards/margins": 0.1922709196805954, "rewards/rejected": 0.10684768110513687, "step": 280 }, { "epoch": 0.75, "grad_norm": 426.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": 82.15860748291016, "logits/rejected": 82.0548324584961, "logps/chosen": -33.60521697998047, "logps/rejected": -31.876062393188477, "loss": 3.1769, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3992280066013336, "rewards/margins": 0.2865853011608124, "rewards/rejected": 0.11264270544052124, "step": 290 }, { "epoch": 0.78, "grad_norm": 288.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 77.75070190429688, "logits/rejected": 77.80924224853516, "logps/chosen": -32.97479248046875, "logps/rejected": -28.991891860961914, "loss": 2.1193, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.34473174810409546, "rewards/margins": 0.25662440061569214, "rewards/rejected": 0.08810728043317795, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 98.99166870117188, "eval_logits/rejected": 98.98554229736328, "eval_logps/chosen": -32.376094818115234, "eval_logps/rejected": -35.87874221801758, "eval_loss": 1.593911051750183, "eval_rewards/accuracies": 0.5191029906272888, "eval_rewards/chosen": 0.05366762727499008, "eval_rewards/margins": -0.01655910350382328, "eval_rewards/rejected": 0.07022672146558762, "eval_runtime": 104.0583, "eval_samples_per_second": 3.296, "eval_steps_per_second": 0.413, "step": 300 }, { "epoch": 0.81, "grad_norm": 211.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 84.67102813720703, "logits/rejected": 84.72035217285156, "logps/chosen": -30.487579345703125, "logps/rejected": -32.10687255859375, "loss": 2.1669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3007642328739166, "rewards/margins": 0.22645199298858643, "rewards/rejected": 0.0743122547864914, "step": 310 }, { "epoch": 0.83, "grad_norm": 220.0, "learning_rate": 4.229036944380913e-07, "logits/chosen": 82.1921615600586, "logits/rejected": 82.18672943115234, "logps/chosen": -31.139236450195312, "logps/rejected": -28.77066421508789, "loss": 1.4103, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.36746180057525635, "rewards/margins": 0.2483980655670166, "rewards/rejected": 0.11906375735998154, "step": 320 }, { "epoch": 0.86, "grad_norm": 246.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": 79.607177734375, "logits/rejected": 79.65017700195312, "logps/chosen": -29.744531631469727, "logps/rejected": -32.44166564941406, "loss": 1.7227, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4864615797996521, "rewards/margins": 0.3319019079208374, "rewards/rejected": 0.15455973148345947, "step": 330 }, { "epoch": 0.88, "grad_norm": 448.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 83.59193420410156, "logits/rejected": 83.5968017578125, "logps/chosen": -32.50049591064453, "logps/rejected": -33.3798828125, "loss": 3.0512, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5693238973617554, "rewards/margins": 0.5782763361930847, "rewards/rejected": -0.008952394127845764, "step": 340 }, { "epoch": 0.91, "grad_norm": 304.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 82.85350036621094, "logits/rejected": 82.85616302490234, "logps/chosen": -33.243656158447266, "logps/rejected": -33.08414840698242, "loss": 1.7389, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.3200667202472687, "rewards/margins": 0.31394660472869873, "rewards/rejected": 0.006120128557085991, "step": 350 }, { "epoch": 0.94, "grad_norm": 120.5, "learning_rate": 6.41315865106129e-08, "logits/chosen": 84.1597900390625, "logits/rejected": 84.19812774658203, "logps/chosen": -28.9798526763916, "logps/rejected": -31.50480079650879, "loss": 1.7363, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4578874111175537, "rewards/margins": 0.3418472409248352, "rewards/rejected": 0.11604013293981552, "step": 360 }, { "epoch": 0.96, "grad_norm": 392.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 83.60018157958984, "logits/rejected": 83.62623596191406, "logps/chosen": -32.55873489379883, "logps/rejected": -34.760498046875, "loss": 2.6656, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.25394394993782043, "rewards/margins": -0.025154482573270798, "rewards/rejected": 0.2790984511375427, "step": 370 }, { "epoch": 0.99, "grad_norm": 338.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": 77.67994689941406, "logits/rejected": 77.54925537109375, "logps/chosen": -30.067230224609375, "logps/rejected": -27.904937744140625, "loss": 2.1564, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4071175158023834, "rewards/margins": 0.20260827243328094, "rewards/rejected": 0.20450922846794128, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 2.2952296467570514, "train_runtime": 2556.4368, "train_samples_per_second": 1.204, "train_steps_per_second": 0.151 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }