{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.866065502166748, "logits/rejected": -1.8703795671463013, "logps/chosen": -36.988380432128906, "logps/rejected": -33.66728210449219, "loss": 0.9889, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.0036439618561416864, "rewards/margins": 0.01108560897409916, "rewards/rejected": -0.007441645488142967, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.997332215309143, "logits/rejected": -1.999983549118042, "logps/chosen": -29.625896453857422, "logps/rejected": -29.035802841186523, "loss": 1.0023, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0032594085205346346, "rewards/margins": -0.002268626820296049, "rewards/rejected": 0.00552803510800004, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.9199495315551758, "logits/rejected": -1.917249321937561, "logps/chosen": -31.421478271484375, "logps/rejected": -33.2115364074707, "loss": 1.0025, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0010770887602120638, "rewards/margins": -0.0024619889445602894, "rewards/rejected": 0.001384900533594191, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.0169284343719482, "logits/rejected": -2.008178949356079, "logps/chosen": -32.59435272216797, "logps/rejected": -32.49193572998047, "loss": 1.0077, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0035086136776953936, "rewards/margins": -0.007655883673578501, "rewards/rejected": 0.004147270228713751, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.86457097530365, "logits/rejected": -1.8537908792495728, "logps/chosen": -33.56566619873047, "logps/rejected": -35.423240661621094, "loss": 1.0075, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0016377497231587768, "rewards/margins": -0.007458710577338934, "rewards/rejected": 0.005820960737764835, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.9449050426483154, "logits/rejected": -1.9468472003936768, "logps/chosen": -32.59955596923828, "logps/rejected": -33.1828498840332, "loss": 0.9976, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00012556914589367807, "rewards/margins": 0.0024188074748963118, "rewards/rejected": -0.0022932388819754124, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.079878330230713, "logits/rejected": -2.084862232208252, "logps/chosen": -33.98878860473633, "logps/rejected": -36.574462890625, "loss": 0.9989, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.058474536985159e-05, "rewards/margins": 0.0010579143417999148, "rewards/rejected": -0.0011184990871697664, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9425569772720337, "logits/rejected": -1.9457191228866577, "logps/chosen": -34.40068054199219, "logps/rejected": -34.5762939453125, "loss": 0.9978, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.005425452254712582, "rewards/margins": 0.0022025699727237225, "rewards/rejected": 0.0032228827476501465, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.9507396221160889, "logits/rejected": -1.9552500247955322, "logps/chosen": -32.460357666015625, "logps/rejected": -32.354434967041016, "loss": 0.9995, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0014136198442429304, "rewards/margins": 0.0005185690824873745, "rewards/rejected": 0.0008950509363785386, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.0492873191833496, "logits/rejected": -2.0472888946533203, "logps/chosen": -32.23810958862305, "logps/rejected": -31.260278701782227, "loss": 1.0006, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0005457091028802097, "rewards/margins": -0.0006409892230294645, "rewards/rejected": 0.0011866979766637087, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.243312358856201, "eval_logits/rejected": -2.238436222076416, "eval_logps/chosen": -34.023216247558594, "eval_logps/rejected": -37.49723434448242, "eval_loss": 1.0016428232192993, "eval_rewards/accuracies": 0.49833887815475464, "eval_rewards/chosen": 0.0022677299566566944, "eval_rewards/margins": -0.0016096002655103803, "eval_rewards/rejected": 0.0038773303385823965, "eval_runtime": 145.8128, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.005169153213501, "logits/rejected": -2.0027499198913574, "logps/chosen": -33.2365837097168, "logps/rejected": -34.01953125, "loss": 1.0005, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0014460014645010233, "rewards/margins": -0.0005222518229857087, "rewards/rejected": 0.0019682529382407665, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.0166728496551514, "logits/rejected": -2.0083022117614746, "logps/chosen": -32.457847595214844, "logps/rejected": -32.18357467651367, "loss": 0.9982, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0025884758215397596, "rewards/margins": 0.0018243432277813554, "rewards/rejected": -0.004412819631397724, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.0462448596954346, "logits/rejected": -2.038203001022339, "logps/chosen": -30.475027084350586, "logps/rejected": -32.046302795410156, "loss": 1.0012, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00013765673793386668, "rewards/margins": -0.001203052932396531, "rewards/rejected": 0.0013407098595052958, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.976986289024353, "logits/rejected": -1.9872560501098633, "logps/chosen": -31.407278060913086, "logps/rejected": -32.543296813964844, "loss": 0.9958, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0024768461007624865, "rewards/margins": 0.004160420503467321, "rewards/rejected": -0.0016835747519508004, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8905508518218994, "logits/rejected": -1.891632080078125, "logps/chosen": -34.20501708984375, "logps/rejected": -34.77235412597656, "loss": 0.9994, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.002465262543410063, "rewards/margins": 0.0005540539277717471, "rewards/rejected": -0.003019316354766488, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.94281005859375, "logits/rejected": -1.939327597618103, "logps/chosen": -36.144107818603516, "logps/rejected": -32.72822570800781, "loss": 0.9934, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.005514757242053747, "rewards/margins": 0.006609287112951279, "rewards/rejected": -0.001094528939574957, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.0420708656311035, "logits/rejected": -2.0346803665161133, "logps/chosen": -33.771934509277344, "logps/rejected": -31.371145248413086, "loss": 0.991, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.005186290480196476, "rewards/margins": 0.009045111015439034, "rewards/rejected": -0.0038588200695812702, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.0475738048553467, "logits/rejected": -2.0528526306152344, "logps/chosen": -32.524593353271484, "logps/rejected": -32.510643005371094, "loss": 0.9909, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004038581624627113, "rewards/margins": 0.009138843044638634, "rewards/rejected": -0.005100260488688946, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.048490524291992, "logits/rejected": -2.0457024574279785, "logps/chosen": -31.492746353149414, "logps/rejected": -31.319293975830078, "loss": 1.0005, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007584737613797188, "rewards/margins": -0.0005425609415397048, "rewards/rejected": -0.0002159134455723688, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9185683727264404, "logits/rejected": -1.9232347011566162, "logps/chosen": -31.5926513671875, "logps/rejected": -32.78697204589844, "loss": 0.9981, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00272659445181489, "rewards/margins": 0.0019350949442014098, "rewards/rejected": 0.0007914996822364628, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.243807315826416, "eval_logits/rejected": -2.2389235496520996, "eval_logps/chosen": -34.018714904785156, "eval_logps/rejected": -37.517478942871094, "eval_loss": 0.9966108798980713, "eval_rewards/accuracies": 0.5328072905540466, "eval_rewards/chosen": 0.0031679810490459204, "eval_rewards/margins": 0.0033402685075998306, "eval_rewards/rejected": -0.00017228761862497777, "eval_runtime": 145.8204, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.0318503379821777, "logits/rejected": -2.042539596557617, "logps/chosen": -31.948400497436523, "logps/rejected": -33.86983871459961, "loss": 0.994, "rewards/accuracies": 0.5, "rewards/chosen": 0.004580962937325239, "rewards/margins": 0.0060088313184678555, "rewards/rejected": -0.0014278689632192254, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9251388311386108, "logits/rejected": -1.9399843215942383, "logps/chosen": -30.099853515625, "logps/rejected": -31.55409812927246, "loss": 0.9985, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0023371793795377016, "rewards/margins": 0.001521837548352778, "rewards/rejected": 0.0008153414237312973, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.9835479259490967, "logits/rejected": -1.9875112771987915, "logps/chosen": -33.387638092041016, "logps/rejected": -31.554845809936523, "loss": 0.9929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0059667350724339485, "rewards/margins": 0.007074916269630194, "rewards/rejected": -0.0011081816628575325, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.9826898574829102, "logits/rejected": -1.9607274532318115, "logps/chosen": -34.158443450927734, "logps/rejected": -34.963783264160156, "loss": 0.9983, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0009392915526404977, "rewards/margins": 0.0016581962117925286, "rewards/rejected": -0.0025974875316023827, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.024381160736084, "logits/rejected": -2.0210862159729004, "logps/chosen": -32.9254035949707, "logps/rejected": -36.251712799072266, "loss": 0.9947, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -6.657392077613622e-05, "rewards/margins": 0.005290796514600515, "rewards/rejected": -0.005357370711863041, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.8911311626434326, "logits/rejected": -1.8886839151382446, "logps/chosen": -34.194557189941406, "logps/rejected": -35.51445770263672, "loss": 0.9993, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0012316217180341482, "rewards/margins": 0.0007347877835854888, "rewards/rejected": -0.001966409618034959, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.8759450912475586, "logits/rejected": -1.8734045028686523, "logps/chosen": -34.40558624267578, "logps/rejected": -31.752349853515625, "loss": 1.0054, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0015969609376043081, "rewards/margins": -0.005393522325903177, "rewards/rejected": 0.0037965611554682255, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.980015754699707, "logits/rejected": -1.9693737030029297, "logps/chosen": -35.33230209350586, "logps/rejected": -31.845691680908203, "loss": 0.9942, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.004867845680564642, "rewards/margins": 0.005807613953948021, "rewards/rejected": -0.0009397673420608044, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.0756278038024902, "logits/rejected": -2.060606002807617, "logps/chosen": -30.907390594482422, "logps/rejected": -32.64055252075195, "loss": 0.9989, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00334669416770339, "rewards/margins": 0.0010900094639509916, "rewards/rejected": 0.002256684470921755, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.946616768836975, "logits/rejected": -1.9440828561782837, "logps/chosen": -32.894561767578125, "logps/rejected": -30.812387466430664, "loss": 0.9944, "rewards/accuracies": 0.5, "rewards/chosen": 0.006510418839752674, "rewards/margins": 0.005625545047223568, "rewards/rejected": 0.0008848730358295143, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2434821128845215, "eval_logits/rejected": -2.2386035919189453, "eval_logps/chosen": -34.017669677734375, "eval_logps/rejected": -37.50018310546875, "eval_loss": 0.9999422430992126, "eval_rewards/accuracies": 0.490448534488678, "eval_rewards/chosen": 0.003376962151378393, "eval_rewards/margins": 8.94198747118935e-05, "eval_rewards/rejected": 0.00328754261136055, "eval_runtime": 145.8777, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.9287067651748657, "logits/rejected": -1.925451636314392, "logps/chosen": -31.603496551513672, "logps/rejected": -33.734046936035156, "loss": 1.0017, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.9836966859875247e-05, "rewards/margins": -0.001651174039579928, "rewards/rejected": 0.0016213370254263282, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9802377223968506, "logits/rejected": -1.9679291248321533, "logps/chosen": -34.585323333740234, "logps/rejected": -33.57084274291992, "loss": 0.9914, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00215008738450706, "rewards/margins": 0.008550785481929779, "rewards/rejected": -0.006400698330253363, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.015906810760498, "logits/rejected": -2.014427661895752, "logps/chosen": -33.49116516113281, "logps/rejected": -32.47978973388672, "loss": 1.001, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0010741351870819926, "rewards/margins": -0.0010322926100343466, "rewards/rejected": -4.1842577047646046e-05, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.1030631065368652, "logits/rejected": -2.0872654914855957, "logps/chosen": -34.18492889404297, "logps/rejected": -33.08319854736328, "loss": 1.0081, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.001107137417420745, "rewards/margins": -0.00807467382401228, "rewards/rejected": 0.006967535708099604, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9745471477508545, "logits/rejected": -1.9736032485961914, "logps/chosen": -33.23271942138672, "logps/rejected": -32.4765510559082, "loss": 0.9878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00987558625638485, "rewards/margins": 0.012156149372458458, "rewards/rejected": -0.002280563348904252, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.9305438995361328, "logits/rejected": -1.940913438796997, "logps/chosen": -32.22040939331055, "logps/rejected": -35.28728103637695, "loss": 1.0034, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.003144216490909457, "rewards/margins": -0.003439632710069418, "rewards/rejected": 0.00029541627736762166, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.069648265838623, "logits/rejected": -2.0630898475646973, "logps/chosen": -33.63695526123047, "logps/rejected": -29.226470947265625, "loss": 0.997, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0030023655854165554, "rewards/margins": 0.0030115083791315556, "rewards/rejected": -9.143399438471533e-06, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.928865671157837, "logits/rejected": -1.9310123920440674, "logps/chosen": -34.243560791015625, "logps/rejected": -30.892742156982422, "loss": 0.9969, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0009016336989589036, "rewards/margins": 0.0031216249335557222, "rewards/rejected": -0.004023258574306965, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.998242255619594, "train_runtime": 3253.3829, "train_samples_per_second": 0.946, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }