{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-08, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.5102, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.2820512820512818e-07, "logits/chosen": -1.8663597106933594, "logits/rejected": -1.8706719875335693, "logps/chosen": -36.990386962890625, "logps/rejected": -33.658267974853516, "loss": 0.4771, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.011347133666276932, "rewards/margins": 0.031080076470971107, "rewards/rejected": -0.019732946529984474, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.5641025641025636e-07, "logits/chosen": -1.9978214502334595, "logits/rejected": -2.000457286834717, "logps/chosen": -29.62484359741211, "logps/rejected": -29.059850692749023, "loss": 0.5261, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.012144992128014565, "rewards/margins": 0.009631244465708733, "rewards/rejected": 0.0025137457996606827, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.9197276830673218, "logits/rejected": -1.9170366525650024, "logps/chosen": -31.404308319091797, "logps/rejected": -33.229034423828125, "loss": 0.5076, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.008251860737800598, "rewards/margins": 0.015654325485229492, "rewards/rejected": -0.007402463350445032, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438433e-07, "logits/chosen": -2.0169289112091064, "logits/rejected": -2.008176803588867, "logps/chosen": -32.55079650878906, "logps/rejected": -32.502708435058594, "loss": 0.535, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.018207356333732605, "rewards/margins": 0.011235545389354229, "rewards/rejected": 0.006971807684749365, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542186e-07, "logits/chosen": -1.86395263671875, "logits/rejected": -1.8531732559204102, "logps/chosen": -33.559814453125, "logps/rejected": -35.43522262573242, "loss": 0.5718, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0016335565596818924, "rewards/margins": -0.013616559095680714, "rewards/rejected": 0.011983001604676247, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941118e-07, "logits/chosen": -1.9451929330825806, "logits/rejected": -1.9471458196640015, "logps/chosen": -32.59247589111328, "logps/rejected": -33.19312286376953, "loss": 0.4982, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.005393369123339653, "rewards/margins": 0.020615221932530403, "rewards/rejected": -0.015221851877868176, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413548e-07, "logits/chosen": -2.0797886848449707, "logits/rejected": -2.08477783203125, "logps/chosen": -33.99976348876953, "logps/rejected": -36.57415771484375, "loss": 0.575, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.007893012836575508, "rewards/margins": -0.004194633569568396, "rewards/rejected": -0.0036983792670071125, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-07, "logits/chosen": -1.9419677257537842, "logits/rejected": -1.94512939453125, "logps/chosen": -34.39982986450195, "logps/rejected": -34.590721130371094, "loss": 0.5441, "rewards/accuracies": 0.5625, "rewards/chosen": 0.019587691873311996, "rewards/margins": 0.01840476132929325, "rewards/rejected": 0.0011829293798655272, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.736716601303429e-07, "logits/chosen": -1.9507859945297241, "logits/rejected": -1.9552862644195557, "logps/chosen": -32.467891693115234, "logps/rejected": -32.358272552490234, "loss": 0.5391, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00033024101867340505, "rewards/margins": -0.0007759220898151398, "rewards/rejected": 0.00044568348675966263, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.62624545834521e-07, "logits/chosen": -2.049070358276367, "logits/rejected": -2.047074794769287, "logps/chosen": -32.25985336303711, "logps/rejected": -31.274398803710938, "loss": 0.5589, "rewards/accuracies": 0.5, "rewards/chosen": -0.013313899748027325, "rewards/margins": -0.007583809085190296, "rewards/rejected": -0.005730087868869305, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.244201421737671, "eval_logits/rejected": -2.239315986633301, "eval_logps/chosen": -34.009124755859375, "eval_logps/rejected": -37.5127067565918, "eval_loss": 0.5106843709945679, "eval_rewards/accuracies": 0.5573089718818665, "eval_rewards/chosen": 0.017796913161873817, "eval_rewards/margins": 0.015057443641126156, "eval_rewards/rejected": 0.0027394662611186504, "eval_runtime": 146.0293, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.4982572012636904e-07, "logits/chosen": -2.005356550216675, "logits/rejected": -2.0029516220092773, "logps/chosen": -33.2292366027832, "logps/rejected": -34.054996490478516, "loss": 0.4957, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01020804513245821, "rewards/margins": 0.028141701593995094, "rewards/rejected": -0.01793365553021431, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777677e-07, "logits/chosen": -2.0166029930114746, "logits/rejected": -2.0082242488861084, "logps/chosen": -32.45597457885742, "logps/rejected": -32.18632507324219, "loss": 0.5282, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007746054325252771, "rewards/margins": 0.009627602994441986, "rewards/rejected": -0.01737365685403347, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.194082707715275e-07, "logits/chosen": -2.046288251876831, "logits/rejected": -2.038238525390625, "logps/chosen": -30.492712020874023, "logps/rejected": -32.042259216308594, "loss": 0.5777, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.011898026801645756, "rewards/margins": -0.01942119374871254, "rewards/rejected": 0.007523166947066784, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.020402418666621e-07, "logits/chosen": -1.976900339126587, "logits/rejected": -1.9871864318847656, "logps/chosen": -31.39472007751465, "logps/rejected": -32.55016326904297, "loss": 0.4864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.01745815947651863, "rewards/margins": 0.028155237436294556, "rewards/rejected": -0.01069707702845335, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.8341962650351185e-07, "logits/chosen": -1.8908016681671143, "logits/rejected": -1.8918870687484741, "logps/chosen": -34.199378967285156, "logps/rejected": -34.751861572265625, "loss": 0.5621, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.004678909666836262, "rewards/margins": -0.008454290218651295, "rewards/rejected": 0.0037753782235085964, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800572e-07, "logits/chosen": -1.9425878524780273, "logits/rejected": -1.9391052722930908, "logps/chosen": -36.136619567871094, "logps/rejected": -32.712371826171875, "loss": 0.5148, "rewards/accuracies": 0.5, "rewards/chosen": 0.02454109489917755, "rewards/margins": 0.01727622002363205, "rewards/rejected": 0.007264876272529364, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.430433172111807e-07, "logits/chosen": -2.042212963104248, "logits/rejected": -2.0348353385925293, "logps/chosen": -33.78378677368164, "logps/rejected": -31.365230560302734, "loss": 0.5133, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.009854594245553017, "rewards/margins": 0.01922372169792652, "rewards/rejected": -0.009369125589728355, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.216202642830543e-07, "logits/chosen": -2.047489881515503, "logits/rejected": -2.0527498722076416, "logps/chosen": -32.54710388183594, "logps/rejected": -32.50310134887695, "loss": 0.5261, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.001622129580937326, "rewards/margins": 0.010947163216769695, "rewards/rejected": -0.01256929337978363, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.9960716642946403e-07, "logits/chosen": -2.0485682487487793, "logits/rejected": -2.04580020904541, "logps/chosen": -31.48910903930664, "logps/rejected": -31.318958282470703, "loss": 0.5482, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00010873023711610585, "rewards/margins": 0.0004080161452293396, "rewards/rejected": -0.000516746542416513, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.771853789806683e-07, "logits/chosen": -1.9184706211090088, "logits/rejected": -1.9231430292129517, "logps/chosen": -31.597286224365234, "logps/rejected": -32.80516815185547, "loss": 0.522, "rewards/accuracies": 0.5, "rewards/chosen": 0.006296842359006405, "rewards/margins": 0.016262350603938103, "rewards/rejected": -0.009965506382286549, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2433319091796875, "eval_logits/rejected": -2.238459825515747, "eval_logps/chosen": -34.028812408447266, "eval_logps/rejected": -37.51158905029297, "eval_loss": 0.5422906875610352, "eval_rewards/accuracies": 0.46885380148887634, "eval_rewards/chosen": 0.00401716772466898, "eval_rewards/margins": 0.0004931418807245791, "eval_rewards/rejected": 0.0035240259021520615, "eval_runtime": 145.7818, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402e-07, "logits/chosen": -2.0319957733154297, "logits/rejected": -2.0426838397979736, "logps/chosen": -31.943634033203125, "logps/rejected": -33.902008056640625, "loss": 0.4594, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.01936788484454155, "rewards/margins": 0.046881090849637985, "rewards/rejected": -0.027513209730386734, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.318564697655179e-07, "logits/chosen": -1.9252302646636963, "logits/rejected": -1.9400790929794312, "logps/chosen": -30.092998504638672, "logps/rejected": -31.55881690979004, "loss": 0.5269, "rewards/accuracies": 0.5625, "rewards/chosen": 0.012976284138858318, "rewards/margins": 0.013424187898635864, "rewards/rejected": -0.0004478988121263683, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.093227910899832e-07, "logits/chosen": -1.983025312423706, "logits/rejected": -1.9869884252548218, "logps/chosen": -33.37274932861328, "logps/rejected": -31.550548553466797, "loss": 0.496, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.03130333498120308, "rewards/margins": 0.03217558190226555, "rewards/rejected": -0.0008722454076632857, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279356e-07, "logits/chosen": -1.983493447303772, "logits/rejected": -1.9615182876586914, "logps/chosen": -34.13421630859375, "logps/rejected": -34.97159957885742, "loss": 0.4915, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.01367081981152296, "rewards/margins": 0.02823723293840885, "rewards/rejected": -0.014566412195563316, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.654436768970182e-07, "logits/chosen": -2.02471661567688, "logits/rejected": -2.0214104652404785, "logps/chosen": -32.94930648803711, "logps/rejected": -36.21381759643555, "loss": 0.6084, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.016968127340078354, "rewards/margins": -0.02474220283329487, "rewards/rejected": 0.007774075958877802, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.444597403062196e-07, "logits/chosen": -1.8912674188613892, "logits/rejected": -1.8888145685195923, "logps/chosen": -34.208099365234375, "logps/rejected": -35.50531005859375, "loss": 0.5753, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.013791380450129509, "rewards/margins": -0.013306483626365662, "rewards/rejected": -0.00048489533946849406, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.2434529917578887e-07, "logits/chosen": -1.8766494989395142, "logits/rejected": -1.8741207122802734, "logps/chosen": -34.38150405883789, "logps/rejected": -31.744686126708984, "loss": 0.5631, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.011268051341176033, "rewards/margins": -0.007382377982139587, "rewards/rejected": 0.018650425598025322, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603521e-07, "logits/chosen": -1.9800630807876587, "logits/rejected": -1.9694359302520752, "logps/chosen": -35.32728958129883, "logps/rejected": -31.843835830688477, "loss": 0.4967, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02054680697619915, "rewards/margins": 0.022537903860211372, "rewards/rejected": -0.001991095719859004, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071453e-08, "logits/chosen": -2.0754363536834717, "logits/rejected": -2.060412645339966, "logps/chosen": -30.902868270874023, "logps/rejected": -32.63262939453125, "loss": 0.5486, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.014878431335091591, "rewards/margins": 0.0014350058045238256, "rewards/rejected": 0.013443423435091972, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-08, "logits/chosen": -1.9468863010406494, "logits/rejected": -1.9443439245224, "logps/chosen": -32.88282775878906, "logps/rejected": -30.835247039794922, "loss": 0.4616, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03099890425801277, "rewards/margins": 0.04390609636902809, "rewards/rejected": -0.012907189317047596, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.243941307067871, "eval_logits/rejected": -2.2390596866607666, "eval_logps/chosen": -34.027870178222656, "eval_logps/rejected": -37.49814224243164, "eval_loss": 0.5651828050613403, "eval_rewards/accuracies": 0.4808970093727112, "eval_rewards/chosen": 0.004676156677305698, "eval_rewards/margins": -0.008257162757217884, "eval_rewards/rejected": 0.012933320365846157, "eval_runtime": 145.8956, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589034e-08, "logits/chosen": -1.928396224975586, "logits/rejected": -1.9251337051391602, "logps/chosen": -31.597158432006836, "logps/rejected": -33.751304626464844, "loss": 0.5251, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.004333490040153265, "rewards/margins": 0.01073872484266758, "rewards/rejected": -0.006405232939869165, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380912e-08, "logits/chosen": -1.9806029796600342, "logits/rejected": -1.9683090448379517, "logps/chosen": -34.563812255859375, "logps/rejected": -33.55340576171875, "loss": 0.4803, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.022582050412893295, "rewards/margins": 0.03277861326932907, "rewards/rejected": -0.010196560993790627, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-08, "logits/chosen": -2.0163512229919434, "logits/rejected": -2.0149030685424805, "logps/chosen": -33.46595764160156, "logps/rejected": -32.46282196044922, "loss": 0.5443, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.013885289430618286, "rewards/margins": 0.0021549216471612453, "rewards/rejected": 0.011730367317795753, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.05793773749158e-08, "logits/chosen": -2.1030476093292236, "logits/rejected": -2.0872480869293213, "logps/chosen": -34.173221588134766, "logps/rejected": -33.08686447143555, "loss": 0.5853, "rewards/accuracies": 0.5, "rewards/chosen": 0.004321468528360128, "rewards/margins": -0.01749541237950325, "rewards/rejected": 0.021816883236169815, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.251801807404168e-08, "logits/chosen": -1.9748103618621826, "logits/rejected": -1.9738647937774658, "logps/chosen": -33.254066467285156, "logps/rejected": -32.4422721862793, "loss": 0.5603, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.019621744751930237, "rewards/margins": 0.0036095953546464443, "rewards/rejected": 0.016012147068977356, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-09, "logits/chosen": -1.9305530786514282, "logits/rejected": -1.9409068822860718, "logps/chosen": -32.2257080078125, "logps/rejected": -35.29142379760742, "loss": 0.5729, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0147119564935565, "rewards/margins": -0.012842650525271893, "rewards/rejected": -0.0018693048041313887, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050324e-09, "logits/chosen": -2.069432497024536, "logits/rejected": -2.0628814697265625, "logps/chosen": -33.65316390991211, "logps/rejected": -29.218481063842773, "loss": 0.5602, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.000836326158605516, "rewards/margins": -0.006397420074790716, "rewards/rejected": 0.005561096128076315, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-10, "logits/chosen": -1.9289767742156982, "logits/rejected": -1.9311527013778687, "logps/chosen": -34.243995666503906, "logps/rejected": -30.892364501953125, "loss": 0.5254, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0034601963125169277, "rewards/margins": 0.010354455560445786, "rewards/rejected": -0.013814652338624, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.5317366457604743, "train_runtime": 3254.062, "train_samples_per_second": 0.946, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }