{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8668782711029053, "logits/rejected": -1.8712005615234375, "logps/chosen": -36.98646545410156, "logps/rejected": -33.67870330810547, "loss": 0.6747, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.012081857770681381, "rewards/margins": 0.04126200079917908, "rewards/rejected": -0.029180139303207397, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9979515075683594, "logits/rejected": -2.0005881786346436, "logps/chosen": -29.662744522094727, "logps/rejected": -29.051654815673828, "loss": 0.7042, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.01232814695686102, "rewards/margins": -0.019403135403990746, "rewards/rejected": 0.007074988447129726, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.921088457107544, "logits/rejected": -1.9184081554412842, "logps/chosen": -31.383258819580078, "logps/rejected": -33.23828887939453, "loss": 0.6794, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.019701367244124413, "rewards/margins": 0.03159697726368904, "rewards/rejected": -0.011895612813532352, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.017509937286377, "logits/rejected": -2.0087647438049316, "logps/chosen": -32.577518463134766, "logps/rejected": -32.509830474853516, "loss": 0.6964, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.00042539089918136597, "rewards/margins": -0.0021285698749125004, "rewards/rejected": 0.001703177229501307, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8623021841049194, "logits/rejected": -1.8515303134918213, "logps/chosen": -33.56303787231445, "logps/rejected": -35.47795867919922, "loss": 0.6896, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0033326249103993177, "rewards/margins": 0.012034483253955841, "rewards/rejected": -0.015367108397185802, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9411529302597046, "logits/rejected": -1.943098783493042, "logps/chosen": -32.549232482910156, "logps/rejected": -33.20621109008789, "loss": 0.6747, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.030568569898605347, "rewards/margins": 0.051466990262269974, "rewards/rejected": -0.02089841663837433, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0727076530456543, "logits/rejected": -2.07767391204834, "logps/chosen": -33.96394729614258, "logps/rejected": -36.61058807373047, "loss": 0.6813, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.014724211767315865, "rewards/margins": 0.03975607082247734, "rewards/rejected": -0.025031859055161476, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9330183267593384, "logits/rejected": -1.936171293258667, "logps/chosen": -34.318397521972656, "logps/rejected": -34.61701583862305, "loss": 0.6634, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06564848870038986, "rewards/margins": 0.08041460067033768, "rewards/rejected": -0.014766111969947815, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9412405490875244, "logits/rejected": -1.945755958557129, "logps/chosen": -32.374977111816406, "logps/rejected": -32.33773422241211, "loss": 0.6787, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05546582490205765, "rewards/margins": 0.04276125878095627, "rewards/rejected": 0.012704563327133656, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0381903648376465, "logits/rejected": -2.0362119674682617, "logps/chosen": -32.13701629638672, "logps/rejected": -31.295801162719727, "loss": 0.6595, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06229018047451973, "rewards/margins": 0.08004496991634369, "rewards/rejected": -0.017754793167114258, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2335634231567383, "eval_logits/rejected": -2.228721857070923, "eval_logps/chosen": -34.00627899169922, "eval_logps/rejected": -37.514984130859375, "eval_loss": 0.6900185346603394, "eval_rewards/accuracies": 0.5253322720527649, "eval_rewards/chosen": 0.016964510083198547, "eval_rewards/margins": 0.015982570126652718, "eval_rewards/rejected": 0.0009819410042837262, "eval_runtime": 145.7969, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9929395914077759, "logits/rejected": -1.99057936668396, "logps/chosen": -33.06591796875, "logps/rejected": -34.01008605957031, "loss": 0.6678, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10673947632312775, "rewards/margins": 0.09516827017068863, "rewards/rejected": 0.011571208015084267, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0045437812805176, "logits/rejected": -1.9962146282196045, "logps/chosen": -32.33503341674805, "logps/rejected": -32.12450408935547, "loss": 0.68, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06592197716236115, "rewards/margins": 0.043721526861190796, "rewards/rejected": 0.02220045030117035, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.032952308654785, "logits/rejected": -2.024994134902954, "logps/chosen": -30.302433013916016, "logps/rejected": -32.04313278198242, "loss": 0.6609, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.10397056490182877, "rewards/margins": 0.09804753214120865, "rewards/rejected": 0.0059230271726846695, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9635902643203735, "logits/rejected": -1.9738080501556396, "logps/chosen": -31.201534271240234, "logps/rejected": -32.556739807128906, "loss": 0.6343, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13087674975395203, "rewards/margins": 0.1439923495054245, "rewards/rejected": -0.013115609996020794, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8755298852920532, "logits/rejected": -1.876691222190857, "logps/chosen": -33.89242172241211, "logps/rejected": -34.753570556640625, "loss": 0.6272, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1801605522632599, "rewards/margins": 0.17795029282569885, "rewards/rejected": 0.002210266888141632, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9265025854110718, "logits/rejected": -1.9230976104736328, "logps/chosen": -35.98413848876953, "logps/rejected": -32.69154357910156, "loss": 0.6574, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.11252401769161224, "rewards/margins": 0.09379850327968597, "rewards/rejected": 0.01872551441192627, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0262577533721924, "logits/rejected": -2.0189335346221924, "logps/chosen": -33.45969772338867, "logps/rejected": -31.372516632080078, "loss": 0.6076, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20289841294288635, "rewards/margins": 0.21529710292816162, "rewards/rejected": -0.012398697435855865, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0338973999023438, "logits/rejected": -2.0391509532928467, "logps/chosen": -32.194793701171875, "logps/rejected": -32.42069625854492, "loss": 0.6221, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20999665558338165, "rewards/margins": 0.17132976651191711, "rewards/rejected": 0.03866690397262573, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.035045862197876, "logits/rejected": -2.032278060913086, "logps/chosen": -31.230976104736328, "logps/rejected": -31.29391860961914, "loss": 0.6403, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1547853648662567, "rewards/margins": 0.14020755887031555, "rewards/rejected": 0.014577840454876423, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9042125940322876, "logits/rejected": -1.9088417291641235, "logps/chosen": -31.285167694091797, "logps/rejected": -32.79944610595703, "loss": 0.6189, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.192669078707695, "rewards/margins": 0.1977730393409729, "rewards/rejected": -0.0051039643585681915, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.230701446533203, "eval_logits/rejected": -2.2258734703063965, "eval_logps/chosen": -34.0167350769043, "eval_logps/rejected": -37.53435134887695, "eval_loss": 0.6894960403442383, "eval_rewards/accuracies": 0.5340532064437866, "eval_rewards/chosen": 0.010691030882298946, "eval_rewards/margins": 0.02133062295615673, "eval_rewards/rejected": -0.010639593005180359, "eval_runtime": 145.5559, "eval_samples_per_second": 2.356, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.018108367919922, "logits/rejected": -2.0287561416625977, "logps/chosen": -31.745798110961914, "logps/rejected": -33.90629577636719, "loss": 0.6305, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.13530384004116058, "rewards/margins": 0.16146349906921387, "rewards/rejected": -0.026159662753343582, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9105422496795654, "logits/rejected": -1.9253056049346924, "logps/chosen": -29.849069595336914, "logps/rejected": -31.58382797241211, "loss": 0.6251, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15748274326324463, "rewards/margins": 0.17287404835224152, "rewards/rejected": -0.015391310676932335, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.966581106185913, "logits/rejected": -1.9705555438995361, "logps/chosen": -33.059837341308594, "logps/rejected": -31.605152130126953, "loss": 0.6, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21457910537719727, "rewards/margins": 0.2480895221233368, "rewards/rejected": -0.033510446548461914, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9644801616668701, "logits/rejected": -1.9426641464233398, "logps/chosen": -33.819881439208984, "logps/rejected": -35.105430603027344, "loss": 0.5826, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20031993091106415, "rewards/margins": 0.2930986285209656, "rewards/rejected": -0.09277870506048203, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.005814552307129, "logits/rejected": -2.002516746520996, "logps/chosen": -32.663055419921875, "logps/rejected": -36.233436584472656, "loss": 0.632, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15720662474632263, "rewards/margins": 0.16231317818164825, "rewards/rejected": -0.0051065413281321526, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8735284805297852, "logits/rejected": -1.8710968494415283, "logps/chosen": -33.964805603027344, "logps/rejected": -35.50111389160156, "loss": 0.6428, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.13415411114692688, "rewards/margins": 0.13204893469810486, "rewards/rejected": 0.0021051731891930103, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8585532903671265, "logits/rejected": -1.8561296463012695, "logps/chosen": -34.17288589477539, "logps/rejected": -31.792491912841797, "loss": 0.6391, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13482868671417236, "rewards/margins": 0.14752644300460815, "rewards/rejected": -0.012697766534984112, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9624073505401611, "logits/rejected": -1.9519150257110596, "logps/chosen": -34.95975875854492, "logps/rejected": -31.84967041015625, "loss": 0.5947, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2381301373243332, "rewards/margins": 0.2433377206325531, "rewards/rejected": -0.005207589361816645, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.056440830230713, "logits/rejected": -2.0415446758270264, "logps/chosen": -30.704137802124023, "logps/rejected": -32.5970458984375, "loss": 0.6654, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13198992609977722, "rewards/margins": 0.09911760687828064, "rewards/rejected": 0.03287229686975479, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.927122712135315, "logits/rejected": -1.924602746963501, "logps/chosen": -32.37379455566406, "logps/rejected": -30.879268646240234, "loss": 0.5582, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.3319894075393677, "rewards/margins": 0.3694665729999542, "rewards/rejected": -0.03747714310884476, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.226900815963745, "eval_logits/rejected": -2.2220816612243652, "eval_logps/chosen": -34.03526306152344, "eval_logps/rejected": -37.56266403198242, "eval_loss": 0.6872054934501648, "eval_rewards/accuracies": 0.5598006844520569, "eval_rewards/chosen": -0.0004257837135810405, "eval_rewards/margins": 0.027201363816857338, "eval_rewards/rejected": -0.027627145871520042, "eval_runtime": 145.6667, "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9124987125396729, "logits/rejected": -1.9092620611190796, "logps/chosen": -31.285348892211914, "logps/rejected": -33.75069046020508, "loss": 0.6188, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19080111384391785, "rewards/margins": 0.1959228217601776, "rewards/rejected": -0.0051217032596468925, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9616212844848633, "logits/rejected": -1.9494349956512451, "logps/chosen": -34.30357360839844, "logps/rejected": -33.64542770385742, "loss": 0.5999, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17549821734428406, "rewards/margins": 0.23945149779319763, "rewards/rejected": -0.06395327299833298, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.997859239578247, "logits/rejected": -1.9964158535003662, "logps/chosen": -33.130615234375, "logps/rejected": -32.511531829833984, "loss": 0.6073, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21310487389564514, "rewards/margins": 0.23227325081825256, "rewards/rejected": -0.01916835829615593, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.083962917327881, "logits/rejected": -2.06827449798584, "logps/chosen": -33.732425689697266, "logps/rejected": -33.07551193237305, "loss": 0.5985, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2681804299354553, "rewards/margins": 0.24266552925109863, "rewards/rejected": 0.025514895096421242, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.956364631652832, "logits/rejected": -1.9555118083953857, "logps/chosen": -32.79610824584961, "logps/rejected": -32.512969970703125, "loss": 0.5758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2915918231010437, "rewards/margins": 0.3202829957008362, "rewards/rejected": -0.028691178187727928, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9118818044662476, "logits/rejected": -1.9221813678741455, "logps/chosen": -31.855304718017578, "logps/rejected": -35.32331848144531, "loss": 0.6036, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2096320390701294, "rewards/margins": 0.23036828637123108, "rewards/rejected": -0.02073623239994049, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.0520732402801514, "logits/rejected": -2.0456154346466064, "logps/chosen": -33.327049255371094, "logps/rejected": -29.233760833740234, "loss": 0.6124, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19495204091072083, "rewards/margins": 0.19935496151447296, "rewards/rejected": -0.004402949940413237, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.911370038986206, "logits/rejected": -1.9135433435440063, "logps/chosen": -33.83781814575195, "logps/rejected": -30.931133270263672, "loss": 0.5854, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2407412976026535, "rewards/margins": 0.27584755420684814, "rewards/rejected": -0.035106249153614044, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.6364745784115482, "train_runtime": 3251.6479, "train_samples_per_second": 0.947, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }