{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8667490482330322, "logits/rejected": -1.8710733652114868, "logps/chosen": -36.97007369995117, "logps/rejected": -33.66944885253906, "loss": 0.9317, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.03287407010793686, "rewards/margins": 0.06830974668264389, "rewards/rejected": -0.03543568402528763, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9981460571289062, "logits/rejected": -2.000789165496826, "logps/chosen": -29.641231536865234, "logps/rejected": -29.06744384765625, "loss": 0.9955, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0008672710391692817, "rewards/margins": 0.004467610269784927, "rewards/rejected": -0.0036003391724079847, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.920600175857544, "logits/rejected": -1.917925238609314, "logps/chosen": -31.395061492919922, "logps/rejected": -33.240909576416016, "loss": 0.9609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01893135905265808, "rewards/margins": 0.03913776949048042, "rewards/rejected": -0.020206410437822342, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.017815113067627, "logits/rejected": -2.0090720653533936, "logps/chosen": -32.5806884765625, "logps/rejected": -32.515098571777344, "loss": 1.0013, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.003494268748909235, "rewards/margins": -0.0013132141903042793, "rewards/rejected": -0.002181055024266243, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8630876541137695, "logits/rejected": -1.8523353338241577, "logps/chosen": -33.549766540527344, "logps/rejected": -35.46318435668945, "loss": 0.9833, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.006940312683582306, "rewards/margins": 0.016694897785782814, "rewards/rejected": -0.009754580445587635, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9419746398925781, "logits/rejected": -1.943914771080017, "logps/chosen": -32.527896881103516, "logps/rejected": -33.21547317504883, "loss": 0.9153, "rewards/accuracies": 0.625, "rewards/chosen": 0.06505907326936722, "rewards/margins": 0.10474522411823273, "rewards/rejected": -0.03968615084886551, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.07257080078125, "logits/rejected": -2.0775399208068848, "logps/chosen": -34.00202560424805, "logps/rejected": -36.622886657714844, "loss": 0.9636, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.012188142165541649, "rewards/margins": 0.03642461448907852, "rewards/rejected": -0.04861275106668472, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9333629608154297, "logits/rejected": -1.9364970922470093, "logps/chosen": -34.302101135253906, "logps/rejected": -34.63160705566406, "loss": 0.862, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1131378561258316, "rewards/margins": 0.1484164148569107, "rewards/rejected": -0.0352785661816597, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9408857822418213, "logits/rejected": -1.945412039756775, "logps/chosen": -32.36528015136719, "logps/rejected": -32.34526824951172, "loss": 0.9225, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09192506223917007, "rewards/margins": 0.07964853197336197, "rewards/rejected": 0.012276534922420979, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.037550210952759, "logits/rejected": -2.0355725288391113, "logps/chosen": -32.142730712890625, "logps/rejected": -31.29366683959961, "loss": 0.8913, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.08829358220100403, "rewards/margins": 0.1130049005150795, "rewards/rejected": -0.02471131458878517, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.232161283493042, "eval_logits/rejected": -2.2273108959198, "eval_logps/chosen": -34.040714263916016, "eval_logps/rejected": -37.54047775268555, "eval_loss": 0.9844526052474976, "eval_rewards/accuracies": 0.5195183157920837, "eval_rewards/chosen": -0.005542654078453779, "eval_rewards/margins": 0.015924591571092606, "eval_rewards/rejected": -0.021467244252562523, "eval_runtime": 145.9018, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9926517009735107, "logits/rejected": -1.9902753829956055, "logps/chosen": -33.12412643432617, "logps/rejected": -34.011417388916016, "loss": 0.9361, "rewards/accuracies": 0.625, "rewards/chosen": 0.10772128403186798, "rewards/margins": 0.09156213700771332, "rewards/rejected": 0.016159160062670708, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.00441312789917, "logits/rejected": -1.996093988418579, "logps/chosen": -32.33955383300781, "logps/rejected": -32.13432312011719, "loss": 0.9401, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09481850266456604, "rewards/margins": 0.07035262137651443, "rewards/rejected": 0.024465877562761307, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0318965911865234, "logits/rejected": -2.023927688598633, "logps/chosen": -30.336984634399414, "logps/rejected": -32.0634765625, "loss": 0.9061, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.12486012279987335, "rewards/margins": 0.1342838853597641, "rewards/rejected": -0.009423775598406792, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9620994329452515, "logits/rejected": -1.9723354578018188, "logps/chosen": -31.222240447998047, "logps/rejected": -32.57916259765625, "loss": 0.795, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17767605185508728, "rewards/margins": 0.21753115952014923, "rewards/rejected": -0.03985511139035225, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8727748394012451, "logits/rejected": -1.87395441532135, "logps/chosen": -33.931861877441406, "logps/rejected": -34.79869842529297, "loss": 0.7946, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.23474939167499542, "rewards/margins": 0.272051066160202, "rewards/rejected": -0.03730170056223869, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9248745441436768, "logits/rejected": -1.9214649200439453, "logps/chosen": -36.014469146728516, "logps/rejected": -32.73783493041992, "loss": 0.8532, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14148668944835663, "rewards/margins": 0.15506146848201752, "rewards/rejected": -0.013574766926467419, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.025555372238159, "logits/rejected": -2.01819109916687, "logps/chosen": -33.50218200683594, "logps/rejected": -31.41971206665039, "loss": 0.7292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26611366868019104, "rewards/margins": 0.3271873891353607, "rewards/rejected": -0.06107370927929878, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0320167541503906, "logits/rejected": -2.037261486053467, "logps/chosen": -32.24850845336914, "logps/rejected": -32.45344924926758, "loss": 0.7865, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2666531801223755, "rewards/margins": 0.23813048005104065, "rewards/rejected": 0.028522688895463943, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.032525062561035, "logits/rejected": -2.0297436714172363, "logps/chosen": -31.313217163085938, "logps/rejected": -31.349472045898438, "loss": 0.8387, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1581628918647766, "rewards/margins": 0.18629543483257294, "rewards/rejected": -0.02813255414366722, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.902632713317871, "logits/rejected": -1.907284140586853, "logps/chosen": -31.320043563842773, "logps/rejected": -32.85698699951172, "loss": 0.7293, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2576131224632263, "rewards/margins": 0.31706100702285767, "rewards/rejected": -0.05944784730672836, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.228637933731079, "eval_logits/rejected": -2.2238004207611084, "eval_logps/chosen": -34.053680419921875, "eval_logps/rejected": -37.581058502197266, "eval_loss": 0.9601577520370483, "eval_rewards/accuracies": 0.5714285373687744, "eval_rewards/chosen": -0.01721162348985672, "eval_rewards/margins": 0.040783192962408066, "eval_rewards/rejected": -0.057994820177555084, "eval_runtime": 145.5388, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.0149245262145996, "logits/rejected": -2.025560140609741, "logps/chosen": -31.77438735961914, "logps/rejected": -33.95419692993164, "loss": 0.7666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17722666263580322, "rewards/margins": 0.25957340002059937, "rewards/rejected": -0.08234670013189316, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.906951904296875, "logits/rejected": -1.9217418432235718, "logps/chosen": -29.83829116821289, "logps/rejected": -31.636096954345703, "loss": 0.7204, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2459266185760498, "rewards/margins": 0.3160557448863983, "rewards/rejected": -0.07012919336557388, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9629713296890259, "logits/rejected": -1.9669532775878906, "logps/chosen": -33.124656677246094, "logps/rejected": -31.630443572998047, "loss": 0.7348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2635299265384674, "rewards/margins": 0.3365571200847626, "rewards/rejected": -0.07302714884281158, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9614177942276, "logits/rejected": -1.9395818710327148, "logps/chosen": -33.87095260620117, "logps/rejected": -35.10104751586914, "loss": 0.6865, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.254514217376709, "rewards/margins": 0.38974156975746155, "rewards/rejected": -0.13522735238075256, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.0029492378234863, "logits/rejected": -1.999629020690918, "logps/chosen": -32.730865478515625, "logps/rejected": -36.28009796142578, "loss": 0.8055, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17478153109550476, "rewards/margins": 0.2244400531053543, "rewards/rejected": -0.04965851828455925, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8703190088272095, "logits/rejected": -1.8679043054580688, "logps/chosen": -33.98231887817383, "logps/rejected": -35.54644775390625, "loss": 0.8042, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18547315895557404, "rewards/margins": 0.22311437129974365, "rewards/rejected": -0.03764120861887932, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8551464080810547, "logits/rejected": -1.852746605873108, "logps/chosen": -34.20850372314453, "logps/rejected": -31.803356170654297, "loss": 0.8096, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.170187309384346, "rewards/margins": 0.19900819659233093, "rewards/rejected": -0.02882089652121067, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9582526683807373, "logits/rejected": -1.947749376296997, "logps/chosen": -35.0114631652832, "logps/rejected": -31.88564682006836, "loss": 0.692, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3106640875339508, "rewards/margins": 0.3508565425872803, "rewards/rejected": -0.040192440152168274, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.053699493408203, "logits/rejected": -2.038789749145508, "logps/chosen": -30.727243423461914, "logps/rejected": -32.641685485839844, "loss": 0.8827, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.17719359695911407, "rewards/margins": 0.16805905103683472, "rewards/rejected": 0.009134533815085888, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9243850708007812, "logits/rejected": -1.9218356609344482, "logps/chosen": -32.43050003051758, "logps/rejected": -30.8950138092041, "loss": 0.6144, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.446951299905777, "rewards/margins": 0.5173346400260925, "rewards/rejected": -0.07038338482379913, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2248916625976562, "eval_logits/rejected": -2.2200686931610107, "eval_logps/chosen": -34.08660125732422, "eval_logps/rejected": -37.603153228759766, "eval_loss": 0.9712583422660828, "eval_rewards/accuracies": 0.5282392501831055, "eval_rewards/chosen": -0.046843186020851135, "eval_rewards/margins": 0.031036507338285446, "eval_rewards/rejected": -0.07787969708442688, "eval_runtime": 145.8399, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9094560146331787, "logits/rejected": -1.9062097072601318, "logps/chosen": -31.33370018005371, "logps/rejected": -33.83475112915039, "loss": 0.728, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24268333613872528, "rewards/margins": 0.32601919770240784, "rewards/rejected": -0.08333584666252136, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9593706130981445, "logits/rejected": -1.9471466541290283, "logps/chosen": -34.3392333984375, "logps/rejected": -33.68544387817383, "loss": 0.6954, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2311573475599289, "rewards/margins": 0.36310091614723206, "rewards/rejected": -0.13194358348846436, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9940307140350342, "logits/rejected": -1.9925845861434937, "logps/chosen": -33.20854568481445, "logps/rejected": -32.552764892578125, "loss": 0.7359, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.24952416121959686, "rewards/margins": 0.31538745760917664, "rewards/rejected": -0.06586329638957977, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.080763578414917, "logits/rejected": -2.065063953399658, "logps/chosen": -33.82006072998047, "logps/rejected": -33.105167388916016, "loss": 0.7407, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32339948415756226, "rewards/margins": 0.31181785464286804, "rewards/rejected": 0.011581619270145893, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.9535402059555054, "logits/rejected": -1.9526821374893188, "logps/chosen": -32.8734130859375, "logps/rejected": -32.565185546875, "loss": 0.645, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.36781299114227295, "rewards/margins": 0.4578477442264557, "rewards/rejected": -0.09003473073244095, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.909014344215393, "logits/rejected": -1.919298768043518, "logps/chosen": -31.879894256591797, "logps/rejected": -35.34550857543945, "loss": 0.6942, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.29231759905815125, "rewards/margins": 0.34339430928230286, "rewards/rejected": -0.051076728850603104, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.0480055809020996, "logits/rejected": -2.0415189266204834, "logps/chosen": -33.37665939331055, "logps/rejected": -29.24251937866211, "loss": 0.7583, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24777868390083313, "rewards/margins": 0.262265145778656, "rewards/rejected": -0.014486486092209816, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.9075886011123657, "logits/rejected": -1.9097877740859985, "logps/chosen": -33.8558464050293, "logps/rejected": -30.982568740844727, "loss": 0.6567, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.3448841869831085, "rewards/margins": 0.44383174180984497, "rewards/rejected": -0.09894753992557526, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.8125879040012112, "train_runtime": 3249.3137, "train_samples_per_second": 0.948, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }