{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.2287240028381348, "debug/policy_chosen_logps": -169.8909912109375, "debug/policy_rejected_logits": -2.9979002475738525, "debug/policy_rejected_logps": -192.28164672851562, "debug/reference_chosen_logps": -169.8909912109375, "debug/reference_rejected_logps": -192.28164672851562, "epoch": 0.015151515151515152, "grad_norm": 15.896527575776522, "learning_rate": 1e-06, "logits/chosen": -3.2287240028381348, "logits/rejected": -2.9979002475738525, "logps/chosen": -169.8909912109375, "logps/rejected": -192.28164672851562, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.4281623363494873, "debug/policy_chosen_logps": -173.4019012451172, "debug/policy_rejected_logits": -2.949575662612915, "debug/policy_rejected_logps": -202.1363525390625, "debug/reference_chosen_logps": -173.52249145507812, "debug/reference_rejected_logps": -201.62171936035156, "epoch": 0.030303030303030304, "grad_norm": 16.519466000954132, "learning_rate": 1e-06, "logits/chosen": -3.4281623363494873, "logits/rejected": -2.949575662612915, "logps/chosen": -173.4019012451172, "logps/rejected": -202.1363525390625, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012058259453624487, "rewards/margins": 0.006352271884679794, "rewards/rejected": -0.005146445706486702, "step": 2 }, { "debug/policy_chosen_logits": -3.3293609619140625, "debug/policy_chosen_logps": -177.4271240234375, "debug/policy_rejected_logits": -2.903211832046509, "debug/policy_rejected_logps": -203.11773681640625, "debug/reference_chosen_logps": -177.59429931640625, "debug/reference_rejected_logps": -201.2540283203125, "epoch": 0.045454545454545456, "grad_norm": 15.645011951900454, "learning_rate": 1e-06, "logits/chosen": -3.3293609619140625, "logits/rejected": -2.903211832046509, "logps/chosen": -177.4271240234375, "logps/rejected": -203.11773681640625, "loss": 0.4811, "rewards/accuracies": 1.0, "rewards/chosen": 0.0016717910766601562, "rewards/margins": 0.020308855921030045, "rewards/rejected": -0.01863706484436989, "step": 3 }, { "debug/policy_chosen_logits": -3.359229803085327, "debug/policy_chosen_logps": -171.1922607421875, "debug/policy_rejected_logits": -3.2270939350128174, "debug/policy_rejected_logps": -195.89881896972656, "debug/reference_chosen_logps": -171.53582763671875, "debug/reference_rejected_logps": -194.86456298828125, "epoch": 0.06060606060606061, "grad_norm": 16.812604175600832, "learning_rate": 1e-06, "logits/chosen": -3.359229803085327, "logits/rejected": -3.2270939350128174, "logps/chosen": -171.1922607421875, "logps/rejected": -195.89881896972656, "loss": 0.4723, "rewards/accuracies": 0.625, "rewards/chosen": 0.0034355544485151768, "rewards/margins": 0.01377822831273079, "rewards/rejected": -0.0103426743298769, "step": 4 }, { "debug/policy_chosen_logits": -3.296639919281006, "debug/policy_chosen_logps": -183.37974548339844, "debug/policy_rejected_logits": -3.0010054111480713, "debug/policy_rejected_logps": -212.24916076660156, "debug/reference_chosen_logps": -184.3879852294922, "debug/reference_rejected_logps": -202.4541015625, "epoch": 0.07575757575757576, "grad_norm": 17.162080884552854, "learning_rate": 1e-06, "logits/chosen": -3.296639919281006, "logits/rejected": -3.0010054111480713, "logps/chosen": -183.37974548339844, "logps/rejected": -212.24916076660156, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 0.010082511231303215, "rewards/margins": 0.10803306102752686, "rewards/rejected": -0.09795054793357849, "step": 5 }, { "debug/policy_chosen_logits": -3.414454698562622, "debug/policy_chosen_logps": -191.16409301757812, "debug/policy_rejected_logits": -3.2533628940582275, "debug/policy_rejected_logps": -194.81231689453125, "debug/reference_chosen_logps": -192.23001098632812, "debug/reference_rejected_logps": -189.50564575195312, "epoch": 0.09090909090909091, "grad_norm": 12.93626797735514, "learning_rate": 1e-06, "logits/chosen": -3.414454698562622, "logits/rejected": -3.2533628940582275, "logps/chosen": -191.16409301757812, "logps/rejected": -194.81231689453125, "loss": 0.4248, "rewards/accuracies": 0.625, "rewards/chosen": 0.010659217834472656, "rewards/margins": 0.06372596323490143, "rewards/rejected": -0.05306674540042877, "step": 6 }, { "debug/policy_chosen_logits": -3.325504779815674, "debug/policy_chosen_logps": -174.56222534179688, "debug/policy_rejected_logits": -3.1263859272003174, "debug/policy_rejected_logps": -215.89663696289062, "debug/reference_chosen_logps": -176.48927307128906, "debug/reference_rejected_logps": -204.55831909179688, "epoch": 0.10606060606060606, "grad_norm": 16.246776246894385, "learning_rate": 1e-06, "logits/chosen": -3.325504779815674, "logits/rejected": -3.1263859272003174, "logps/chosen": -174.56222534179688, "logps/rejected": -215.89663696289062, "loss": 0.386, "rewards/accuracies": 1.0, "rewards/chosen": 0.01927059143781662, "rewards/margins": 0.13265389204025269, "rewards/rejected": -0.11338328570127487, "step": 7 }, { "debug/policy_chosen_logits": -3.3408055305480957, "debug/policy_chosen_logps": -173.92062377929688, "debug/policy_rejected_logits": -3.0370819568634033, "debug/policy_rejected_logps": -207.4774627685547, "debug/reference_chosen_logps": -176.2281951904297, "debug/reference_rejected_logps": -195.24710083007812, "epoch": 0.12121212121212122, "grad_norm": 14.588248383894227, "learning_rate": 1e-06, "logits/chosen": -3.3408055305480957, "logits/rejected": -3.0370819568634033, "logps/chosen": -173.92062377929688, "logps/rejected": -207.4774627685547, "loss": 0.3915, "rewards/accuracies": 1.0, "rewards/chosen": 0.023075714707374573, "rewards/margins": 0.1453792005777359, "rewards/rejected": -0.12230348587036133, "step": 8 }, { "debug/policy_chosen_logits": -3.3789806365966797, "debug/policy_chosen_logps": -173.42562866210938, "debug/policy_rejected_logits": -3.133667469024658, "debug/policy_rejected_logps": -227.72598266601562, "debug/reference_chosen_logps": -181.0255584716797, "debug/reference_rejected_logps": -196.7879180908203, "epoch": 0.13636363636363635, "grad_norm": 10.41812193686415, "learning_rate": 1e-06, "logits/chosen": -3.3789806365966797, "logits/rejected": -3.133667469024658, "logps/chosen": -173.42562866210938, "logps/rejected": -227.72598266601562, "loss": 0.2552, "rewards/accuracies": 0.875, "rewards/chosen": 0.07599931955337524, "rewards/margins": 0.3853800296783447, "rewards/rejected": -0.3093807101249695, "step": 9 }, { "debug/policy_chosen_logits": -3.2632176876068115, "debug/policy_chosen_logps": -168.97828674316406, "debug/policy_rejected_logits": -3.2279319763183594, "debug/policy_rejected_logps": -221.69720458984375, "debug/reference_chosen_logps": -178.62234497070312, "debug/reference_rejected_logps": -198.7471160888672, "epoch": 0.15151515151515152, "grad_norm": 9.736417746815414, "learning_rate": 1e-06, "logits/chosen": -3.2632176876068115, "logits/rejected": -3.2279319763183594, "logps/chosen": -168.97828674316406, "logps/rejected": -221.69720458984375, "loss": 0.2275, "rewards/accuracies": 0.875, "rewards/chosen": 0.09644053876399994, "rewards/margins": 0.32594138383865356, "rewards/rejected": -0.22950084507465363, "step": 10 }, { "debug/policy_chosen_logits": -3.263065814971924, "debug/policy_chosen_logps": -191.12673950195312, "debug/policy_rejected_logits": -3.136707067489624, "debug/policy_rejected_logps": -207.8048858642578, "debug/reference_chosen_logps": -203.09510803222656, "debug/reference_rejected_logps": -185.27200317382812, "epoch": 0.16666666666666666, "grad_norm": 7.102123058873711, "learning_rate": 1e-06, "logits/chosen": -3.263065814971924, "logits/rejected": -3.136707067489624, "logps/chosen": -191.12673950195312, "logps/rejected": -207.8048858642578, "loss": 0.2506, "rewards/accuracies": 0.875, "rewards/chosen": 0.11968372017145157, "rewards/margins": 0.34501251578330994, "rewards/rejected": -0.22532880306243896, "step": 11 }, { "debug/policy_chosen_logits": -3.3079724311828613, "debug/policy_chosen_logps": -158.8116455078125, "debug/policy_rejected_logits": -3.1820497512817383, "debug/policy_rejected_logps": -222.01480102539062, "debug/reference_chosen_logps": -173.60525512695312, "debug/reference_rejected_logps": -194.14846801757812, "epoch": 0.18181818181818182, "grad_norm": 8.335166240155669, "learning_rate": 1e-06, "logits/chosen": -3.3079724311828613, "logits/rejected": -3.1820497512817383, "logps/chosen": -158.8116455078125, "logps/rejected": -222.01480102539062, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/chosen": 0.1479361653327942, "rewards/margins": 0.42659950256347656, "rewards/rejected": -0.2786633372306824, "step": 12 }, { "debug/policy_chosen_logits": -3.4431967735290527, "debug/policy_chosen_logps": -163.28610229492188, "debug/policy_rejected_logits": -3.1781017780303955, "debug/policy_rejected_logps": -232.920654296875, "debug/reference_chosen_logps": -181.77667236328125, "debug/reference_rejected_logps": -198.103515625, "epoch": 0.19696969696969696, "grad_norm": 4.779904097185996, "learning_rate": 1e-06, "logits/chosen": -3.4431967735290527, "logits/rejected": -3.1781017780303955, "logps/chosen": -163.28610229492188, "logps/rejected": -232.920654296875, "loss": 0.2285, "rewards/accuracies": 0.875, "rewards/chosen": 0.18490572273731232, "rewards/margins": 0.5330770015716553, "rewards/rejected": -0.34817129373550415, "step": 13 }, { "debug/policy_chosen_logits": -3.24206280708313, "debug/policy_chosen_logps": -162.18551635742188, "debug/policy_rejected_logits": -3.1449334621429443, "debug/policy_rejected_logps": -235.12533569335938, "debug/reference_chosen_logps": -181.19454956054688, "debug/reference_rejected_logps": -199.58999633789062, "epoch": 0.21212121212121213, "grad_norm": 5.665092718277167, "learning_rate": 1e-06, "logits/chosen": -3.24206280708313, "logits/rejected": -3.1449334621429443, "logps/chosen": -162.18551635742188, "logps/rejected": -235.12533569335938, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 0.1900903284549713, "rewards/margins": 0.5454437732696533, "rewards/rejected": -0.3553534746170044, "step": 14 }, { "debug/policy_chosen_logits": -3.2667055130004883, "debug/policy_chosen_logps": -165.58248901367188, "debug/policy_rejected_logits": -3.1148128509521484, "debug/policy_rejected_logps": -224.86569213867188, "debug/reference_chosen_logps": -187.37530517578125, "debug/reference_rejected_logps": -197.89208984375, "epoch": 0.22727272727272727, "grad_norm": 4.224713137955556, "learning_rate": 1e-06, "logits/chosen": -3.2667055130004883, "logits/rejected": -3.1148128509521484, "logps/chosen": -165.58248901367188, "logps/rejected": -224.86569213867188, "loss": 0.2135, "rewards/accuracies": 0.875, "rewards/chosen": 0.21792812645435333, "rewards/margins": 0.4876641035079956, "rewards/rejected": -0.26973599195480347, "step": 15 }, { "debug/policy_chosen_logits": -3.2505483627319336, "debug/policy_chosen_logps": -172.78692626953125, "debug/policy_rejected_logits": -3.1557655334472656, "debug/policy_rejected_logps": -225.20663452148438, "debug/reference_chosen_logps": -199.422119140625, "debug/reference_rejected_logps": -199.93321228027344, "epoch": 0.24242424242424243, "grad_norm": 3.9371884469626375, "learning_rate": 1e-06, "logits/chosen": -3.2505483627319336, "logits/rejected": -3.1557655334472656, "logps/chosen": -172.78692626953125, "logps/rejected": -225.20663452148438, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 0.2663518786430359, "rewards/margins": 0.5190861225128174, "rewards/rejected": -0.2527342438697815, "step": 16 }, { "debug/policy_chosen_logits": -3.350160598754883, "debug/policy_chosen_logps": -134.92742919921875, "debug/policy_rejected_logits": -3.2445473670959473, "debug/policy_rejected_logps": -195.10174560546875, "debug/reference_chosen_logps": -174.5078582763672, "debug/reference_rejected_logps": -182.78952026367188, "epoch": 0.25757575757575757, "grad_norm": 5.86135399581337, "learning_rate": 1e-06, "logits/chosen": -3.350160598754883, "logits/rejected": -3.2445473670959473, "logps/chosen": -134.92742919921875, "logps/rejected": -195.10174560546875, "loss": 0.2652, "rewards/accuracies": 0.875, "rewards/chosen": 0.3958042860031128, "rewards/margins": 0.5189265012741089, "rewards/rejected": -0.12312224507331848, "step": 17 }, { "debug/policy_chosen_logits": -3.388575553894043, "debug/policy_chosen_logps": -152.7288055419922, "debug/policy_rejected_logits": -3.3196918964385986, "debug/policy_rejected_logps": -226.92054748535156, "debug/reference_chosen_logps": -195.43197631835938, "debug/reference_rejected_logps": -194.177978515625, "epoch": 0.2727272727272727, "grad_norm": 12.473151798495989, "learning_rate": 1e-06, "logits/chosen": -3.388575553894043, "logits/rejected": -3.3196918964385986, "logps/chosen": -152.7288055419922, "logps/rejected": -226.92054748535156, "loss": 0.2351, "rewards/accuracies": 1.0, "rewards/chosen": 0.42703163623809814, "rewards/margins": 0.7544572353363037, "rewards/rejected": -0.32742559909820557, "step": 18 }, { "debug/policy_chosen_logits": -3.327465534210205, "debug/policy_chosen_logps": -140.37681579589844, "debug/policy_rejected_logits": -3.172419548034668, "debug/policy_rejected_logps": -257.0501403808594, "debug/reference_chosen_logps": -184.55157470703125, "debug/reference_rejected_logps": -198.62893676757812, "epoch": 0.2878787878787879, "grad_norm": 10.236054760882395, "learning_rate": 1e-06, "logits/chosen": -3.327465534210205, "logits/rejected": -3.172419548034668, "logps/chosen": -140.37681579589844, "logps/rejected": -257.0501403808594, "loss": 0.2843, "rewards/accuracies": 1.0, "rewards/chosen": 0.4417475461959839, "rewards/margins": 1.0259594917297363, "rewards/rejected": -0.5842119455337524, "step": 19 }, { "debug/policy_chosen_logits": -3.3493270874023438, "debug/policy_chosen_logps": -138.30780029296875, "debug/policy_rejected_logits": -3.316509485244751, "debug/policy_rejected_logps": -236.43585205078125, "debug/reference_chosen_logps": -185.0295867919922, "debug/reference_rejected_logps": -193.80906677246094, "epoch": 0.30303030303030304, "grad_norm": 10.225710834896915, "learning_rate": 1e-06, "logits/chosen": -3.3493270874023438, "logits/rejected": -3.316509485244751, "logps/chosen": -138.30780029296875, "logps/rejected": -236.43585205078125, "loss": 0.3396, "rewards/accuracies": 0.75, "rewards/chosen": 0.4672178328037262, "rewards/margins": 0.8934856057167053, "rewards/rejected": -0.4262677729129791, "step": 20 }, { "debug/policy_chosen_logits": -3.4731037616729736, "debug/policy_chosen_logps": -144.64541625976562, "debug/policy_rejected_logits": -3.2627787590026855, "debug/policy_rejected_logps": -242.2275390625, "debug/reference_chosen_logps": -187.64114379882812, "debug/reference_rejected_logps": -202.5106201171875, "epoch": 0.3181818181818182, "grad_norm": 8.551620856924016, "learning_rate": 1e-06, "logits/chosen": -3.4731037616729736, "logits/rejected": -3.2627787590026855, "logps/chosen": -144.64541625976562, "logps/rejected": -242.2275390625, "loss": 0.2314, "rewards/accuracies": 0.875, "rewards/chosen": 0.42995721101760864, "rewards/margins": 0.8271263837814331, "rewards/rejected": -0.39716917276382446, "step": 21 }, { "debug/policy_chosen_logits": -3.3169944286346436, "debug/policy_chosen_logps": -134.04901123046875, "debug/policy_rejected_logits": -3.2604851722717285, "debug/policy_rejected_logps": -216.86337280273438, "debug/reference_chosen_logps": -177.80865478515625, "debug/reference_rejected_logps": -205.44650268554688, "epoch": 0.3333333333333333, "grad_norm": 6.81909021472202, "learning_rate": 1e-06, "logits/chosen": -3.3169944286346436, "logits/rejected": -3.2604851722717285, "logps/chosen": -134.04901123046875, "logps/rejected": -216.86337280273438, "loss": 0.1982, "rewards/accuracies": 0.875, "rewards/chosen": 0.43759632110595703, "rewards/margins": 0.5517649054527283, "rewards/rejected": -0.11416859924793243, "step": 22 }, { "debug/policy_chosen_logits": -3.33351731300354, "debug/policy_chosen_logps": -161.69329833984375, "debug/policy_rejected_logits": -3.3218417167663574, "debug/policy_rejected_logps": -212.47789001464844, "debug/reference_chosen_logps": -210.9875946044922, "debug/reference_rejected_logps": -190.85877990722656, "epoch": 0.3484848484848485, "grad_norm": 5.161837643427947, "learning_rate": 1e-06, "logits/chosen": -3.33351731300354, "logits/rejected": -3.3218417167663574, "logps/chosen": -161.69329833984375, "logps/rejected": -212.47789001464844, "loss": 0.1909, "rewards/accuracies": 0.875, "rewards/chosen": 0.4929431676864624, "rewards/margins": 0.7091342210769653, "rewards/rejected": -0.21619105339050293, "step": 23 }, { "debug/policy_chosen_logits": -3.4123055934906006, "debug/policy_chosen_logps": -140.40411376953125, "debug/policy_rejected_logits": -3.2541122436523438, "debug/policy_rejected_logps": -231.91970825195312, "debug/reference_chosen_logps": -181.82656860351562, "debug/reference_rejected_logps": -200.554443359375, "epoch": 0.36363636363636365, "grad_norm": 14.832304628721737, "learning_rate": 1e-06, "logits/chosen": -3.4123055934906006, "logits/rejected": -3.2541122436523438, "logps/chosen": -140.40411376953125, "logps/rejected": -231.91970825195312, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 0.41422444581985474, "rewards/margins": 0.7278770804405212, "rewards/rejected": -0.3136526942253113, "step": 24 }, { "debug/policy_chosen_logits": -3.3668429851531982, "debug/policy_chosen_logps": -153.1930694580078, "debug/policy_rejected_logits": -3.2047929763793945, "debug/policy_rejected_logps": -235.02700805664062, "debug/reference_chosen_logps": -197.70933532714844, "debug/reference_rejected_logps": -198.66546630859375, "epoch": 0.3787878787878788, "grad_norm": 4.7098040773524765, "learning_rate": 1e-06, "logits/chosen": -3.3668429851531982, "logits/rejected": -3.2047929763793945, "logps/chosen": -153.1930694580078, "logps/rejected": -235.02700805664062, "loss": 0.1772, "rewards/accuracies": 0.875, "rewards/chosen": 0.44516265392303467, "rewards/margins": 0.808777928352356, "rewards/rejected": -0.3636153042316437, "step": 25 }, { "debug/policy_chosen_logits": -3.331206798553467, "debug/policy_chosen_logps": -144.004150390625, "debug/policy_rejected_logits": -3.1716086864471436, "debug/policy_rejected_logps": -233.64779663085938, "debug/reference_chosen_logps": -186.57241821289062, "debug/reference_rejected_logps": -198.4064483642578, "epoch": 0.3939393939393939, "grad_norm": 10.883519493191617, "learning_rate": 1e-06, "logits/chosen": -3.331206798553467, "logits/rejected": -3.1716086864471436, "logps/chosen": -144.004150390625, "logps/rejected": -233.64779663085938, "loss": 0.2811, "rewards/accuracies": 0.875, "rewards/chosen": 0.4256827235221863, "rewards/margins": 0.7780961990356445, "rewards/rejected": -0.35241347551345825, "step": 26 }, { "debug/policy_chosen_logits": -3.3798036575317383, "debug/policy_chosen_logps": -138.9322509765625, "debug/policy_rejected_logits": -3.249253273010254, "debug/policy_rejected_logps": -234.61282348632812, "debug/reference_chosen_logps": -176.48486328125, "debug/reference_rejected_logps": -200.70828247070312, "epoch": 0.4090909090909091, "grad_norm": 3.421025608338867, "learning_rate": 1e-06, "logits/chosen": -3.3798036575317383, "logits/rejected": -3.249253273010254, "logps/chosen": -138.9322509765625, "logps/rejected": -234.61282348632812, "loss": 0.166, "rewards/accuracies": 0.875, "rewards/chosen": 0.3755261301994324, "rewards/margins": 0.7145713567733765, "rewards/rejected": -0.3390452265739441, "step": 27 }, { "debug/policy_chosen_logits": -3.2142980098724365, "debug/policy_chosen_logps": -159.95425415039062, "debug/policy_rejected_logits": -3.146585464477539, "debug/policy_rejected_logps": -225.6060791015625, "debug/reference_chosen_logps": -195.47201538085938, "debug/reference_rejected_logps": -201.02882385253906, "epoch": 0.42424242424242425, "grad_norm": 4.061171240286967, "learning_rate": 1e-06, "logits/chosen": -3.2142980098724365, "logits/rejected": -3.146585464477539, "logps/chosen": -159.95425415039062, "logps/rejected": -225.6060791015625, "loss": 0.1957, "rewards/accuracies": 0.875, "rewards/chosen": 0.3551776111125946, "rewards/margins": 0.600950300693512, "rewards/rejected": -0.24577270448207855, "step": 28 }, { "debug/policy_chosen_logits": -3.552765130996704, "debug/policy_chosen_logps": -159.6048583984375, "debug/policy_rejected_logits": -3.3928418159484863, "debug/policy_rejected_logps": -209.23745727539062, "debug/reference_chosen_logps": -187.8738250732422, "debug/reference_rejected_logps": -196.34986877441406, "epoch": 0.4393939393939394, "grad_norm": 3.2133257593593862, "learning_rate": 1e-06, "logits/chosen": -3.552765130996704, "logits/rejected": -3.3928418159484863, "logps/chosen": -159.6048583984375, "logps/rejected": -209.23745727539062, "loss": 0.2314, "rewards/accuracies": 0.75, "rewards/chosen": 0.2826896905899048, "rewards/margins": 0.41156554222106934, "rewards/rejected": -0.12887582182884216, "step": 29 }, { "debug/policy_chosen_logits": -3.3832268714904785, "debug/policy_chosen_logps": -167.49896240234375, "debug/policy_rejected_logits": -3.1030924320220947, "debug/policy_rejected_logps": -252.34332275390625, "debug/reference_chosen_logps": -190.51348876953125, "debug/reference_rejected_logps": -200.29566955566406, "epoch": 0.45454545454545453, "grad_norm": 12.600772030365162, "learning_rate": 1e-06, "logits/chosen": -3.3832268714904785, "logits/rejected": -3.1030924320220947, "logps/chosen": -167.49896240234375, "logps/rejected": -252.34332275390625, "loss": 0.152, "rewards/accuracies": 1.0, "rewards/chosen": 0.23014536499977112, "rewards/margins": 0.7506218552589417, "rewards/rejected": -0.5204765200614929, "step": 30 }, { "debug/policy_chosen_logits": -3.2991068363189697, "debug/policy_chosen_logps": -165.23487854003906, "debug/policy_rejected_logits": -3.164799451828003, "debug/policy_rejected_logps": -229.68218994140625, "debug/reference_chosen_logps": -189.39071655273438, "debug/reference_rejected_logps": -195.18954467773438, "epoch": 0.4696969696969697, "grad_norm": 23.25002197120677, "learning_rate": 1e-06, "logits/chosen": -3.2991068363189697, "logits/rejected": -3.164799451828003, "logps/chosen": -165.23487854003906, "logps/rejected": -229.68218994140625, "loss": 0.1933, "rewards/accuracies": 1.0, "rewards/chosen": 0.241558238863945, "rewards/margins": 0.5864847898483276, "rewards/rejected": -0.34492653608322144, "step": 31 }, { "debug/policy_chosen_logits": -3.438469886779785, "debug/policy_chosen_logps": -150.08570861816406, "debug/policy_rejected_logits": -3.223253011703491, "debug/policy_rejected_logps": -240.90557861328125, "debug/reference_chosen_logps": -173.40936279296875, "debug/reference_rejected_logps": -196.56854248046875, "epoch": 0.48484848484848486, "grad_norm": 7.773509800300969, "learning_rate": 1e-06, "logits/chosen": -3.438469886779785, "logits/rejected": -3.223253011703491, "logps/chosen": -150.08570861816406, "logps/rejected": -240.90557861328125, "loss": 0.2104, "rewards/accuracies": 1.0, "rewards/chosen": 0.2332364320755005, "rewards/margins": 0.6766066551208496, "rewards/rejected": -0.4433702826499939, "step": 32 }, { "debug/policy_chosen_logits": -3.3407649993896484, "debug/policy_chosen_logps": -179.110595703125, "debug/policy_rejected_logits": -3.2053518295288086, "debug/policy_rejected_logps": -235.9097900390625, "debug/reference_chosen_logps": -204.107421875, "debug/reference_rejected_logps": -197.0460205078125, "epoch": 0.5, "grad_norm": 6.718642855006392, "learning_rate": 1e-06, "logits/chosen": -3.3407649993896484, "logits/rejected": -3.2053518295288086, "logps/chosen": -179.110595703125, "logps/rejected": -235.9097900390625, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": 0.2499680519104004, "rewards/margins": 0.6386057138442993, "rewards/rejected": -0.3886377215385437, "step": 33 }, { "debug/policy_chosen_logits": -3.3050708770751953, "debug/policy_chosen_logps": -152.60276794433594, "debug/policy_rejected_logits": -3.2292323112487793, "debug/policy_rejected_logps": -230.19276428222656, "debug/reference_chosen_logps": -176.83837890625, "debug/reference_rejected_logps": -195.39669799804688, "epoch": 0.5151515151515151, "grad_norm": 9.101702637374363, "learning_rate": 1e-06, "logits/chosen": -3.3050708770751953, "logits/rejected": -3.2292323112487793, "logps/chosen": -152.60276794433594, "logps/rejected": -230.19276428222656, "loss": 0.1477, "rewards/accuracies": 1.0, "rewards/chosen": 0.24235627055168152, "rewards/margins": 0.5903170108795166, "rewards/rejected": -0.3479607105255127, "step": 34 }, { "debug/policy_chosen_logits": -3.2726495265960693, "debug/policy_chosen_logps": -174.27133178710938, "debug/policy_rejected_logits": -3.203645706176758, "debug/policy_rejected_logps": -196.8633270263672, "debug/reference_chosen_logps": -198.22906494140625, "debug/reference_rejected_logps": -186.4788818359375, "epoch": 0.5303030303030303, "grad_norm": 3.0621710142938094, "learning_rate": 1e-06, "logits/chosen": -3.2726495265960693, "logits/rejected": -3.203645706176758, "logps/chosen": -174.27133178710938, "logps/rejected": -196.8633270263672, "loss": 0.2372, "rewards/accuracies": 0.5, "rewards/chosen": 0.2395772933959961, "rewards/margins": 0.3434217870235443, "rewards/rejected": -0.10384449362754822, "step": 35 }, { "debug/policy_chosen_logits": -3.384018659591675, "debug/policy_chosen_logps": -167.7943115234375, "debug/policy_rejected_logits": -3.1378824710845947, "debug/policy_rejected_logps": -248.11648559570312, "debug/reference_chosen_logps": -192.9346466064453, "debug/reference_rejected_logps": -202.7908935546875, "epoch": 0.5454545454545454, "grad_norm": 3.7803820798969134, "learning_rate": 1e-06, "logits/chosen": -3.384018659591675, "logits/rejected": -3.1378824710845947, "logps/chosen": -167.7943115234375, "logps/rejected": -248.11648559570312, "loss": 0.1955, "rewards/accuracies": 1.0, "rewards/chosen": 0.2514033913612366, "rewards/margins": 0.704659104347229, "rewards/rejected": -0.4532557725906372, "step": 36 }, { "debug/policy_chosen_logits": -3.3746280670166016, "debug/policy_chosen_logps": -157.13916015625, "debug/policy_rejected_logits": -3.148245334625244, "debug/policy_rejected_logps": -192.2917938232422, "debug/reference_chosen_logps": -179.935546875, "debug/reference_rejected_logps": -189.93637084960938, "epoch": 0.5606060606060606, "grad_norm": 3.3859231835471504, "learning_rate": 1e-06, "logits/chosen": -3.3746280670166016, "logits/rejected": -3.148245334625244, "logps/chosen": -157.13916015625, "logps/rejected": -192.2917938232422, "loss": 0.2319, "rewards/accuracies": 0.5, "rewards/chosen": 0.22796380519866943, "rewards/margins": 0.25151798129081726, "rewards/rejected": -0.02355419099330902, "step": 37 }, { "debug/policy_chosen_logits": -3.3511316776275635, "debug/policy_chosen_logps": -155.6072235107422, "debug/policy_rejected_logits": -3.144333839416504, "debug/policy_rejected_logps": -244.5394287109375, "debug/reference_chosen_logps": -181.16592407226562, "debug/reference_rejected_logps": -202.65194702148438, "epoch": 0.5757575757575758, "grad_norm": 3.505660994560735, "learning_rate": 1e-06, "logits/chosen": -3.3511316776275635, "logits/rejected": -3.144333839416504, "logps/chosen": -155.6072235107422, "logps/rejected": -244.5394287109375, "loss": 0.1577, "rewards/accuracies": 1.0, "rewards/chosen": 0.25558698177337646, "rewards/margins": 0.6744616031646729, "rewards/rejected": -0.41887468099594116, "step": 38 }, { "debug/policy_chosen_logits": -3.385699987411499, "debug/policy_chosen_logps": -155.55105590820312, "debug/policy_rejected_logits": -3.1693825721740723, "debug/policy_rejected_logps": -231.96063232421875, "debug/reference_chosen_logps": -185.80572509765625, "debug/reference_rejected_logps": -199.73977661132812, "epoch": 0.5909090909090909, "grad_norm": 3.5404015657866217, "learning_rate": 1e-06, "logits/chosen": -3.385699987411499, "logits/rejected": -3.1693825721740723, "logps/chosen": -155.55105590820312, "logps/rejected": -231.96063232421875, "loss": 0.1543, "rewards/accuracies": 1.0, "rewards/chosen": 0.3025468587875366, "rewards/margins": 0.6247553825378418, "rewards/rejected": -0.32220855355262756, "step": 39 }, { "debug/policy_chosen_logits": -3.283646583557129, "debug/policy_chosen_logps": -157.669677734375, "debug/policy_rejected_logits": -3.1286423206329346, "debug/policy_rejected_logps": -216.38865661621094, "debug/reference_chosen_logps": -190.46728515625, "debug/reference_rejected_logps": -203.07147216796875, "epoch": 0.6060606060606061, "grad_norm": 2.9051264992734738, "learning_rate": 1e-06, "logits/chosen": -3.283646583557129, "logits/rejected": -3.1286423206329346, "logps/chosen": -157.669677734375, "logps/rejected": -216.38865661621094, "loss": 0.1286, "rewards/accuracies": 0.625, "rewards/chosen": 0.32797616720199585, "rewards/margins": 0.46114811301231384, "rewards/rejected": -0.1331719607114792, "step": 40 }, { "debug/policy_chosen_logits": -3.3565878868103027, "debug/policy_chosen_logps": -153.67202758789062, "debug/policy_rejected_logits": -3.2679927349090576, "debug/policy_rejected_logps": -211.33941650390625, "debug/reference_chosen_logps": -186.74990844726562, "debug/reference_rejected_logps": -193.83908081054688, "epoch": 0.6212121212121212, "grad_norm": 3.021431271490525, "learning_rate": 1e-06, "logits/chosen": -3.3565878868103027, "logits/rejected": -3.2679927349090576, "logps/chosen": -153.67202758789062, "logps/rejected": -211.33941650390625, "loss": 0.1735, "rewards/accuracies": 0.875, "rewards/chosen": 0.3307788670063019, "rewards/margins": 0.5057821273803711, "rewards/rejected": -0.17500324547290802, "step": 41 }, { "debug/policy_chosen_logits": -3.3630802631378174, "debug/policy_chosen_logps": -147.03170776367188, "debug/policy_rejected_logits": -3.1322529315948486, "debug/policy_rejected_logps": -238.04006958007812, "debug/reference_chosen_logps": -180.31466674804688, "debug/reference_rejected_logps": -199.04287719726562, "epoch": 0.6363636363636364, "grad_norm": 2.3454946493499476, "learning_rate": 1e-06, "logits/chosen": -3.3630802631378174, "logits/rejected": -3.1322529315948486, "logps/chosen": -147.03170776367188, "logps/rejected": -238.04006958007812, "loss": 0.1765, "rewards/accuracies": 1.0, "rewards/chosen": 0.3328295946121216, "rewards/margins": 0.7228015065193176, "rewards/rejected": -0.38997191190719604, "step": 42 }, { "debug/policy_chosen_logits": -3.465418577194214, "debug/policy_chosen_logps": -149.51712036132812, "debug/policy_rejected_logits": -3.1464831829071045, "debug/policy_rejected_logps": -239.27133178710938, "debug/reference_chosen_logps": -184.30979919433594, "debug/reference_rejected_logps": -197.39755249023438, "epoch": 0.6515151515151515, "grad_norm": 2.711135795428503, "learning_rate": 1e-06, "logits/chosen": -3.465418577194214, "logits/rejected": -3.1464831829071045, "logps/chosen": -149.51712036132812, "logps/rejected": -239.27133178710938, "loss": 0.1346, "rewards/accuracies": 1.0, "rewards/chosen": 0.3479267954826355, "rewards/margins": 0.7666647434234619, "rewards/rejected": -0.41873791813850403, "step": 43 }, { "debug/policy_chosen_logits": -3.459343910217285, "debug/policy_chosen_logps": -143.04705810546875, "debug/policy_rejected_logits": -3.3008081912994385, "debug/policy_rejected_logps": -235.9581298828125, "debug/reference_chosen_logps": -176.62020874023438, "debug/reference_rejected_logps": -202.58688354492188, "epoch": 0.6666666666666666, "grad_norm": 2.6035677185715054, "learning_rate": 1e-06, "logits/chosen": -3.459343910217285, "logits/rejected": -3.3008081912994385, "logps/chosen": -143.04705810546875, "logps/rejected": -235.9581298828125, "loss": 0.1886, "rewards/accuracies": 0.875, "rewards/chosen": 0.3357314467430115, "rewards/margins": 0.6694440841674805, "rewards/rejected": -0.33371254801750183, "step": 44 }, { "debug/policy_chosen_logits": -3.319984197616577, "debug/policy_chosen_logps": -150.12997436523438, "debug/policy_rejected_logits": -3.2634334564208984, "debug/policy_rejected_logps": -229.33334350585938, "debug/reference_chosen_logps": -186.470458984375, "debug/reference_rejected_logps": -193.90338134765625, "epoch": 0.6818181818181818, "grad_norm": 2.840083699266489, "learning_rate": 1e-06, "logits/chosen": -3.319984197616577, "logits/rejected": -3.2634334564208984, "logps/chosen": -150.12997436523438, "logps/rejected": -229.33334350585938, "loss": 0.1887, "rewards/accuracies": 1.0, "rewards/chosen": 0.3634048104286194, "rewards/margins": 0.7177043557167053, "rewards/rejected": -0.3542996048927307, "step": 45 }, { "debug/policy_chosen_logits": -3.4640610218048096, "debug/policy_chosen_logps": -151.5071563720703, "debug/policy_rejected_logits": -3.2400693893432617, "debug/policy_rejected_logps": -238.11105346679688, "debug/reference_chosen_logps": -192.1048583984375, "debug/reference_rejected_logps": -193.15365600585938, "epoch": 0.696969696969697, "grad_norm": 3.01351223100777, "learning_rate": 1e-06, "logits/chosen": -3.4640610218048096, "logits/rejected": -3.2400693893432617, "logps/chosen": -151.5071563720703, "logps/rejected": -238.11105346679688, "loss": 0.1823, "rewards/accuracies": 0.875, "rewards/chosen": 0.4059770703315735, "rewards/margins": 0.8555511236190796, "rewards/rejected": -0.4495740532875061, "step": 46 }, { "debug/policy_chosen_logits": -3.3979074954986572, "debug/policy_chosen_logps": -144.83529663085938, "debug/policy_rejected_logits": -3.2323358058929443, "debug/policy_rejected_logps": -229.11749267578125, "debug/reference_chosen_logps": -183.30435180664062, "debug/reference_rejected_logps": -196.0800018310547, "epoch": 0.7121212121212122, "grad_norm": 4.231006222618645, "learning_rate": 1e-06, "logits/chosen": -3.3979074954986572, "logits/rejected": -3.2323358058929443, "logps/chosen": -144.83529663085938, "logps/rejected": -229.11749267578125, "loss": 0.2445, "rewards/accuracies": 0.875, "rewards/chosen": 0.38469067215919495, "rewards/margins": 0.715065598487854, "rewards/rejected": -0.33037489652633667, "step": 47 }, { "debug/policy_chosen_logits": -3.3939130306243896, "debug/policy_chosen_logps": -143.1968994140625, "debug/policy_rejected_logits": -3.189805507659912, "debug/policy_rejected_logps": -240.96237182617188, "debug/reference_chosen_logps": -178.94287109375, "debug/reference_rejected_logps": -197.04150390625, "epoch": 0.7272727272727273, "grad_norm": 3.471389410983741, "learning_rate": 1e-06, "logits/chosen": -3.3939130306243896, "logits/rejected": -3.189805507659912, "logps/chosen": -143.1968994140625, "logps/rejected": -240.96237182617188, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": 0.3574597239494324, "rewards/margins": 0.7966686487197876, "rewards/rejected": -0.43920889496803284, "step": 48 }, { "debug/policy_chosen_logits": -3.322723865509033, "debug/policy_chosen_logps": -175.99673461914062, "debug/policy_rejected_logits": -3.1501941680908203, "debug/policy_rejected_logps": -255.93283081054688, "debug/reference_chosen_logps": -214.01654052734375, "debug/reference_rejected_logps": -200.45352172851562, "epoch": 0.7424242424242424, "grad_norm": 3.246697882143599, "learning_rate": 1e-06, "logits/chosen": -3.322723865509033, "logits/rejected": -3.1501941680908203, "logps/chosen": -175.99673461914062, "logps/rejected": -255.93283081054688, "loss": 0.2028, "rewards/accuracies": 1.0, "rewards/chosen": 0.38019809126853943, "rewards/margins": 0.9349913597106934, "rewards/rejected": -0.5547932386398315, "step": 49 }, { "debug/policy_chosen_logits": -3.3696868419647217, "debug/policy_chosen_logps": -152.26644897460938, "debug/policy_rejected_logits": -3.2407031059265137, "debug/policy_rejected_logps": -216.9593505859375, "debug/reference_chosen_logps": -180.33203125, "debug/reference_rejected_logps": -189.9693603515625, "epoch": 0.7575757575757576, "grad_norm": 2.8372780096699377, "learning_rate": 1e-06, "logits/chosen": -3.3696868419647217, "logits/rejected": -3.2407031059265137, "logps/chosen": -152.26644897460938, "logps/rejected": -216.9593505859375, "loss": 0.2172, "rewards/accuracies": 0.875, "rewards/chosen": 0.28065598011016846, "rewards/margins": 0.550555944442749, "rewards/rejected": -0.26989996433258057, "step": 50 }, { "debug/policy_chosen_logits": -3.508178234100342, "debug/policy_chosen_logps": -141.80020141601562, "debug/policy_rejected_logits": -3.368643283843994, "debug/policy_rejected_logps": -213.55711364746094, "debug/reference_chosen_logps": -174.03500366210938, "debug/reference_rejected_logps": -191.1943359375, "epoch": 0.7727272727272727, "grad_norm": 2.861498023934926, "learning_rate": 1e-06, "logits/chosen": -3.508178234100342, "logits/rejected": -3.368643283843994, "logps/chosen": -141.80020141601562, "logps/rejected": -213.55711364746094, "loss": 0.2201, "rewards/accuracies": 0.875, "rewards/chosen": 0.32234811782836914, "rewards/margins": 0.5459758639335632, "rewards/rejected": -0.2236277461051941, "step": 51 }, { "debug/policy_chosen_logits": -3.3607146739959717, "debug/policy_chosen_logps": -142.0826416015625, "debug/policy_rejected_logits": -3.232203722000122, "debug/policy_rejected_logps": -206.24005126953125, "debug/reference_chosen_logps": -174.87051391601562, "debug/reference_rejected_logps": -195.29995727539062, "epoch": 0.7878787878787878, "grad_norm": 2.7004125968496786, "learning_rate": 1e-06, "logits/chosen": -3.3607146739959717, "logits/rejected": -3.232203722000122, "logps/chosen": -142.0826416015625, "logps/rejected": -206.24005126953125, "loss": 0.2126, "rewards/accuracies": 0.75, "rewards/chosen": 0.32787877321243286, "rewards/margins": 0.43727970123291016, "rewards/rejected": -0.10940094292163849, "step": 52 }, { "debug/policy_chosen_logits": -3.364624500274658, "debug/policy_chosen_logps": -149.24827575683594, "debug/policy_rejected_logits": -3.1964778900146484, "debug/policy_rejected_logps": -240.46517944335938, "debug/reference_chosen_logps": -178.32278442382812, "debug/reference_rejected_logps": -208.72308349609375, "epoch": 0.803030303030303, "grad_norm": 3.8062064544709258, "learning_rate": 1e-06, "logits/chosen": -3.364624500274658, "logits/rejected": -3.1964778900146484, "logps/chosen": -149.24827575683594, "logps/rejected": -240.46517944335938, "loss": 0.1199, "rewards/accuracies": 0.875, "rewards/chosen": 0.29074496030807495, "rewards/margins": 0.608165979385376, "rewards/rejected": -0.31742095947265625, "step": 53 }, { "debug/policy_chosen_logits": -3.3570804595947266, "debug/policy_chosen_logps": -162.79751586914062, "debug/policy_rejected_logits": -3.065300941467285, "debug/policy_rejected_logps": -243.37155151367188, "debug/reference_chosen_logps": -187.5218048095703, "debug/reference_rejected_logps": -200.8238067626953, "epoch": 0.8181818181818182, "grad_norm": 2.707670663836166, "learning_rate": 1e-06, "logits/chosen": -3.3570804595947266, "logits/rejected": -3.065300941467285, "logps/chosen": -162.79751586914062, "logps/rejected": -243.37155151367188, "loss": 0.1623, "rewards/accuracies": 0.875, "rewards/chosen": 0.2472427785396576, "rewards/margins": 0.672720193862915, "rewards/rejected": -0.42547738552093506, "step": 54 }, { "debug/policy_chosen_logits": -3.244579553604126, "debug/policy_chosen_logps": -176.91433715820312, "debug/policy_rejected_logits": -3.1410536766052246, "debug/policy_rejected_logps": -215.51986694335938, "debug/reference_chosen_logps": -206.104248046875, "debug/reference_rejected_logps": -194.65060424804688, "epoch": 0.8333333333333334, "grad_norm": 2.7474028289459964, "learning_rate": 1e-06, "logits/chosen": -3.244579553604126, "logits/rejected": -3.1410536766052246, "logps/chosen": -176.91433715820312, "logps/rejected": -215.51986694335938, "loss": 0.205, "rewards/accuracies": 0.875, "rewards/chosen": 0.2918991446495056, "rewards/margins": 0.5005917549133301, "rewards/rejected": -0.20869261026382446, "step": 55 }, { "debug/policy_chosen_logits": -3.334970235824585, "debug/policy_chosen_logps": -162.10260009765625, "debug/policy_rejected_logits": -3.1446890830993652, "debug/policy_rejected_logps": -219.34228515625, "debug/reference_chosen_logps": -191.03146362304688, "debug/reference_rejected_logps": -200.82135009765625, "epoch": 0.8484848484848485, "grad_norm": 2.224463162540291, "learning_rate": 1e-06, "logits/chosen": -3.334970235824585, "logits/rejected": -3.1446890830993652, "logps/chosen": -162.10260009765625, "logps/rejected": -219.34228515625, "loss": 0.1909, "rewards/accuracies": 0.75, "rewards/chosen": 0.2892886996269226, "rewards/margins": 0.47449809312820435, "rewards/rejected": -0.18520936369895935, "step": 56 }, { "debug/policy_chosen_logits": -3.3305227756500244, "debug/policy_chosen_logps": -161.7706298828125, "debug/policy_rejected_logits": -3.1771843433380127, "debug/policy_rejected_logps": -247.12884521484375, "debug/reference_chosen_logps": -195.26727294921875, "debug/reference_rejected_logps": -202.39918518066406, "epoch": 0.8636363636363636, "grad_norm": 2.8582681328128428, "learning_rate": 1e-06, "logits/chosen": -3.3305227756500244, "logits/rejected": -3.1771843433380127, "logps/chosen": -161.7706298828125, "logps/rejected": -247.12884521484375, "loss": 0.1866, "rewards/accuracies": 1.0, "rewards/chosen": 0.3349665403366089, "rewards/margins": 0.7822632193565369, "rewards/rejected": -0.4472966492176056, "step": 57 }, { "debug/policy_chosen_logits": -3.362928867340088, "debug/policy_chosen_logps": -159.98141479492188, "debug/policy_rejected_logits": -3.235872983932495, "debug/policy_rejected_logps": -231.48805236816406, "debug/reference_chosen_logps": -189.04058837890625, "debug/reference_rejected_logps": -201.61265563964844, "epoch": 0.8787878787878788, "grad_norm": 2.5581657382568643, "learning_rate": 1e-06, "logits/chosen": -3.362928867340088, "logits/rejected": -3.235872983932495, "logps/chosen": -159.98141479492188, "logps/rejected": -231.48805236816406, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 0.290591835975647, "rewards/margins": 0.5893458127975464, "rewards/rejected": -0.2987539768218994, "step": 58 }, { "debug/policy_chosen_logits": -3.3519487380981445, "debug/policy_chosen_logps": -164.85804748535156, "debug/policy_rejected_logits": -3.295527458190918, "debug/policy_rejected_logps": -208.72828674316406, "debug/reference_chosen_logps": -194.98751831054688, "debug/reference_rejected_logps": -185.7240753173828, "epoch": 0.8939393939393939, "grad_norm": 2.771552302200925, "learning_rate": 1e-06, "logits/chosen": -3.3519487380981445, "logits/rejected": -3.295527458190918, "logps/chosen": -164.85804748535156, "logps/rejected": -208.72828674316406, "loss": 0.1484, "rewards/accuracies": 1.0, "rewards/chosen": 0.30129462480545044, "rewards/margins": 0.5313367247581482, "rewards/rejected": -0.23004208505153656, "step": 59 }, { "debug/policy_chosen_logits": -3.3647828102111816, "debug/policy_chosen_logps": -154.6871795654297, "debug/policy_rejected_logits": -3.216038703918457, "debug/policy_rejected_logps": -212.03436279296875, "debug/reference_chosen_logps": -180.7813720703125, "debug/reference_rejected_logps": -191.39483642578125, "epoch": 0.9090909090909091, "grad_norm": 2.2780886750287785, "learning_rate": 1e-06, "logits/chosen": -3.3647828102111816, "logits/rejected": -3.216038703918457, "logps/chosen": -154.6871795654297, "logps/rejected": -212.03436279296875, "loss": 0.2049, "rewards/accuracies": 0.875, "rewards/chosen": 0.26094186305999756, "rewards/margins": 0.46733713150024414, "rewards/rejected": -0.20639526844024658, "step": 60 }, { "debug/policy_chosen_logits": -3.3314597606658936, "debug/policy_chosen_logps": -185.95034790039062, "debug/policy_rejected_logits": -3.259791374206543, "debug/policy_rejected_logps": -197.266357421875, "debug/reference_chosen_logps": -213.84652709960938, "debug/reference_rejected_logps": -192.33790588378906, "epoch": 0.9242424242424242, "grad_norm": 2.531583968298835, "learning_rate": 1e-06, "logits/chosen": -3.3314597606658936, "logits/rejected": -3.259791374206543, "logps/chosen": -185.95034790039062, "logps/rejected": -197.266357421875, "loss": 0.1995, "rewards/accuracies": 0.875, "rewards/chosen": 0.2789619266986847, "rewards/margins": 0.32824641466140747, "rewards/rejected": -0.04928448051214218, "step": 61 }, { "debug/policy_chosen_logits": -3.2989447116851807, "debug/policy_chosen_logps": -156.58633422851562, "debug/policy_rejected_logits": -3.2626352310180664, "debug/policy_rejected_logps": -230.65228271484375, "debug/reference_chosen_logps": -190.15541076660156, "debug/reference_rejected_logps": -200.54278564453125, "epoch": 0.9393939393939394, "grad_norm": 2.870474029845961, "learning_rate": 1e-06, "logits/chosen": -3.2989447116851807, "logits/rejected": -3.2626352310180664, "logps/chosen": -156.58633422851562, "logps/rejected": -230.65228271484375, "loss": 0.1667, "rewards/accuracies": 1.0, "rewards/chosen": 0.335690975189209, "rewards/margins": 0.6367859244346619, "rewards/rejected": -0.30109497904777527, "step": 62 }, { "debug/policy_chosen_logits": -3.449002742767334, "debug/policy_chosen_logps": -146.80349731445312, "debug/policy_rejected_logits": -3.296942710876465, "debug/policy_rejected_logps": -192.1079864501953, "debug/reference_chosen_logps": -171.24142456054688, "debug/reference_rejected_logps": -188.4217529296875, "epoch": 0.9545454545454546, "grad_norm": 2.3229238868090785, "learning_rate": 1e-06, "logits/chosen": -3.449002742767334, "logits/rejected": -3.296942710876465, "logps/chosen": -146.80349731445312, "logps/rejected": -192.1079864501953, "loss": 0.2016, "rewards/accuracies": 0.75, "rewards/chosen": 0.2443791925907135, "rewards/margins": 0.28124135732650757, "rewards/rejected": -0.036862172186374664, "step": 63 }, { "debug/policy_chosen_logits": -3.3112168312072754, "debug/policy_chosen_logps": -145.07110595703125, "debug/policy_rejected_logits": -3.103217363357544, "debug/policy_rejected_logps": -254.41021728515625, "debug/reference_chosen_logps": -175.35618591308594, "debug/reference_rejected_logps": -202.21981811523438, "epoch": 0.9696969696969697, "grad_norm": 2.75901441970413, "learning_rate": 1e-06, "logits/chosen": -3.3112168312072754, "logits/rejected": -3.103217363357544, "logps/chosen": -145.07110595703125, "logps/rejected": -254.41021728515625, "loss": 0.207, "rewards/accuracies": 1.0, "rewards/chosen": 0.3028508126735687, "rewards/margins": 0.8247548341751099, "rewards/rejected": -0.5219039916992188, "step": 64 }, { "debug/policy_chosen_logits": -3.3073911666870117, "debug/policy_chosen_logps": -165.0496368408203, "debug/policy_rejected_logits": -3.2468745708465576, "debug/policy_rejected_logps": -212.7276153564453, "debug/reference_chosen_logps": -198.48748779296875, "debug/reference_rejected_logps": -190.78448486328125, "epoch": 0.9848484848484849, "grad_norm": 3.5148119940284666, "learning_rate": 1e-06, "logits/chosen": -3.3073911666870117, "logits/rejected": -3.2468745708465576, "logps/chosen": -165.0496368408203, "logps/rejected": -212.7276153564453, "loss": 0.1079, "rewards/accuracies": 0.875, "rewards/chosen": 0.33437857031822205, "rewards/margins": 0.5538097620010376, "rewards/rejected": -0.21943116188049316, "step": 65 }, { "debug/policy_chosen_logits": -3.316688060760498, "debug/policy_chosen_logps": -156.58001708984375, "debug/policy_rejected_logits": -3.1428608894348145, "debug/policy_rejected_logps": -250.894775390625, "debug/reference_chosen_logps": -189.58541870117188, "debug/reference_rejected_logps": -201.66758728027344, "epoch": 1.0, "grad_norm": 2.0811238439257576, "learning_rate": 1e-06, "logits/chosen": -3.316688060760498, "logits/rejected": -3.1428608894348145, "logps/chosen": -156.58001708984375, "logps/rejected": -250.894775390625, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 0.3300541043281555, "rewards/margins": 0.8223260045051575, "rewards/rejected": -0.49227192997932434, "step": 66 }, { "epoch": 1.0, "step": 66, "total_flos": 0.0, "train_loss": 0.2294480740798242, "train_runtime": 195.0399, "train_samples_per_second": 21.406, "train_steps_per_second": 0.338 } ], "logging_steps": 1, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }