{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 39, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.6231489181518555, "debug/policy_chosen_logps": -216.72048950195312, "debug/policy_rejected_logits": -1.7830783128738403, "debug/policy_rejected_logps": -218.3311004638672, "debug/reference_chosen_logps": -216.72048950195312, "debug/reference_rejected_logps": -218.3311004638672, "epoch": 0.02564102564102564, "grad_norm": 5.24784744524443, "learning_rate": 1e-06, "logits/chosen": -1.6231489181518555, "logits/rejected": -1.7830783128738403, "logps/chosen": -216.72048950195312, "logps/rejected": -218.3311004638672, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -1.8552749156951904, "debug/policy_chosen_logps": -238.62481689453125, "debug/policy_rejected_logits": -1.8094723224639893, "debug/policy_rejected_logps": -241.33612060546875, "debug/reference_chosen_logps": -238.8965301513672, "debug/reference_rejected_logps": -241.34713745117188, "epoch": 0.05128205128205128, "grad_norm": 4.903353330473321, "learning_rate": 1e-06, "logits/chosen": -1.8552749156951904, "logits/rejected": -1.8094723224639893, "logps/chosen": -238.62481689453125, "logps/rejected": -241.33612060546875, "loss": 0.5003, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027172660920768976, "rewards/margins": 0.002607173752039671, "rewards/rejected": 0.00011009228182956576, "step": 2 }, { "debug/policy_chosen_logits": -1.9046015739440918, "debug/policy_chosen_logps": -221.9311981201172, "debug/policy_rejected_logits": -1.8472867012023926, "debug/policy_rejected_logps": -230.67025756835938, "debug/reference_chosen_logps": -222.35098266601562, "debug/reference_rejected_logps": -230.71707153320312, "epoch": 0.07692307692307693, "grad_norm": 5.503246384691659, "learning_rate": 1e-06, "logits/chosen": -1.9046015739440918, "logits/rejected": -1.8472867012023926, "logps/chosen": -221.9311981201172, "logps/rejected": -230.67025756835938, "loss": 0.5002, "rewards/accuracies": 0.875, "rewards/chosen": 0.004197959788143635, "rewards/margins": 0.003729667514562607, "rewards/rejected": 0.000468292273581028, "step": 3 }, { "debug/policy_chosen_logits": -1.7802233695983887, "debug/policy_chosen_logps": -223.10430908203125, "debug/policy_rejected_logits": -1.809967041015625, "debug/policy_rejected_logps": -244.6110076904297, "debug/reference_chosen_logps": -221.8607940673828, "debug/reference_rejected_logps": -243.65640258789062, "epoch": 0.10256410256410256, "grad_norm": 5.7079228521445655, "learning_rate": 1e-06, "logits/chosen": -1.7802233695983887, "logits/rejected": -1.809967041015625, "logps/chosen": -223.10430908203125, "logps/rejected": -244.6110076904297, "loss": 0.5005, "rewards/accuracies": 0.375, "rewards/chosen": -0.012435225769877434, "rewards/margins": -0.002889118157327175, "rewards/rejected": -0.009546108543872833, "step": 4 }, { "debug/policy_chosen_logits": -1.940010666847229, "debug/policy_chosen_logps": -226.70391845703125, "debug/policy_rejected_logits": -1.8683459758758545, "debug/policy_rejected_logps": -234.95077514648438, "debug/reference_chosen_logps": -226.436767578125, "debug/reference_rejected_logps": -234.722900390625, "epoch": 0.1282051282051282, "grad_norm": 5.067492903373418, "learning_rate": 1e-06, "logits/chosen": -1.940010666847229, "logits/rejected": -1.8683459758758545, "logps/chosen": -226.70391845703125, "logps/rejected": -234.95077514648438, "loss": 0.5007, "rewards/accuracies": 0.5, "rewards/chosen": -0.002671546768397093, "rewards/margins": -0.00039278031908906996, "rewards/rejected": -0.002278766594827175, "step": 5 }, { "debug/policy_chosen_logits": -2.0360376834869385, "debug/policy_chosen_logps": -212.69775390625, "debug/policy_rejected_logits": -2.021047592163086, "debug/policy_rejected_logps": -213.17205810546875, "debug/reference_chosen_logps": -212.8671417236328, "debug/reference_rejected_logps": -213.28619384765625, "epoch": 0.15384615384615385, "grad_norm": 5.41124503909242, "learning_rate": 1e-06, "logits/chosen": -2.0360376834869385, "logits/rejected": -2.021047592163086, "logps/chosen": -212.69775390625, "logps/rejected": -213.17205810546875, "loss": 0.4988, "rewards/accuracies": 0.75, "rewards/chosen": 0.0016938589978963137, "rewards/margins": 0.0005526728928089142, "rewards/rejected": 0.0011411856394261122, "step": 6 }, { "debug/policy_chosen_logits": -1.93068528175354, "debug/policy_chosen_logps": -216.78298950195312, "debug/policy_rejected_logits": -1.8857327699661255, "debug/policy_rejected_logps": -220.622314453125, "debug/reference_chosen_logps": -216.56871032714844, "debug/reference_rejected_logps": -220.99203491210938, "epoch": 0.1794871794871795, "grad_norm": 5.17099503731511, "learning_rate": 1e-06, "logits/chosen": -1.93068528175354, "logits/rejected": -1.8857327699661255, "logps/chosen": -216.78298950195312, "logps/rejected": -220.622314453125, "loss": 0.5003, "rewards/accuracies": 0.25, "rewards/chosen": -0.0021429634653031826, "rewards/margins": -0.00584003422409296, "rewards/rejected": 0.0036970707587897778, "step": 7 }, { "debug/policy_chosen_logits": -1.7973796129226685, "debug/policy_chosen_logps": -227.25387573242188, "debug/policy_rejected_logits": -1.9199271202087402, "debug/policy_rejected_logps": -210.955810546875, "debug/reference_chosen_logps": -227.18124389648438, "debug/reference_rejected_logps": -210.296142578125, "epoch": 0.20512820512820512, "grad_norm": 5.137980375498251, "learning_rate": 1e-06, "logits/chosen": -1.7973796129226685, "logits/rejected": -1.9199271202087402, "logps/chosen": -227.25387573242188, "logps/rejected": -210.955810546875, "loss": 0.4982, "rewards/accuracies": 0.875, "rewards/chosen": -0.0007261275313794613, "rewards/margins": 0.0058706095442175865, "rewards/rejected": -0.0065967366099357605, "step": 8 }, { "debug/policy_chosen_logits": -1.864752173423767, "debug/policy_chosen_logps": -213.74026489257812, "debug/policy_rejected_logits": -1.965905785560608, "debug/policy_rejected_logps": -206.05532836914062, "debug/reference_chosen_logps": -213.11065673828125, "debug/reference_rejected_logps": -205.26007080078125, "epoch": 0.23076923076923078, "grad_norm": 5.500505955132033, "learning_rate": 1e-06, "logits/chosen": -1.864752173423767, "logits/rejected": -1.965905785560608, "logps/chosen": -213.74026489257812, "logps/rejected": -206.05532836914062, "loss": 0.4991, "rewards/accuracies": 0.375, "rewards/chosen": -0.006295967381447554, "rewards/margins": 0.001656532404012978, "rewards/rejected": -0.007952500134706497, "step": 9 }, { "debug/policy_chosen_logits": -1.8028641939163208, "debug/policy_chosen_logps": -212.62191772460938, "debug/policy_rejected_logits": -1.7619720697402954, "debug/policy_rejected_logps": -224.09353637695312, "debug/reference_chosen_logps": -213.36245727539062, "debug/reference_rejected_logps": -225.09918212890625, "epoch": 0.2564102564102564, "grad_norm": 5.598797708277589, "learning_rate": 1e-06, "logits/chosen": -1.8028641939163208, "logits/rejected": -1.7619720697402954, "logps/chosen": -212.62191772460938, "logps/rejected": -224.09353637695312, "loss": 0.5006, "rewards/accuracies": 0.375, "rewards/chosen": 0.007405414246022701, "rewards/margins": -0.002651119139045477, "rewards/rejected": 0.010056533850729465, "step": 10 }, { "debug/policy_chosen_logits": -1.7873791456222534, "debug/policy_chosen_logps": -235.42039489746094, "debug/policy_rejected_logits": -1.7289907932281494, "debug/policy_rejected_logps": -256.24310302734375, "debug/reference_chosen_logps": -235.70285034179688, "debug/reference_rejected_logps": -256.4640808105469, "epoch": 0.28205128205128205, "grad_norm": 5.121711768916106, "learning_rate": 1e-06, "logits/chosen": -1.7873791456222534, "logits/rejected": -1.7289907932281494, "logps/chosen": -235.42039489746094, "logps/rejected": -256.24310302734375, "loss": 0.4969, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028246305882930756, "rewards/margins": 0.0006149486871436238, "rewards/rejected": 0.0022096820175647736, "step": 11 }, { "debug/policy_chosen_logits": -1.8599488735198975, "debug/policy_chosen_logps": -235.88316345214844, "debug/policy_rejected_logits": -1.771195411682129, "debug/policy_rejected_logps": -249.5576934814453, "debug/reference_chosen_logps": -235.88832092285156, "debug/reference_rejected_logps": -249.1665802001953, "epoch": 0.3076923076923077, "grad_norm": 5.641288741238012, "learning_rate": 1e-06, "logits/chosen": -1.8599488735198975, "logits/rejected": -1.771195411682129, "logps/chosen": -235.88316345214844, "logps/rejected": -249.5576934814453, "loss": 0.4987, "rewards/accuracies": 0.625, "rewards/chosen": 5.1631766837090254e-05, "rewards/margins": 0.003962859511375427, "rewards/rejected": -0.003911228384822607, "step": 12 }, { "debug/policy_chosen_logits": -1.7387449741363525, "debug/policy_chosen_logps": -285.1178894042969, "debug/policy_rejected_logits": -1.7257730960845947, "debug/policy_rejected_logps": -238.3681640625, "debug/reference_chosen_logps": -284.8379821777344, "debug/reference_rejected_logps": -238.81504821777344, "epoch": 0.3333333333333333, "grad_norm": 5.561737325106727, "learning_rate": 1e-06, "logits/chosen": -1.7387449741363525, "logits/rejected": -1.7257730960845947, "logps/chosen": -285.1178894042969, "logps/rejected": -238.3681640625, "loss": 0.4999, "rewards/accuracies": 0.25, "rewards/chosen": -0.0027993388939648867, "rewards/margins": -0.007268181070685387, "rewards/rejected": 0.004468841943889856, "step": 13 }, { "debug/policy_chosen_logits": -1.681726098060608, "debug/policy_chosen_logps": -231.71658325195312, "debug/policy_rejected_logits": -1.6202036142349243, "debug/policy_rejected_logps": -219.25909423828125, "debug/reference_chosen_logps": -231.42608642578125, "debug/reference_rejected_logps": -218.79884338378906, "epoch": 0.358974358974359, "grad_norm": 5.232957939676945, "learning_rate": 1e-06, "logits/chosen": -1.681726098060608, "logits/rejected": -1.6202036142349243, "logps/chosen": -231.71658325195312, "logps/rejected": -219.25909423828125, "loss": 0.501, "rewards/accuracies": 0.625, "rewards/chosen": -0.002904757857322693, "rewards/margins": 0.0016978075727820396, "rewards/rejected": -0.00460256589576602, "step": 14 }, { "debug/policy_chosen_logits": -1.861380696296692, "debug/policy_chosen_logps": -224.05007934570312, "debug/policy_rejected_logits": -1.9202995300292969, "debug/policy_rejected_logps": -215.3521728515625, "debug/reference_chosen_logps": -223.88458251953125, "debug/reference_rejected_logps": -214.3697509765625, "epoch": 0.38461538461538464, "grad_norm": 5.390732710920602, "learning_rate": 1e-06, "logits/chosen": -1.861380696296692, "logits/rejected": -1.9202995300292969, "logps/chosen": -224.05007934570312, "logps/rejected": -215.3521728515625, "loss": 0.498, "rewards/accuracies": 0.625, "rewards/chosen": -0.0016548539279028773, "rewards/margins": 0.008169345557689667, "rewards/rejected": -0.009824199602007866, "step": 15 }, { "debug/policy_chosen_logits": -1.979051947593689, "debug/policy_chosen_logps": -224.90777587890625, "debug/policy_rejected_logits": -1.8672553300857544, "debug/policy_rejected_logps": -223.85275268554688, "debug/reference_chosen_logps": -225.060302734375, "debug/reference_rejected_logps": -224.13758850097656, "epoch": 0.41025641025641024, "grad_norm": 5.501909488929655, "learning_rate": 1e-06, "logits/chosen": -1.979051947593689, "logits/rejected": -1.8672553300857544, "logps/chosen": -224.90777587890625, "logps/rejected": -223.85275268554688, "loss": 0.4996, "rewards/accuracies": 0.5, "rewards/chosen": 0.001525268773548305, "rewards/margins": -0.001322993659414351, "rewards/rejected": 0.0028482628986239433, "step": 16 }, { "debug/policy_chosen_logits": -1.9620063304901123, "debug/policy_chosen_logps": -218.52392578125, "debug/policy_rejected_logits": -1.88973069190979, "debug/policy_rejected_logps": -224.32699584960938, "debug/reference_chosen_logps": -219.12481689453125, "debug/reference_rejected_logps": -225.84568786621094, "epoch": 0.4358974358974359, "grad_norm": 5.762923323191231, "learning_rate": 1e-06, "logits/chosen": -1.9620063304901123, "logits/rejected": -1.88973069190979, "logps/chosen": -218.52392578125, "logps/rejected": -224.32699584960938, "loss": 0.5035, "rewards/accuracies": 0.25, "rewards/chosen": 0.006008872762322426, "rewards/margins": -0.009178085252642632, "rewards/rejected": 0.015186958014965057, "step": 17 }, { "debug/policy_chosen_logits": -1.965958595275879, "debug/policy_chosen_logps": -205.58248901367188, "debug/policy_rejected_logits": -1.9452952146530151, "debug/policy_rejected_logps": -245.25881958007812, "debug/reference_chosen_logps": -205.2749481201172, "debug/reference_rejected_logps": -245.52975463867188, "epoch": 0.46153846153846156, "grad_norm": 5.445181305136342, "learning_rate": 1e-06, "logits/chosen": -1.965958595275879, "logits/rejected": -1.9452952146530151, "logps/chosen": -205.58248901367188, "logps/rejected": -245.25881958007812, "loss": 0.5016, "rewards/accuracies": 0.25, "rewards/chosen": -0.003075389890000224, "rewards/margins": -0.005784777924418449, "rewards/rejected": 0.0027093887329101562, "step": 18 }, { "debug/policy_chosen_logits": -1.8928056955337524, "debug/policy_chosen_logps": -230.90948486328125, "debug/policy_rejected_logits": -1.9355077743530273, "debug/policy_rejected_logps": -241.87542724609375, "debug/reference_chosen_logps": -231.4395751953125, "debug/reference_rejected_logps": -242.0914306640625, "epoch": 0.48717948717948717, "grad_norm": 5.459735471483051, "learning_rate": 1e-06, "logits/chosen": -1.8928056955337524, "logits/rejected": -1.9355077743530273, "logps/chosen": -230.90948486328125, "logps/rejected": -241.87542724609375, "loss": 0.5011, "rewards/accuracies": 0.5, "rewards/chosen": 0.005300826858729124, "rewards/margins": 0.0031407931819558144, "rewards/rejected": 0.0021600339096039534, "step": 19 }, { "debug/policy_chosen_logits": -1.7778483629226685, "debug/policy_chosen_logps": -240.5557861328125, "debug/policy_rejected_logits": -1.786712408065796, "debug/policy_rejected_logps": -226.53598022460938, "debug/reference_chosen_logps": -240.55596923828125, "debug/reference_rejected_logps": -226.1593017578125, "epoch": 0.5128205128205128, "grad_norm": 5.415236952737061, "learning_rate": 1e-06, "logits/chosen": -1.7778483629226685, "logits/rejected": -1.786712408065796, "logps/chosen": -240.5557861328125, "logps/rejected": -226.53598022460938, "loss": 0.4991, "rewards/accuracies": 0.625, "rewards/chosen": 1.79302878677845e-06, "rewards/margins": 0.003768615424633026, "rewards/rejected": -0.0037668226286768913, "step": 20 }, { "debug/policy_chosen_logits": -1.8529176712036133, "debug/policy_chosen_logps": -219.3668975830078, "debug/policy_rejected_logits": -1.7754184007644653, "debug/policy_rejected_logps": -243.0176239013672, "debug/reference_chosen_logps": -218.1999053955078, "debug/reference_rejected_logps": -241.68991088867188, "epoch": 0.5384615384615384, "grad_norm": 5.792699083134304, "learning_rate": 1e-06, "logits/chosen": -1.8529176712036133, "logits/rejected": -1.7754184007644653, "logps/chosen": -219.3668975830078, "logps/rejected": -243.0176239013672, "loss": 0.5004, "rewards/accuracies": 0.375, "rewards/chosen": -0.011669883504509926, "rewards/margins": 0.001607151236385107, "rewards/rejected": -0.013277034275233746, "step": 21 }, { "debug/policy_chosen_logits": -1.8949838876724243, "debug/policy_chosen_logps": -227.61126708984375, "debug/policy_rejected_logits": -1.820349097251892, "debug/policy_rejected_logps": -225.1505584716797, "debug/reference_chosen_logps": -227.88690185546875, "debug/reference_rejected_logps": -225.42080688476562, "epoch": 0.5641025641025641, "grad_norm": 5.378339843106799, "learning_rate": 1e-06, "logits/chosen": -1.8949838876724243, "logits/rejected": -1.820349097251892, "logps/chosen": -227.61126708984375, "logps/rejected": -225.1505584716797, "loss": 0.4998, "rewards/accuracies": 0.5, "rewards/chosen": 0.0027565001510083675, "rewards/margins": 5.413009785115719e-05, "rewards/rejected": 0.0027023698203265667, "step": 22 }, { "debug/policy_chosen_logits": -2.001783609390259, "debug/policy_chosen_logps": -226.62612915039062, "debug/policy_rejected_logits": -1.9064666032791138, "debug/policy_rejected_logps": -204.798828125, "debug/reference_chosen_logps": -228.11544799804688, "debug/reference_rejected_logps": -206.84844970703125, "epoch": 0.5897435897435898, "grad_norm": 5.577141438233455, "learning_rate": 1e-06, "logits/chosen": -2.001783609390259, "logits/rejected": -1.9064666032791138, "logps/chosen": -226.62612915039062, "logps/rejected": -204.798828125, "loss": 0.5006, "rewards/accuracies": 0.375, "rewards/chosen": 0.01489314902573824, "rewards/margins": -0.005602874793112278, "rewards/rejected": 0.020496025681495667, "step": 23 }, { "debug/policy_chosen_logits": -1.8187044858932495, "debug/policy_chosen_logps": -215.1470489501953, "debug/policy_rejected_logits": -1.8203312158584595, "debug/policy_rejected_logps": -224.79830932617188, "debug/reference_chosen_logps": -215.2783203125, "debug/reference_rejected_logps": -224.73924255371094, "epoch": 0.6153846153846154, "grad_norm": 5.562297665817565, "learning_rate": 1e-06, "logits/chosen": -1.8187044858932495, "logits/rejected": -1.8203312158584595, "logps/chosen": -215.1470489501953, "logps/rejected": -224.79830932617188, "loss": 0.4979, "rewards/accuracies": 0.5, "rewards/chosen": 0.0013126945123076439, "rewards/margins": 0.0019034575670957565, "rewards/rejected": -0.0005907632876187563, "step": 24 }, { "debug/policy_chosen_logits": -1.9102095365524292, "debug/policy_chosen_logps": -202.33090209960938, "debug/policy_rejected_logits": -1.8565008640289307, "debug/policy_rejected_logps": -219.95408630371094, "debug/reference_chosen_logps": -203.3621826171875, "debug/reference_rejected_logps": -219.01568603515625, "epoch": 0.6410256410256411, "grad_norm": 5.1175378799566165, "learning_rate": 1e-06, "logits/chosen": -1.9102095365524292, "logits/rejected": -1.8565008640289307, "logps/chosen": -202.33090209960938, "logps/rejected": -219.95408630371094, "loss": 0.4998, "rewards/accuracies": 0.875, "rewards/chosen": 0.010312843136489391, "rewards/margins": 0.019696807488799095, "rewards/rejected": -0.009383964352309704, "step": 25 }, { "debug/policy_chosen_logits": -1.7438678741455078, "debug/policy_chosen_logps": -235.56057739257812, "debug/policy_rejected_logits": -1.9471222162246704, "debug/policy_rejected_logps": -219.12709045410156, "debug/reference_chosen_logps": -236.11581420898438, "debug/reference_rejected_logps": -219.77798461914062, "epoch": 0.6666666666666666, "grad_norm": 5.822923438646737, "learning_rate": 1e-06, "logits/chosen": -1.7438678741455078, "logits/rejected": -1.9471222162246704, "logps/chosen": -235.56057739257812, "logps/rejected": -219.12709045410156, "loss": 0.4994, "rewards/accuracies": 0.5, "rewards/chosen": 0.005552348680794239, "rewards/margins": -0.0009564775973558426, "rewards/rejected": 0.006508826278150082, "step": 26 }, { "debug/policy_chosen_logits": -2.078953981399536, "debug/policy_chosen_logps": -219.88027954101562, "debug/policy_rejected_logits": -1.8341658115386963, "debug/policy_rejected_logps": -230.7357177734375, "debug/reference_chosen_logps": -219.43035888671875, "debug/reference_rejected_logps": -229.518798828125, "epoch": 0.6923076923076923, "grad_norm": 5.319213927338628, "learning_rate": 1e-06, "logits/chosen": -2.078953981399536, "logits/rejected": -1.8341658115386963, "logps/chosen": -219.88027954101562, "logps/rejected": -230.7357177734375, "loss": 0.5, "rewards/accuracies": 0.75, "rewards/chosen": -0.0044991872273385525, "rewards/margins": 0.007670154795050621, "rewards/rejected": -0.012169341556727886, "step": 27 }, { "debug/policy_chosen_logits": -1.9813232421875, "debug/policy_chosen_logps": -214.16021728515625, "debug/policy_rejected_logits": -1.8636360168457031, "debug/policy_rejected_logps": -220.3545684814453, "debug/reference_chosen_logps": -212.45709228515625, "debug/reference_rejected_logps": -218.9035186767578, "epoch": 0.717948717948718, "grad_norm": 5.582948165601855, "learning_rate": 1e-06, "logits/chosen": -1.9813232421875, "logits/rejected": -1.8636360168457031, "logps/chosen": -214.16021728515625, "logps/rejected": -220.3545684814453, "loss": 0.4988, "rewards/accuracies": 0.5, "rewards/chosen": -0.017031308263540268, "rewards/margins": -0.002520828042179346, "rewards/rejected": -0.01451047882437706, "step": 28 }, { "debug/policy_chosen_logits": -1.790213942527771, "debug/policy_chosen_logps": -244.75131225585938, "debug/policy_rejected_logits": -1.7480782270431519, "debug/policy_rejected_logps": -245.99044799804688, "debug/reference_chosen_logps": -243.73739624023438, "debug/reference_rejected_logps": -245.6822052001953, "epoch": 0.7435897435897436, "grad_norm": 6.4286661209691545, "learning_rate": 1e-06, "logits/chosen": -1.790213942527771, "logits/rejected": -1.7480782270431519, "logps/chosen": -244.75131225585938, "logps/rejected": -245.99044799804688, "loss": 0.5026, "rewards/accuracies": 0.25, "rewards/chosen": -0.010139026679098606, "rewards/margins": -0.007056655362248421, "rewards/rejected": -0.003082370851188898, "step": 29 }, { "debug/policy_chosen_logits": -1.7824077606201172, "debug/policy_chosen_logps": -231.88729858398438, "debug/policy_rejected_logits": -1.7534245252609253, "debug/policy_rejected_logps": -235.18899536132812, "debug/reference_chosen_logps": -229.94952392578125, "debug/reference_rejected_logps": -233.69874572753906, "epoch": 0.7692307692307693, "grad_norm": 6.850079704702285, "learning_rate": 1e-06, "logits/chosen": -1.7824077606201172, "logits/rejected": -1.7534245252609253, "logps/chosen": -231.88729858398438, "logps/rejected": -235.18899536132812, "loss": 0.5034, "rewards/accuracies": 0.25, "rewards/chosen": -0.019377898424863815, "rewards/margins": -0.004475403111428022, "rewards/rejected": -0.01490249577909708, "step": 30 }, { "debug/policy_chosen_logits": -1.9378036260604858, "debug/policy_chosen_logps": -225.71983337402344, "debug/policy_rejected_logits": -2.006803274154663, "debug/policy_rejected_logps": -199.27195739746094, "debug/reference_chosen_logps": -226.28076171875, "debug/reference_rejected_logps": -199.43853759765625, "epoch": 0.7948717948717948, "grad_norm": 5.961594287287365, "learning_rate": 1e-06, "logits/chosen": -1.9378036260604858, "logits/rejected": -2.006803274154663, "logps/chosen": -225.71983337402344, "logps/rejected": -199.27195739746094, "loss": 0.5022, "rewards/accuracies": 0.5, "rewards/chosen": 0.005609264597296715, "rewards/margins": 0.003943481482565403, "rewards/rejected": 0.0016657828819006681, "step": 31 }, { "debug/policy_chosen_logits": -1.8526768684387207, "debug/policy_chosen_logps": -223.19715881347656, "debug/policy_rejected_logits": -1.675136923789978, "debug/policy_rejected_logps": -227.8362579345703, "debug/reference_chosen_logps": -225.1814727783203, "debug/reference_rejected_logps": -229.39248657226562, "epoch": 0.8205128205128205, "grad_norm": 6.901356204686462, "learning_rate": 1e-06, "logits/chosen": -1.8526768684387207, "logits/rejected": -1.675136923789978, "logps/chosen": -223.19715881347656, "logps/rejected": -227.8362579345703, "loss": 0.4958, "rewards/accuracies": 0.5, "rewards/chosen": 0.019843177869915962, "rewards/margins": 0.00428071990609169, "rewards/rejected": 0.015562457963824272, "step": 32 }, { "debug/policy_chosen_logits": -1.8851096630096436, "debug/policy_chosen_logps": -215.10391235351562, "debug/policy_rejected_logits": -1.9044780731201172, "debug/policy_rejected_logps": -215.65618896484375, "debug/reference_chosen_logps": -215.9468994140625, "debug/reference_rejected_logps": -216.65481567382812, "epoch": 0.8461538461538461, "grad_norm": 5.491792264330514, "learning_rate": 1e-06, "logits/chosen": -1.8851096630096436, "logits/rejected": -1.9044780731201172, "logps/chosen": -215.10391235351562, "logps/rejected": -215.65618896484375, "loss": 0.4989, "rewards/accuracies": 0.375, "rewards/chosen": 0.00842985138297081, "rewards/margins": -0.00155616772826761, "rewards/rejected": 0.009986018761992455, "step": 33 }, { "debug/policy_chosen_logits": -1.8586148023605347, "debug/policy_chosen_logps": -208.19105529785156, "debug/policy_rejected_logits": -1.8968983888626099, "debug/policy_rejected_logps": -210.92306518554688, "debug/reference_chosen_logps": -209.490234375, "debug/reference_rejected_logps": -212.6377716064453, "epoch": 0.8717948717948718, "grad_norm": 5.509885066438828, "learning_rate": 1e-06, "logits/chosen": -1.8586148023605347, "logits/rejected": -1.8968983888626099, "logps/chosen": -208.19105529785156, "logps/rejected": -210.92306518554688, "loss": 0.4999, "rewards/accuracies": 0.5, "rewards/chosen": 0.012991733849048615, "rewards/margins": -0.004155273083597422, "rewards/rejected": 0.01714700646698475, "step": 34 }, { "debug/policy_chosen_logits": -1.859360694885254, "debug/policy_chosen_logps": -237.1796875, "debug/policy_rejected_logits": -1.7760928869247437, "debug/policy_rejected_logps": -213.73178100585938, "debug/reference_chosen_logps": -236.4244384765625, "debug/reference_rejected_logps": -212.8974609375, "epoch": 0.8974358974358975, "grad_norm": 5.143897582388409, "learning_rate": 1e-06, "logits/chosen": -1.859360694885254, "logits/rejected": -1.7760928869247437, "logps/chosen": -237.1796875, "logps/rejected": -213.73178100585938, "loss": 0.4983, "rewards/accuracies": 0.5, "rewards/chosen": -0.007552489638328552, "rewards/margins": 0.0007905767997726798, "rewards/rejected": -0.008343067020177841, "step": 35 }, { "debug/policy_chosen_logits": -1.6780697107315063, "debug/policy_chosen_logps": -204.98974609375, "debug/policy_rejected_logits": -1.738698124885559, "debug/policy_rejected_logps": -227.52330017089844, "debug/reference_chosen_logps": -205.32254028320312, "debug/reference_rejected_logps": -226.46026611328125, "epoch": 0.9230769230769231, "grad_norm": 5.773414088282702, "learning_rate": 1e-06, "logits/chosen": -1.6780697107315063, "logits/rejected": -1.738698124885559, "logps/chosen": -204.98974609375, "logps/rejected": -227.52330017089844, "loss": 0.497, "rewards/accuracies": 0.75, "rewards/chosen": 0.0033279801718890667, "rewards/margins": 0.013958434574306011, "rewards/rejected": -0.010630454868078232, "step": 36 }, { "debug/policy_chosen_logits": -1.6102544069290161, "debug/policy_chosen_logps": -214.78628540039062, "debug/policy_rejected_logits": -1.539096474647522, "debug/policy_rejected_logps": -229.92982482910156, "debug/reference_chosen_logps": -214.2623748779297, "debug/reference_rejected_logps": -228.9617919921875, "epoch": 0.9487179487179487, "grad_norm": 5.759655599429418, "learning_rate": 1e-06, "logits/chosen": -1.6102544069290161, "logits/rejected": -1.539096474647522, "logps/chosen": -214.78628540039062, "logps/rejected": -229.92982482910156, "loss": 0.4982, "rewards/accuracies": 0.625, "rewards/chosen": -0.005239181220531464, "rewards/margins": 0.004441184923052788, "rewards/rejected": -0.009680366143584251, "step": 37 }, { "debug/policy_chosen_logits": -1.7854262590408325, "debug/policy_chosen_logps": -226.05897521972656, "debug/policy_rejected_logits": -1.7675358057022095, "debug/policy_rejected_logps": -239.63931274414062, "debug/reference_chosen_logps": -225.51510620117188, "debug/reference_rejected_logps": -238.988037109375, "epoch": 0.9743589743589743, "grad_norm": 5.9701994847144295, "learning_rate": 1e-06, "logits/chosen": -1.7854262590408325, "logits/rejected": -1.7675358057022095, "logps/chosen": -226.05897521972656, "logps/rejected": -239.63931274414062, "loss": 0.4979, "rewards/accuracies": 0.625, "rewards/chosen": -0.005438614170998335, "rewards/margins": 0.0010740852449089289, "rewards/rejected": -0.00651269918307662, "step": 38 }, { "debug/policy_chosen_logits": -1.808987021446228, "debug/policy_chosen_logps": -211.41786193847656, "debug/policy_rejected_logits": -1.7692675590515137, "debug/policy_rejected_logps": -222.66329956054688, "debug/reference_chosen_logps": -212.28707885742188, "debug/reference_rejected_logps": -221.61166381835938, "epoch": 1.0, "grad_norm": 6.454924667492722, "learning_rate": 1e-06, "logits/chosen": -1.808987021446228, "logits/rejected": -1.7692675590515137, "logps/chosen": -211.41786193847656, "logps/rejected": -222.66329956054688, "loss": 0.473, "rewards/accuracies": 0.5, "rewards/chosen": 0.008692149072885513, "rewards/margins": 0.01920858398079872, "rewards/rejected": -0.010516433045268059, "step": 39 }, { "epoch": 1.0, "step": 39, "total_flos": 0.0, "train_loss": 0.4990231822698544, "train_runtime": 145.1235, "train_samples_per_second": 16.896, "train_steps_per_second": 0.269 } ], "logging_steps": 1, "max_steps": 39, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }