{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 36, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.389188766479492, "debug/policy_chosen_logps": -130.53399658203125, "debug/policy_rejected_logits": -3.369499444961548, "debug/policy_rejected_logps": -137.01986694335938, "debug/reference_chosen_logps": -130.53399658203125, "debug/reference_rejected_logps": -137.01986694335938, "epoch": 0.027777777777777776, "grad_norm": 3.5241658881341182, "learning_rate": 1e-06, "logits/chosen": -3.389188766479492, "logits/rejected": -3.369499444961548, "logps/chosen": -130.53399658203125, "logps/rejected": -137.01986694335938, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.4248387813568115, "debug/policy_chosen_logps": -126.08409118652344, "debug/policy_rejected_logits": -3.4477694034576416, "debug/policy_rejected_logps": -134.35662841796875, "debug/reference_chosen_logps": -125.94849395751953, "debug/reference_rejected_logps": -134.22088623046875, "epoch": 0.05555555555555555, "grad_norm": 3.538485770522604, "learning_rate": 1e-06, "logits/chosen": -3.4248387813568115, "logits/rejected": -3.4477694034576416, "logps/chosen": -126.08409118652344, "logps/rejected": -134.35662841796875, "loss": 0.5005, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013558960054069757, "rewards/margins": 1.4685792848467827e-06, "rewards/rejected": -0.0013573648175224662, "step": 2 }, { "debug/policy_chosen_logits": -3.408137321472168, "debug/policy_chosen_logps": -135.76004028320312, "debug/policy_rejected_logits": -3.3996260166168213, "debug/policy_rejected_logps": -146.23269653320312, "debug/reference_chosen_logps": -135.80258178710938, "debug/reference_rejected_logps": -146.20411682128906, "epoch": 0.08333333333333333, "grad_norm": 3.634448059369767, "learning_rate": 1e-06, "logits/chosen": -3.408137321472168, "logits/rejected": -3.3996260166168213, "logps/chosen": -135.76004028320312, "logps/rejected": -146.23269653320312, "loss": 0.5004, "rewards/accuracies": 0.625, "rewards/chosen": 0.0004253291408531368, "rewards/margins": 0.0007111645536497235, "rewards/rejected": -0.00028583526727743447, "step": 3 }, { "debug/policy_chosen_logits": -3.3224852085113525, "debug/policy_chosen_logps": -147.47596740722656, "debug/policy_rejected_logits": -3.302865505218506, "debug/policy_rejected_logps": -139.837646484375, "debug/reference_chosen_logps": -147.6927032470703, "debug/reference_rejected_logps": -139.80435180664062, "epoch": 0.1111111111111111, "grad_norm": 3.750967977221286, "learning_rate": 1e-06, "logits/chosen": -3.3224852085113525, "logits/rejected": -3.302865505218506, "logps/chosen": -147.47596740722656, "logps/rejected": -139.837646484375, "loss": 0.4997, "rewards/accuracies": 0.625, "rewards/chosen": 0.0021673012524843216, "rewards/margins": 0.0025002670008689165, "rewards/rejected": -0.0003329657483845949, "step": 4 }, { "debug/policy_chosen_logits": -3.2819766998291016, "debug/policy_chosen_logps": -155.7947998046875, "debug/policy_rejected_logits": -3.2471601963043213, "debug/policy_rejected_logps": -147.0779266357422, "debug/reference_chosen_logps": -155.66995239257812, "debug/reference_rejected_logps": -147.07655334472656, "epoch": 0.1388888888888889, "grad_norm": 3.489411841690947, "learning_rate": 1e-06, "logits/chosen": -3.2819766998291016, "logits/rejected": -3.2471601963043213, "logps/chosen": -155.7947998046875, "logps/rejected": -147.0779266357422, "loss": 0.5002, "rewards/accuracies": 0.375, "rewards/chosen": -0.0012485121842473745, "rewards/margins": -0.0012347220908850431, "rewards/rejected": -1.3790151569992304e-05, "step": 5 }, { "debug/policy_chosen_logits": -3.2803831100463867, "debug/policy_chosen_logps": -134.0931396484375, "debug/policy_rejected_logits": -3.3686742782592773, "debug/policy_rejected_logps": -170.6842498779297, "debug/reference_chosen_logps": -134.14315795898438, "debug/reference_rejected_logps": -170.57257080078125, "epoch": 0.16666666666666666, "grad_norm": 3.6447957487493845, "learning_rate": 1e-06, "logits/chosen": -3.2803831100463867, "logits/rejected": -3.3686742782592773, "logps/chosen": -134.0931396484375, "logps/rejected": -170.6842498779297, "loss": 0.4994, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005001830868422985, "rewards/margins": 0.0016169355949386954, "rewards/rejected": -0.0011167526245117188, "step": 6 }, { "debug/policy_chosen_logits": -3.2792463302612305, "debug/policy_chosen_logps": -134.1569366455078, "debug/policy_rejected_logits": -3.3762855529785156, "debug/policy_rejected_logps": -150.0790557861328, "debug/reference_chosen_logps": -133.72824096679688, "debug/reference_rejected_logps": -149.54525756835938, "epoch": 0.19444444444444445, "grad_norm": 3.7395286226939346, "learning_rate": 1e-06, "logits/chosen": -3.2792463302612305, "logits/rejected": -3.3762855529785156, "logps/chosen": -134.1569366455078, "logps/rejected": -150.0790557861328, "loss": 0.4999, "rewards/accuracies": 0.5, "rewards/chosen": -0.004286966286599636, "rewards/margins": 0.0010509965941309929, "rewards/rejected": -0.005337962880730629, "step": 7 }, { "debug/policy_chosen_logits": -3.3443524837493896, "debug/policy_chosen_logps": -134.28506469726562, "debug/policy_rejected_logits": -3.3893535137176514, "debug/policy_rejected_logps": -138.88333129882812, "debug/reference_chosen_logps": -134.07891845703125, "debug/reference_rejected_logps": -138.52908325195312, "epoch": 0.2222222222222222, "grad_norm": 3.721075864812284, "learning_rate": 1e-06, "logits/chosen": -3.3443524837493896, "logits/rejected": -3.3893535137176514, "logps/chosen": -134.28506469726562, "logps/rejected": -138.88333129882812, "loss": 0.4999, "rewards/accuracies": 0.625, "rewards/chosen": -0.002061443403363228, "rewards/margins": 0.001481065759435296, "rewards/rejected": -0.0035425089299678802, "step": 8 }, { "debug/policy_chosen_logits": -3.177027702331543, "debug/policy_chosen_logps": -148.52394104003906, "debug/policy_rejected_logits": -3.250603675842285, "debug/policy_rejected_logps": -136.55433654785156, "debug/reference_chosen_logps": -148.66184997558594, "debug/reference_rejected_logps": -136.14984130859375, "epoch": 0.25, "grad_norm": 3.618238309269385, "learning_rate": 1e-06, "logits/chosen": -3.177027702331543, "logits/rejected": -3.250603675842285, "logps/chosen": -148.52394104003906, "logps/rejected": -136.55433654785156, "loss": 0.4983, "rewards/accuracies": 0.875, "rewards/chosen": 0.0013790606753900647, "rewards/margins": 0.005423860624432564, "rewards/rejected": -0.004044800065457821, "step": 9 }, { "debug/policy_chosen_logits": -3.383765935897827, "debug/policy_chosen_logps": -130.89303588867188, "debug/policy_rejected_logits": -3.3352606296539307, "debug/policy_rejected_logps": -140.389404296875, "debug/reference_chosen_logps": -130.9444580078125, "debug/reference_rejected_logps": -140.4166717529297, "epoch": 0.2777777777777778, "grad_norm": 3.581228093909178, "learning_rate": 1e-06, "logits/chosen": -3.383765935897827, "logits/rejected": -3.3352606296539307, "logps/chosen": -130.89303588867188, "logps/rejected": -140.389404296875, "loss": 0.4996, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005142114823684096, "rewards/margins": 0.00024154642596840858, "rewards/rejected": 0.00027266499819234014, "step": 10 }, { "debug/policy_chosen_logits": -3.3894593715667725, "debug/policy_chosen_logps": -143.8419189453125, "debug/policy_rejected_logits": -3.4514429569244385, "debug/policy_rejected_logps": -137.72750854492188, "debug/reference_chosen_logps": -143.99107360839844, "debug/reference_rejected_logps": -137.8389129638672, "epoch": 0.3055555555555556, "grad_norm": 3.662837343230959, "learning_rate": 1e-06, "logits/chosen": -3.3894593715667725, "logits/rejected": -3.4514429569244385, "logps/chosen": -143.8419189453125, "logps/rejected": -137.72750854492188, "loss": 0.4999, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014914990169927478, "rewards/margins": 0.0003772352356463671, "rewards/rejected": 0.0011142636649310589, "step": 11 }, { "debug/policy_chosen_logits": -3.2939815521240234, "debug/policy_chosen_logps": -130.4807586669922, "debug/policy_rejected_logits": -3.31062388420105, "debug/policy_rejected_logps": -134.68899536132812, "debug/reference_chosen_logps": -130.83193969726562, "debug/reference_rejected_logps": -134.80657958984375, "epoch": 0.3333333333333333, "grad_norm": 3.6582335225875857, "learning_rate": 1e-06, "logits/chosen": -3.2939815521240234, "logits/rejected": -3.31062388420105, "logps/chosen": -130.4807586669922, "logps/rejected": -134.68899536132812, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": 0.0035119247622787952, "rewards/margins": 0.002335900906473398, "rewards/rejected": 0.0011760236229747534, "step": 12 }, { "debug/policy_chosen_logits": -3.1823947429656982, "debug/policy_chosen_logps": -142.1556396484375, "debug/policy_rejected_logits": -3.234529972076416, "debug/policy_rejected_logps": -146.97329711914062, "debug/reference_chosen_logps": -142.38873291015625, "debug/reference_rejected_logps": -147.51141357421875, "epoch": 0.3611111111111111, "grad_norm": 3.7187934784438883, "learning_rate": 1e-06, "logits/chosen": -3.1823947429656982, "logits/rejected": -3.234529972076416, "logps/chosen": -142.1556396484375, "logps/rejected": -146.97329711914062, "loss": 0.4992, "rewards/accuracies": 0.5, "rewards/chosen": 0.002330951625481248, "rewards/margins": -0.003050069557502866, "rewards/rejected": 0.005381021182984114, "step": 13 }, { "debug/policy_chosen_logits": -3.241863489151001, "debug/policy_chosen_logps": -149.54800415039062, "debug/policy_rejected_logits": -3.2308874130249023, "debug/policy_rejected_logps": -149.73306274414062, "debug/reference_chosen_logps": -149.528564453125, "debug/reference_rejected_logps": -150.3438720703125, "epoch": 0.3888888888888889, "grad_norm": 3.6082585398441998, "learning_rate": 1e-06, "logits/chosen": -3.241863489151001, "logits/rejected": -3.2308874130249023, "logps/chosen": -149.54800415039062, "logps/rejected": -149.73306274414062, "loss": 0.5008, "rewards/accuracies": 0.125, "rewards/chosen": -0.00019451143452897668, "rewards/margins": -0.006302604451775551, "rewards/rejected": 0.006108093075454235, "step": 14 }, { "debug/policy_chosen_logits": -3.611445426940918, "debug/policy_chosen_logps": -149.59336853027344, "debug/policy_rejected_logits": -3.399775505065918, "debug/policy_rejected_logps": -153.10552978515625, "debug/reference_chosen_logps": -149.2114715576172, "debug/reference_rejected_logps": -152.98480224609375, "epoch": 0.4166666666666667, "grad_norm": 3.650913491374871, "learning_rate": 1e-06, "logits/chosen": -3.611445426940918, "logits/rejected": -3.399775505065918, "logps/chosen": -149.59336853027344, "logps/rejected": -153.10552978515625, "loss": 0.5007, "rewards/accuracies": 0.625, "rewards/chosen": -0.003819055622443557, "rewards/margins": -0.0026117325760424137, "rewards/rejected": -0.001207323046401143, "step": 15 }, { "debug/policy_chosen_logits": -3.3950932025909424, "debug/policy_chosen_logps": -140.8656768798828, "debug/policy_rejected_logits": -3.303450584411621, "debug/policy_rejected_logps": -139.17660522460938, "debug/reference_chosen_logps": -141.45046997070312, "debug/reference_rejected_logps": -139.87338256835938, "epoch": 0.4444444444444444, "grad_norm": 3.5564513010845387, "learning_rate": 1e-06, "logits/chosen": -3.3950932025909424, "logits/rejected": -3.303450584411621, "logps/chosen": -140.8656768798828, "logps/rejected": -139.17660522460938, "loss": 0.5003, "rewards/accuracies": 0.5, "rewards/chosen": 0.005848026368767023, "rewards/margins": -0.0011198711581528187, "rewards/rejected": 0.006967897526919842, "step": 16 }, { "debug/policy_chosen_logits": -3.3803534507751465, "debug/policy_chosen_logps": -149.49050903320312, "debug/policy_rejected_logits": -3.347896099090576, "debug/policy_rejected_logps": -148.02999877929688, "debug/reference_chosen_logps": -149.98651123046875, "debug/reference_rejected_logps": -148.20993041992188, "epoch": 0.4722222222222222, "grad_norm": 3.5270452835142394, "learning_rate": 1e-06, "logits/chosen": -3.3803534507751465, "logits/rejected": -3.347896099090576, "logps/chosen": -149.49050903320312, "logps/rejected": -148.02999877929688, "loss": 0.4973, "rewards/accuracies": 0.625, "rewards/chosen": 0.0049600983038544655, "rewards/margins": 0.0031606964766979218, "rewards/rejected": 0.001799402292817831, "step": 17 }, { "debug/policy_chosen_logits": -3.4158170223236084, "debug/policy_chosen_logps": -150.7939453125, "debug/policy_rejected_logits": -3.4141695499420166, "debug/policy_rejected_logps": -128.3897705078125, "debug/reference_chosen_logps": -150.65750122070312, "debug/reference_rejected_logps": -128.62059020996094, "epoch": 0.5, "grad_norm": 3.8891703304483185, "learning_rate": 1e-06, "logits/chosen": -3.4158170223236084, "logits/rejected": -3.4141695499420166, "logps/chosen": -150.7939453125, "logps/rejected": -128.3897705078125, "loss": 0.5002, "rewards/accuracies": 0.25, "rewards/chosen": -0.0013643745332956314, "rewards/margins": -0.003672752296552062, "rewards/rejected": 0.002308378228917718, "step": 18 }, { "debug/policy_chosen_logits": -3.2540664672851562, "debug/policy_chosen_logps": -144.9429931640625, "debug/policy_rejected_logits": -3.344304323196411, "debug/policy_rejected_logps": -140.64501953125, "debug/reference_chosen_logps": -144.70391845703125, "debug/reference_rejected_logps": -140.39666748046875, "epoch": 0.5277777777777778, "grad_norm": 3.7659009000969075, "learning_rate": 1e-06, "logits/chosen": -3.2540664672851562, "logits/rejected": -3.344304323196411, "logps/chosen": -144.9429931640625, "logps/rejected": -140.64501953125, "loss": 0.4998, "rewards/accuracies": 0.375, "rewards/chosen": -0.0023905562702566385, "rewards/margins": 9.290699381381273e-05, "rewards/rejected": -0.002483463380485773, "step": 19 }, { "debug/policy_chosen_logits": -3.201190948486328, "debug/policy_chosen_logps": -135.35888671875, "debug/policy_rejected_logits": -3.371107339859009, "debug/policy_rejected_logps": -129.51800537109375, "debug/reference_chosen_logps": -135.95867919921875, "debug/reference_rejected_logps": -129.5604248046875, "epoch": 0.5555555555555556, "grad_norm": 3.455506692117544, "learning_rate": 1e-06, "logits/chosen": -3.201190948486328, "logits/rejected": -3.371107339859009, "logps/chosen": -135.35888671875, "logps/rejected": -129.51800537109375, "loss": 0.4991, "rewards/accuracies": 0.625, "rewards/chosen": 0.005997886415570974, "rewards/margins": 0.005573768634349108, "rewards/rejected": 0.00042411801405251026, "step": 20 }, { "debug/policy_chosen_logits": -3.3029751777648926, "debug/policy_chosen_logps": -140.86770629882812, "debug/policy_rejected_logits": -3.268817186355591, "debug/policy_rejected_logps": -142.11985778808594, "debug/reference_chosen_logps": -140.67758178710938, "debug/reference_rejected_logps": -141.81129455566406, "epoch": 0.5833333333333334, "grad_norm": 3.714998480314685, "learning_rate": 1e-06, "logits/chosen": -3.3029751777648926, "logits/rejected": -3.268817186355591, "logps/chosen": -140.86770629882812, "logps/rejected": -142.11985778808594, "loss": 0.4971, "rewards/accuracies": 0.375, "rewards/chosen": -0.0019013977143913507, "rewards/margins": 0.0011841959785670042, "rewards/rejected": -0.0030855941586196423, "step": 21 }, { "debug/policy_chosen_logits": -3.3189659118652344, "debug/policy_chosen_logps": -147.112060546875, "debug/policy_rejected_logits": -3.3244478702545166, "debug/policy_rejected_logps": -142.3300323486328, "debug/reference_chosen_logps": -147.44517517089844, "debug/reference_rejected_logps": -142.57394409179688, "epoch": 0.6111111111111112, "grad_norm": 3.6822242911178518, "learning_rate": 1e-06, "logits/chosen": -3.3189659118652344, "logits/rejected": -3.3244478702545166, "logps/chosen": -147.112060546875, "logps/rejected": -142.3300323486328, "loss": 0.5, "rewards/accuracies": 0.5, "rewards/chosen": 0.0033312130253762007, "rewards/margins": 0.000892105046659708, "rewards/rejected": 0.002439107745885849, "step": 22 }, { "debug/policy_chosen_logits": -3.4213130474090576, "debug/policy_chosen_logps": -140.08522033691406, "debug/policy_rejected_logits": -3.4246339797973633, "debug/policy_rejected_logps": -130.4640350341797, "debug/reference_chosen_logps": -139.59059143066406, "debug/reference_rejected_logps": -130.9028778076172, "epoch": 0.6388888888888888, "grad_norm": 3.5695109096490345, "learning_rate": 1e-06, "logits/chosen": -3.4213130474090576, "logits/rejected": -3.4246339797973633, "logps/chosen": -140.08522033691406, "logps/rejected": -130.4640350341797, "loss": 0.5017, "rewards/accuracies": 0.25, "rewards/chosen": -0.0049462029710412025, "rewards/margins": -0.009334669448435307, "rewards/rejected": 0.004388465546071529, "step": 23 }, { "debug/policy_chosen_logits": -3.3711600303649902, "debug/policy_chosen_logps": -156.96621704101562, "debug/policy_rejected_logits": -3.343695640563965, "debug/policy_rejected_logps": -139.0091552734375, "debug/reference_chosen_logps": -156.67721557617188, "debug/reference_rejected_logps": -138.96543884277344, "epoch": 0.6666666666666666, "grad_norm": 3.867512978649109, "learning_rate": 1e-06, "logits/chosen": -3.3711600303649902, "logits/rejected": -3.343695640563965, "logps/chosen": -156.96621704101562, "logps/rejected": -139.0091552734375, "loss": 0.4959, "rewards/accuracies": 0.375, "rewards/chosen": -0.0028899763710796833, "rewards/margins": -0.0024528217036277056, "rewards/rejected": -0.00043715466745197773, "step": 24 }, { "debug/policy_chosen_logits": -3.2905821800231934, "debug/policy_chosen_logps": -148.3179931640625, "debug/policy_rejected_logits": -3.450080156326294, "debug/policy_rejected_logps": -134.548095703125, "debug/reference_chosen_logps": -148.64956665039062, "debug/reference_rejected_logps": -134.7545928955078, "epoch": 0.6944444444444444, "grad_norm": 3.5528494656871943, "learning_rate": 1e-06, "logits/chosen": -3.2905821800231934, "logits/rejected": -3.450080156326294, "logps/chosen": -148.3179931640625, "logps/rejected": -134.548095703125, "loss": 0.4976, "rewards/accuracies": 0.5, "rewards/chosen": 0.0033157728612422943, "rewards/margins": 0.0012508960207924247, "rewards/rejected": 0.002064876491203904, "step": 25 }, { "debug/policy_chosen_logits": -3.3846447467803955, "debug/policy_chosen_logps": -140.20948791503906, "debug/policy_rejected_logits": -3.3473317623138428, "debug/policy_rejected_logps": -144.134033203125, "debug/reference_chosen_logps": -139.72286987304688, "debug/reference_rejected_logps": -143.17819213867188, "epoch": 0.7222222222222222, "grad_norm": 3.522835837758366, "learning_rate": 1e-06, "logits/chosen": -3.3846447467803955, "logits/rejected": -3.3473317623138428, "logps/chosen": -140.20948791503906, "logps/rejected": -144.134033203125, "loss": 0.4968, "rewards/accuracies": 0.625, "rewards/chosen": -0.004866065923124552, "rewards/margins": 0.00469239242374897, "rewards/rejected": -0.009558457881212234, "step": 26 }, { "debug/policy_chosen_logits": -3.2885804176330566, "debug/policy_chosen_logps": -131.42832946777344, "debug/policy_rejected_logits": -3.199730157852173, "debug/policy_rejected_logps": -167.41575622558594, "debug/reference_chosen_logps": -131.87686157226562, "debug/reference_rejected_logps": -166.8900146484375, "epoch": 0.75, "grad_norm": 3.776580476774495, "learning_rate": 1e-06, "logits/chosen": -3.2885804176330566, "logits/rejected": -3.199730157852173, "logps/chosen": -131.42832946777344, "logps/rejected": -167.41575622558594, "loss": 0.4968, "rewards/accuracies": 0.625, "rewards/chosen": 0.004485292360186577, "rewards/margins": 0.009742669761180878, "rewards/rejected": -0.005257377866655588, "step": 27 }, { "debug/policy_chosen_logits": -3.3472468852996826, "debug/policy_chosen_logps": -133.6158905029297, "debug/policy_rejected_logits": -3.290668249130249, "debug/policy_rejected_logps": -146.1497039794922, "debug/reference_chosen_logps": -133.69766235351562, "debug/reference_rejected_logps": -145.79873657226562, "epoch": 0.7777777777777778, "grad_norm": 3.5272238549750985, "learning_rate": 1e-06, "logits/chosen": -3.3472468852996826, "logits/rejected": -3.290668249130249, "logps/chosen": -133.6158905029297, "logps/rejected": -146.1497039794922, "loss": 0.5009, "rewards/accuracies": 0.5, "rewards/chosen": 0.00081774708814919, "rewards/margins": 0.004327363334596157, "rewards/rejected": -0.0035096164792776108, "step": 28 }, { "debug/policy_chosen_logits": -3.4250340461730957, "debug/policy_chosen_logps": -133.65365600585938, "debug/policy_rejected_logits": -3.298884153366089, "debug/policy_rejected_logps": -138.59500122070312, "debug/reference_chosen_logps": -134.072509765625, "debug/reference_rejected_logps": -138.65289306640625, "epoch": 0.8055555555555556, "grad_norm": 3.602823944098358, "learning_rate": 1e-06, "logits/chosen": -3.4250340461730957, "logits/rejected": -3.298884153366089, "logps/chosen": -133.65365600585938, "logps/rejected": -138.59500122070312, "loss": 0.4982, "rewards/accuracies": 0.625, "rewards/chosen": 0.004188622813671827, "rewards/margins": 0.0036097620613873005, "rewards/rejected": 0.0005788612179458141, "step": 29 }, { "debug/policy_chosen_logits": -3.268261671066284, "debug/policy_chosen_logps": -136.84478759765625, "debug/policy_rejected_logits": -3.257566213607788, "debug/policy_rejected_logps": -162.07492065429688, "debug/reference_chosen_logps": -136.37957763671875, "debug/reference_rejected_logps": -161.6750946044922, "epoch": 0.8333333333333334, "grad_norm": 3.7759260044534106, "learning_rate": 1e-06, "logits/chosen": -3.268261671066284, "logits/rejected": -3.257566213607788, "logps/chosen": -136.84478759765625, "logps/rejected": -162.07492065429688, "loss": 0.4982, "rewards/accuracies": 0.625, "rewards/chosen": -0.004652070812880993, "rewards/margins": -0.000653810566291213, "rewards/rejected": -0.0039982604794204235, "step": 30 }, { "debug/policy_chosen_logits": -3.2410950660705566, "debug/policy_chosen_logps": -132.0355682373047, "debug/policy_rejected_logits": -3.2467472553253174, "debug/policy_rejected_logps": -156.85931396484375, "debug/reference_chosen_logps": -131.93606567382812, "debug/reference_rejected_logps": -156.878173828125, "epoch": 0.8611111111111112, "grad_norm": 3.714880901694917, "learning_rate": 1e-06, "logits/chosen": -3.2410950660705566, "logits/rejected": -3.2467472553253174, "logps/chosen": -132.0355682373047, "logps/rejected": -156.85931396484375, "loss": 0.5043, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009950445964932442, "rewards/margins": -0.0011836052872240543, "rewards/rejected": 0.00018856054521165788, "step": 31 }, { "debug/policy_chosen_logits": -3.2965383529663086, "debug/policy_chosen_logps": -151.22242736816406, "debug/policy_rejected_logits": -3.3531861305236816, "debug/policy_rejected_logps": -158.19801330566406, "debug/reference_chosen_logps": -150.66336059570312, "debug/reference_rejected_logps": -154.187255859375, "epoch": 0.8888888888888888, "grad_norm": 3.625657005605096, "learning_rate": 1e-06, "logits/chosen": -3.2965383529663086, "logits/rejected": -3.3531861305236816, "logps/chosen": -151.22242736816406, "logps/rejected": -158.19801330566406, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": -0.0055907731875777245, "rewards/margins": 0.034516897052526474, "rewards/rejected": -0.040107667446136475, "step": 32 }, { "debug/policy_chosen_logits": -3.3763067722320557, "debug/policy_chosen_logps": -144.869140625, "debug/policy_rejected_logits": -3.462132692337036, "debug/policy_rejected_logps": -145.60043334960938, "debug/reference_chosen_logps": -145.45584106445312, "debug/reference_rejected_logps": -144.76223754882812, "epoch": 0.9166666666666666, "grad_norm": 3.5803165187465273, "learning_rate": 1e-06, "logits/chosen": -3.3763067722320557, "logits/rejected": -3.462132692337036, "logps/chosen": -144.869140625, "logps/rejected": -145.60043334960938, "loss": 0.4903, "rewards/accuracies": 0.875, "rewards/chosen": 0.005866975523531437, "rewards/margins": 0.01424897089600563, "rewards/rejected": -0.008381996303796768, "step": 33 }, { "debug/policy_chosen_logits": -3.220027208328247, "debug/policy_chosen_logps": -136.84854125976562, "debug/policy_rejected_logits": -3.232970952987671, "debug/policy_rejected_logps": -153.30503845214844, "debug/reference_chosen_logps": -137.6507568359375, "debug/reference_rejected_logps": -153.16461181640625, "epoch": 0.9444444444444444, "grad_norm": 3.6129260590369485, "learning_rate": 1e-06, "logits/chosen": -3.220027208328247, "logits/rejected": -3.232970952987671, "logps/chosen": -136.84854125976562, "logps/rejected": -153.30503845214844, "loss": 0.4956, "rewards/accuracies": 0.875, "rewards/chosen": 0.008022289723157883, "rewards/margins": 0.009426584467291832, "rewards/rejected": -0.0014042950933799148, "step": 34 }, { "debug/policy_chosen_logits": -3.0886175632476807, "debug/policy_chosen_logps": -187.7407684326172, "debug/policy_rejected_logits": -3.212488889694214, "debug/policy_rejected_logps": -144.3076171875, "debug/reference_chosen_logps": -187.63613891601562, "debug/reference_rejected_logps": -143.5675048828125, "epoch": 0.9722222222222222, "grad_norm": 3.722674992897622, "learning_rate": 1e-06, "logits/chosen": -3.0886175632476807, "logits/rejected": -3.212488889694214, "logps/chosen": -187.7407684326172, "logps/rejected": -144.3076171875, "loss": 0.4981, "rewards/accuracies": 0.5, "rewards/chosen": -0.0010462282225489616, "rewards/margins": 0.006354827433824539, "rewards/rejected": -0.007401055656373501, "step": 35 }, { "debug/policy_chosen_logits": -3.3802547454833984, "debug/policy_chosen_logps": -126.92862701416016, "debug/policy_rejected_logits": -3.3321433067321777, "debug/policy_rejected_logps": -138.41415405273438, "debug/reference_chosen_logps": -128.0789794921875, "debug/reference_rejected_logps": -137.37353515625, "epoch": 1.0, "grad_norm": 3.429417526274531, "learning_rate": 1e-06, "logits/chosen": -3.3802547454833984, "logits/rejected": -3.3321433067321777, "logps/chosen": -126.92862701416016, "logps/rejected": -138.41415405273438, "loss": 0.4619, "rewards/accuracies": 1.0, "rewards/chosen": 0.011503458023071289, "rewards/margins": 0.021909702569246292, "rewards/rejected": -0.010406245477497578, "step": 36 }, { "epoch": 1.0, "step": 36, "total_flos": 0.0, "train_loss": 0.49796508583757615, "train_runtime": 140.0049, "train_samples_per_second": 16.021, "train_steps_per_second": 0.257 } ], "logging_steps": 1, "max_steps": 36, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }