0.0001_idpo_same_6iters_iter_2 / trainer_state.json
ShenaoZ's picture
Model save
3d6bd8d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 6.25e-08,
"logits/chosen": -2.897486925125122,
"logits/rejected": -2.9312877655029297,
"logps/chosen": -227.6373291015625,
"logps/pi_response": -109.72225952148438,
"logps/ref_response": -109.72225952148438,
"logps/rejected": -190.0130615234375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.12,
"learning_rate": 4.990486745229364e-07,
"logits/chosen": -2.757150411605835,
"logits/rejected": -2.7352075576782227,
"logps/chosen": -220.00762939453125,
"logps/pi_response": -118.12198638916016,
"logps/ref_response": -117.70235443115234,
"logps/rejected": -177.66741943359375,
"loss": 0.6921,
"rewards/accuracies": 0.4722222089767456,
"rewards/chosen": 0.0035162584390491247,
"rewards/margins": 0.005680772475898266,
"rewards/rejected": -0.0021645138040184975,
"step": 10
},
{
"epoch": 0.25,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": -2.7867398262023926,
"logits/rejected": -2.737657308578491,
"logps/chosen": -214.31796264648438,
"logps/pi_response": -134.75608825683594,
"logps/ref_response": -125.65106201171875,
"logps/rejected": -227.33175659179688,
"loss": 0.6816,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.046200741082429886,
"rewards/margins": 0.019284024834632874,
"rewards/rejected": -0.06548477709293365,
"step": 20
},
{
"epoch": 0.38,
"learning_rate": 3.933941090877615e-07,
"logits/chosen": -2.6279125213623047,
"logits/rejected": -2.613940715789795,
"logps/chosen": -222.9117431640625,
"logps/pi_response": -128.2218780517578,
"logps/ref_response": -110.140380859375,
"logps/rejected": -215.9916229248047,
"loss": 0.663,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.08697148412466049,
"rewards/margins": 0.11129184067249298,
"rewards/rejected": -0.19826331734657288,
"step": 30
},
{
"epoch": 0.5,
"learning_rate": 2.934120444167326e-07,
"logits/chosen": -2.6441240310668945,
"logits/rejected": -2.65305757522583,
"logps/chosen": -238.2937469482422,
"logps/pi_response": -141.17239379882812,
"logps/ref_response": -110.28767395019531,
"logps/rejected": -214.75643920898438,
"loss": 0.646,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.17858782410621643,
"rewards/margins": 0.15444216132164001,
"rewards/rejected": -0.33302998542785645,
"step": 40
},
{
"epoch": 0.62,
"learning_rate": 1.8529523872436977e-07,
"logits/chosen": -2.576162099838257,
"logits/rejected": -2.580824375152588,
"logps/chosen": -219.0809783935547,
"logps/pi_response": -139.94015502929688,
"logps/ref_response": -102.89493560791016,
"logps/rejected": -216.08120727539062,
"loss": 0.6323,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.18389426171779633,
"rewards/margins": 0.17146775126457214,
"rewards/rejected": -0.3553619980812073,
"step": 50
},
{
"epoch": 0.75,
"learning_rate": 8.930309757836516e-08,
"logits/chosen": -2.601134777069092,
"logits/rejected": -2.5447657108306885,
"logps/chosen": -250.886962890625,
"logps/pi_response": -163.75860595703125,
"logps/ref_response": -112.12379455566406,
"logps/rejected": -244.28335571289062,
"loss": 0.6301,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.3884199261665344,
"rewards/margins": 0.17365404963493347,
"rewards/rejected": -0.5620739459991455,
"step": 60
},
{
"epoch": 0.88,
"learning_rate": 2.3423053240837514e-08,
"logits/chosen": -2.5715625286102295,
"logits/rejected": -2.552263021469116,
"logps/chosen": -254.3754425048828,
"logps/pi_response": -162.65504455566406,
"logps/ref_response": -97.93321228027344,
"logps/rejected": -240.8832550048828,
"loss": 0.6143,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.44511038064956665,
"rewards/margins": 0.2850010395050049,
"rewards/rejected": -0.7301114201545715,
"step": 70
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": -2.6229751110076904,
"logits/rejected": -2.600135564804077,
"logps/chosen": -251.10275268554688,
"logps/pi_response": -162.8291778564453,
"logps/ref_response": -112.6815185546875,
"logps/rejected": -245.9313507080078,
"loss": 0.6164,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.35662582516670227,
"rewards/margins": 0.25557661056518555,
"rewards/rejected": -0.6122024655342102,
"step": 80
},
{
"epoch": 1.0,
"step": 80,
"total_flos": 0.0,
"train_loss": 0.6470192432403564,
"train_runtime": 2118.6618,
"train_samples_per_second": 4.809,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 80,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}