{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 309, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003236245954692557, "grad_norm": 1658.8809088511882, "learning_rate": 1.6129032258064514e-08, "logits/chosen": -0.20905712246894836, "logits/rejected": -0.22190234065055847, "logps/chosen": -51.62083435058594, "logps/rejected": -51.69921112060547, "loss": 113.3679, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032362459546925564, "grad_norm": 1515.9865821069318, "learning_rate": 1.6129032258064515e-07, "logits/chosen": -0.5064975619316101, "logits/rejected": -0.4768737554550171, "logps/chosen": -117.30918884277344, "logps/rejected": -114.356689453125, "loss": 109.6363, "rewards/accuracies": 0.2777777910232544, "rewards/chosen": 0.0006128688692115247, "rewards/margins": -0.00019755105313379318, "rewards/rejected": 0.0008104199077934027, "step": 10 }, { "epoch": 0.06472491909385113, "grad_norm": 969.4522935755164, "learning_rate": 3.225806451612903e-07, "logits/chosen": -0.3630278706550598, "logits/rejected": -0.34703055024147034, "logps/chosen": -77.77701568603516, "logps/rejected": -79.4086685180664, "loss": 90.5388, "rewards/accuracies": 0.625, "rewards/chosen": 0.01180888619273901, "rewards/margins": 0.0002235680294688791, "rewards/rejected": 0.01158531941473484, "step": 20 }, { "epoch": 0.0970873786407767, "grad_norm": 888.3989822885039, "learning_rate": 4.838709677419355e-07, "logits/chosen": -0.4171529710292816, "logits/rejected": -0.4250712990760803, "logps/chosen": -73.54609680175781, "logps/rejected": -102.91902923583984, "loss": 83.8566, "rewards/accuracies": 0.375, "rewards/chosen": 0.018597453832626343, "rewards/margins": 0.00040426477789878845, "rewards/rejected": 0.018193189054727554, "step": 30 }, { "epoch": 0.12944983818770225, "grad_norm": 947.9076339924575, "learning_rate": 4.838129496402878e-07, "logits/chosen": -0.5706937313079834, "logits/rejected": -0.5749413371086121, "logps/chosen": -90.37564086914062, "logps/rejected": -89.90583801269531, "loss": 81.1653, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.023226192221045494, "rewards/margins": -0.000476902408991009, "rewards/rejected": 0.023703094571828842, "step": 40 }, { "epoch": 0.16181229773462782, "grad_norm": 946.9060895360059, "learning_rate": 4.6582733812949637e-07, "logits/chosen": -0.5844239592552185, "logits/rejected": -0.5668785572052002, "logps/chosen": -86.814453125, "logps/rejected": -86.94580841064453, "loss": 79.2398, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.024742115288972855, "rewards/margins": 0.0008400626247748733, "rewards/rejected": 0.023902051150798798, "step": 50 }, { "epoch": 0.1941747572815534, "grad_norm": 882.6897235193384, "learning_rate": 4.4784172661870503e-07, "logits/chosen": -0.578288733959198, "logits/rejected": -0.5263481140136719, "logps/chosen": -90.2419662475586, "logps/rejected": -82.88394165039062, "loss": 78.6072, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.023136131465435028, "rewards/margins": -0.0008002830436453223, "rewards/rejected": 0.02393641509115696, "step": 60 }, { "epoch": 0.22653721682847897, "grad_norm": 961.7649999475635, "learning_rate": 4.2985611510791364e-07, "logits/chosen": -0.5004868507385254, "logits/rejected": -0.5238932371139526, "logps/chosen": -86.54154968261719, "logps/rejected": -95.54032135009766, "loss": 78.1544, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.02475227229297161, "rewards/margins": -0.0008166898041963577, "rewards/rejected": 0.02556896209716797, "step": 70 }, { "epoch": 0.2588996763754045, "grad_norm": 883.6111764382059, "learning_rate": 4.118705035971223e-07, "logits/chosen": -0.4584523141384125, "logits/rejected": -0.4544026851654053, "logps/chosen": -57.0123405456543, "logps/rejected": -61.774627685546875, "loss": 77.6898, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02323099412024021, "rewards/margins": -8.148546476149932e-05, "rewards/rejected": 0.02331247739493847, "step": 80 }, { "epoch": 0.2912621359223301, "grad_norm": 829.9995588751361, "learning_rate": 3.938848920863309e-07, "logits/chosen": -0.3944535553455353, "logits/rejected": -0.4474177360534668, "logps/chosen": -72.95917510986328, "logps/rejected": -78.4412841796875, "loss": 74.775, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.023325806483626366, "rewards/margins": -2.6454519684193656e-05, "rewards/rejected": 0.023352261632680893, "step": 90 }, { "epoch": 0.32362459546925565, "grad_norm": 932.2260025071632, "learning_rate": 3.7589928057553957e-07, "logits/chosen": -0.4648945927619934, "logits/rejected": -0.5219349265098572, "logps/chosen": -92.59781646728516, "logps/rejected": -99.55081939697266, "loss": 74.8136, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02784586511552334, "rewards/margins": -0.001028593396767974, "rewards/rejected": 0.028874456882476807, "step": 100 }, { "epoch": 0.3559870550161812, "grad_norm": 896.6229924522859, "learning_rate": 3.579136690647482e-07, "logits/chosen": -0.49588069319725037, "logits/rejected": -0.47765594720840454, "logps/chosen": -84.73075866699219, "logps/rejected": -86.83433532714844, "loss": 81.4245, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.023242127150297165, "rewards/margins": -0.0016486679669469595, "rewards/rejected": 0.02489079348742962, "step": 110 }, { "epoch": 0.3883495145631068, "grad_norm": 891.651805206827, "learning_rate": 3.3992805755395684e-07, "logits/chosen": -0.5411235094070435, "logits/rejected": -0.5059757828712463, "logps/chosen": -70.73728942871094, "logps/rejected": -72.28491973876953, "loss": 71.0374, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.025572773069143295, "rewards/margins": -0.0004695397801697254, "rewards/rejected": 0.026042312383651733, "step": 120 }, { "epoch": 0.42071197411003236, "grad_norm": 866.1129148935271, "learning_rate": 3.2194244604316545e-07, "logits/chosen": -0.4196953773498535, "logits/rejected": -0.3725748658180237, "logps/chosen": -72.61355590820312, "logps/rejected": -75.1019287109375, "loss": 76.2665, "rewards/accuracies": 0.5, "rewards/chosen": 0.02426721155643463, "rewards/margins": 0.00010678176477085799, "rewards/rejected": 0.02416042983531952, "step": 130 }, { "epoch": 0.45307443365695793, "grad_norm": 1010.9753753149187, "learning_rate": 3.039568345323741e-07, "logits/chosen": -0.5107399225234985, "logits/rejected": -0.5314233899116516, "logps/chosen": -93.92290496826172, "logps/rejected": -93.65010070800781, "loss": 83.5214, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.025826340541243553, "rewards/margins": 0.00021239747002255172, "rewards/rejected": 0.025613943114876747, "step": 140 }, { "epoch": 0.4854368932038835, "grad_norm": 913.7418029983949, "learning_rate": 2.859712230215827e-07, "logits/chosen": -0.5381096601486206, "logits/rejected": -0.5203205943107605, "logps/chosen": -65.59220886230469, "logps/rejected": -66.00724029541016, "loss": 76.5186, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.024304382503032684, "rewards/margins": -0.0004876487364526838, "rewards/rejected": 0.024792032316327095, "step": 150 }, { "epoch": 0.517799352750809, "grad_norm": 972.5973319895395, "learning_rate": 2.679856115107914e-07, "logits/chosen": -0.39734119176864624, "logits/rejected": -0.38979026675224304, "logps/chosen": -61.9793586730957, "logps/rejected": -64.6052017211914, "loss": 76.9767, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.024198293685913086, "rewards/margins": 0.001675128354690969, "rewards/rejected": 0.022523164749145508, "step": 160 }, { "epoch": 0.5501618122977346, "grad_norm": 949.8191332999405, "learning_rate": 2.5e-07, "logits/chosen": -0.4918611943721771, "logits/rejected": -0.45585107803344727, "logps/chosen": -69.14030456542969, "logps/rejected": -74.96630096435547, "loss": 76.1, "rewards/accuracies": 0.5, "rewards/chosen": 0.023405950516462326, "rewards/margins": -0.00011506080772960559, "rewards/rejected": 0.02352100983262062, "step": 170 }, { "epoch": 0.5825242718446602, "grad_norm": 1013.6715799407693, "learning_rate": 2.3201438848920862e-07, "logits/chosen": -0.48608237504959106, "logits/rejected": -0.5281080007553101, "logps/chosen": -104.78086853027344, "logps/rejected": -108.03865814208984, "loss": 83.236, "rewards/accuracies": 0.375, "rewards/chosen": 0.027134334668517113, "rewards/margins": -0.0010873143328353763, "rewards/rejected": 0.028221651911735535, "step": 180 }, { "epoch": 0.6148867313915858, "grad_norm": 847.9053556478235, "learning_rate": 2.1402877697841726e-07, "logits/chosen": -0.5092284679412842, "logits/rejected": -0.48264461755752563, "logps/chosen": -73.16789245605469, "logps/rejected": -64.76610565185547, "loss": 77.9783, "rewards/accuracies": 0.375, "rewards/chosen": 0.024086542427539825, "rewards/margins": -0.0006825210293754935, "rewards/rejected": 0.02476906217634678, "step": 190 }, { "epoch": 0.6472491909385113, "grad_norm": 884.8782203906196, "learning_rate": 1.960431654676259e-07, "logits/chosen": -0.5042704343795776, "logits/rejected": -0.5013135671615601, "logps/chosen": -69.30252075195312, "logps/rejected": -73.18415832519531, "loss": 78.8926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.025373632088303566, "rewards/margins": 0.0003596575988922268, "rewards/rejected": 0.02501397207379341, "step": 200 }, { "epoch": 0.6796116504854369, "grad_norm": 881.5033348535335, "learning_rate": 1.7805755395683453e-07, "logits/chosen": -0.4601220190525055, "logits/rejected": -0.46207714080810547, "logps/chosen": -84.7288589477539, "logps/rejected": -81.23218536376953, "loss": 75.1407, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.02544797584414482, "rewards/margins": 0.0003221259394194931, "rewards/rejected": 0.02512585185468197, "step": 210 }, { "epoch": 0.7119741100323624, "grad_norm": 1004.8101705908582, "learning_rate": 1.6007194244604316e-07, "logits/chosen": -0.5395989418029785, "logits/rejected": -0.5014286041259766, "logps/chosen": -61.23677444458008, "logps/rejected": -58.251853942871094, "loss": 72.5765, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.02392033487558365, "rewards/margins": 0.0002870003809221089, "rewards/rejected": 0.023633334785699844, "step": 220 }, { "epoch": 0.7443365695792881, "grad_norm": 794.2835932368788, "learning_rate": 1.420863309352518e-07, "logits/chosen": -0.5492295026779175, "logits/rejected": -0.5480334162712097, "logps/chosen": -76.67291259765625, "logps/rejected": -78.2052993774414, "loss": 76.2023, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02420797012746334, "rewards/margins": -0.0005196809652261436, "rewards/rejected": 0.024727651849389076, "step": 230 }, { "epoch": 0.7766990291262136, "grad_norm": 917.7791062574787, "learning_rate": 1.2410071942446043e-07, "logits/chosen": -0.5878959894180298, "logits/rejected": -0.5924805998802185, "logps/chosen": -90.03998565673828, "logps/rejected": -90.48416137695312, "loss": 78.2183, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02639937959611416, "rewards/margins": -0.0005113029619678855, "rewards/rejected": 0.026910683140158653, "step": 240 }, { "epoch": 0.8090614886731392, "grad_norm": 929.1351901669688, "learning_rate": 1.0611510791366907e-07, "logits/chosen": -0.45039838552474976, "logits/rejected": -0.4581735134124756, "logps/chosen": -78.8072280883789, "logps/rejected": -81.91495513916016, "loss": 80.8378, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02478354051709175, "rewards/margins": -0.00047748201177455485, "rewards/rejected": 0.02526102401316166, "step": 250 }, { "epoch": 0.8414239482200647, "grad_norm": 979.736057923442, "learning_rate": 8.812949640287769e-08, "logits/chosen": -0.5115951299667358, "logits/rejected": -0.49285611510276794, "logps/chosen": -64.18290710449219, "logps/rejected": -75.04850769042969, "loss": 79.4801, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.024892836809158325, "rewards/margins": 0.0003677388303913176, "rewards/rejected": 0.024525096639990807, "step": 260 }, { "epoch": 0.8737864077669902, "grad_norm": 937.1301078278518, "learning_rate": 7.014388489208632e-08, "logits/chosen": -0.49609702825546265, "logits/rejected": -0.4680093824863434, "logps/chosen": -77.85667419433594, "logps/rejected": -77.64161682128906, "loss": 76.3001, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.023356296122074127, "rewards/margins": -0.0011736620217561722, "rewards/rejected": 0.02452996000647545, "step": 270 }, { "epoch": 0.9061488673139159, "grad_norm": 962.3034050479715, "learning_rate": 5.2158273381294966e-08, "logits/chosen": -0.4928700029850006, "logits/rejected": -0.5252888798713684, "logps/chosen": -83.03263092041016, "logps/rejected": -85.7882080078125, "loss": 75.3971, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.024604439735412598, "rewards/margins": -0.0005611368687823415, "rewards/rejected": 0.025165576487779617, "step": 280 }, { "epoch": 0.9385113268608414, "grad_norm": 988.3844505003867, "learning_rate": 3.41726618705036e-08, "logits/chosen": -0.5471370220184326, "logits/rejected": -0.5181233286857605, "logps/chosen": -116.2409896850586, "logps/rejected": -98.54341125488281, "loss": 79.4163, "rewards/accuracies": 0.5, "rewards/chosen": 0.028873136267066002, "rewards/margins": 0.0007068775594234467, "rewards/rejected": 0.028166258707642555, "step": 290 }, { "epoch": 0.970873786407767, "grad_norm": 941.0104794351889, "learning_rate": 1.618705035971223e-08, "logits/chosen": -0.5063565373420715, "logits/rejected": -0.5347974300384521, "logps/chosen": -87.50187683105469, "logps/rejected": -85.3526382446289, "loss": 77.0531, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.026520222425460815, "rewards/margins": 0.0010460775811225176, "rewards/rejected": 0.025474146008491516, "step": 300 }, { "epoch": 1.0, "step": 309, "total_flos": 0.0, "train_loss": 79.41712339333345, "train_runtime": 2752.7811, "train_samples_per_second": 7.18, "train_steps_per_second": 0.112 } ], "logging_steps": 10, "max_steps": 309, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }