{ "best_metric": 0.07559170573949814, "best_model_checkpoint": "./trained_models/Mistral-7B-Instruct-v0.2_-1pos_-1neg_perNE_top-1NEs_TrueDef-IT/checkpoint-420", "epoch": 0.7190241814680077, "eval_steps": 20, "global_step": 420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.4999999999999998e-05, "loss": 6.4943, "step": 5 }, { "epoch": 0.02, "learning_rate": 4.9999999999999996e-05, "loss": 3.9486, "step": 10 }, { "epoch": 0.03, "learning_rate": 7.5e-05, "loss": 0.4566, "step": 15 }, { "epoch": 0.03, "learning_rate": 9.999999999999999e-05, "loss": 0.3512, "step": 20 }, { "epoch": 0.03, "eval_loss": 0.3139978051185608, "eval_runtime": 222.18, "eval_samples_per_second": 7.84, "eval_steps_per_second": 7.84, "step": 20 }, { "epoch": 0.04, "learning_rate": 0.000125, "loss": 0.1879, "step": 25 }, { "epoch": 0.05, "learning_rate": 0.00015, "loss": 0.1468, "step": 30 }, { "epoch": 0.06, "learning_rate": 0.000175, "loss": 0.152, "step": 35 }, { "epoch": 0.07, "learning_rate": 0.00019999999999999998, "loss": 0.1357, "step": 40 }, { "epoch": 0.07, "eval_loss": 0.20455242693424225, "eval_runtime": 222.1053, "eval_samples_per_second": 7.843, "eval_steps_per_second": 7.843, "step": 40 }, { "epoch": 0.08, "learning_rate": 0.000225, "loss": 0.1162, "step": 45 }, { "epoch": 0.09, "learning_rate": 0.00025, "loss": 0.1099, "step": 50 }, { "epoch": 0.09, "learning_rate": 0.00027499999999999996, "loss": 0.3812, "step": 55 }, { "epoch": 0.1, "learning_rate": 0.0003, "loss": 0.2093, "step": 60 }, { "epoch": 0.1, "eval_loss": 0.12376075983047485, "eval_runtime": 222.1103, "eval_samples_per_second": 7.843, "eval_steps_per_second": 7.843, "step": 60 }, { "epoch": 0.11, "learning_rate": 0.0002999994460825163, "loss": 0.1396, "step": 65 }, { "epoch": 0.12, "learning_rate": 0.0002999977843341562, "loss": 0.1036, "step": 70 }, { "epoch": 0.13, "learning_rate": 0.00029999501476719257, "loss": 0.0634, "step": 75 }, { "epoch": 0.14, "learning_rate": 0.0002999911374020804, "loss": 0.0933, "step": 80 }, { "epoch": 0.14, "eval_loss": 0.1939181536436081, "eval_runtime": 222.0528, "eval_samples_per_second": 7.845, "eval_steps_per_second": 7.845, "step": 80 }, { "epoch": 0.15, "learning_rate": 0.00029998615226745605, "loss": 0.0656, "step": 85 }, { "epoch": 0.15, "learning_rate": 0.0002999800594001376, "loss": 0.0519, "step": 90 }, { "epoch": 0.16, "learning_rate": 0.0002999728588451245, "loss": 0.0476, "step": 95 }, { "epoch": 0.17, "learning_rate": 0.0002999645506555967, "loss": 0.0098, "step": 100 }, { "epoch": 0.17, "eval_loss": 0.24784570932388306, "eval_runtime": 222.2602, "eval_samples_per_second": 7.838, "eval_steps_per_second": 7.838, "step": 100 }, { "epoch": 0.18, "learning_rate": 0.00029995513489291506, "loss": 0.356, "step": 105 }, { "epoch": 0.19, "learning_rate": 0.00029994461162662024, "loss": 0.1972, "step": 110 }, { "epoch": 0.2, "learning_rate": 0.00029993298093443246, "loss": 0.1255, "step": 115 }, { "epoch": 0.21, "learning_rate": 0.000299920242902251, "loss": 0.0923, "step": 120 }, { "epoch": 0.21, "eval_loss": 0.12546367943286896, "eval_runtime": 222.0682, "eval_samples_per_second": 7.844, "eval_steps_per_second": 7.844, "step": 120 }, { "epoch": 0.21, "learning_rate": 0.0002999063976241536, "loss": 0.0504, "step": 125 }, { "epoch": 0.22, "learning_rate": 0.0002998914452023953, "loss": 0.1582, "step": 130 }, { "epoch": 0.23, "learning_rate": 0.00029987538574740826, "loss": 0.0452, "step": 135 }, { "epoch": 0.24, "learning_rate": 0.0002998582193778006, "loss": 0.0258, "step": 140 }, { "epoch": 0.24, "eval_loss": 0.21805395185947418, "eval_runtime": 224.118, "eval_samples_per_second": 7.773, "eval_steps_per_second": 7.773, "step": 140 }, { "epoch": 0.25, "learning_rate": 0.00029983994622035585, "loss": 0.032, "step": 145 }, { "epoch": 0.26, "learning_rate": 0.00029982056641003147, "loss": 0.0224, "step": 150 }, { "epoch": 0.27, "learning_rate": 0.00029980008008995834, "loss": 0.2454, "step": 155 }, { "epoch": 0.27, "learning_rate": 0.00029977848741143966, "loss": 0.1583, "step": 160 }, { "epoch": 0.27, "eval_loss": 0.10049739480018616, "eval_runtime": 224.1576, "eval_samples_per_second": 7.771, "eval_steps_per_second": 7.771, "step": 160 }, { "epoch": 0.28, "learning_rate": 0.0002997557885339494, "loss": 0.1388, "step": 165 }, { "epoch": 0.29, "learning_rate": 0.0002997319836251319, "loss": 0.0962, "step": 170 }, { "epoch": 0.3, "learning_rate": 0.00029970707286079966, "loss": 0.0602, "step": 175 }, { "epoch": 0.31, "learning_rate": 0.00029968105642493286, "loss": 0.0642, "step": 180 }, { "epoch": 0.31, "eval_loss": 0.10171639919281006, "eval_runtime": 223.682, "eval_samples_per_second": 7.788, "eval_steps_per_second": 7.788, "step": 180 }, { "epoch": 0.32, "learning_rate": 0.0002996539345096776, "loss": 0.0499, "step": 185 }, { "epoch": 0.33, "learning_rate": 0.0002996257073153446, "loss": 0.0486, "step": 190 }, { "epoch": 0.33, "learning_rate": 0.00029959637505040773, "loss": 0.0198, "step": 195 }, { "epoch": 0.34, "learning_rate": 0.00029956593793150233, "loss": 0.0359, "step": 200 }, { "epoch": 0.34, "eval_loss": 0.1516859233379364, "eval_runtime": 228.4243, "eval_samples_per_second": 7.626, "eval_steps_per_second": 7.626, "step": 200 }, { "epoch": 0.35, "learning_rate": 0.0002995343961834238, "loss": 0.2612, "step": 205 }, { "epoch": 0.36, "learning_rate": 0.00029950175003912573, "loss": 0.1232, "step": 210 }, { "epoch": 0.37, "learning_rate": 0.0002994679997397185, "loss": 0.0861, "step": 215 }, { "epoch": 0.38, "learning_rate": 0.00029943314553446706, "loss": 0.0867, "step": 220 }, { "epoch": 0.38, "eval_loss": 0.08929727226495743, "eval_runtime": 229.1014, "eval_samples_per_second": 7.604, "eval_steps_per_second": 7.604, "step": 220 }, { "epoch": 0.39, "learning_rate": 0.0002993971876807896, "loss": 0.0485, "step": 225 }, { "epoch": 0.39, "learning_rate": 0.00029936012644425517, "loss": 0.0516, "step": 230 }, { "epoch": 0.4, "learning_rate": 0.00029932196209858197, "loss": 0.0583, "step": 235 }, { "epoch": 0.41, "learning_rate": 0.00029928269492563537, "loss": 0.0271, "step": 240 }, { "epoch": 0.41, "eval_loss": 0.13832463324069977, "eval_runtime": 227.2299, "eval_samples_per_second": 7.666, "eval_steps_per_second": 7.666, "step": 240 }, { "epoch": 0.42, "learning_rate": 0.00029924232521542557, "loss": 0.0213, "step": 245 }, { "epoch": 0.43, "learning_rate": 0.00029920085326610595, "loss": 0.0388, "step": 250 }, { "epoch": 0.44, "learning_rate": 0.00029915827938397017, "loss": 0.3212, "step": 255 }, { "epoch": 0.45, "learning_rate": 0.0002991146038834505, "loss": 0.1213, "step": 260 }, { "epoch": 0.45, "eval_loss": 0.09067531675100327, "eval_runtime": 228.6975, "eval_samples_per_second": 7.617, "eval_steps_per_second": 7.617, "step": 260 }, { "epoch": 0.45, "learning_rate": 0.00029906982708711533, "loss": 0.0905, "step": 265 }, { "epoch": 0.46, "learning_rate": 0.00029902394932566657, "loss": 0.0397, "step": 270 }, { "epoch": 0.47, "learning_rate": 0.00029897697093793753, "loss": 0.0497, "step": 275 }, { "epoch": 0.48, "learning_rate": 0.0002989288922708902, "loss": 0.0549, "step": 280 }, { "epoch": 0.48, "eval_loss": 0.10685621201992035, "eval_runtime": 227.369, "eval_samples_per_second": 7.662, "eval_steps_per_second": 7.662, "step": 280 }, { "epoch": 0.49, "learning_rate": 0.0002988797136796128, "loss": 0.0281, "step": 285 }, { "epoch": 0.5, "learning_rate": 0.00029882943552731703, "loss": 0.0586, "step": 290 }, { "epoch": 0.51, "learning_rate": 0.0002987780581853355, "loss": 0.0284, "step": 295 }, { "epoch": 0.51, "learning_rate": 0.00029872558203311914, "loss": 0.0436, "step": 300 }, { "epoch": 0.51, "eval_loss": 0.1119319349527359, "eval_runtime": 221.5402, "eval_samples_per_second": 7.863, "eval_steps_per_second": 7.863, "step": 300 }, { "epoch": 0.52, "learning_rate": 0.00029867200745823384, "loss": 0.1798, "step": 305 }, { "epoch": 0.53, "learning_rate": 0.00029861733485635834, "loss": 0.1191, "step": 310 }, { "epoch": 0.54, "learning_rate": 0.0002985615646312807, "loss": 0.0841, "step": 315 }, { "epoch": 0.55, "learning_rate": 0.00029850469719489573, "loss": 0.1025, "step": 320 }, { "epoch": 0.55, "eval_loss": 0.08742260932922363, "eval_runtime": 221.4467, "eval_samples_per_second": 7.866, "eval_steps_per_second": 7.866, "step": 320 }, { "epoch": 0.56, "learning_rate": 0.00029844673296720154, "loss": 0.0457, "step": 325 }, { "epoch": 0.56, "learning_rate": 0.00029838767237629684, "loss": 0.0465, "step": 330 }, { "epoch": 0.57, "learning_rate": 0.0002983275158583775, "loss": 0.063, "step": 335 }, { "epoch": 0.58, "learning_rate": 0.0002982662638577335, "loss": 0.0238, "step": 340 }, { "epoch": 0.58, "eval_loss": 0.14248833060264587, "eval_runtime": 225.6871, "eval_samples_per_second": 7.719, "eval_steps_per_second": 7.719, "step": 340 }, { "epoch": 0.59, "learning_rate": 0.00029820391682674563, "loss": 0.0205, "step": 345 }, { "epoch": 0.6, "learning_rate": 0.00029814047522588194, "loss": 0.0218, "step": 350 }, { "epoch": 0.61, "learning_rate": 0.00029807593952369465, "loss": 0.2612, "step": 355 }, { "epoch": 0.62, "learning_rate": 0.00029801031019681645, "loss": 0.1889, "step": 360 }, { "epoch": 0.62, "eval_loss": 0.09468888491392136, "eval_runtime": 228.9494, "eval_samples_per_second": 7.609, "eval_steps_per_second": 7.609, "step": 360 }, { "epoch": 0.62, "learning_rate": 0.0002979435877299571, "loss": 0.0905, "step": 365 }, { "epoch": 0.63, "learning_rate": 0.0002978757726158998, "loss": 0.0619, "step": 370 }, { "epoch": 0.64, "learning_rate": 0.00029780686535549756, "loss": 0.0625, "step": 375 }, { "epoch": 0.65, "learning_rate": 0.0002977368664576696, "loss": 0.0585, "step": 380 }, { "epoch": 0.65, "eval_loss": 0.08509568870067596, "eval_runtime": 227.8786, "eval_samples_per_second": 7.644, "eval_steps_per_second": 7.644, "step": 380 }, { "epoch": 0.66, "learning_rate": 0.00029766577643939744, "loss": 0.0327, "step": 385 }, { "epoch": 0.67, "learning_rate": 0.00029759359582572103, "loss": 0.043, "step": 390 }, { "epoch": 0.68, "learning_rate": 0.00029752032514973516, "loss": 0.0234, "step": 395 }, { "epoch": 0.68, "learning_rate": 0.00029744596495258525, "loss": 0.0119, "step": 400 }, { "epoch": 0.68, "eval_loss": 0.09636981785297394, "eval_runtime": 228.541, "eval_samples_per_second": 7.622, "eval_steps_per_second": 7.622, "step": 400 }, { "epoch": 0.69, "learning_rate": 0.00029737051578346345, "loss": 0.1557, "step": 405 }, { "epoch": 0.7, "learning_rate": 0.0002972939781996047, "loss": 0.1116, "step": 410 }, { "epoch": 0.71, "learning_rate": 0.0002972163527662824, "loss": 0.0638, "step": 415 }, { "epoch": 0.72, "learning_rate": 0.00029713764005680427, "loss": 0.0726, "step": 420 }, { "epoch": 0.72, "eval_loss": 0.07559170573949814, "eval_runtime": 228.1262, "eval_samples_per_second": 7.636, "eval_steps_per_second": 7.636, "step": 420 } ], "logging_steps": 5, "max_steps": 5840, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 20, "total_flos": 2.1261537057551155e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }