{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.753086419753085, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.24691358024691357, "grad_norm": 15.961835861206055, "learning_rate": 1.0000000000000002e-06, "loss": 0.6418, "step": 10 }, { "epoch": 0.49382716049382713, "grad_norm": 8.321958541870117, "learning_rate": 2.0000000000000003e-06, "loss": 0.6123, "step": 20 }, { "epoch": 0.7407407407407407, "grad_norm": 10.797492980957031, "learning_rate": 3e-06, "loss": 0.5325, "step": 30 }, { "epoch": 0.9876543209876543, "grad_norm": 8.990424156188965, "learning_rate": 4.000000000000001e-06, "loss": 0.3937, "step": 40 }, { "epoch": 1.2345679012345678, "grad_norm": 5.069626331329346, "learning_rate": 5e-06, "loss": 0.2157, "step": 50 }, { "epoch": 1.4814814814814814, "grad_norm": 2.54313063621521, "learning_rate": 6e-06, "loss": 0.1184, "step": 60 }, { "epoch": 1.7283950617283952, "grad_norm": 1.0293046236038208, "learning_rate": 7e-06, "loss": 0.0469, "step": 70 }, { "epoch": 1.9753086419753085, "grad_norm": 0.32547664642333984, "learning_rate": 8.000000000000001e-06, "loss": 0.0176, "step": 80 }, { "epoch": 2.2222222222222223, "grad_norm": 0.20642875134944916, "learning_rate": 9e-06, "loss": 0.0088, "step": 90 }, { "epoch": 2.4691358024691357, "grad_norm": 0.16824281215667725, "learning_rate": 1e-05, "loss": 0.0056, "step": 100 }, { "epoch": 2.4691358024691357, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.004177506547421217, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.7278, "eval_samples_per_second": 133.281, "eval_steps_per_second": 9.618, "step": 100 }, { "epoch": 2.7160493827160495, "grad_norm": 0.10963490605354309, "learning_rate": 1.1000000000000001e-05, "loss": 0.0038, "step": 110 }, { "epoch": 2.962962962962963, "grad_norm": 0.07338671386241913, "learning_rate": 1.2e-05, "loss": 0.0029, "step": 120 }, { "epoch": 3.2098765432098766, "grad_norm": 0.0706721693277359, "learning_rate": 1.3000000000000001e-05, "loss": 0.0024, "step": 130 }, { "epoch": 3.45679012345679, "grad_norm": 0.06313496083021164, "learning_rate": 1.4e-05, "loss": 0.002, "step": 140 }, { "epoch": 3.7037037037037037, "grad_norm": 0.063168965280056, "learning_rate": 1.5000000000000002e-05, "loss": 0.0018, "step": 150 }, { "epoch": 3.950617283950617, "grad_norm": 0.05419662222266197, "learning_rate": 1.6000000000000003e-05, "loss": 0.0016, "step": 160 }, { "epoch": 4.197530864197531, "grad_norm": 0.047441959381103516, "learning_rate": 1.7e-05, "loss": 0.0014, "step": 170 }, { "epoch": 4.444444444444445, "grad_norm": 0.04058253392577171, "learning_rate": 1.8e-05, "loss": 0.0012, "step": 180 }, { "epoch": 4.6913580246913575, "grad_norm": 0.029308538883924484, "learning_rate": 1.9e-05, "loss": 0.0011, "step": 190 }, { "epoch": 4.938271604938271, "grad_norm": 0.027978356927633286, "learning_rate": 2e-05, "loss": 0.001, "step": 200 }, { "epoch": 4.938271604938271, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.0008954937802627683, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.521, "eval_samples_per_second": 186.189, "eval_steps_per_second": 13.436, "step": 200 }, { "epoch": 5.185185185185185, "grad_norm": 0.023471660912036896, "learning_rate": 1.9666666666666666e-05, "loss": 0.0009, "step": 210 }, { "epoch": 5.432098765432099, "grad_norm": 0.021547624841332436, "learning_rate": 1.9333333333333333e-05, "loss": 0.0008, "step": 220 }, { "epoch": 5.679012345679013, "grad_norm": 0.02687031961977482, "learning_rate": 1.9e-05, "loss": 0.0007, "step": 230 }, { "epoch": 5.925925925925926, "grad_norm": 0.021016767248511314, "learning_rate": 1.866666666666667e-05, "loss": 0.0006, "step": 240 }, { "epoch": 6.172839506172839, "grad_norm": 0.017553668469190598, "learning_rate": 1.8333333333333333e-05, "loss": 0.0006, "step": 250 }, { "epoch": 6.419753086419753, "grad_norm": 0.016819961369037628, "learning_rate": 1.8e-05, "loss": 0.0006, "step": 260 }, { "epoch": 6.666666666666667, "grad_norm": 0.018349776044487953, "learning_rate": 1.7666666666666668e-05, "loss": 0.0005, "step": 270 }, { "epoch": 6.91358024691358, "grad_norm": 0.01844148337841034, "learning_rate": 1.7333333333333336e-05, "loss": 0.0005, "step": 280 }, { "epoch": 7.160493827160494, "grad_norm": 0.016825733706355095, "learning_rate": 1.7e-05, "loss": 0.0005, "step": 290 }, { "epoch": 7.407407407407407, "grad_norm": 0.013994095847010612, "learning_rate": 1.6666666666666667e-05, "loss": 0.0005, "step": 300 }, { "epoch": 7.407407407407407, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.00043683411786332726, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5186, "eval_samples_per_second": 187.027, "eval_steps_per_second": 13.497, "step": 300 }, { "epoch": 7.654320987654321, "grad_norm": 0.015141790732741356, "learning_rate": 1.6333333333333335e-05, "loss": 0.0004, "step": 310 }, { "epoch": 7.901234567901234, "grad_norm": 0.013697362504899502, "learning_rate": 1.6000000000000003e-05, "loss": 0.0004, "step": 320 }, { "epoch": 8.148148148148149, "grad_norm": 0.01174458209425211, "learning_rate": 1.5666666666666667e-05, "loss": 0.0004, "step": 330 }, { "epoch": 8.395061728395062, "grad_norm": 0.013013974763453007, "learning_rate": 1.5333333333333334e-05, "loss": 0.0004, "step": 340 }, { "epoch": 8.641975308641975, "grad_norm": 0.011320522986352444, "learning_rate": 1.5000000000000002e-05, "loss": 0.0004, "step": 350 }, { "epoch": 8.88888888888889, "grad_norm": 0.010261823423206806, "learning_rate": 1.4666666666666666e-05, "loss": 0.0003, "step": 360 }, { "epoch": 9.135802469135802, "grad_norm": 0.009858865290880203, "learning_rate": 1.4333333333333334e-05, "loss": 0.0003, "step": 370 }, { "epoch": 9.382716049382717, "grad_norm": 0.009954158216714859, "learning_rate": 1.4e-05, "loss": 0.0003, "step": 380 }, { "epoch": 9.62962962962963, "grad_norm": 0.010688001289963722, "learning_rate": 1.3666666666666667e-05, "loss": 0.0003, "step": 390 }, { "epoch": 9.876543209876543, "grad_norm": 0.009526471607387066, "learning_rate": 1.3333333333333333e-05, "loss": 0.0003, "step": 400 }, { "epoch": 9.876543209876543, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.0002707206876948476, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5189, "eval_samples_per_second": 186.945, "eval_steps_per_second": 13.491, "step": 400 }, { "epoch": 10.123456790123457, "grad_norm": 0.00999557226896286, "learning_rate": 1.3000000000000001e-05, "loss": 0.0003, "step": 410 }, { "epoch": 10.37037037037037, "grad_norm": 0.009755443781614304, "learning_rate": 1.2666666666666667e-05, "loss": 0.0003, "step": 420 }, { "epoch": 10.617283950617283, "grad_norm": 0.008844558149576187, "learning_rate": 1.2333333333333334e-05, "loss": 0.0003, "step": 430 }, { "epoch": 10.864197530864198, "grad_norm": 0.00740455137565732, "learning_rate": 1.2e-05, "loss": 0.0003, "step": 440 }, { "epoch": 11.11111111111111, "grad_norm": 0.007182607427239418, "learning_rate": 1.1666666666666668e-05, "loss": 0.0002, "step": 450 }, { "epoch": 11.358024691358025, "grad_norm": 0.007493776269257069, "learning_rate": 1.1333333333333334e-05, "loss": 0.0003, "step": 460 }, { "epoch": 11.604938271604938, "grad_norm": 0.008535212837159634, "learning_rate": 1.1000000000000001e-05, "loss": 0.0002, "step": 470 }, { "epoch": 11.851851851851851, "grad_norm": 0.007039290387183428, "learning_rate": 1.0666666666666667e-05, "loss": 0.0002, "step": 480 }, { "epoch": 12.098765432098766, "grad_norm": 0.007746797055006027, "learning_rate": 1.0333333333333335e-05, "loss": 0.0002, "step": 490 }, { "epoch": 12.345679012345679, "grad_norm": 0.008360541425645351, "learning_rate": 1e-05, "loss": 0.0002, "step": 500 }, { "epoch": 12.345679012345679, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.00022185646230354905, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5185, "eval_samples_per_second": 187.093, "eval_steps_per_second": 13.502, "step": 500 }, { "epoch": 12.592592592592592, "grad_norm": 0.006791314110159874, "learning_rate": 9.666666666666667e-06, "loss": 0.0002, "step": 510 }, { "epoch": 12.839506172839506, "grad_norm": 0.007602753583341837, "learning_rate": 9.333333333333334e-06, "loss": 0.0002, "step": 520 }, { "epoch": 13.08641975308642, "grad_norm": 0.0071947514079511166, "learning_rate": 9e-06, "loss": 0.0002, "step": 530 }, { "epoch": 13.333333333333334, "grad_norm": 0.007956212386488914, "learning_rate": 8.666666666666668e-06, "loss": 0.0002, "step": 540 }, { "epoch": 13.580246913580247, "grad_norm": 0.007944190874695778, "learning_rate": 8.333333333333334e-06, "loss": 0.0002, "step": 550 }, { "epoch": 13.82716049382716, "grad_norm": 0.007252030540257692, "learning_rate": 8.000000000000001e-06, "loss": 0.0002, "step": 560 }, { "epoch": 14.074074074074074, "grad_norm": 0.007420521695166826, "learning_rate": 7.666666666666667e-06, "loss": 0.0002, "step": 570 }, { "epoch": 14.320987654320987, "grad_norm": 0.007615529000759125, "learning_rate": 7.333333333333333e-06, "loss": 0.0002, "step": 580 }, { "epoch": 14.567901234567902, "grad_norm": 0.007768448442220688, "learning_rate": 7e-06, "loss": 0.0002, "step": 590 }, { "epoch": 14.814814814814815, "grad_norm": 0.005428287200629711, "learning_rate": 6.666666666666667e-06, "loss": 0.0002, "step": 600 }, { "epoch": 14.814814814814815, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.00019342350424267352, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5188, "eval_samples_per_second": 186.984, "eval_steps_per_second": 13.494, "step": 600 }, { "epoch": 15.061728395061728, "grad_norm": 0.005406714044511318, "learning_rate": 6.333333333333333e-06, "loss": 0.0002, "step": 610 }, { "epoch": 15.308641975308642, "grad_norm": 0.007229079958051443, "learning_rate": 6e-06, "loss": 0.0002, "step": 620 }, { "epoch": 15.555555555555555, "grad_norm": 0.006031244061887264, "learning_rate": 5.666666666666667e-06, "loss": 0.0002, "step": 630 }, { "epoch": 15.802469135802468, "grad_norm": 0.0075646815821528435, "learning_rate": 5.333333333333334e-06, "loss": 0.0002, "step": 640 }, { "epoch": 16.049382716049383, "grad_norm": 0.006907324306666851, "learning_rate": 5e-06, "loss": 0.0002, "step": 650 }, { "epoch": 16.296296296296298, "grad_norm": 0.005848431494086981, "learning_rate": 4.666666666666667e-06, "loss": 0.0002, "step": 660 }, { "epoch": 16.54320987654321, "grad_norm": 0.007128111552447081, "learning_rate": 4.333333333333334e-06, "loss": 0.0002, "step": 670 }, { "epoch": 16.790123456790123, "grad_norm": 0.00657699815928936, "learning_rate": 4.000000000000001e-06, "loss": 0.0002, "step": 680 }, { "epoch": 17.037037037037038, "grad_norm": 0.005719279404729605, "learning_rate": 3.6666666666666666e-06, "loss": 0.0002, "step": 690 }, { "epoch": 17.28395061728395, "grad_norm": 0.0061570280231535435, "learning_rate": 3.3333333333333333e-06, "loss": 0.0002, "step": 700 }, { "epoch": 17.28395061728395, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.0001789480447769165, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5201, "eval_samples_per_second": 186.493, "eval_steps_per_second": 13.458, "step": 700 }, { "epoch": 17.530864197530864, "grad_norm": 0.006241227500140667, "learning_rate": 3e-06, "loss": 0.0002, "step": 710 }, { "epoch": 17.77777777777778, "grad_norm": 0.006561820395290852, "learning_rate": 2.666666666666667e-06, "loss": 0.0002, "step": 720 }, { "epoch": 18.02469135802469, "grad_norm": 0.00643093092367053, "learning_rate": 2.3333333333333336e-06, "loss": 0.0002, "step": 730 }, { "epoch": 18.271604938271604, "grad_norm": 0.005693737417459488, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "step": 740 }, { "epoch": 18.51851851851852, "grad_norm": 0.0065653519704937935, "learning_rate": 1.6666666666666667e-06, "loss": 0.0002, "step": 750 }, { "epoch": 18.765432098765434, "grad_norm": 0.004837734624743462, "learning_rate": 1.3333333333333334e-06, "loss": 0.0002, "step": 760 }, { "epoch": 19.012345679012345, "grad_norm": 0.005498081911355257, "learning_rate": 1.0000000000000002e-06, "loss": 0.0002, "step": 770 }, { "epoch": 19.25925925925926, "grad_norm": 0.0063169412314891815, "learning_rate": 6.666666666666667e-07, "loss": 0.0002, "step": 780 }, { "epoch": 19.506172839506174, "grad_norm": 0.00681178318336606, "learning_rate": 3.3333333333333335e-07, "loss": 0.0002, "step": 790 }, { "epoch": 19.753086419753085, "grad_norm": 0.006289825774729252, "learning_rate": 0.0, "loss": 0.0002, "step": 800 }, { "epoch": 19.753086419753085, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.0001743907341733575, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5213, "eval_samples_per_second": 186.078, "eval_steps_per_second": 13.428, "step": 800 }, { "epoch": 19.753086419753085, "step": 800, "total_flos": 93696895492200.0, "train_loss": 0.032879929275804895, "train_runtime": 407.7795, "train_samples_per_second": 63.073, "train_steps_per_second": 1.962 }, { "epoch": 19.753086419753085, "eval_accuracy": 1.0, "eval_accuracy_label_GD622:Null": 1.0, "eval_accuracy_label_GD622:YES": 1.0, "eval_f1": 1.0, "eval_loss": 0.0001743907341733575, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 0.5104, "eval_samples_per_second": 190.041, "eval_steps_per_second": 13.714, "step": 800 } ], "logging_steps": 10, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 93696895492200.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }