{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.6923076923076925, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "learning_rate": 0.0002, "loss": 0.6472, "step": 10 }, { "epoch": 0.31, "learning_rate": 0.0002, "loss": 0.3259, "step": 20 }, { "epoch": 0.46, "learning_rate": 0.0002, "loss": 0.224, "step": 30 }, { "epoch": 0.62, "learning_rate": 0.0002, "loss": 0.1295, "step": 40 }, { "epoch": 0.77, "learning_rate": 0.0002, "loss": 0.046, "step": 50 }, { "epoch": 0.92, "learning_rate": 0.0002, "loss": 0.0327, "step": 60 }, { "epoch": 1.08, "learning_rate": 0.0002, "loss": 0.0207, "step": 70 }, { "epoch": 1.23, "learning_rate": 0.0002, "loss": 0.0176, "step": 80 }, { "epoch": 1.38, "learning_rate": 0.0002, "loss": 0.015, "step": 90 }, { "epoch": 1.54, "learning_rate": 0.0002, "loss": 0.0139, "step": 100 }, { "epoch": 1.54, "eval_loss": 0.01566016674041748, "eval_runtime": 4.9917, "eval_samples_per_second": 13.022, "eval_steps_per_second": 1.803, "step": 100 }, { "epoch": 1.69, "learning_rate": 0.0002, "loss": 0.0117, "step": 110 }, { "epoch": 1.85, "learning_rate": 0.0002, "loss": 0.0112, "step": 120 }, { "epoch": 2.0, "learning_rate": 0.0002, "loss": 0.0105, "step": 130 }, { "epoch": 2.15, "learning_rate": 0.0002, "loss": 0.0097, "step": 140 }, { "epoch": 2.31, "learning_rate": 0.0002, "loss": 0.0104, "step": 150 }, { "epoch": 2.46, "learning_rate": 0.0002, "loss": 0.0118, "step": 160 }, { "epoch": 2.62, "learning_rate": 0.0002, "loss": 0.01, "step": 170 }, { "epoch": 2.77, "learning_rate": 0.0002, "loss": 0.0091, "step": 180 }, { "epoch": 2.92, "learning_rate": 0.0002, "loss": 0.0072, "step": 190 }, { "epoch": 3.08, "learning_rate": 0.0002, "loss": 0.0079, "step": 200 }, { "epoch": 3.08, "eval_loss": 0.012308033183217049, "eval_runtime": 4.9748, "eval_samples_per_second": 13.066, "eval_steps_per_second": 1.809, "step": 200 }, { "epoch": 3.23, "learning_rate": 0.0002, "loss": 0.0077, "step": 210 }, { "epoch": 3.38, "learning_rate": 0.0002, "loss": 0.0081, "step": 220 }, { "epoch": 3.54, "learning_rate": 0.0002, "loss": 0.0072, "step": 230 }, { "epoch": 3.69, "learning_rate": 0.0002, "loss": 0.0073, "step": 240 }, { "epoch": 3.85, "learning_rate": 0.0002, "loss": 0.0068, "step": 250 }, { "epoch": 4.0, "learning_rate": 0.0002, "loss": 0.007, "step": 260 }, { "epoch": 4.15, "learning_rate": 0.0002, "loss": 0.0057, "step": 270 }, { "epoch": 4.31, "learning_rate": 0.0002, "loss": 0.0072, "step": 280 }, { "epoch": 4.46, "learning_rate": 0.0002, "loss": 0.006, "step": 290 }, { "epoch": 4.62, "learning_rate": 0.0002, "loss": 0.0062, "step": 300 }, { "epoch": 4.62, "eval_loss": 0.011374865658581257, "eval_runtime": 4.9747, "eval_samples_per_second": 13.066, "eval_steps_per_second": 1.809, "step": 300 }, { "epoch": 4.77, "learning_rate": 0.0002, "loss": 0.0063, "step": 310 }, { "epoch": 4.92, "learning_rate": 0.0002, "loss": 0.0067, "step": 320 }, { "epoch": 5.08, "learning_rate": 0.0002, "loss": 0.0052, "step": 330 }, { "epoch": 5.23, "learning_rate": 0.0002, "loss": 0.0069, "step": 340 }, { "epoch": 5.38, "learning_rate": 0.0002, "loss": 0.0054, "step": 350 }, { "epoch": 5.54, "learning_rate": 0.0002, "loss": 0.0061, "step": 360 }, { "epoch": 5.69, "learning_rate": 0.0002, "loss": 0.006, "step": 370 }, { "epoch": 5.85, "learning_rate": 0.0002, "loss": 0.0061, "step": 380 }, { "epoch": 6.0, "learning_rate": 0.0002, "loss": 0.0064, "step": 390 }, { "epoch": 6.15, "learning_rate": 0.0002, "loss": 0.0047, "step": 400 }, { "epoch": 6.15, "eval_loss": 0.009761717170476913, "eval_runtime": 4.974, "eval_samples_per_second": 13.068, "eval_steps_per_second": 1.809, "step": 400 }, { "epoch": 6.31, "learning_rate": 0.0002, "loss": 0.0051, "step": 410 }, { "epoch": 6.46, "learning_rate": 0.0002, "loss": 0.0053, "step": 420 }, { "epoch": 6.62, "learning_rate": 0.0002, "loss": 0.0051, "step": 430 }, { "epoch": 6.77, "learning_rate": 0.0002, "loss": 0.0052, "step": 440 }, { "epoch": 6.92, "learning_rate": 0.0002, "loss": 0.005, "step": 450 }, { "epoch": 7.08, "learning_rate": 0.0002, "loss": 0.0052, "step": 460 }, { "epoch": 7.23, "learning_rate": 0.0002, "loss": 0.0042, "step": 470 }, { "epoch": 7.38, "learning_rate": 0.0002, "loss": 0.0051, "step": 480 }, { "epoch": 7.54, "learning_rate": 0.0002, "loss": 0.005, "step": 490 }, { "epoch": 7.69, "learning_rate": 0.0002, "loss": 0.0049, "step": 500 }, { "epoch": 7.69, "eval_loss": 0.01100065279752016, "eval_runtime": 4.9709, "eval_samples_per_second": 13.076, "eval_steps_per_second": 1.811, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "total_flos": 8.7710798708736e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }