{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 3342, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 6.054772853851318, "learning_rate": 1.9820466786355476e-05, "loss": 0.5721, "step": 100 }, { "epoch": 0.36, "grad_norm": 9.810357093811035, "learning_rate": 1.9640933572710953e-05, "loss": 0.3846, "step": 200 }, { "epoch": 0.54, "grad_norm": 9.026122093200684, "learning_rate": 1.9461400359066428e-05, "loss": 0.3019, "step": 300 }, { "epoch": 0.72, "grad_norm": 11.95788288116455, "learning_rate": 1.9281867145421905e-05, "loss": 0.3071, "step": 400 }, { "epoch": 0.9, "grad_norm": 15.329608917236328, "learning_rate": 1.910233393177738e-05, "loss": 0.2907, "step": 500 }, { "epoch": 1.08, "grad_norm": 12.364314079284668, "learning_rate": 1.8922800718132857e-05, "loss": 0.2646, "step": 600 }, { "epoch": 1.26, "grad_norm": 14.555986404418945, "learning_rate": 1.874326750448833e-05, "loss": 0.2097, "step": 700 }, { "epoch": 1.44, "grad_norm": 4.199421405792236, "learning_rate": 1.8563734290843805e-05, "loss": 0.2389, "step": 800 }, { "epoch": 1.62, "grad_norm": 22.485984802246094, "learning_rate": 1.8384201077199283e-05, "loss": 0.2019, "step": 900 }, { "epoch": 1.8, "grad_norm": 9.688520431518555, "learning_rate": 1.820466786355476e-05, "loss": 0.2594, "step": 1000 }, { "epoch": 1.97, "grad_norm": 18.052719116210938, "learning_rate": 1.8025134649910235e-05, "loss": 0.2044, "step": 1100 }, { "epoch": 2.15, "grad_norm": 0.7371789216995239, "learning_rate": 1.7845601436265712e-05, "loss": 0.1551, "step": 1200 }, { "epoch": 2.33, "grad_norm": 20.938648223876953, "learning_rate": 1.7666068222621186e-05, "loss": 0.1463, "step": 1300 }, { "epoch": 2.51, "grad_norm": 0.25227147340774536, "learning_rate": 1.748653500897666e-05, "loss": 0.1493, "step": 1400 }, { "epoch": 2.69, "grad_norm": 0.27634137868881226, "learning_rate": 1.7307001795332138e-05, "loss": 0.1649, "step": 1500 }, { "epoch": 2.87, "grad_norm": 0.2588340938091278, "learning_rate": 1.7127468581687616e-05, "loss": 0.1521, "step": 1600 }, { "epoch": 3.05, "grad_norm": 0.05350634083151817, "learning_rate": 1.694793536804309e-05, "loss": 0.1343, "step": 1700 }, { "epoch": 3.23, "grad_norm": 0.02972230687737465, "learning_rate": 1.6768402154398564e-05, "loss": 0.1068, "step": 1800 }, { "epoch": 3.41, "grad_norm": 0.09572970867156982, "learning_rate": 1.658886894075404e-05, "loss": 0.1151, "step": 1900 }, { "epoch": 3.59, "grad_norm": 21.431325912475586, "learning_rate": 1.6409335727109516e-05, "loss": 0.1073, "step": 2000 }, { "epoch": 3.77, "grad_norm": 1.4688669443130493, "learning_rate": 1.6229802513464993e-05, "loss": 0.1098, "step": 2100 }, { "epoch": 3.95, "grad_norm": 19.461355209350586, "learning_rate": 1.6050269299820467e-05, "loss": 0.1238, "step": 2200 }, { "epoch": 4.13, "grad_norm": 6.33543586730957, "learning_rate": 1.5870736086175945e-05, "loss": 0.0934, "step": 2300 }, { "epoch": 4.31, "grad_norm": 10.25698184967041, "learning_rate": 1.569120287253142e-05, "loss": 0.068, "step": 2400 }, { "epoch": 4.49, "grad_norm": 0.05421575903892517, "learning_rate": 1.5511669658886893e-05, "loss": 0.0767, "step": 2500 }, { "epoch": 4.67, "grad_norm": 0.04917303845286369, "learning_rate": 1.533213644524237e-05, "loss": 0.1053, "step": 2600 }, { "epoch": 4.85, "grad_norm": 1.722901463508606, "learning_rate": 1.5152603231597847e-05, "loss": 0.0513, "step": 2700 }, { "epoch": 5.03, "grad_norm": 28.548158645629883, "learning_rate": 1.4973070017953321e-05, "loss": 0.0611, "step": 2800 }, { "epoch": 5.21, "grad_norm": 1.0562294721603394, "learning_rate": 1.4793536804308799e-05, "loss": 0.0543, "step": 2900 }, { "epoch": 5.39, "grad_norm": 0.011326675303280354, "learning_rate": 1.4614003590664274e-05, "loss": 0.0491, "step": 3000 }, { "epoch": 5.57, "grad_norm": 0.009987740777432919, "learning_rate": 1.4434470377019749e-05, "loss": 0.0567, "step": 3100 }, { "epoch": 5.75, "grad_norm": 26.81354331970215, "learning_rate": 1.4254937163375226e-05, "loss": 0.0641, "step": 3200 }, { "epoch": 5.92, "grad_norm": 0.042816389352083206, "learning_rate": 1.4075403949730702e-05, "loss": 0.0488, "step": 3300 } ], "logging_steps": 100, "max_steps": 11140, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 3.4115676633458784e+16, "train_batch_size": 14, "trial_name": null, "trial_params": null }