{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 3171,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0946073793755913,
      "grad_norm": 1.295289158821106,
      "learning_rate": 0.0009700409965310628,
      "loss": 2.3794,
      "step": 100
    },
    {
      "epoch": 0.1892147587511826,
      "grad_norm": 0.7961627244949341,
      "learning_rate": 0.0009385052034058657,
      "loss": 0.5368,
      "step": 200
    },
    {
      "epoch": 0.28382213812677387,
      "grad_norm": 0.8890175223350525,
      "learning_rate": 0.0009069694102806686,
      "loss": 0.3488,
      "step": 300
    },
    {
      "epoch": 0.3784295175023652,
      "grad_norm": 0.5166388154029846,
      "learning_rate": 0.0008754336171554715,
      "loss": 0.2712,
      "step": 400
    },
    {
      "epoch": 0.47303689687795647,
      "grad_norm": 0.3027492165565491,
      "learning_rate": 0.0008438978240302744,
      "loss": 0.2219,
      "step": 500
    },
    {
      "epoch": 0.5676442762535477,
      "grad_norm": 0.8666072487831116,
      "learning_rate": 0.0008123620309050773,
      "loss": 0.1906,
      "step": 600
    },
    {
      "epoch": 0.6622516556291391,
      "grad_norm": 0.5293228030204773,
      "learning_rate": 0.0007808262377798802,
      "loss": 0.17,
      "step": 700
    },
    {
      "epoch": 0.7568590350047304,
      "grad_norm": 0.33755800127983093,
      "learning_rate": 0.0007492904446546831,
      "loss": 0.1506,
      "step": 800
    },
    {
      "epoch": 0.8514664143803217,
      "grad_norm": 0.3760841190814972,
      "learning_rate": 0.000717754651529486,
      "loss": 0.142,
      "step": 900
    },
    {
      "epoch": 0.9460737937559129,
      "grad_norm": 0.3145512044429779,
      "learning_rate": 0.0006862188584042889,
      "loss": 0.1256,
      "step": 1000
    },
    {
      "epoch": 1.0406811731315042,
      "grad_norm": 0.47477108240127563,
      "learning_rate": 0.0006546830652790918,
      "loss": 0.1225,
      "step": 1100
    },
    {
      "epoch": 1.1352885525070955,
      "grad_norm": 0.3582330644130707,
      "learning_rate": 0.0006231472721538947,
      "loss": 0.1148,
      "step": 1200
    },
    {
      "epoch": 1.2298959318826868,
      "grad_norm": 0.4500308036804199,
      "learning_rate": 0.0005916114790286976,
      "loss": 0.1148,
      "step": 1300
    },
    {
      "epoch": 1.3245033112582782,
      "grad_norm": 0.2415657341480255,
      "learning_rate": 0.0005600756859035005,
      "loss": 0.0934,
      "step": 1400
    },
    {
      "epoch": 1.4191106906338695,
      "grad_norm": 0.49272701144218445,
      "learning_rate": 0.0005285398927783034,
      "loss": 0.098,
      "step": 1500
    },
    {
      "epoch": 1.5137180700094608,
      "grad_norm": 0.28604432940483093,
      "learning_rate": 0.0004970040996531063,
      "loss": 0.1056,
      "step": 1600
    },
    {
      "epoch": 1.608325449385052,
      "grad_norm": 0.4883616864681244,
      "learning_rate": 0.00046546830652790914,
      "loss": 0.0992,
      "step": 1700
    },
    {
      "epoch": 1.7029328287606433,
      "grad_norm": 0.42010796070098877,
      "learning_rate": 0.0004339325134027121,
      "loss": 0.0879,
      "step": 1800
    },
    {
      "epoch": 1.7975402081362346,
      "grad_norm": 0.23443974554538727,
      "learning_rate": 0.000402396720277515,
      "loss": 0.0906,
      "step": 1900
    },
    {
      "epoch": 1.8921475875118259,
      "grad_norm": 0.6164644956588745,
      "learning_rate": 0.0003708609271523179,
      "loss": 0.0834,
      "step": 2000
    },
    {
      "epoch": 1.9867549668874172,
      "grad_norm": 0.36332041025161743,
      "learning_rate": 0.0003393251340271208,
      "loss": 0.0798,
      "step": 2100
    },
    {
      "epoch": 2.0813623462630084,
      "grad_norm": 0.3371862769126892,
      "learning_rate": 0.00030778934090192365,
      "loss": 0.0893,
      "step": 2200
    },
    {
      "epoch": 2.1759697256385997,
      "grad_norm": 0.2492402046918869,
      "learning_rate": 0.00027625354777672655,
      "loss": 0.0768,
      "step": 2300
    },
    {
      "epoch": 2.270577105014191,
      "grad_norm": 0.34287121891975403,
      "learning_rate": 0.00024503311258278145,
      "loss": 0.0776,
      "step": 2400
    },
    {
      "epoch": 2.3651844843897822,
      "grad_norm": 0.4034591615200043,
      "learning_rate": 0.00021349731945758435,
      "loss": 0.0717,
      "step": 2500
    },
    {
      "epoch": 2.4597918637653735,
      "grad_norm": 0.3876320719718933,
      "learning_rate": 0.00018196152633238728,
      "loss": 0.0762,
      "step": 2600
    },
    {
      "epoch": 2.5543992431409652,
      "grad_norm": 0.16697722673416138,
      "learning_rate": 0.00015042573320719015,
      "loss": 0.0688,
      "step": 2700
    },
    {
      "epoch": 2.6490066225165565,
      "grad_norm": 0.32368209958076477,
      "learning_rate": 0.00011888994008199306,
      "loss": 0.0728,
      "step": 2800
    },
    {
      "epoch": 2.7436140018921478,
      "grad_norm": 0.4008019268512726,
      "learning_rate": 8.735414695679597e-05,
      "loss": 0.0756,
      "step": 2900
    },
    {
      "epoch": 2.838221381267739,
      "grad_norm": 0.25073108077049255,
      "learning_rate": 5.581835383159887e-05,
      "loss": 0.0766,
      "step": 3000
    },
    {
      "epoch": 2.9328287606433303,
      "grad_norm": 0.3861249089241028,
      "learning_rate": 2.4282560706401765e-05,
      "loss": 0.0717,
      "step": 3100
    }
  ],
  "logging_steps": 100,
  "max_steps": 3171,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 202375839744000.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}