|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 50, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004, |
|
"eval_loss": 1.9942963123321533, |
|
"eval_runtime": 2.9922, |
|
"eval_samples_per_second": 52.136, |
|
"eval_steps_per_second": 2.674, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.944833517074585, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.4744, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8111104965209961, |
|
"eval_runtime": 2.9551, |
|
"eval_samples_per_second": 52.79, |
|
"eval_steps_per_second": 2.707, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.267634868621826, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.8438, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.7407088875770569, |
|
"eval_runtime": 3.0287, |
|
"eval_samples_per_second": 51.507, |
|
"eval_steps_per_second": 2.641, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.4580960273742676, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7954, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.7344938516616821, |
|
"eval_runtime": 3.0361, |
|
"eval_samples_per_second": 51.381, |
|
"eval_steps_per_second": 2.635, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.8795665502548218, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.8062, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.7369781732559204, |
|
"eval_runtime": 2.9655, |
|
"eval_samples_per_second": 52.604, |
|
"eval_steps_per_second": 2.698, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.9113938808441162, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.8094, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.7419852614402771, |
|
"eval_runtime": 2.9722, |
|
"eval_samples_per_second": 52.487, |
|
"eval_steps_per_second": 2.692, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.6403286457061768, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7765, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 0.7483660578727722, |
|
"eval_runtime": 2.9618, |
|
"eval_samples_per_second": 52.671, |
|
"eval_steps_per_second": 2.701, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.6574498414993286, |
|
"learning_rate": 1.9983081582712684e-05, |
|
"loss": 0.8277, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.7558473348617554, |
|
"eval_runtime": 2.9758, |
|
"eval_samples_per_second": 52.423, |
|
"eval_steps_per_second": 2.688, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.647865653038025, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.8109, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7527696490287781, |
|
"eval_runtime": 2.9606, |
|
"eval_samples_per_second": 52.692, |
|
"eval_steps_per_second": 2.702, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 6.825328350067139, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.8396, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 0.7513787150382996, |
|
"eval_runtime": 2.9613, |
|
"eval_samples_per_second": 52.679, |
|
"eval_steps_per_second": 2.701, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.86250901222229, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.814, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7477762699127197, |
|
"eval_runtime": 2.9731, |
|
"eval_samples_per_second": 52.471, |
|
"eval_steps_per_second": 2.691, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.209193229675293, |
|
"learning_rate": 1.957989512315489e-05, |
|
"loss": 0.512, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.792364239692688, |
|
"eval_runtime": 2.9678, |
|
"eval_samples_per_second": 52.564, |
|
"eval_steps_per_second": 2.696, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 3.9676766395568848, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.4991, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.7819795608520508, |
|
"eval_runtime": 2.9632, |
|
"eval_samples_per_second": 52.645, |
|
"eval_steps_per_second": 2.7, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.8103879690170288, |
|
"learning_rate": 1.9182161068802742e-05, |
|
"loss": 0.5022, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 0.7795585989952087, |
|
"eval_runtime": 2.9661, |
|
"eval_samples_per_second": 52.594, |
|
"eval_steps_per_second": 2.697, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.4793493747711182, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.4964, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.7873778343200684, |
|
"eval_runtime": 2.9886, |
|
"eval_samples_per_second": 52.199, |
|
"eval_steps_per_second": 2.677, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.360481858253479, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.519, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 0.779276430606842, |
|
"eval_runtime": 3.0307, |
|
"eval_samples_per_second": 51.473, |
|
"eval_steps_per_second": 2.64, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.756487250328064, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.5117, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.7745152115821838, |
|
"eval_runtime": 2.9728, |
|
"eval_samples_per_second": 52.475, |
|
"eval_steps_per_second": 2.691, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.5152206420898438, |
|
"learning_rate": 1.802123192755044e-05, |
|
"loss": 0.5411, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 0.771414577960968, |
|
"eval_runtime": 2.9972, |
|
"eval_samples_per_second": 52.048, |
|
"eval_steps_per_second": 2.669, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.860755681991577, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.5345, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.7689093351364136, |
|
"eval_runtime": 2.9564, |
|
"eval_samples_per_second": 52.766, |
|
"eval_steps_per_second": 2.706, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.5143054723739624, |
|
"learning_rate": 1.7273736415730488e-05, |
|
"loss": 0.534, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 0.7705763578414917, |
|
"eval_runtime": 2.9633, |
|
"eval_samples_per_second": 52.644, |
|
"eval_steps_per_second": 2.7, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.869483470916748, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.5292, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.769786536693573, |
|
"eval_runtime": 2.961, |
|
"eval_samples_per_second": 52.686, |
|
"eval_steps_per_second": 2.702, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 3000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"total_flos": 1.1397181534424269e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|