|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0139372822299653, |
|
"eval_steps": 9, |
|
"global_step": 36, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027874564459930314, |
|
"grad_norm": 1.528730034828186, |
|
"learning_rate": 5e-05, |
|
"loss": 5.1122, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.027874564459930314, |
|
"eval_loss": 4.7186970710754395, |
|
"eval_runtime": 1.266, |
|
"eval_samples_per_second": 191.16, |
|
"eval_steps_per_second": 48.185, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05574912891986063, |
|
"grad_norm": 1.4118348360061646, |
|
"learning_rate": 0.0001, |
|
"loss": 4.8444, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.08362369337979095, |
|
"grad_norm": 1.4177184104919434, |
|
"learning_rate": 9.978670881475172e-05, |
|
"loss": 5.0323, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.11149825783972125, |
|
"grad_norm": 1.3121063709259033, |
|
"learning_rate": 9.91486549841951e-05, |
|
"loss": 4.9379, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.13937282229965156, |
|
"grad_norm": 1.1417440176010132, |
|
"learning_rate": 9.809128215864097e-05, |
|
"loss": 4.9118, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1672473867595819, |
|
"grad_norm": 1.0766890048980713, |
|
"learning_rate": 9.662361147021779e-05, |
|
"loss": 4.8126, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 1.0330865383148193, |
|
"learning_rate": 9.475816456775313e-05, |
|
"loss": 5.0199, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.2229965156794425, |
|
"grad_norm": 0.8478246331214905, |
|
"learning_rate": 9.251085678648072e-05, |
|
"loss": 4.7868, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2508710801393728, |
|
"grad_norm": 0.7461872100830078, |
|
"learning_rate": 8.9900861364012e-05, |
|
"loss": 4.7503, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2508710801393728, |
|
"eval_loss": 4.568607330322266, |
|
"eval_runtime": 1.2588, |
|
"eval_samples_per_second": 192.254, |
|
"eval_steps_per_second": 48.461, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.2787456445993031, |
|
"grad_norm": 0.7740032076835632, |
|
"learning_rate": 8.695044586103296e-05, |
|
"loss": 4.6227, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.30662020905923343, |
|
"grad_norm": 0.7942211627960205, |
|
"learning_rate": 8.368478218232787e-05, |
|
"loss": 4.6837, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3344947735191638, |
|
"grad_norm": 0.8654100894927979, |
|
"learning_rate": 8.013173181896283e-05, |
|
"loss": 4.6198, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.3623693379790941, |
|
"grad_norm": 0.9776397943496704, |
|
"learning_rate": 7.63216081438678e-05, |
|
"loss": 4.7598, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.828058123588562, |
|
"learning_rate": 7.228691778882693e-05, |
|
"loss": 4.6796, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4181184668989547, |
|
"grad_norm": 0.8717594146728516, |
|
"learning_rate": 6.806208330935766e-05, |
|
"loss": 4.5456, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.445993031358885, |
|
"grad_norm": 0.9016168117523193, |
|
"learning_rate": 6.368314950360415e-05, |
|
"loss": 4.7329, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.4738675958188153, |
|
"grad_norm": 0.9021468162536621, |
|
"learning_rate": 5.918747589082853e-05, |
|
"loss": 4.6807, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5017421602787456, |
|
"grad_norm": 0.9048068523406982, |
|
"learning_rate": 5.4613417973165106e-05, |
|
"loss": 4.6001, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5017421602787456, |
|
"eval_loss": 4.461349010467529, |
|
"eval_runtime": 1.2645, |
|
"eval_samples_per_second": 191.38, |
|
"eval_steps_per_second": 48.24, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.5296167247386759, |
|
"grad_norm": 0.9348726868629456, |
|
"learning_rate": 5e-05, |
|
"loss": 4.7054, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.5574912891986062, |
|
"grad_norm": 0.8965240120887756, |
|
"learning_rate": 4.5386582026834906e-05, |
|
"loss": 4.7152, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 0.8576871752738953, |
|
"learning_rate": 4.0812524109171476e-05, |
|
"loss": 4.629, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6132404181184669, |
|
"grad_norm": 0.8105357885360718, |
|
"learning_rate": 3.631685049639586e-05, |
|
"loss": 4.6084, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.6411149825783972, |
|
"grad_norm": 0.7387930750846863, |
|
"learning_rate": 3.1937916690642356e-05, |
|
"loss": 4.5363, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.6689895470383276, |
|
"grad_norm": 0.8787197470664978, |
|
"learning_rate": 2.771308221117309e-05, |
|
"loss": 4.639, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.6968641114982579, |
|
"grad_norm": 0.8041818141937256, |
|
"learning_rate": 2.3678391856132204e-05, |
|
"loss": 4.583, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.7247386759581882, |
|
"grad_norm": 0.7042474746704102, |
|
"learning_rate": 1.9868268181037185e-05, |
|
"loss": 4.5931, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.7526132404181185, |
|
"grad_norm": 0.6868075132369995, |
|
"learning_rate": 1.631521781767214e-05, |
|
"loss": 4.5914, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7526132404181185, |
|
"eval_loss": 4.399294376373291, |
|
"eval_runtime": 1.2619, |
|
"eval_samples_per_second": 191.767, |
|
"eval_steps_per_second": 48.338, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 0.6679636836051941, |
|
"learning_rate": 1.3049554138967051e-05, |
|
"loss": 4.5055, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.8083623693379791, |
|
"grad_norm": 0.7194696068763733, |
|
"learning_rate": 1.0099138635988026e-05, |
|
"loss": 4.5841, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.8362369337979094, |
|
"grad_norm": 0.7062932848930359, |
|
"learning_rate": 7.489143213519301e-06, |
|
"loss": 4.502, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8641114982578397, |
|
"grad_norm": 0.7180935740470886, |
|
"learning_rate": 5.241835432246889e-06, |
|
"loss": 4.4559, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.89198606271777, |
|
"grad_norm": 0.702731192111969, |
|
"learning_rate": 3.376388529782215e-06, |
|
"loss": 4.6249, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.9198606271777003, |
|
"grad_norm": 0.7095968127250671, |
|
"learning_rate": 1.908717841359048e-06, |
|
"loss": 4.4653, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.9477351916376306, |
|
"grad_norm": 0.7153111696243286, |
|
"learning_rate": 8.513450158049108e-07, |
|
"loss": 4.542, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.7444437146186829, |
|
"learning_rate": 2.1329118524827662e-07, |
|
"loss": 4.5022, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.0139372822299653, |
|
"grad_norm": 1.0557011365890503, |
|
"learning_rate": 0.0, |
|
"loss": 6.8891, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.0139372822299653, |
|
"eval_loss": 4.388754367828369, |
|
"eval_runtime": 1.2674, |
|
"eval_samples_per_second": 190.94, |
|
"eval_steps_per_second": 48.13, |
|
"step": 36 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 36, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 9, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6149077112717312.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|