|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.00040705304197946504, |
|
"eval_steps": 3, |
|
"global_step": 48, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.48027170790552e-06, |
|
"eval_loss": 8.904634475708008, |
|
"eval_runtime": 395.5428, |
|
"eval_samples_per_second": 125.526, |
|
"eval_steps_per_second": 62.764, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 2.5440815123716565e-05, |
|
"grad_norm": 3.2092926502227783, |
|
"learning_rate": 6e-05, |
|
"loss": 9.4435, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 2.5440815123716565e-05, |
|
"eval_loss": 8.903043746948242, |
|
"eval_runtime": 394.4845, |
|
"eval_samples_per_second": 125.863, |
|
"eval_steps_per_second": 62.933, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 5.088163024743313e-05, |
|
"grad_norm": 2.6875100135803223, |
|
"learning_rate": 0.00012, |
|
"loss": 8.8735, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 5.088163024743313e-05, |
|
"eval_loss": 8.875833511352539, |
|
"eval_runtime": 393.8802, |
|
"eval_samples_per_second": 126.056, |
|
"eval_steps_per_second": 63.029, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 7.632244537114969e-05, |
|
"grad_norm": 2.1503734588623047, |
|
"learning_rate": 0.00018, |
|
"loss": 8.243, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 7.632244537114969e-05, |
|
"eval_loss": 8.804418563842773, |
|
"eval_runtime": 393.1137, |
|
"eval_samples_per_second": 126.302, |
|
"eval_steps_per_second": 63.152, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00010176326049486626, |
|
"grad_norm": 2.0084388256073, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 9.4, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00010176326049486626, |
|
"eval_loss": 8.705506324768066, |
|
"eval_runtime": 393.3705, |
|
"eval_samples_per_second": 126.219, |
|
"eval_steps_per_second": 63.111, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.00012720407561858283, |
|
"grad_norm": 2.128262758255005, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 8.7753, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00012720407561858283, |
|
"eval_loss": 8.609003067016602, |
|
"eval_runtime": 393.4243, |
|
"eval_samples_per_second": 126.202, |
|
"eval_steps_per_second": 63.102, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00015264489074229938, |
|
"grad_norm": 2.6665070056915283, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 8.7284, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00015264489074229938, |
|
"eval_loss": 8.514460563659668, |
|
"eval_runtime": 393.1462, |
|
"eval_samples_per_second": 126.291, |
|
"eval_steps_per_second": 63.147, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.00017808570586601595, |
|
"grad_norm": 1.9979753494262695, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 8.4017, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00017808570586601595, |
|
"eval_loss": 8.427648544311523, |
|
"eval_runtime": 393.6383, |
|
"eval_samples_per_second": 126.134, |
|
"eval_steps_per_second": 63.068, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.00020352652098973252, |
|
"grad_norm": 1.9800701141357422, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 8.6508, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00020352652098973252, |
|
"eval_loss": 8.347582817077637, |
|
"eval_runtime": 393.648, |
|
"eval_samples_per_second": 126.13, |
|
"eval_steps_per_second": 63.066, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.00022896733611344906, |
|
"grad_norm": 1.8114262819290161, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 7.9676, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00022896733611344906, |
|
"eval_loss": 8.279464721679688, |
|
"eval_runtime": 394.6493, |
|
"eval_samples_per_second": 125.81, |
|
"eval_steps_per_second": 62.906, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00025440815123716566, |
|
"grad_norm": 3.259230852127075, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 7.7184, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.00025440815123716566, |
|
"eval_loss": 8.224762916564941, |
|
"eval_runtime": 393.2672, |
|
"eval_samples_per_second": 126.253, |
|
"eval_steps_per_second": 63.128, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0002798489663608822, |
|
"grad_norm": 2.8306350708007812, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 9.016, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0002798489663608822, |
|
"eval_loss": 8.18464469909668, |
|
"eval_runtime": 394.6805, |
|
"eval_samples_per_second": 125.8, |
|
"eval_steps_per_second": 62.902, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.00030528978148459875, |
|
"grad_norm": 2.145679235458374, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 8.1484, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.00030528978148459875, |
|
"eval_loss": 8.153741836547852, |
|
"eval_runtime": 395.8325, |
|
"eval_samples_per_second": 125.434, |
|
"eval_steps_per_second": 62.718, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0003307305966083153, |
|
"grad_norm": 1.9868985414505005, |
|
"learning_rate": 2.6427608932686843e-05, |
|
"loss": 8.136, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0003307305966083153, |
|
"eval_loss": 8.135424613952637, |
|
"eval_runtime": 396.6057, |
|
"eval_samples_per_second": 125.19, |
|
"eval_steps_per_second": 62.596, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0003561714117320319, |
|
"grad_norm": 1.6023919582366943, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 7.4801, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0003561714117320319, |
|
"eval_loss": 8.125348091125488, |
|
"eval_runtime": 393.4071, |
|
"eval_samples_per_second": 126.208, |
|
"eval_steps_per_second": 63.105, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.00038161222685574847, |
|
"grad_norm": 2.529416561126709, |
|
"learning_rate": 3.059973406066963e-06, |
|
"loss": 8.0106, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00038161222685574847, |
|
"eval_loss": 8.12152099609375, |
|
"eval_runtime": 393.2608, |
|
"eval_samples_per_second": 126.255, |
|
"eval_steps_per_second": 63.129, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00040705304197946504, |
|
"grad_norm": 1.6257084608078003, |
|
"learning_rate": 0.0, |
|
"loss": 7.7403, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.00040705304197946504, |
|
"eval_loss": 8.12084674835205, |
|
"eval_runtime": 392.7186, |
|
"eval_samples_per_second": 126.429, |
|
"eval_steps_per_second": 63.216, |
|
"step": 48 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 48, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 70, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6359178805248.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|