|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.273224043715847, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0036429872495446266, |
|
"grad_norm": 0.22306445240974426, |
|
"learning_rate": 1e-05, |
|
"loss": 2.9329, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036429872495446266, |
|
"eval_loss": 1.4070076942443848, |
|
"eval_runtime": 36.0987, |
|
"eval_samples_per_second": 6.399, |
|
"eval_steps_per_second": 0.803, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007285974499089253, |
|
"grad_norm": 0.3016580641269684, |
|
"learning_rate": 2e-05, |
|
"loss": 3.0567, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01092896174863388, |
|
"grad_norm": 0.21999327838420868, |
|
"learning_rate": 3e-05, |
|
"loss": 2.6965, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.014571948998178506, |
|
"grad_norm": 0.24598363041877747, |
|
"learning_rate": 4e-05, |
|
"loss": 3.2788, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.018214936247723135, |
|
"grad_norm": 0.328949898481369, |
|
"learning_rate": 5e-05, |
|
"loss": 3.1415, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02185792349726776, |
|
"grad_norm": 0.3219306468963623, |
|
"learning_rate": 6e-05, |
|
"loss": 3.211, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.025500910746812388, |
|
"grad_norm": 0.30114132165908813, |
|
"learning_rate": 7e-05, |
|
"loss": 2.6, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.029143897996357013, |
|
"grad_norm": 0.2939751446247101, |
|
"learning_rate": 8e-05, |
|
"loss": 2.7389, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03278688524590164, |
|
"grad_norm": 0.31080543994903564, |
|
"learning_rate": 9e-05, |
|
"loss": 2.8584, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03278688524590164, |
|
"eval_loss": 1.3745203018188477, |
|
"eval_runtime": 36.5458, |
|
"eval_samples_per_second": 6.321, |
|
"eval_steps_per_second": 0.794, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03642987249544627, |
|
"grad_norm": 0.3594372868537903, |
|
"learning_rate": 0.0001, |
|
"loss": 2.5839, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04007285974499089, |
|
"grad_norm": 0.3138103485107422, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 2.5284, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04371584699453552, |
|
"grad_norm": 0.40280646085739136, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 3.1069, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04735883424408015, |
|
"grad_norm": 0.4165895879268646, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 2.2823, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.051001821493624776, |
|
"grad_norm": 0.46014565229415894, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 2.5827, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0546448087431694, |
|
"grad_norm": 0.416968435049057, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 2.9045, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.058287795992714025, |
|
"grad_norm": 0.44254550337791443, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 2.578, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.061930783242258654, |
|
"grad_norm": 0.6331019997596741, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 2.513, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 0.3989606201648712, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 2.4668, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06557377049180328, |
|
"eval_loss": 1.2810860872268677, |
|
"eval_runtime": 36.4865, |
|
"eval_samples_per_second": 6.331, |
|
"eval_steps_per_second": 0.795, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0692167577413479, |
|
"grad_norm": 0.4963116943836212, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 2.5629, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.07285974499089254, |
|
"grad_norm": 0.39597874879837036, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 2.8487, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07650273224043716, |
|
"grad_norm": 0.49734658002853394, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 2.7251, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08014571948998178, |
|
"grad_norm": 0.5086115002632141, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 2.6009, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08378870673952642, |
|
"grad_norm": 0.5011266469955444, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 2.6599, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08743169398907104, |
|
"grad_norm": 0.44377079606056213, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 2.3917, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09107468123861566, |
|
"grad_norm": 0.4632920026779175, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 2.5564, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0947176684881603, |
|
"grad_norm": 0.5033605098724365, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 2.5636, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"grad_norm": 0.3843136429786682, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 2.7431, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"eval_loss": 1.2466179132461548, |
|
"eval_runtime": 36.1779, |
|
"eval_samples_per_second": 6.385, |
|
"eval_steps_per_second": 0.802, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.10200364298724955, |
|
"grad_norm": 0.4826091229915619, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 2.3946, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10564663023679417, |
|
"grad_norm": 0.48955684900283813, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 2.5544, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1092896174863388, |
|
"grad_norm": 0.5087964534759521, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 2.7766, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11293260473588343, |
|
"grad_norm": 0.463750422000885, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 2.5906, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.11657559198542805, |
|
"grad_norm": 0.3833313286304474, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 2.4696, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.12021857923497267, |
|
"grad_norm": 0.4325249493122101, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 2.4887, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.12386156648451731, |
|
"grad_norm": 1.51544988155365, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 2.427, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12750455373406194, |
|
"grad_norm": 0.47455018758773804, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 2.3536, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 0.45129895210266113, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 2.3617, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13114754098360656, |
|
"eval_loss": 1.231567144393921, |
|
"eval_runtime": 36.1382, |
|
"eval_samples_per_second": 6.392, |
|
"eval_steps_per_second": 0.802, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13479052823315119, |
|
"grad_norm": 0.47395437955856323, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 2.424, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1384335154826958, |
|
"grad_norm": 0.4069020748138428, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 2.3032, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.14207650273224043, |
|
"grad_norm": 0.47907042503356934, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 2.6569, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.14571948998178508, |
|
"grad_norm": 0.5535211563110352, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 2.442, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1493624772313297, |
|
"grad_norm": 0.4240165650844574, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 2.474, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.15300546448087432, |
|
"grad_norm": 0.48536279797554016, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 2.7711, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.15664845173041894, |
|
"grad_norm": 0.44794994592666626, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 2.4307, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.16029143897996356, |
|
"grad_norm": 0.38578876852989197, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 2.5097, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.4519205093383789, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 2.4849, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"eval_loss": 1.223783016204834, |
|
"eval_runtime": 36.2336, |
|
"eval_samples_per_second": 6.375, |
|
"eval_steps_per_second": 0.8, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.16757741347905283, |
|
"grad_norm": 0.5709835290908813, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 2.6203, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.17122040072859745, |
|
"grad_norm": 0.5105118155479431, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 2.4076, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.17486338797814208, |
|
"grad_norm": 0.4190482199192047, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 2.6536, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1785063752276867, |
|
"grad_norm": 0.5616876482963562, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 2.171, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.18214936247723132, |
|
"grad_norm": 0.47931408882141113, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 2.3936, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18579234972677597, |
|
"grad_norm": 0.4086602032184601, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 2.0796, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1894353369763206, |
|
"grad_norm": 0.4565342962741852, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 2.4687, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1930783242258652, |
|
"grad_norm": 0.44736742973327637, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 2.6026, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.49989748001098633, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 2.8232, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"eval_loss": 1.2164099216461182, |
|
"eval_runtime": 36.1482, |
|
"eval_samples_per_second": 6.39, |
|
"eval_steps_per_second": 0.802, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.20036429872495445, |
|
"grad_norm": 0.5327097773551941, |
|
"learning_rate": 5e-05, |
|
"loss": 2.1804, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2040072859744991, |
|
"grad_norm": 0.46552613377571106, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 2.9111, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.20765027322404372, |
|
"grad_norm": 0.47907155752182007, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 2.5331, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.21129326047358835, |
|
"grad_norm": 0.5192755460739136, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 2.452, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.21493624772313297, |
|
"grad_norm": 0.5046710968017578, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 2.2555, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 0.4994800388813019, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 2.4397, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.5281521081924438, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 2.9933, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.22586520947176686, |
|
"grad_norm": 0.45675769448280334, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 2.4611, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.22950819672131148, |
|
"grad_norm": 0.4723397493362427, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 2.2367, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.22950819672131148, |
|
"eval_loss": 1.2124544382095337, |
|
"eval_runtime": 36.1593, |
|
"eval_samples_per_second": 6.388, |
|
"eval_steps_per_second": 0.802, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2331511839708561, |
|
"grad_norm": 0.4308946132659912, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 2.298, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.23679417122040072, |
|
"grad_norm": 0.5040618181228638, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 2.4357, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.24043715846994534, |
|
"grad_norm": 0.46097809076309204, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 2.6279, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.24408014571949, |
|
"grad_norm": 0.4577733278274536, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 2.6152, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.24772313296903462, |
|
"grad_norm": 0.46525999903678894, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 2.3304, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.25136612021857924, |
|
"grad_norm": 0.49677371978759766, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 2.5507, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2550091074681239, |
|
"grad_norm": 0.5101350545883179, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 2.5659, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2586520947176685, |
|
"grad_norm": 0.4523501992225647, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 2.2424, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.5566951036453247, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 2.3174, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.26229508196721313, |
|
"eval_loss": 1.209464430809021, |
|
"eval_runtime": 36.1745, |
|
"eval_samples_per_second": 6.386, |
|
"eval_steps_per_second": 0.802, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2659380692167577, |
|
"grad_norm": 0.4237559139728546, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 2.4986, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.26958105646630237, |
|
"grad_norm": 0.5177941918373108, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 2.2564, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.273224043715847, |
|
"grad_norm": 0.503273606300354, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 2.3239, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.5217449435136e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|