{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.273224043715847, "eval_steps": 9, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0036429872495446266, "grad_norm": 0.22306445240974426, "learning_rate": 1e-05, "loss": 2.9329, "step": 1 }, { "epoch": 0.0036429872495446266, "eval_loss": 1.4070076942443848, "eval_runtime": 36.0987, "eval_samples_per_second": 6.399, "eval_steps_per_second": 0.803, "step": 1 }, { "epoch": 0.007285974499089253, "grad_norm": 0.3016580641269684, "learning_rate": 2e-05, "loss": 3.0567, "step": 2 }, { "epoch": 0.01092896174863388, "grad_norm": 0.21999327838420868, "learning_rate": 3e-05, "loss": 2.6965, "step": 3 }, { "epoch": 0.014571948998178506, "grad_norm": 0.24598363041877747, "learning_rate": 4e-05, "loss": 3.2788, "step": 4 }, { "epoch": 0.018214936247723135, "grad_norm": 0.328949898481369, "learning_rate": 5e-05, "loss": 3.1415, "step": 5 }, { "epoch": 0.02185792349726776, "grad_norm": 0.3219306468963623, "learning_rate": 6e-05, "loss": 3.211, "step": 6 }, { "epoch": 0.025500910746812388, "grad_norm": 0.30114132165908813, "learning_rate": 7e-05, "loss": 2.6, "step": 7 }, { "epoch": 0.029143897996357013, "grad_norm": 0.2939751446247101, "learning_rate": 8e-05, "loss": 2.7389, "step": 8 }, { "epoch": 0.03278688524590164, "grad_norm": 0.31080543994903564, "learning_rate": 9e-05, "loss": 2.8584, "step": 9 }, { "epoch": 0.03278688524590164, "eval_loss": 1.3745203018188477, "eval_runtime": 36.5458, "eval_samples_per_second": 6.321, "eval_steps_per_second": 0.794, "step": 9 }, { "epoch": 0.03642987249544627, "grad_norm": 0.3594372868537903, "learning_rate": 0.0001, "loss": 2.5839, "step": 10 }, { "epoch": 0.04007285974499089, "grad_norm": 0.3138103485107422, "learning_rate": 9.99695413509548e-05, "loss": 2.5284, "step": 11 }, { "epoch": 0.04371584699453552, "grad_norm": 0.40280646085739136, "learning_rate": 9.987820251299122e-05, "loss": 3.1069, "step": 12 }, { "epoch": 0.04735883424408015, "grad_norm": 0.4165895879268646, "learning_rate": 9.972609476841367e-05, "loss": 2.2823, "step": 13 }, { "epoch": 0.051001821493624776, "grad_norm": 0.46014565229415894, "learning_rate": 9.951340343707852e-05, "loss": 2.5827, "step": 14 }, { "epoch": 0.0546448087431694, "grad_norm": 0.416968435049057, "learning_rate": 9.924038765061042e-05, "loss": 2.9045, "step": 15 }, { "epoch": 0.058287795992714025, "grad_norm": 0.44254550337791443, "learning_rate": 9.890738003669029e-05, "loss": 2.578, "step": 16 }, { "epoch": 0.061930783242258654, "grad_norm": 0.6331019997596741, "learning_rate": 9.851478631379982e-05, "loss": 2.513, "step": 17 }, { "epoch": 0.06557377049180328, "grad_norm": 0.3989606201648712, "learning_rate": 9.806308479691595e-05, "loss": 2.4668, "step": 18 }, { "epoch": 0.06557377049180328, "eval_loss": 1.2810860872268677, "eval_runtime": 36.4865, "eval_samples_per_second": 6.331, "eval_steps_per_second": 0.795, "step": 18 }, { "epoch": 0.0692167577413479, "grad_norm": 0.4963116943836212, "learning_rate": 9.755282581475769e-05, "loss": 2.5629, "step": 19 }, { "epoch": 0.07285974499089254, "grad_norm": 0.39597874879837036, "learning_rate": 9.698463103929542e-05, "loss": 2.8487, "step": 20 }, { "epoch": 0.07650273224043716, "grad_norm": 0.49734658002853394, "learning_rate": 9.635919272833938e-05, "loss": 2.7251, "step": 21 }, { "epoch": 0.08014571948998178, "grad_norm": 0.5086115002632141, "learning_rate": 9.567727288213005e-05, "loss": 2.6009, "step": 22 }, { "epoch": 0.08378870673952642, "grad_norm": 0.5011266469955444, "learning_rate": 9.493970231495835e-05, "loss": 2.6599, "step": 23 }, { "epoch": 0.08743169398907104, "grad_norm": 0.44377079606056213, "learning_rate": 9.414737964294636e-05, "loss": 2.3917, "step": 24 }, { "epoch": 0.09107468123861566, "grad_norm": 0.4632920026779175, "learning_rate": 9.330127018922194e-05, "loss": 2.5564, "step": 25 }, { "epoch": 0.0947176684881603, "grad_norm": 0.5033605098724365, "learning_rate": 9.24024048078213e-05, "loss": 2.5636, "step": 26 }, { "epoch": 0.09836065573770492, "grad_norm": 0.3843136429786682, "learning_rate": 9.145187862775209e-05, "loss": 2.7431, "step": 27 }, { "epoch": 0.09836065573770492, "eval_loss": 1.2466179132461548, "eval_runtime": 36.1779, "eval_samples_per_second": 6.385, "eval_steps_per_second": 0.802, "step": 27 }, { "epoch": 0.10200364298724955, "grad_norm": 0.4826091229915619, "learning_rate": 9.045084971874738e-05, "loss": 2.3946, "step": 28 }, { "epoch": 0.10564663023679417, "grad_norm": 0.48955684900283813, "learning_rate": 8.940053768033609e-05, "loss": 2.5544, "step": 29 }, { "epoch": 0.1092896174863388, "grad_norm": 0.5087964534759521, "learning_rate": 8.83022221559489e-05, "loss": 2.7766, "step": 30 }, { "epoch": 0.11293260473588343, "grad_norm": 0.463750422000885, "learning_rate": 8.715724127386972e-05, "loss": 2.5906, "step": 31 }, { "epoch": 0.11657559198542805, "grad_norm": 0.3833313286304474, "learning_rate": 8.596699001693255e-05, "loss": 2.4696, "step": 32 }, { "epoch": 0.12021857923497267, "grad_norm": 0.4325249493122101, "learning_rate": 8.473291852294987e-05, "loss": 2.4887, "step": 33 }, { "epoch": 0.12386156648451731, "grad_norm": 1.51544988155365, "learning_rate": 8.345653031794292e-05, "loss": 2.427, "step": 34 }, { "epoch": 0.12750455373406194, "grad_norm": 0.47455018758773804, "learning_rate": 8.213938048432697e-05, "loss": 2.3536, "step": 35 }, { "epoch": 0.13114754098360656, "grad_norm": 0.45129895210266113, "learning_rate": 8.07830737662829e-05, "loss": 2.3617, "step": 36 }, { "epoch": 0.13114754098360656, "eval_loss": 1.231567144393921, "eval_runtime": 36.1382, "eval_samples_per_second": 6.392, "eval_steps_per_second": 0.802, "step": 36 }, { "epoch": 0.13479052823315119, "grad_norm": 0.47395437955856323, "learning_rate": 7.938926261462366e-05, "loss": 2.424, "step": 37 }, { "epoch": 0.1384335154826958, "grad_norm": 0.4069020748138428, "learning_rate": 7.795964517353735e-05, "loss": 2.3032, "step": 38 }, { "epoch": 0.14207650273224043, "grad_norm": 0.47907042503356934, "learning_rate": 7.649596321166024e-05, "loss": 2.6569, "step": 39 }, { "epoch": 0.14571948998178508, "grad_norm": 0.5535211563110352, "learning_rate": 7.500000000000001e-05, "loss": 2.442, "step": 40 }, { "epoch": 0.1493624772313297, "grad_norm": 0.4240165650844574, "learning_rate": 7.347357813929454e-05, "loss": 2.474, "step": 41 }, { "epoch": 0.15300546448087432, "grad_norm": 0.48536279797554016, "learning_rate": 7.191855733945387e-05, "loss": 2.7711, "step": 42 }, { "epoch": 0.15664845173041894, "grad_norm": 0.44794994592666626, "learning_rate": 7.033683215379002e-05, "loss": 2.4307, "step": 43 }, { "epoch": 0.16029143897996356, "grad_norm": 0.38578876852989197, "learning_rate": 6.873032967079561e-05, "loss": 2.5097, "step": 44 }, { "epoch": 0.16393442622950818, "grad_norm": 0.4519205093383789, "learning_rate": 6.710100716628344e-05, "loss": 2.4849, "step": 45 }, { "epoch": 0.16393442622950818, "eval_loss": 1.223783016204834, "eval_runtime": 36.2336, "eval_samples_per_second": 6.375, "eval_steps_per_second": 0.8, "step": 45 }, { "epoch": 0.16757741347905283, "grad_norm": 0.5709835290908813, "learning_rate": 6.545084971874738e-05, "loss": 2.6203, "step": 46 }, { "epoch": 0.17122040072859745, "grad_norm": 0.5105118155479431, "learning_rate": 6.378186779084995e-05, "loss": 2.4076, "step": 47 }, { "epoch": 0.17486338797814208, "grad_norm": 0.4190482199192047, "learning_rate": 6.209609477998338e-05, "loss": 2.6536, "step": 48 }, { "epoch": 0.1785063752276867, "grad_norm": 0.5616876482963562, "learning_rate": 6.0395584540887963e-05, "loss": 2.171, "step": 49 }, { "epoch": 0.18214936247723132, "grad_norm": 0.47931408882141113, "learning_rate": 5.868240888334653e-05, "loss": 2.3936, "step": 50 }, { "epoch": 0.18579234972677597, "grad_norm": 0.4086602032184601, "learning_rate": 5.695865504800327e-05, "loss": 2.0796, "step": 51 }, { "epoch": 0.1894353369763206, "grad_norm": 0.4565342962741852, "learning_rate": 5.522642316338268e-05, "loss": 2.4687, "step": 52 }, { "epoch": 0.1930783242258652, "grad_norm": 0.44736742973327637, "learning_rate": 5.348782368720626e-05, "loss": 2.6026, "step": 53 }, { "epoch": 0.19672131147540983, "grad_norm": 0.49989748001098633, "learning_rate": 5.174497483512506e-05, "loss": 2.8232, "step": 54 }, { "epoch": 0.19672131147540983, "eval_loss": 1.2164099216461182, "eval_runtime": 36.1482, "eval_samples_per_second": 6.39, "eval_steps_per_second": 0.802, "step": 54 }, { "epoch": 0.20036429872495445, "grad_norm": 0.5327097773551941, "learning_rate": 5e-05, "loss": 2.1804, "step": 55 }, { "epoch": 0.2040072859744991, "grad_norm": 0.46552613377571106, "learning_rate": 4.825502516487497e-05, "loss": 2.9111, "step": 56 }, { "epoch": 0.20765027322404372, "grad_norm": 0.47907155752182007, "learning_rate": 4.6512176312793736e-05, "loss": 2.5331, "step": 57 }, { "epoch": 0.21129326047358835, "grad_norm": 0.5192755460739136, "learning_rate": 4.477357683661734e-05, "loss": 2.452, "step": 58 }, { "epoch": 0.21493624772313297, "grad_norm": 0.5046710968017578, "learning_rate": 4.3041344951996746e-05, "loss": 2.2555, "step": 59 }, { "epoch": 0.2185792349726776, "grad_norm": 0.4994800388813019, "learning_rate": 4.131759111665349e-05, "loss": 2.4397, "step": 60 }, { "epoch": 0.2222222222222222, "grad_norm": 0.5281521081924438, "learning_rate": 3.960441545911204e-05, "loss": 2.9933, "step": 61 }, { "epoch": 0.22586520947176686, "grad_norm": 0.45675769448280334, "learning_rate": 3.790390522001662e-05, "loss": 2.4611, "step": 62 }, { "epoch": 0.22950819672131148, "grad_norm": 0.4723397493362427, "learning_rate": 3.6218132209150045e-05, "loss": 2.2367, "step": 63 }, { "epoch": 0.22950819672131148, "eval_loss": 1.2124544382095337, "eval_runtime": 36.1593, "eval_samples_per_second": 6.388, "eval_steps_per_second": 0.802, "step": 63 }, { "epoch": 0.2331511839708561, "grad_norm": 0.4308946132659912, "learning_rate": 3.4549150281252636e-05, "loss": 2.298, "step": 64 }, { "epoch": 0.23679417122040072, "grad_norm": 0.5040618181228638, "learning_rate": 3.289899283371657e-05, "loss": 2.4357, "step": 65 }, { "epoch": 0.24043715846994534, "grad_norm": 0.46097809076309204, "learning_rate": 3.12696703292044e-05, "loss": 2.6279, "step": 66 }, { "epoch": 0.24408014571949, "grad_norm": 0.4577733278274536, "learning_rate": 2.9663167846209998e-05, "loss": 2.6152, "step": 67 }, { "epoch": 0.24772313296903462, "grad_norm": 0.46525999903678894, "learning_rate": 2.8081442660546125e-05, "loss": 2.3304, "step": 68 }, { "epoch": 0.25136612021857924, "grad_norm": 0.49677371978759766, "learning_rate": 2.6526421860705473e-05, "loss": 2.5507, "step": 69 }, { "epoch": 0.2550091074681239, "grad_norm": 0.5101350545883179, "learning_rate": 2.500000000000001e-05, "loss": 2.5659, "step": 70 }, { "epoch": 0.2586520947176685, "grad_norm": 0.4523501992225647, "learning_rate": 2.350403678833976e-05, "loss": 2.2424, "step": 71 }, { "epoch": 0.26229508196721313, "grad_norm": 0.5566951036453247, "learning_rate": 2.2040354826462668e-05, "loss": 2.3174, "step": 72 }, { "epoch": 0.26229508196721313, "eval_loss": 1.209464430809021, "eval_runtime": 36.1745, "eval_samples_per_second": 6.386, "eval_steps_per_second": 0.802, "step": 72 }, { "epoch": 0.2659380692167577, "grad_norm": 0.4237559139728546, "learning_rate": 2.061073738537635e-05, "loss": 2.4986, "step": 73 }, { "epoch": 0.26958105646630237, "grad_norm": 0.5177941918373108, "learning_rate": 1.9216926233717085e-05, "loss": 2.2564, "step": 74 }, { "epoch": 0.273224043715847, "grad_norm": 0.503273606300354, "learning_rate": 1.7860619515673033e-05, "loss": 2.3239, "step": 75 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.5217449435136e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }