{ "best_metric": 1.0260274410247803, "best_model_checkpoint": "miner_id_24/checkpoint-50", "epoch": 1.0255427841634739, "eval_steps": 25, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020434227330779056, "grad_norm": 7.360929489135742, "learning_rate": 5.000000000000001e-07, "loss": 17.3749, "step": 1 }, { "epoch": 0.020434227330779056, "eval_loss": 1.138229489326477, "eval_runtime": 21.2669, "eval_samples_per_second": 7.759, "eval_steps_per_second": 1.975, "step": 1 }, { "epoch": 0.04086845466155811, "grad_norm": 6.317312717437744, "learning_rate": 1.0000000000000002e-06, "loss": 17.9715, "step": 2 }, { "epoch": 0.06130268199233716, "grad_norm": 6.0731658935546875, "learning_rate": 1.5e-06, "loss": 16.5836, "step": 3 }, { "epoch": 0.08173690932311622, "grad_norm": 5.645073413848877, "learning_rate": 2.0000000000000003e-06, "loss": 16.209, "step": 4 }, { "epoch": 0.10217113665389528, "grad_norm": 7.367918014526367, "learning_rate": 2.5e-06, "loss": 17.2826, "step": 5 }, { "epoch": 0.12260536398467432, "grad_norm": 6.8022966384887695, "learning_rate": 3e-06, "loss": 16.758, "step": 6 }, { "epoch": 0.14303959131545338, "grad_norm": 6.725194931030273, "learning_rate": 3.5000000000000004e-06, "loss": 19.0619, "step": 7 }, { "epoch": 0.16347381864623245, "grad_norm": 6.638408660888672, "learning_rate": 4.000000000000001e-06, "loss": 17.6671, "step": 8 }, { "epoch": 0.1839080459770115, "grad_norm": 5.699502468109131, "learning_rate": 4.5e-06, "loss": 17.215, "step": 9 }, { "epoch": 0.20434227330779056, "grad_norm": 6.1157097816467285, "learning_rate": 5e-06, "loss": 18.4525, "step": 10 }, { "epoch": 0.2247765006385696, "grad_norm": 5.536643981933594, "learning_rate": 5.500000000000001e-06, "loss": 15.6788, "step": 11 }, { "epoch": 0.24521072796934865, "grad_norm": 7.206900596618652, "learning_rate": 6e-06, "loss": 17.0678, "step": 12 }, { "epoch": 0.2656449553001277, "grad_norm": 5.241375923156738, "learning_rate": 6.5000000000000004e-06, "loss": 18.2783, "step": 13 }, { "epoch": 0.28607918263090676, "grad_norm": 5.224916934967041, "learning_rate": 7.000000000000001e-06, "loss": 18.9804, "step": 14 }, { "epoch": 0.3065134099616858, "grad_norm": 4.611645698547363, "learning_rate": 7.5e-06, "loss": 18.8118, "step": 15 }, { "epoch": 0.3269476372924649, "grad_norm": 5.321063995361328, "learning_rate": 8.000000000000001e-06, "loss": 16.1292, "step": 16 }, { "epoch": 0.34738186462324394, "grad_norm": 4.779561519622803, "learning_rate": 8.500000000000002e-06, "loss": 17.0707, "step": 17 }, { "epoch": 0.367816091954023, "grad_norm": 4.141415119171143, "learning_rate": 9e-06, "loss": 19.2067, "step": 18 }, { "epoch": 0.388250319284802, "grad_norm": 4.518102645874023, "learning_rate": 9.5e-06, "loss": 16.8985, "step": 19 }, { "epoch": 0.4086845466155811, "grad_norm": 4.083268165588379, "learning_rate": 1e-05, "loss": 18.5685, "step": 20 }, { "epoch": 0.42911877394636017, "grad_norm": 4.17296028137207, "learning_rate": 1.05e-05, "loss": 16.5411, "step": 21 }, { "epoch": 0.4495530012771392, "grad_norm": 3.7888073921203613, "learning_rate": 1.1000000000000001e-05, "loss": 16.498, "step": 22 }, { "epoch": 0.46998722860791825, "grad_norm": 3.7531864643096924, "learning_rate": 1.1500000000000002e-05, "loss": 15.8281, "step": 23 }, { "epoch": 0.4904214559386973, "grad_norm": 3.5403201580047607, "learning_rate": 1.2e-05, "loss": 16.5056, "step": 24 }, { "epoch": 0.5108556832694764, "grad_norm": 3.830643653869629, "learning_rate": 1.25e-05, "loss": 16.6906, "step": 25 }, { "epoch": 0.5108556832694764, "eval_loss": 1.084327220916748, "eval_runtime": 21.5708, "eval_samples_per_second": 7.649, "eval_steps_per_second": 1.947, "step": 25 }, { "epoch": 0.5312899106002554, "grad_norm": 3.512051582336426, "learning_rate": 1.3000000000000001e-05, "loss": 17.1562, "step": 26 }, { "epoch": 0.5517241379310345, "grad_norm": 3.581163167953491, "learning_rate": 1.3500000000000001e-05, "loss": 15.3294, "step": 27 }, { "epoch": 0.5721583652618135, "grad_norm": 3.7532386779785156, "learning_rate": 1.4000000000000001e-05, "loss": 18.2124, "step": 28 }, { "epoch": 0.5925925925925926, "grad_norm": 3.3712620735168457, "learning_rate": 1.45e-05, "loss": 15.1985, "step": 29 }, { "epoch": 0.6130268199233716, "grad_norm": 3.74042010307312, "learning_rate": 1.5e-05, "loss": 16.3185, "step": 30 }, { "epoch": 0.6334610472541508, "grad_norm": 3.747729778289795, "learning_rate": 1.55e-05, "loss": 18.091, "step": 31 }, { "epoch": 0.6538952745849298, "grad_norm": 3.3717548847198486, "learning_rate": 1.6000000000000003e-05, "loss": 15.5985, "step": 32 }, { "epoch": 0.6743295019157088, "grad_norm": 3.1136016845703125, "learning_rate": 1.65e-05, "loss": 16.1861, "step": 33 }, { "epoch": 0.6947637292464879, "grad_norm": 2.9378445148468018, "learning_rate": 1.7000000000000003e-05, "loss": 15.428, "step": 34 }, { "epoch": 0.7151979565772669, "grad_norm": 3.0630242824554443, "learning_rate": 1.75e-05, "loss": 15.6852, "step": 35 }, { "epoch": 0.735632183908046, "grad_norm": 3.5427470207214355, "learning_rate": 1.8e-05, "loss": 15.8646, "step": 36 }, { "epoch": 0.756066411238825, "grad_norm": 3.6774771213531494, "learning_rate": 1.85e-05, "loss": 15.4281, "step": 37 }, { "epoch": 0.776500638569604, "grad_norm": 3.539029836654663, "learning_rate": 1.9e-05, "loss": 16.3628, "step": 38 }, { "epoch": 0.7969348659003831, "grad_norm": 3.6181910037994385, "learning_rate": 1.9500000000000003e-05, "loss": 16.0907, "step": 39 }, { "epoch": 0.8173690932311622, "grad_norm": 3.2122557163238525, "learning_rate": 2e-05, "loss": 16.8239, "step": 40 }, { "epoch": 0.8378033205619413, "grad_norm": 3.049232006072998, "learning_rate": 2.05e-05, "loss": 17.2511, "step": 41 }, { "epoch": 0.8582375478927203, "grad_norm": 3.254066228866577, "learning_rate": 2.1e-05, "loss": 16.2736, "step": 42 }, { "epoch": 0.8786717752234994, "grad_norm": 3.0137479305267334, "learning_rate": 2.15e-05, "loss": 16.5703, "step": 43 }, { "epoch": 0.8991060025542784, "grad_norm": 3.380715847015381, "learning_rate": 2.2000000000000003e-05, "loss": 16.8219, "step": 44 }, { "epoch": 0.9195402298850575, "grad_norm": 2.935159921646118, "learning_rate": 2.25e-05, "loss": 16.5484, "step": 45 }, { "epoch": 0.9399744572158365, "grad_norm": 2.9660491943359375, "learning_rate": 2.3000000000000003e-05, "loss": 15.6773, "step": 46 }, { "epoch": 0.9604086845466155, "grad_norm": 2.7908406257629395, "learning_rate": 2.35e-05, "loss": 15.7697, "step": 47 }, { "epoch": 0.9808429118773946, "grad_norm": 3.0066637992858887, "learning_rate": 2.4e-05, "loss": 17.0586, "step": 48 }, { "epoch": 1.0051085568326947, "grad_norm": 3.090890645980835, "learning_rate": 2.45e-05, "loss": 19.5984, "step": 49 }, { "epoch": 1.0255427841634739, "grad_norm": 2.8104114532470703, "learning_rate": 2.5e-05, "loss": 16.0491, "step": 50 }, { "epoch": 1.0255427841634739, "eval_loss": 1.0260274410247803, "eval_runtime": 21.5925, "eval_samples_per_second": 7.642, "eval_steps_per_second": 1.945, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6709702668294554e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }