{ "best_metric": 0.8009337782859802, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.3917727717923604, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0039177277179236044, "grad_norm": 2.730484962463379, "learning_rate": 1.6666666666666668e-07, "loss": 3.6509, "step": 1 }, { "epoch": 0.0039177277179236044, "eval_loss": 1.4694706201553345, "eval_runtime": 54.6537, "eval_samples_per_second": 7.868, "eval_steps_per_second": 0.988, "step": 1 }, { "epoch": 0.007835455435847209, "grad_norm": 3.1775081157684326, "learning_rate": 3.3333333333333335e-07, "loss": 3.8131, "step": 2 }, { "epoch": 0.011753183153770812, "grad_norm": 3.3643202781677246, "learning_rate": 5.000000000000001e-07, "loss": 3.9419, "step": 3 }, { "epoch": 0.015670910871694418, "grad_norm": 3.8046364784240723, "learning_rate": 6.666666666666667e-07, "loss": 4.1392, "step": 4 }, { "epoch": 0.019588638589618023, "grad_norm": 3.8760738372802734, "learning_rate": 8.333333333333333e-07, "loss": 4.4679, "step": 5 }, { "epoch": 0.023506366307541625, "grad_norm": 4.109034061431885, "learning_rate": 1.0000000000000002e-06, "loss": 4.3329, "step": 6 }, { "epoch": 0.02742409402546523, "grad_norm": 3.9366793632507324, "learning_rate": 1.1666666666666668e-06, "loss": 4.332, "step": 7 }, { "epoch": 0.031341821743388835, "grad_norm": 4.43690299987793, "learning_rate": 1.3333333333333334e-06, "loss": 4.9059, "step": 8 }, { "epoch": 0.03525954946131244, "grad_norm": 4.912406921386719, "learning_rate": 1.5e-06, "loss": 4.6052, "step": 9 }, { "epoch": 0.039177277179236046, "grad_norm": 4.383081912994385, "learning_rate": 1.6666666666666667e-06, "loss": 4.667, "step": 10 }, { "epoch": 0.043095004897159644, "grad_norm": 4.72794771194458, "learning_rate": 1.8333333333333333e-06, "loss": 4.7332, "step": 11 }, { "epoch": 0.04701273261508325, "grad_norm": 4.649956703186035, "learning_rate": 2.0000000000000003e-06, "loss": 4.9432, "step": 12 }, { "epoch": 0.050930460333006855, "grad_norm": 4.884151458740234, "learning_rate": 2.166666666666667e-06, "loss": 4.7805, "step": 13 }, { "epoch": 0.05484818805093046, "grad_norm": 5.181488513946533, "learning_rate": 2.3333333333333336e-06, "loss": 4.755, "step": 14 }, { "epoch": 0.058765915768854066, "grad_norm": 4.893612861633301, "learning_rate": 2.5e-06, "loss": 5.0977, "step": 15 }, { "epoch": 0.06268364348677767, "grad_norm": 5.440213203430176, "learning_rate": 2.666666666666667e-06, "loss": 4.9032, "step": 16 }, { "epoch": 0.06660137120470128, "grad_norm": 4.837584972381592, "learning_rate": 2.8333333333333335e-06, "loss": 4.9545, "step": 17 }, { "epoch": 0.07051909892262488, "grad_norm": 4.980720043182373, "learning_rate": 3e-06, "loss": 5.0711, "step": 18 }, { "epoch": 0.07443682664054849, "grad_norm": 6.30215311050415, "learning_rate": 3.1666666666666667e-06, "loss": 5.0939, "step": 19 }, { "epoch": 0.07835455435847209, "grad_norm": 5.513412952423096, "learning_rate": 3.3333333333333333e-06, "loss": 5.256, "step": 20 }, { "epoch": 0.08227228207639568, "grad_norm": 5.348018169403076, "learning_rate": 3.5e-06, "loss": 5.2425, "step": 21 }, { "epoch": 0.08619000979431929, "grad_norm": 5.8349175453186035, "learning_rate": 3.6666666666666666e-06, "loss": 5.2358, "step": 22 }, { "epoch": 0.0901077375122429, "grad_norm": 5.832411766052246, "learning_rate": 3.833333333333334e-06, "loss": 5.4514, "step": 23 }, { "epoch": 0.0940254652301665, "grad_norm": 6.0586724281311035, "learning_rate": 4.000000000000001e-06, "loss": 5.2267, "step": 24 }, { "epoch": 0.0979431929480901, "grad_norm": 5.891234397888184, "learning_rate": 4.166666666666667e-06, "loss": 5.5319, "step": 25 }, { "epoch": 0.10186092066601371, "grad_norm": 6.236785411834717, "learning_rate": 4.333333333333334e-06, "loss": 5.3977, "step": 26 }, { "epoch": 0.10577864838393732, "grad_norm": 6.528311252593994, "learning_rate": 4.5e-06, "loss": 5.1924, "step": 27 }, { "epoch": 0.10969637610186092, "grad_norm": 6.024746894836426, "learning_rate": 4.666666666666667e-06, "loss": 5.3039, "step": 28 }, { "epoch": 0.11361410381978453, "grad_norm": 6.226823329925537, "learning_rate": 4.833333333333333e-06, "loss": 5.3183, "step": 29 }, { "epoch": 0.11753183153770813, "grad_norm": 5.889983654022217, "learning_rate": 5e-06, "loss": 5.2, "step": 30 }, { "epoch": 0.12144955925563174, "grad_norm": 5.979458808898926, "learning_rate": 4.997482666353287e-06, "loss": 5.4484, "step": 31 }, { "epoch": 0.12536728697355534, "grad_norm": 5.922553062438965, "learning_rate": 4.989935734988098e-06, "loss": 5.2643, "step": 32 }, { "epoch": 0.12928501469147893, "grad_norm": 5.977787971496582, "learning_rate": 4.977374404419838e-06, "loss": 5.4059, "step": 33 }, { "epoch": 0.13320274240940255, "grad_norm": 6.46279764175415, "learning_rate": 4.959823971496575e-06, "loss": 5.5486, "step": 34 }, { "epoch": 0.13712047012732614, "grad_norm": 6.590497970581055, "learning_rate": 4.937319780454559e-06, "loss": 5.3917, "step": 35 }, { "epoch": 0.14103819784524976, "grad_norm": 5.591681957244873, "learning_rate": 4.909907151739634e-06, "loss": 5.3846, "step": 36 }, { "epoch": 0.14495592556317335, "grad_norm": 5.741349697113037, "learning_rate": 4.8776412907378845e-06, "loss": 5.299, "step": 37 }, { "epoch": 0.14887365328109697, "grad_norm": 6.1233134269714355, "learning_rate": 4.8405871765993435e-06, "loss": 5.3341, "step": 38 }, { "epoch": 0.15279138099902057, "grad_norm": 5.704336166381836, "learning_rate": 4.7988194313786275e-06, "loss": 5.2046, "step": 39 }, { "epoch": 0.15670910871694418, "grad_norm": 6.782821178436279, "learning_rate": 4.752422169756048e-06, "loss": 5.3502, "step": 40 }, { "epoch": 0.16062683643486778, "grad_norm": 5.274021148681641, "learning_rate": 4.701488829641845e-06, "loss": 4.8217, "step": 41 }, { "epoch": 0.16454456415279137, "grad_norm": 5.679739475250244, "learning_rate": 4.646121984004666e-06, "loss": 5.3528, "step": 42 }, { "epoch": 0.168462291870715, "grad_norm": 5.360927581787109, "learning_rate": 4.586433134303257e-06, "loss": 4.9398, "step": 43 }, { "epoch": 0.17238001958863858, "grad_norm": 5.5803680419921875, "learning_rate": 4.522542485937369e-06, "loss": 5.1841, "step": 44 }, { "epoch": 0.1762977473065622, "grad_norm": 6.342293739318848, "learning_rate": 4.454578706170075e-06, "loss": 5.1268, "step": 45 }, { "epoch": 0.1802154750244858, "grad_norm": 5.699821949005127, "learning_rate": 4.382678665009028e-06, "loss": 5.1354, "step": 46 }, { "epoch": 0.1841332027424094, "grad_norm": 6.461126804351807, "learning_rate": 4.3069871595684795e-06, "loss": 5.2324, "step": 47 }, { "epoch": 0.188050930460333, "grad_norm": 6.610848426818848, "learning_rate": 4.227656622467162e-06, "loss": 5.4744, "step": 48 }, { "epoch": 0.19196865817825662, "grad_norm": 7.548392295837402, "learning_rate": 4.144846814849282e-06, "loss": 5.622, "step": 49 }, { "epoch": 0.1958863858961802, "grad_norm": 6.763551712036133, "learning_rate": 4.058724504646834e-06, "loss": 5.4079, "step": 50 }, { "epoch": 0.1958863858961802, "eval_loss": 0.9716836810112, "eval_runtime": 55.5212, "eval_samples_per_second": 7.745, "eval_steps_per_second": 0.973, "step": 50 }, { "epoch": 0.19980411361410383, "grad_norm": 1.9311734437942505, "learning_rate": 3.969463130731183e-06, "loss": 3.0061, "step": 51 }, { "epoch": 0.20372184133202742, "grad_norm": 2.0846590995788574, "learning_rate": 3.8772424536302565e-06, "loss": 2.7355, "step": 52 }, { "epoch": 0.20763956904995104, "grad_norm": 2.4227850437164307, "learning_rate": 3.782248193514766e-06, "loss": 3.0445, "step": 53 }, { "epoch": 0.21155729676787463, "grad_norm": 2.2137115001678467, "learning_rate": 3.684671656182497e-06, "loss": 3.1911, "step": 54 }, { "epoch": 0.21547502448579825, "grad_norm": 2.4037668704986572, "learning_rate": 3.5847093477938955e-06, "loss": 2.9154, "step": 55 }, { "epoch": 0.21939275220372184, "grad_norm": 2.4038519859313965, "learning_rate": 3.4825625791348093e-06, "loss": 3.284, "step": 56 }, { "epoch": 0.22331047992164543, "grad_norm": 2.5408387184143066, "learning_rate": 3.3784370602033572e-06, "loss": 3.3012, "step": 57 }, { "epoch": 0.22722820763956905, "grad_norm": 2.4454472064971924, "learning_rate": 3.272542485937369e-06, "loss": 3.3035, "step": 58 }, { "epoch": 0.23114593535749264, "grad_norm": 2.239074945449829, "learning_rate": 3.165092113916688e-06, "loss": 2.9207, "step": 59 }, { "epoch": 0.23506366307541626, "grad_norm": 2.6796836853027344, "learning_rate": 3.056302334890786e-06, "loss": 3.3162, "step": 60 }, { "epoch": 0.23898139079333985, "grad_norm": 2.651420831680298, "learning_rate": 2.946392236996592e-06, "loss": 3.1916, "step": 61 }, { "epoch": 0.24289911851126347, "grad_norm": 2.4591100215911865, "learning_rate": 2.835583164544139e-06, "loss": 3.2679, "step": 62 }, { "epoch": 0.24681684622918706, "grad_norm": 2.5218474864959717, "learning_rate": 2.724098272258584e-06, "loss": 3.2717, "step": 63 }, { "epoch": 0.2507345739471107, "grad_norm": 2.6746842861175537, "learning_rate": 2.6121620758762877e-06, "loss": 3.2271, "step": 64 }, { "epoch": 0.2546523016650343, "grad_norm": 2.6411406993865967, "learning_rate": 2.5e-06, "loss": 3.1717, "step": 65 }, { "epoch": 0.25857002938295787, "grad_norm": 2.8181159496307373, "learning_rate": 2.3878379241237136e-06, "loss": 3.3025, "step": 66 }, { "epoch": 0.2624877571008815, "grad_norm": 2.9681222438812256, "learning_rate": 2.2759017277414165e-06, "loss": 3.2978, "step": 67 }, { "epoch": 0.2664054848188051, "grad_norm": 2.8011727333068848, "learning_rate": 2.1644168354558623e-06, "loss": 3.3067, "step": 68 }, { "epoch": 0.2703232125367287, "grad_norm": 2.805682420730591, "learning_rate": 2.053607763003409e-06, "loss": 3.1629, "step": 69 }, { "epoch": 0.2742409402546523, "grad_norm": 2.8847367763519287, "learning_rate": 1.9436976651092143e-06, "loss": 3.5541, "step": 70 }, { "epoch": 0.2781586679725759, "grad_norm": 3.0911881923675537, "learning_rate": 1.8349078860833125e-06, "loss": 3.5225, "step": 71 }, { "epoch": 0.2820763956904995, "grad_norm": 3.2571005821228027, "learning_rate": 1.7274575140626318e-06, "loss": 3.5847, "step": 72 }, { "epoch": 0.2859941234084231, "grad_norm": 2.7532684803009033, "learning_rate": 1.6215629397966432e-06, "loss": 3.4655, "step": 73 }, { "epoch": 0.2899118511263467, "grad_norm": 2.7345879077911377, "learning_rate": 1.5174374208651913e-06, "loss": 3.3459, "step": 74 }, { "epoch": 0.2938295788442703, "grad_norm": 2.824265241622925, "learning_rate": 1.415290652206105e-06, "loss": 3.3271, "step": 75 }, { "epoch": 0.29774730656219395, "grad_norm": 3.0816643238067627, "learning_rate": 1.3153283438175036e-06, "loss": 3.4014, "step": 76 }, { "epoch": 0.30166503428011754, "grad_norm": 3.105966567993164, "learning_rate": 1.217751806485235e-06, "loss": 3.1919, "step": 77 }, { "epoch": 0.30558276199804113, "grad_norm": 3.160095691680908, "learning_rate": 1.122757546369744e-06, "loss": 3.3194, "step": 78 }, { "epoch": 0.3095004897159647, "grad_norm": 3.615074872970581, "learning_rate": 1.0305368692688175e-06, "loss": 3.4239, "step": 79 }, { "epoch": 0.31341821743388837, "grad_norm": 3.469092845916748, "learning_rate": 9.412754953531664e-07, "loss": 3.2093, "step": 80 }, { "epoch": 0.31733594515181196, "grad_norm": 3.430832624435425, "learning_rate": 8.551531851507186e-07, "loss": 3.3869, "step": 81 }, { "epoch": 0.32125367286973555, "grad_norm": 3.5903029441833496, "learning_rate": 7.723433775328385e-07, "loss": 3.3811, "step": 82 }, { "epoch": 0.32517140058765914, "grad_norm": 3.278386116027832, "learning_rate": 6.930128404315214e-07, "loss": 3.2986, "step": 83 }, { "epoch": 0.32908912830558273, "grad_norm": 3.9844226837158203, "learning_rate": 6.17321334990973e-07, "loss": 3.6662, "step": 84 }, { "epoch": 0.3330068560235064, "grad_norm": 3.422700881958008, "learning_rate": 5.454212938299256e-07, "loss": 3.1981, "step": 85 }, { "epoch": 0.33692458374143, "grad_norm": 3.988621234893799, "learning_rate": 4.774575140626317e-07, "loss": 3.4691, "step": 86 }, { "epoch": 0.34084231145935356, "grad_norm": 3.635932207107544, "learning_rate": 4.1356686569674344e-07, "loss": 3.4766, "step": 87 }, { "epoch": 0.34476003917727716, "grad_norm": 3.8308897018432617, "learning_rate": 3.538780159953348e-07, "loss": 3.6658, "step": 88 }, { "epoch": 0.3486777668952008, "grad_norm": 4.297379493713379, "learning_rate": 2.98511170358155e-07, "loss": 3.4658, "step": 89 }, { "epoch": 0.3525954946131244, "grad_norm": 4.496688365936279, "learning_rate": 2.4757783024395244e-07, "loss": 3.7584, "step": 90 }, { "epoch": 0.356513222331048, "grad_norm": 4.174125671386719, "learning_rate": 2.0118056862137358e-07, "loss": 3.5665, "step": 91 }, { "epoch": 0.3604309500489716, "grad_norm": 4.252688884735107, "learning_rate": 1.59412823400657e-07, "loss": 3.4631, "step": 92 }, { "epoch": 0.3643486777668952, "grad_norm": 4.413058757781982, "learning_rate": 1.223587092621162e-07, "loss": 3.7688, "step": 93 }, { "epoch": 0.3682664054848188, "grad_norm": 4.7277116775512695, "learning_rate": 9.00928482603669e-08, "loss": 3.6556, "step": 94 }, { "epoch": 0.3721841332027424, "grad_norm": 4.922855377197266, "learning_rate": 6.268021954544095e-08, "loss": 3.7743, "step": 95 }, { "epoch": 0.376101860920666, "grad_norm": 4.681742191314697, "learning_rate": 4.017602850342584e-08, "loss": 3.6726, "step": 96 }, { "epoch": 0.38001958863858964, "grad_norm": 5.408985137939453, "learning_rate": 2.262559558016325e-08, "loss": 3.6609, "step": 97 }, { "epoch": 0.38393731635651324, "grad_norm": 5.3041486740112305, "learning_rate": 1.006426501190233e-08, "loss": 3.873, "step": 98 }, { "epoch": 0.3878550440744368, "grad_norm": 5.634120941162109, "learning_rate": 2.5173336467135266e-09, "loss": 4.1503, "step": 99 }, { "epoch": 0.3917727717923604, "grad_norm": 7.089431285858154, "learning_rate": 0.0, "loss": 4.5872, "step": 100 }, { "epoch": 0.3917727717923604, "eval_loss": 0.8009337782859802, "eval_runtime": 55.4982, "eval_samples_per_second": 7.748, "eval_steps_per_second": 0.973, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.660991548522496e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }