{ "best_metric": 0.22028179466724396, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.8830022075055187, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008830022075055188, "grad_norm": 0.5267917513847351, "learning_rate": 2.9999999999999997e-05, "loss": 0.7436, "step": 1 }, { "epoch": 0.008830022075055188, "eval_loss": 0.7710736989974976, "eval_runtime": 8.9645, "eval_samples_per_second": 5.578, "eval_steps_per_second": 0.781, "step": 1 }, { "epoch": 0.017660044150110375, "grad_norm": 0.6040651202201843, "learning_rate": 5.9999999999999995e-05, "loss": 0.8066, "step": 2 }, { "epoch": 0.026490066225165563, "grad_norm": 0.6579259634017944, "learning_rate": 8.999999999999999e-05, "loss": 0.8391, "step": 3 }, { "epoch": 0.03532008830022075, "grad_norm": 0.6005827784538269, "learning_rate": 0.00011999999999999999, "loss": 0.7884, "step": 4 }, { "epoch": 0.04415011037527594, "grad_norm": 0.5310338735580444, "learning_rate": 0.00015, "loss": 0.5849, "step": 5 }, { "epoch": 0.052980132450331126, "grad_norm": 0.4925440847873688, "learning_rate": 0.00017999999999999998, "loss": 0.5388, "step": 6 }, { "epoch": 0.06181015452538632, "grad_norm": 0.6406821012496948, "learning_rate": 0.00020999999999999998, "loss": 0.4683, "step": 7 }, { "epoch": 0.0706401766004415, "grad_norm": 0.9326793551445007, "learning_rate": 0.00023999999999999998, "loss": 0.5396, "step": 8 }, { "epoch": 0.07947019867549669, "grad_norm": 0.5024523138999939, "learning_rate": 0.00027, "loss": 0.4557, "step": 9 }, { "epoch": 0.08830022075055188, "grad_norm": 0.5244933366775513, "learning_rate": 0.0003, "loss": 0.4242, "step": 10 }, { "epoch": 0.09713024282560706, "grad_norm": 0.5493826866149902, "learning_rate": 0.0002999794957488703, "loss": 0.3902, "step": 11 }, { "epoch": 0.10596026490066225, "grad_norm": 0.5801348686218262, "learning_rate": 0.0002999179886011389, "loss": 0.4015, "step": 12 }, { "epoch": 0.11479028697571744, "grad_norm": 0.48916712403297424, "learning_rate": 0.0002998154953722457, "loss": 0.3983, "step": 13 }, { "epoch": 0.12362030905077263, "grad_norm": 0.3941458761692047, "learning_rate": 0.00029967204408281613, "loss": 0.3628, "step": 14 }, { "epoch": 0.13245033112582782, "grad_norm": 0.4000348746776581, "learning_rate": 0.00029948767395100045, "loss": 0.3254, "step": 15 }, { "epoch": 0.141280353200883, "grad_norm": 0.39534318447113037, "learning_rate": 0.0002992624353817517, "loss": 0.3563, "step": 16 }, { "epoch": 0.15011037527593818, "grad_norm": 0.3745306134223938, "learning_rate": 0.0002989963899530457, "loss": 0.3458, "step": 17 }, { "epoch": 0.15894039735099338, "grad_norm": 0.3725316822528839, "learning_rate": 0.00029868961039904624, "loss": 0.3429, "step": 18 }, { "epoch": 0.16777041942604856, "grad_norm": 0.3427765965461731, "learning_rate": 0.00029834218059022024, "loss": 0.3101, "step": 19 }, { "epoch": 0.17660044150110377, "grad_norm": 0.3185417056083679, "learning_rate": 0.00029795419551040833, "loss": 0.2583, "step": 20 }, { "epoch": 0.18543046357615894, "grad_norm": 0.3798052668571472, "learning_rate": 0.00029752576123085736, "loss": 0.2997, "step": 21 }, { "epoch": 0.19426048565121412, "grad_norm": 0.4568968713283539, "learning_rate": 0.0002970569948812214, "loss": 0.2729, "step": 22 }, { "epoch": 0.20309050772626933, "grad_norm": 0.3697591722011566, "learning_rate": 0.0002965480246175399, "loss": 0.2551, "step": 23 }, { "epoch": 0.2119205298013245, "grad_norm": 0.36315158009529114, "learning_rate": 0.0002959989895872009, "loss": 0.2292, "step": 24 }, { "epoch": 0.22075055187637968, "grad_norm": 0.4563204050064087, "learning_rate": 0.0002954100398908995, "loss": 0.2202, "step": 25 }, { "epoch": 0.22075055187637968, "eval_loss": 0.3227197527885437, "eval_runtime": 8.5814, "eval_samples_per_second": 5.827, "eval_steps_per_second": 0.816, "step": 25 }, { "epoch": 0.22958057395143489, "grad_norm": 0.3887939155101776, "learning_rate": 0.0002947813365416023, "loss": 0.2462, "step": 26 }, { "epoch": 0.23841059602649006, "grad_norm": 0.5520837903022766, "learning_rate": 0.0002941130514205272, "loss": 0.4752, "step": 27 }, { "epoch": 0.24724061810154527, "grad_norm": 4.037907123565674, "learning_rate": 0.0002934053672301536, "loss": 1.4451, "step": 28 }, { "epoch": 0.2560706401766004, "grad_norm": 0.38813430070877075, "learning_rate": 0.00029265847744427303, "loss": 0.4314, "step": 29 }, { "epoch": 0.26490066225165565, "grad_norm": 0.34112170338630676, "learning_rate": 0.00029187258625509513, "loss": 0.4137, "step": 30 }, { "epoch": 0.2737306843267108, "grad_norm": 0.3160345256328583, "learning_rate": 0.00029104790851742417, "loss": 0.3477, "step": 31 }, { "epoch": 0.282560706401766, "grad_norm": 0.33565646409988403, "learning_rate": 0.0002901846696899191, "loss": 0.3028, "step": 32 }, { "epoch": 0.2913907284768212, "grad_norm": 0.26487964391708374, "learning_rate": 0.00028928310577345606, "loss": 0.2279, "step": 33 }, { "epoch": 0.30022075055187636, "grad_norm": 0.28182142972946167, "learning_rate": 0.0002883434632466077, "loss": 0.2752, "step": 34 }, { "epoch": 0.3090507726269316, "grad_norm": 0.3119100034236908, "learning_rate": 0.00028736599899825856, "loss": 0.2873, "step": 35 }, { "epoch": 0.31788079470198677, "grad_norm": 0.280179500579834, "learning_rate": 0.00028635098025737434, "loss": 0.2633, "step": 36 }, { "epoch": 0.32671081677704195, "grad_norm": 0.27022480964660645, "learning_rate": 0.00028529868451994384, "loss": 0.2684, "step": 37 }, { "epoch": 0.3355408388520971, "grad_norm": 0.31984391808509827, "learning_rate": 0.0002842093994731145, "loss": 0.2911, "step": 38 }, { "epoch": 0.3443708609271523, "grad_norm": 0.2874024212360382, "learning_rate": 0.00028308342291654174, "loss": 0.2726, "step": 39 }, { "epoch": 0.35320088300220753, "grad_norm": 0.2740573287010193, "learning_rate": 0.00028192106268097334, "loss": 0.2663, "step": 40 }, { "epoch": 0.3620309050772627, "grad_norm": 0.2838146686553955, "learning_rate": 0.00028072263654409154, "loss": 0.2439, "step": 41 }, { "epoch": 0.3708609271523179, "grad_norm": 0.2758471369743347, "learning_rate": 0.0002794884721436361, "loss": 0.2644, "step": 42 }, { "epoch": 0.37969094922737306, "grad_norm": 0.265119343996048, "learning_rate": 0.00027821890688783083, "loss": 0.2497, "step": 43 }, { "epoch": 0.38852097130242824, "grad_norm": 0.28861644864082336, "learning_rate": 0.0002769142878631403, "loss": 0.2751, "step": 44 }, { "epoch": 0.3973509933774834, "grad_norm": 0.2569108307361603, "learning_rate": 0.00027557497173937923, "loss": 0.2488, "step": 45 }, { "epoch": 0.40618101545253865, "grad_norm": 0.24016250669956207, "learning_rate": 0.000274201324672203, "loss": 0.2178, "step": 46 }, { "epoch": 0.41501103752759383, "grad_norm": 0.2832198143005371, "learning_rate": 0.00027279372220300385, "loss": 0.2597, "step": 47 }, { "epoch": 0.423841059602649, "grad_norm": 0.2444036304950714, "learning_rate": 0.0002713525491562421, "loss": 0.195, "step": 48 }, { "epoch": 0.4326710816777042, "grad_norm": 0.24414511024951935, "learning_rate": 0.00026987819953423867, "loss": 0.1861, "step": 49 }, { "epoch": 0.44150110375275936, "grad_norm": 0.27019017934799194, "learning_rate": 0.00026837107640945905, "loss": 0.2333, "step": 50 }, { "epoch": 0.44150110375275936, "eval_loss": 0.26030516624450684, "eval_runtime": 8.5711, "eval_samples_per_second": 5.834, "eval_steps_per_second": 0.817, "step": 50 }, { "epoch": 0.4503311258278146, "grad_norm": 0.26158371567726135, "learning_rate": 0.0002668315918143169, "loss": 0.1915, "step": 51 }, { "epoch": 0.45916114790286977, "grad_norm": 0.3106990456581116, "learning_rate": 0.00026526016662852886, "loss": 0.2188, "step": 52 }, { "epoch": 0.46799116997792495, "grad_norm": 0.35014471411705017, "learning_rate": 0.00026365723046405023, "loss": 0.2459, "step": 53 }, { "epoch": 0.4768211920529801, "grad_norm": 0.37680375576019287, "learning_rate": 0.0002620232215476231, "loss": 0.29, "step": 54 }, { "epoch": 0.4856512141280353, "grad_norm": 0.4499601423740387, "learning_rate": 0.0002603585866009697, "loss": 0.3253, "step": 55 }, { "epoch": 0.49448123620309054, "grad_norm": 1.3646451234817505, "learning_rate": 0.00025866378071866334, "loss": 0.635, "step": 56 }, { "epoch": 0.5033112582781457, "grad_norm": 0.3264712691307068, "learning_rate": 0.00025693926724370956, "loss": 0.3822, "step": 57 }, { "epoch": 0.5121412803532008, "grad_norm": 0.3070512115955353, "learning_rate": 0.00025518551764087326, "loss": 0.3222, "step": 58 }, { "epoch": 0.5209713024282561, "grad_norm": 0.27624809741973877, "learning_rate": 0.00025340301136778483, "loss": 0.3353, "step": 59 }, { "epoch": 0.5298013245033113, "grad_norm": 0.2753635048866272, "learning_rate": 0.00025159223574386114, "loss": 0.3167, "step": 60 }, { "epoch": 0.5386313465783664, "grad_norm": 0.2416149228811264, "learning_rate": 0.0002497536858170772, "loss": 0.2327, "step": 61 }, { "epoch": 0.5474613686534217, "grad_norm": 0.2498352825641632, "learning_rate": 0.00024788786422862526, "loss": 0.2401, "step": 62 }, { "epoch": 0.5562913907284768, "grad_norm": 0.24006612598896027, "learning_rate": 0.00024599528107549745, "loss": 0.2362, "step": 63 }, { "epoch": 0.565121412803532, "grad_norm": 0.23591412603855133, "learning_rate": 0.00024407645377103054, "loss": 0.2181, "step": 64 }, { "epoch": 0.5739514348785872, "grad_norm": 0.26131555438041687, "learning_rate": 0.00024213190690345018, "loss": 0.2303, "step": 65 }, { "epoch": 0.5827814569536424, "grad_norm": 0.23474238812923431, "learning_rate": 0.00024016217209245374, "loss": 0.2231, "step": 66 }, { "epoch": 0.5916114790286976, "grad_norm": 0.26084139943122864, "learning_rate": 0.00023816778784387094, "loss": 0.2424, "step": 67 }, { "epoch": 0.6004415011037527, "grad_norm": 0.28198346495628357, "learning_rate": 0.0002361492994024415, "loss": 0.2714, "step": 68 }, { "epoch": 0.609271523178808, "grad_norm": 0.26249727606773376, "learning_rate": 0.0002341072586027509, "loss": 0.2478, "step": 69 }, { "epoch": 0.6181015452538632, "grad_norm": 0.2686706781387329, "learning_rate": 0.00023204222371836405, "loss": 0.2649, "step": 70 }, { "epoch": 0.6269315673289183, "grad_norm": 0.2589472234249115, "learning_rate": 0.00022995475930919905, "loss": 0.247, "step": 71 }, { "epoch": 0.6357615894039735, "grad_norm": 0.24519923329353333, "learning_rate": 0.00022784543606718227, "loss": 0.233, "step": 72 }, { "epoch": 0.6445916114790287, "grad_norm": 0.25989893078804016, "learning_rate": 0.00022571483066022657, "loss": 0.2705, "step": 73 }, { "epoch": 0.6534216335540839, "grad_norm": 0.2571690082550049, "learning_rate": 0.0002235635255745762, "loss": 0.2456, "step": 74 }, { "epoch": 0.6622516556291391, "grad_norm": 0.253063440322876, "learning_rate": 0.00022139210895556104, "loss": 0.182, "step": 75 }, { "epoch": 0.6622516556291391, "eval_loss": 0.23765632510185242, "eval_runtime": 8.9718, "eval_samples_per_second": 5.573, "eval_steps_per_second": 0.78, "step": 75 }, { "epoch": 0.6710816777041942, "grad_norm": 0.26982638239860535, "learning_rate": 0.00021920117444680317, "loss": 0.2228, "step": 76 }, { "epoch": 0.6799116997792495, "grad_norm": 0.2407388687133789, "learning_rate": 0.00021699132102792097, "loss": 0.1981, "step": 77 }, { "epoch": 0.6887417218543046, "grad_norm": 0.2429220825433731, "learning_rate": 0.0002147631528507739, "loss": 0.2099, "step": 78 }, { "epoch": 0.6975717439293598, "grad_norm": 0.2709774672985077, "learning_rate": 0.00021251727907429355, "loss": 0.208, "step": 79 }, { "epoch": 0.7064017660044151, "grad_norm": 0.2707652747631073, "learning_rate": 0.0002102543136979454, "loss": 0.2074, "step": 80 }, { "epoch": 0.7152317880794702, "grad_norm": 0.34875017404556274, "learning_rate": 0.0002079748753938678, "loss": 0.2687, "step": 81 }, { "epoch": 0.7240618101545254, "grad_norm": 0.3415201008319855, "learning_rate": 0.0002056795873377331, "loss": 0.2631, "step": 82 }, { "epoch": 0.7328918322295805, "grad_norm": 0.40681779384613037, "learning_rate": 0.00020336907703837748, "loss": 0.2692, "step": 83 }, { "epoch": 0.7417218543046358, "grad_norm": 1.7035514116287231, "learning_rate": 0.00020104397616624645, "loss": 0.6545, "step": 84 }, { "epoch": 0.7505518763796909, "grad_norm": 0.29007866978645325, "learning_rate": 0.00019870492038070252, "loss": 0.3436, "step": 85 }, { "epoch": 0.7593818984547461, "grad_norm": 0.28065428137779236, "learning_rate": 0.0001963525491562421, "loss": 0.299, "step": 86 }, { "epoch": 0.7682119205298014, "grad_norm": 0.2473965883255005, "learning_rate": 0.0001939875056076697, "loss": 0.3076, "step": 87 }, { "epoch": 0.7770419426048565, "grad_norm": 0.2542283535003662, "learning_rate": 0.00019161043631427666, "loss": 0.2815, "step": 88 }, { "epoch": 0.7858719646799117, "grad_norm": 0.22185376286506653, "learning_rate": 0.00018922199114307294, "loss": 0.2297, "step": 89 }, { "epoch": 0.7947019867549668, "grad_norm": 0.20742599666118622, "learning_rate": 0.00018682282307111987, "loss": 0.1853, "step": 90 }, { "epoch": 0.8035320088300221, "grad_norm": 0.23171746730804443, "learning_rate": 0.00018441358800701273, "loss": 0.2454, "step": 91 }, { "epoch": 0.8123620309050773, "grad_norm": 0.2426079958677292, "learning_rate": 0.00018199494461156203, "loss": 0.1993, "step": 92 }, { "epoch": 0.8211920529801324, "grad_norm": 0.26155412197113037, "learning_rate": 0.000179567554117722, "loss": 0.2489, "step": 93 }, { "epoch": 0.8300220750551877, "grad_norm": 0.25975102186203003, "learning_rate": 0.00017713208014981648, "loss": 0.2235, "step": 94 }, { "epoch": 0.8388520971302428, "grad_norm": 0.2326008379459381, "learning_rate": 0.00017468918854211007, "loss": 0.2288, "step": 95 }, { "epoch": 0.847682119205298, "grad_norm": 0.24007712304592133, "learning_rate": 0.00017223954715677627, "loss": 0.2236, "step": 96 }, { "epoch": 0.8565121412803532, "grad_norm": 0.24881809949874878, "learning_rate": 0.00016978382570131034, "loss": 0.2407, "step": 97 }, { "epoch": 0.8653421633554084, "grad_norm": 0.25319480895996094, "learning_rate": 0.00016732269554543794, "loss": 0.2066, "step": 98 }, { "epoch": 0.8741721854304636, "grad_norm": 0.24012450873851776, "learning_rate": 0.00016485682953756942, "loss": 0.2191, "step": 99 }, { "epoch": 0.8830022075055187, "grad_norm": 0.23768042027950287, "learning_rate": 0.00016238690182084986, "loss": 0.2296, "step": 100 }, { "epoch": 0.8830022075055187, "eval_loss": 0.22028179466724396, "eval_runtime": 8.4215, "eval_samples_per_second": 5.937, "eval_steps_per_second": 0.831, "step": 100 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2189384971740774e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }