{ "best_metric": 0.6968957781791687, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 2.5901374292643493, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025869037995149554, "grad_norm": 348.736083984375, "learning_rate": 2.9999999999999997e-05, "loss": 274.9102, "step": 1 }, { "epoch": 0.025869037995149554, "eval_loss": 8.283905982971191, "eval_runtime": 0.6332, "eval_samples_per_second": 78.958, "eval_steps_per_second": 3.158, "step": 1 }, { "epoch": 0.05173807599029911, "grad_norm": 348.5337829589844, "learning_rate": 5.9999999999999995e-05, "loss": 269.3359, "step": 2 }, { "epoch": 0.07760711398544867, "grad_norm": 336.2130126953125, "learning_rate": 8.999999999999999e-05, "loss": 265.1719, "step": 3 }, { "epoch": 0.10347615198059822, "grad_norm": 333.7207946777344, "learning_rate": 0.00011999999999999999, "loss": 249.0039, "step": 4 }, { "epoch": 0.1293451899757478, "grad_norm": 298.2082824707031, "learning_rate": 0.00015, "loss": 206.7969, "step": 5 }, { "epoch": 0.15521422797089734, "grad_norm": 250.3636474609375, "learning_rate": 0.00017999999999999998, "loss": 161.4229, "step": 6 }, { "epoch": 0.18108326596604687, "grad_norm": 226.34564208984375, "learning_rate": 0.00020999999999999998, "loss": 120.4238, "step": 7 }, { "epoch": 0.20695230396119643, "grad_norm": 308.28143310546875, "learning_rate": 0.00023999999999999998, "loss": 84.0972, "step": 8 }, { "epoch": 0.232821341956346, "grad_norm": 549.4783325195312, "learning_rate": 0.00027, "loss": 52.3253, "step": 9 }, { "epoch": 0.2586903799514956, "grad_norm": 572.8822021484375, "learning_rate": 0.0003, "loss": 46.6969, "step": 10 }, { "epoch": 0.28455941794664513, "grad_norm": 67.11377716064453, "learning_rate": 0.00029993412547631913, "loss": 24.6701, "step": 11 }, { "epoch": 0.3104284559417947, "grad_norm": 113.78333282470703, "learning_rate": 0.0002997365597646482, "loss": 26.2216, "step": 12 }, { "epoch": 0.33629749393694425, "grad_norm": 129.92559814453125, "learning_rate": 0.0002994074763922825, "loss": 24.956, "step": 13 }, { "epoch": 0.36216653193209375, "grad_norm": 104.27074432373047, "learning_rate": 0.0002989471644020275, "loss": 24.2584, "step": 14 }, { "epoch": 0.3880355699272433, "grad_norm": 119.985595703125, "learning_rate": 0.00029835602809832456, "loss": 25.2181, "step": 15 }, { "epoch": 0.41390460792239286, "grad_norm": 48.832435607910156, "learning_rate": 0.0002976345866921395, "loss": 22.767, "step": 16 }, { "epoch": 0.4397736459175424, "grad_norm": 81.3912353515625, "learning_rate": 0.0002967834738449256, "loss": 23.8924, "step": 17 }, { "epoch": 0.465642683912692, "grad_norm": 54.908119201660156, "learning_rate": 0.0002958034371120616, "loss": 24.4696, "step": 18 }, { "epoch": 0.49151172190784154, "grad_norm": 51.236083984375, "learning_rate": 0.00029469533728625376, "loss": 23.9197, "step": 19 }, { "epoch": 0.5173807599029911, "grad_norm": 47.18868637084961, "learning_rate": 0.00029346014764147836, "loss": 23.3075, "step": 20 }, { "epoch": 0.5432497978981407, "grad_norm": 19.380447387695312, "learning_rate": 0.0002920989530781287, "loss": 22.372, "step": 21 }, { "epoch": 0.5691188358932903, "grad_norm": 51.02498245239258, "learning_rate": 0.00029061294917011814, "loss": 23.7828, "step": 22 }, { "epoch": 0.5949878738884398, "grad_norm": 61.17385482788086, "learning_rate": 0.000289003441114775, "loss": 25.444, "step": 23 }, { "epoch": 0.6208569118835894, "grad_norm": 34.5933723449707, "learning_rate": 0.0002872718425864527, "loss": 23.0492, "step": 24 }, { "epoch": 0.6467259498787389, "grad_norm": 19.800092697143555, "learning_rate": 0.0002854196744948615, "loss": 22.5537, "step": 25 }, { "epoch": 0.6467259498787389, "eval_loss": 0.741091787815094, "eval_runtime": 0.6281, "eval_samples_per_second": 79.606, "eval_steps_per_second": 3.184, "step": 25 }, { "epoch": 0.6725949878738885, "grad_norm": 28.665796279907227, "learning_rate": 0.0002834485636492121, "loss": 23.0459, "step": 26 }, { "epoch": 0.698464025869038, "grad_norm": 54.957481384277344, "learning_rate": 0.0002813602413293455, "loss": 25.5733, "step": 27 }, { "epoch": 0.7243330638641875, "grad_norm": 28.139915466308594, "learning_rate": 0.0002791565417651033, "loss": 23.6223, "step": 28 }, { "epoch": 0.7502021018593371, "grad_norm": 29.63477897644043, "learning_rate": 0.0002768394005252739, "loss": 23.6755, "step": 29 }, { "epoch": 0.7760711398544866, "grad_norm": 24.49903678894043, "learning_rate": 0.00027441085281753024, "loss": 22.999, "step": 30 }, { "epoch": 0.8019401778496362, "grad_norm": 16.341554641723633, "learning_rate": 0.0002718730317008522, "loss": 22.6584, "step": 31 }, { "epoch": 0.8278092158447857, "grad_norm": 19.589542388916016, "learning_rate": 0.000269228166212003, "loss": 23.1001, "step": 32 }, { "epoch": 0.8536782538399353, "grad_norm": 20.623701095581055, "learning_rate": 0.00026647857940770634, "loss": 23.1374, "step": 33 }, { "epoch": 0.8795472918350848, "grad_norm": 24.688899993896484, "learning_rate": 0.000263626686324243, "loss": 23.4097, "step": 34 }, { "epoch": 0.9054163298302345, "grad_norm": 22.096830368041992, "learning_rate": 0.0002606749918562591, "loss": 23.1316, "step": 35 }, { "epoch": 0.931285367825384, "grad_norm": 27.559904098510742, "learning_rate": 0.00025762608855664965, "loss": 23.2516, "step": 36 }, { "epoch": 0.9571544058205336, "grad_norm": 15.820242881774902, "learning_rate": 0.00025448265435944954, "loss": 22.5069, "step": 37 }, { "epoch": 0.9830234438156831, "grad_norm": 18.462657928466797, "learning_rate": 0.0002512474502277316, "loss": 22.9305, "step": 38 }, { "epoch": 1.0105092966855296, "grad_norm": 16.62592887878418, "learning_rate": 0.0002479233177285782, "loss": 22.217, "step": 39 }, { "epoch": 1.036378334680679, "grad_norm": 19.09794044494629, "learning_rate": 0.0002445131765372567, "loss": 22.9307, "step": 40 }, { "epoch": 1.0622473726758286, "grad_norm": 17.65207290649414, "learning_rate": 0.000241020021872789, "loss": 22.5269, "step": 41 }, { "epoch": 1.0881164106709782, "grad_norm": 23.124086380004883, "learning_rate": 0.00023744692186717078, "loss": 22.7486, "step": 42 }, { "epoch": 1.1139854486661278, "grad_norm": 3.4357526302337646, "learning_rate": 0.00023379701487054785, "loss": 22.0516, "step": 43 }, { "epoch": 1.1398544866612772, "grad_norm": 30.857542037963867, "learning_rate": 0.00023007350669471862, "loss": 23.7532, "step": 44 }, { "epoch": 1.1657235246564268, "grad_norm": 40.20075988769531, "learning_rate": 0.00022627966779738306, "loss": 24.4793, "step": 45 }, { "epoch": 1.1915925626515764, "grad_norm": 24.608131408691406, "learning_rate": 0.00022241883040961173, "loss": 22.423, "step": 46 }, { "epoch": 1.217461600646726, "grad_norm": 11.946351051330566, "learning_rate": 0.00021849438560905693, "loss": 22.8885, "step": 47 }, { "epoch": 1.2433306386418754, "grad_norm": 11.997475624084473, "learning_rate": 0.00021450978034147806, "loss": 22.3267, "step": 48 }, { "epoch": 1.269199676637025, "grad_norm": 24.218584060668945, "learning_rate": 0.00021046851439319585, "loss": 22.9163, "step": 49 }, { "epoch": 1.2950687146321747, "grad_norm": 21.68750762939453, "learning_rate": 0.0002063741373171357, "loss": 22.702, "step": 50 }, { "epoch": 1.2950687146321747, "eval_loss": 0.7224195599555969, "eval_runtime": 0.6276, "eval_samples_per_second": 79.667, "eval_steps_per_second": 3.187, "step": 50 }, { "epoch": 1.3209377526273243, "grad_norm": 15.50403118133545, "learning_rate": 0.0002022302453151598, "loss": 22.3089, "step": 51 }, { "epoch": 1.3468067906224737, "grad_norm": 24.226417541503906, "learning_rate": 0.0001980404780794256, "loss": 23.1566, "step": 52 }, { "epoch": 1.3726758286176233, "grad_norm": 20.176647186279297, "learning_rate": 0.00019380851559554636, "loss": 22.929, "step": 53 }, { "epoch": 1.3985448666127729, "grad_norm": 15.459396362304688, "learning_rate": 0.00018953807491036011, "loss": 22.6978, "step": 54 }, { "epoch": 1.4244139046079223, "grad_norm": 14.14704418182373, "learning_rate": 0.00018523290686714756, "loss": 22.7141, "step": 55 }, { "epoch": 1.450282942603072, "grad_norm": 7.990035533905029, "learning_rate": 0.00018089679281116472, "loss": 23.1633, "step": 56 }, { "epoch": 1.4761519805982215, "grad_norm": 3.211017608642578, "learning_rate": 0.00017653354126838592, "loss": 22.3353, "step": 57 }, { "epoch": 1.502021018593371, "grad_norm": 18.740083694458008, "learning_rate": 0.00017214698460037218, "loss": 23.5309, "step": 58 }, { "epoch": 1.5278900565885207, "grad_norm": 16.11014175415039, "learning_rate": 0.00016774097563820485, "loss": 22.8019, "step": 59 }, { "epoch": 1.5537590945836701, "grad_norm": 26.232606887817383, "learning_rate": 0.00016331938429844022, "loss": 23.5109, "step": 60 }, { "epoch": 1.5796281325788197, "grad_norm": 16.256412506103516, "learning_rate": 0.00015888609418405713, "loss": 22.8009, "step": 61 }, { "epoch": 1.6054971705739693, "grad_norm": 11.629958152770996, "learning_rate": 0.00015444499917338395, "loss": 22.3203, "step": 62 }, { "epoch": 1.6313662085691187, "grad_norm": 11.147138595581055, "learning_rate": 0.00015, "loss": 22.4757, "step": 63 }, { "epoch": 1.6572352465642683, "grad_norm": 5.99025297164917, "learning_rate": 0.00014555500082661602, "loss": 22.2444, "step": 64 }, { "epoch": 1.683104284559418, "grad_norm": 11.468669891357422, "learning_rate": 0.00014111390581594284, "loss": 22.2462, "step": 65 }, { "epoch": 1.7089733225545674, "grad_norm": 14.979022979736328, "learning_rate": 0.00013668061570155978, "loss": 21.7589, "step": 66 }, { "epoch": 1.7348423605497172, "grad_norm": 12.94080924987793, "learning_rate": 0.00013225902436179513, "loss": 22.4269, "step": 67 }, { "epoch": 1.7607113985448666, "grad_norm": 11.411182403564453, "learning_rate": 0.00012785301539962782, "loss": 21.7354, "step": 68 }, { "epoch": 1.7865804365400162, "grad_norm": 27.090801239013672, "learning_rate": 0.00012346645873161408, "loss": 23.5318, "step": 69 }, { "epoch": 1.8124494745351658, "grad_norm": 17.46219825744629, "learning_rate": 0.00011910320718883525, "loss": 22.8003, "step": 70 }, { "epoch": 1.8383185125303152, "grad_norm": 17.276792526245117, "learning_rate": 0.00011476709313285244, "loss": 22.7198, "step": 71 }, { "epoch": 1.8641875505254648, "grad_norm": 13.101729393005371, "learning_rate": 0.00011046192508963989, "loss": 22.2413, "step": 72 }, { "epoch": 1.8900565885206144, "grad_norm": 10.330924987792969, "learning_rate": 0.00010619148440445364, "loss": 21.9412, "step": 73 }, { "epoch": 1.9159256265157638, "grad_norm": 16.028894424438477, "learning_rate": 0.00010195952192057438, "loss": 22.5098, "step": 74 }, { "epoch": 1.9417946645109136, "grad_norm": 8.1192626953125, "learning_rate": 9.776975468484019e-05, "loss": 22.1182, "step": 75 }, { "epoch": 1.9417946645109136, "eval_loss": 0.7175214886665344, "eval_runtime": 0.6276, "eval_samples_per_second": 79.669, "eval_steps_per_second": 3.187, "step": 75 }, { "epoch": 1.967663702506063, "grad_norm": 11.423409461975098, "learning_rate": 9.36258626828643e-05, "loss": 22.3389, "step": 76 }, { "epoch": 1.9935327405012127, "grad_norm": 12.934334754943848, "learning_rate": 8.953148560680418e-05, "loss": 22.7501, "step": 77 }, { "epoch": 2.021018593371059, "grad_norm": 22.10219383239746, "learning_rate": 8.549021965852197e-05, "loss": 23.1807, "step": 78 }, { "epoch": 2.0468876313662085, "grad_norm": 15.90378475189209, "learning_rate": 8.150561439094303e-05, "loss": 22.5372, "step": 79 }, { "epoch": 2.072756669361358, "grad_norm": 10.656487464904785, "learning_rate": 7.758116959038828e-05, "loss": 22.1827, "step": 80 }, { "epoch": 2.0986257073565078, "grad_norm": 22.766876220703125, "learning_rate": 7.372033220261696e-05, "loss": 22.6163, "step": 81 }, { "epoch": 2.124494745351657, "grad_norm": 11.259724617004395, "learning_rate": 6.992649330528145e-05, "loss": 22.0147, "step": 82 }, { "epoch": 2.1503637833468066, "grad_norm": 12.66515827178955, "learning_rate": 6.620298512945214e-05, "loss": 21.9512, "step": 83 }, { "epoch": 2.1762328213419564, "grad_norm": 5.229973793029785, "learning_rate": 6.255307813282921e-05, "loss": 22.17, "step": 84 }, { "epoch": 2.202101859337106, "grad_norm": 6.952908992767334, "learning_rate": 5.897997812721103e-05, "loss": 22.418, "step": 85 }, { "epoch": 2.2279708973322556, "grad_norm": 9.438949584960938, "learning_rate": 5.5486823462743344e-05, "loss": 22.334, "step": 86 }, { "epoch": 2.253839935327405, "grad_norm": 13.546004295349121, "learning_rate": 5.2076682271421774e-05, "loss": 22.3634, "step": 87 }, { "epoch": 2.2797089733225544, "grad_norm": 14.096308708190918, "learning_rate": 4.8752549772268444e-05, "loss": 22.6631, "step": 88 }, { "epoch": 2.3055780113177042, "grad_norm": 18.847871780395508, "learning_rate": 4.551734564055049e-05, "loss": 22.0801, "step": 89 }, { "epoch": 2.3314470493128536, "grad_norm": 7.903066635131836, "learning_rate": 4.2373911443350286e-05, "loss": 22.043, "step": 90 }, { "epoch": 2.3573160873080035, "grad_norm": 16.976978302001953, "learning_rate": 3.932500814374089e-05, "loss": 22.2002, "step": 91 }, { "epoch": 2.383185125303153, "grad_norm": 11.1248140335083, "learning_rate": 3.637331367575698e-05, "loss": 22.1329, "step": 92 }, { "epoch": 2.4090541632983022, "grad_norm": 5.761756896972656, "learning_rate": 3.352142059229365e-05, "loss": 22.0856, "step": 93 }, { "epoch": 2.434923201293452, "grad_norm": 12.847921371459961, "learning_rate": 3.077183378799699e-05, "loss": 22.0646, "step": 94 }, { "epoch": 2.4607922392886015, "grad_norm": 9.289769172668457, "learning_rate": 2.81269682991478e-05, "loss": 21.8848, "step": 95 }, { "epoch": 2.486661277283751, "grad_norm": 13.644316673278809, "learning_rate": 2.5589147182469732e-05, "loss": 23.1436, "step": 96 }, { "epoch": 2.5125303152789007, "grad_norm": 16.434682846069336, "learning_rate": 2.316059947472607e-05, "loss": 22.212, "step": 97 }, { "epoch": 2.53839935327405, "grad_norm": 6.969300270080566, "learning_rate": 2.0843458234896666e-05, "loss": 22.2793, "step": 98 }, { "epoch": 2.5642683912691995, "grad_norm": 21.42749786376953, "learning_rate": 1.8639758670654486e-05, "loss": 22.3692, "step": 99 }, { "epoch": 2.5901374292643493, "grad_norm": 13.674956321716309, "learning_rate": 1.6551436350787918e-05, "loss": 22.2481, "step": 100 }, { "epoch": 2.5901374292643493, "eval_loss": 0.6968957781791687, "eval_runtime": 0.6272, "eval_samples_per_second": 79.714, "eval_steps_per_second": 3.189, "step": 100 } ], "logging_steps": 1, "max_steps": 116, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.020403094913024e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }