{ "best_metric": 0.6065332889556885, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.034299434059338024, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003429943405933802, "grad_norm": 37.64034652709961, "learning_rate": 2e-05, "loss": 8.6205, "step": 1 }, { "epoch": 0.0003429943405933802, "eval_loss": 9.76213550567627, "eval_runtime": 454.8819, "eval_samples_per_second": 86.359, "eval_steps_per_second": 5.399, "step": 1 }, { "epoch": 0.0006859886811867604, "grad_norm": 44.59769058227539, "learning_rate": 4e-05, "loss": 9.6734, "step": 2 }, { "epoch": 0.0010289830217801406, "grad_norm": 34.71078109741211, "learning_rate": 6e-05, "loss": 8.9436, "step": 3 }, { "epoch": 0.0013719773623735209, "grad_norm": 27.079349517822266, "learning_rate": 8e-05, "loss": 8.1493, "step": 4 }, { "epoch": 0.001714971702966901, "grad_norm": 25.673320770263672, "learning_rate": 0.0001, "loss": 5.7357, "step": 5 }, { "epoch": 0.0020579660435602813, "grad_norm": 26.150300979614258, "learning_rate": 9.997266286704631e-05, "loss": 3.8443, "step": 6 }, { "epoch": 0.0024009603841536613, "grad_norm": 18.380756378173828, "learning_rate": 9.989068136093873e-05, "loss": 2.155, "step": 7 }, { "epoch": 0.0027439547247470417, "grad_norm": 16.064098358154297, "learning_rate": 9.975414512725057e-05, "loss": 1.3593, "step": 8 }, { "epoch": 0.0030869490653404217, "grad_norm": 15.563278198242188, "learning_rate": 9.956320346634876e-05, "loss": 1.1666, "step": 9 }, { "epoch": 0.003429943405933802, "grad_norm": 7.041236400604248, "learning_rate": 9.931806517013612e-05, "loss": 0.8032, "step": 10 }, { "epoch": 0.003772937746527182, "grad_norm": 7.1518940925598145, "learning_rate": 9.901899829374047e-05, "loss": 0.6811, "step": 11 }, { "epoch": 0.004115932087120563, "grad_norm": 10.00113296508789, "learning_rate": 9.86663298624003e-05, "loss": 0.6515, "step": 12 }, { "epoch": 0.004458926427713943, "grad_norm": 11.751809120178223, "learning_rate": 9.826044551386744e-05, "loss": 1.1076, "step": 13 }, { "epoch": 0.004801920768307323, "grad_norm": 8.324594497680664, "learning_rate": 9.780178907671789e-05, "loss": 0.8199, "step": 14 }, { "epoch": 0.0051449151089007035, "grad_norm": 13.078007698059082, "learning_rate": 9.729086208503174e-05, "loss": 0.6963, "step": 15 }, { "epoch": 0.0054879094494940835, "grad_norm": 9.826541900634766, "learning_rate": 9.672822322997305e-05, "loss": 0.9405, "step": 16 }, { "epoch": 0.0058309037900874635, "grad_norm": 5.5718536376953125, "learning_rate": 9.611448774886924e-05, "loss": 0.7803, "step": 17 }, { "epoch": 0.0061738981306808435, "grad_norm": 8.907158851623535, "learning_rate": 9.545032675245813e-05, "loss": 0.7238, "step": 18 }, { "epoch": 0.006516892471274224, "grad_norm": 26.508176803588867, "learning_rate": 9.473646649103818e-05, "loss": 1.2421, "step": 19 }, { "epoch": 0.006859886811867604, "grad_norm": 9.246644973754883, "learning_rate": 9.397368756032445e-05, "loss": 0.7533, "step": 20 }, { "epoch": 0.007202881152460984, "grad_norm": 4.378249168395996, "learning_rate": 9.316282404787871e-05, "loss": 0.6177, "step": 21 }, { "epoch": 0.007545875493054364, "grad_norm": 3.846181631088257, "learning_rate": 9.230476262104677e-05, "loss": 0.626, "step": 22 }, { "epoch": 0.007888869833647744, "grad_norm": 4.822765350341797, "learning_rate": 9.140044155740101e-05, "loss": 0.616, "step": 23 }, { "epoch": 0.008231864174241125, "grad_norm": 2.799402952194214, "learning_rate": 9.045084971874738e-05, "loss": 0.5661, "step": 24 }, { "epoch": 0.008574858514834506, "grad_norm": 4.079718589782715, "learning_rate": 8.945702546981969e-05, "loss": 0.6022, "step": 25 }, { "epoch": 0.008574858514834506, "eval_loss": 0.8190973401069641, "eval_runtime": 455.0157, "eval_samples_per_second": 86.333, "eval_steps_per_second": 5.398, "step": 25 }, { "epoch": 0.008917852855427885, "grad_norm": 13.959466934204102, "learning_rate": 8.842005554284296e-05, "loss": 1.3743, "step": 26 }, { "epoch": 0.009260847196021266, "grad_norm": 5.51138162612915, "learning_rate": 8.73410738492077e-05, "loss": 0.6511, "step": 27 }, { "epoch": 0.009603841536614645, "grad_norm": 5.255268096923828, "learning_rate": 8.622126023955446e-05, "loss": 0.8741, "step": 28 }, { "epoch": 0.009946835877208026, "grad_norm": 7.801476955413818, "learning_rate": 8.506183921362443e-05, "loss": 1.2424, "step": 29 }, { "epoch": 0.010289830217801407, "grad_norm": 6.918961048126221, "learning_rate": 8.386407858128706e-05, "loss": 1.0745, "step": 30 }, { "epoch": 0.010632824558394786, "grad_norm": 6.8002543449401855, "learning_rate": 8.262928807620843e-05, "loss": 0.8693, "step": 31 }, { "epoch": 0.010975818898988167, "grad_norm": 9.85213851928711, "learning_rate": 8.135881792367686e-05, "loss": 0.8527, "step": 32 }, { "epoch": 0.011318813239581546, "grad_norm": 8.137986183166504, "learning_rate": 8.005405736415126e-05, "loss": 0.7667, "step": 33 }, { "epoch": 0.011661807580174927, "grad_norm": 4.770893573760986, "learning_rate": 7.871643313414718e-05, "loss": 0.6926, "step": 34 }, { "epoch": 0.012004801920768308, "grad_norm": 3.7439723014831543, "learning_rate": 7.734740790612136e-05, "loss": 0.6625, "step": 35 }, { "epoch": 0.012347796261361687, "grad_norm": 2.6367475986480713, "learning_rate": 7.594847868906076e-05, "loss": 0.5606, "step": 36 }, { "epoch": 0.012690790601955068, "grad_norm": 2.4979803562164307, "learning_rate": 7.452117519152542e-05, "loss": 0.5968, "step": 37 }, { "epoch": 0.013033784942548449, "grad_norm": 5.410016059875488, "learning_rate": 7.30670581489344e-05, "loss": 0.8155, "step": 38 }, { "epoch": 0.013376779283141828, "grad_norm": 4.380951404571533, "learning_rate": 7.158771761692464e-05, "loss": 0.7026, "step": 39 }, { "epoch": 0.013719773623735209, "grad_norm": 3.8239080905914307, "learning_rate": 7.008477123264848e-05, "loss": 0.5371, "step": 40 }, { "epoch": 0.014062767964328588, "grad_norm": 4.943681240081787, "learning_rate": 6.855986244591104e-05, "loss": 0.8807, "step": 41 }, { "epoch": 0.014405762304921969, "grad_norm": 6.690975189208984, "learning_rate": 6.701465872208216e-05, "loss": 0.9235, "step": 42 }, { "epoch": 0.01474875664551535, "grad_norm": 5.062525272369385, "learning_rate": 6.545084971874738e-05, "loss": 0.7921, "step": 43 }, { "epoch": 0.015091750986108729, "grad_norm": 8.082966804504395, "learning_rate": 6.387014543809223e-05, "loss": 0.788, "step": 44 }, { "epoch": 0.01543474532670211, "grad_norm": 9.022645950317383, "learning_rate": 6.227427435703997e-05, "loss": 0.8071, "step": 45 }, { "epoch": 0.01577773966729549, "grad_norm": 9.152997016906738, "learning_rate": 6.066498153718735e-05, "loss": 0.8149, "step": 46 }, { "epoch": 0.01612073400788887, "grad_norm": 8.326438903808594, "learning_rate": 5.90440267166055e-05, "loss": 0.7687, "step": 47 }, { "epoch": 0.01646372834848225, "grad_norm": 6.8462958335876465, "learning_rate": 5.74131823855921e-05, "loss": 0.705, "step": 48 }, { "epoch": 0.01680672268907563, "grad_norm": 4.630781650543213, "learning_rate": 5.577423184847932e-05, "loss": 0.657, "step": 49 }, { "epoch": 0.017149717029669012, "grad_norm": 3.1709320545196533, "learning_rate": 5.4128967273616625e-05, "loss": 0.6271, "step": 50 }, { "epoch": 0.017149717029669012, "eval_loss": 0.6372910141944885, "eval_runtime": 454.4408, "eval_samples_per_second": 86.442, "eval_steps_per_second": 5.404, "step": 50 }, { "epoch": 0.01749271137026239, "grad_norm": 4.854703903198242, "learning_rate": 5.247918773366112e-05, "loss": 0.6515, "step": 51 }, { "epoch": 0.01783570571085577, "grad_norm": 3.4379847049713135, "learning_rate": 5.0826697238317935e-05, "loss": 0.5102, "step": 52 }, { "epoch": 0.01817870005144915, "grad_norm": 1.0231150388717651, "learning_rate": 4.917330276168208e-05, "loss": 0.5786, "step": 53 }, { "epoch": 0.018521694392042532, "grad_norm": 9.799490928649902, "learning_rate": 4.7520812266338885e-05, "loss": 1.1475, "step": 54 }, { "epoch": 0.01886468873263591, "grad_norm": 7.999386310577393, "learning_rate": 4.5871032726383386e-05, "loss": 1.0773, "step": 55 }, { "epoch": 0.01920768307322929, "grad_norm": 6.689034938812256, "learning_rate": 4.4225768151520694e-05, "loss": 0.9807, "step": 56 }, { "epoch": 0.019550677413822673, "grad_norm": 3.2716429233551025, "learning_rate": 4.2586817614407895e-05, "loss": 0.6374, "step": 57 }, { "epoch": 0.019893671754416052, "grad_norm": 2.5127179622650146, "learning_rate": 4.095597328339452e-05, "loss": 0.5646, "step": 58 }, { "epoch": 0.02023666609500943, "grad_norm": 2.1465413570404053, "learning_rate": 3.933501846281267e-05, "loss": 0.5376, "step": 59 }, { "epoch": 0.020579660435602814, "grad_norm": 1.16959547996521, "learning_rate": 3.772572564296005e-05, "loss": 0.5539, "step": 60 }, { "epoch": 0.020922654776196193, "grad_norm": 1.0824437141418457, "learning_rate": 3.612985456190778e-05, "loss": 0.5521, "step": 61 }, { "epoch": 0.021265649116789572, "grad_norm": 1.995509147644043, "learning_rate": 3.4549150281252636e-05, "loss": 0.5849, "step": 62 }, { "epoch": 0.021608643457382955, "grad_norm": 4.708991527557373, "learning_rate": 3.298534127791785e-05, "loss": 0.7752, "step": 63 }, { "epoch": 0.021951637797976334, "grad_norm": 4.980032444000244, "learning_rate": 3.144013755408895e-05, "loss": 0.7324, "step": 64 }, { "epoch": 0.022294632138569713, "grad_norm": 3.3115477561950684, "learning_rate": 2.991522876735154e-05, "loss": 0.5236, "step": 65 }, { "epoch": 0.022637626479163092, "grad_norm": 3.6939260959625244, "learning_rate": 2.8412282383075363e-05, "loss": 0.7728, "step": 66 }, { "epoch": 0.022980620819756475, "grad_norm": 4.9289679527282715, "learning_rate": 2.693294185106562e-05, "loss": 0.9046, "step": 67 }, { "epoch": 0.023323615160349854, "grad_norm": 4.798633098602295, "learning_rate": 2.547882480847461e-05, "loss": 0.88, "step": 68 }, { "epoch": 0.023666609500943233, "grad_norm": 3.1352736949920654, "learning_rate": 2.405152131093926e-05, "loss": 0.618, "step": 69 }, { "epoch": 0.024009603841536616, "grad_norm": 4.145514488220215, "learning_rate": 2.2652592093878666e-05, "loss": 0.6046, "step": 70 }, { "epoch": 0.024352598182129995, "grad_norm": 4.038343906402588, "learning_rate": 2.128356686585282e-05, "loss": 0.6156, "step": 71 }, { "epoch": 0.024695592522723374, "grad_norm": 3.129878044128418, "learning_rate": 1.9945942635848748e-05, "loss": 0.6141, "step": 72 }, { "epoch": 0.025038586863316756, "grad_norm": 2.775832414627075, "learning_rate": 1.8641182076323148e-05, "loss": 0.5714, "step": 73 }, { "epoch": 0.025381581203910136, "grad_norm": 2.6074538230895996, "learning_rate": 1.7370711923791567e-05, "loss": 0.568, "step": 74 }, { "epoch": 0.025724575544503515, "grad_norm": 2.4739668369293213, "learning_rate": 1.6135921418712956e-05, "loss": 0.5792, "step": 75 }, { "epoch": 0.025724575544503515, "eval_loss": 0.6100103855133057, "eval_runtime": 454.3358, "eval_samples_per_second": 86.462, "eval_steps_per_second": 5.406, "step": 75 }, { "epoch": 0.026067569885096897, "grad_norm": 3.9021899700164795, "learning_rate": 1.4938160786375572e-05, "loss": 0.6354, "step": 76 }, { "epoch": 0.026410564225690276, "grad_norm": 1.588626503944397, "learning_rate": 1.3778739760445552e-05, "loss": 0.4974, "step": 77 }, { "epoch": 0.026753558566283656, "grad_norm": 1.2203344106674194, "learning_rate": 1.2658926150792322e-05, "loss": 0.543, "step": 78 }, { "epoch": 0.027096552906877038, "grad_norm": 5.719263553619385, "learning_rate": 1.157994445715706e-05, "loss": 0.8592, "step": 79 }, { "epoch": 0.027439547247470417, "grad_norm": 6.814025402069092, "learning_rate": 1.0542974530180327e-05, "loss": 0.9175, "step": 80 }, { "epoch": 0.027782541588063796, "grad_norm": 3.1489052772521973, "learning_rate": 9.549150281252633e-06, "loss": 0.7111, "step": 81 }, { "epoch": 0.028125535928657176, "grad_norm": 1.2740695476531982, "learning_rate": 8.599558442598998e-06, "loss": 0.57, "step": 82 }, { "epoch": 0.028468530269250558, "grad_norm": 0.8978542685508728, "learning_rate": 7.695237378953223e-06, "loss": 0.5269, "step": 83 }, { "epoch": 0.028811524609843937, "grad_norm": 1.3057279586791992, "learning_rate": 6.837175952121306e-06, "loss": 0.5852, "step": 84 }, { "epoch": 0.029154518950437316, "grad_norm": 1.4580329656600952, "learning_rate": 6.026312439675552e-06, "loss": 0.5761, "step": 85 }, { "epoch": 0.0294975132910307, "grad_norm": 1.2395612001419067, "learning_rate": 5.263533508961827e-06, "loss": 0.5555, "step": 86 }, { "epoch": 0.029840507631624078, "grad_norm": 1.5107340812683105, "learning_rate": 4.549673247541875e-06, "loss": 0.5383, "step": 87 }, { "epoch": 0.030183501972217457, "grad_norm": 2.7954299449920654, "learning_rate": 3.885512251130763e-06, "loss": 0.6013, "step": 88 }, { "epoch": 0.03052649631281084, "grad_norm": 1.9643067121505737, "learning_rate": 3.271776770026963e-06, "loss": 0.5663, "step": 89 }, { "epoch": 0.03086949065340422, "grad_norm": 1.799647331237793, "learning_rate": 2.7091379149682685e-06, "loss": 0.4468, "step": 90 }, { "epoch": 0.031212484993997598, "grad_norm": 5.7880330085754395, "learning_rate": 2.1982109232821178e-06, "loss": 0.8115, "step": 91 }, { "epoch": 0.03155547933459098, "grad_norm": 6.51954984664917, "learning_rate": 1.7395544861325718e-06, "loss": 0.8979, "step": 92 }, { "epoch": 0.031898473675184356, "grad_norm": 5.1272759437561035, "learning_rate": 1.333670137599713e-06, "loss": 0.824, "step": 93 }, { "epoch": 0.03224146801577774, "grad_norm": 0.9056283235549927, "learning_rate": 9.810017062595322e-07, "loss": 0.5315, "step": 94 }, { "epoch": 0.03258446235637112, "grad_norm": 1.025938868522644, "learning_rate": 6.819348298638839e-07, "loss": 0.566, "step": 95 }, { "epoch": 0.0329274566969645, "grad_norm": 1.1732980012893677, "learning_rate": 4.367965336512403e-07, "loss": 0.5358, "step": 96 }, { "epoch": 0.03327045103755788, "grad_norm": 0.9143359661102295, "learning_rate": 2.458548727494292e-07, "loss": 0.5577, "step": 97 }, { "epoch": 0.03361344537815126, "grad_norm": 0.9600540399551392, "learning_rate": 1.0931863906127327e-07, "loss": 0.5347, "step": 98 }, { "epoch": 0.03395643971874464, "grad_norm": 0.9209562540054321, "learning_rate": 2.7337132953697554e-08, "loss": 0.5528, "step": 99 }, { "epoch": 0.034299434059338024, "grad_norm": 1.2809160947799683, "learning_rate": 0.0, "loss": 0.5496, "step": 100 }, { "epoch": 0.034299434059338024, "eval_loss": 0.6065332889556885, "eval_runtime": 454.2986, "eval_samples_per_second": 86.47, "eval_steps_per_second": 5.406, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.357855120654336e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }