{ "best_metric": 0.18359769880771637, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.11230388109000826, "eval_steps": 25, "global_step": 119, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009437300931933467, "grad_norm": 3.160567045211792, "learning_rate": 2e-05, "loss": 3.7567, "step": 1 }, { "epoch": 0.0009437300931933467, "eval_loss": 4.042717456817627, "eval_runtime": 0.9968, "eval_samples_per_second": 50.162, "eval_steps_per_second": 13.042, "step": 1 }, { "epoch": 0.0018874601863866935, "grad_norm": 3.04227614402771, "learning_rate": 4e-05, "loss": 4.2187, "step": 2 }, { "epoch": 0.00283119027958004, "grad_norm": 2.9732751846313477, "learning_rate": 6e-05, "loss": 3.8159, "step": 3 }, { "epoch": 0.003774920372773387, "grad_norm": 2.9514777660369873, "learning_rate": 8e-05, "loss": 3.5658, "step": 4 }, { "epoch": 0.0047186504659667335, "grad_norm": 3.053494691848755, "learning_rate": 0.0001, "loss": 3.1504, "step": 5 }, { "epoch": 0.00566238055916008, "grad_norm": 3.0063374042510986, "learning_rate": 9.998291381612281e-05, "loss": 2.7114, "step": 6 }, { "epoch": 0.006606110652353427, "grad_norm": 2.4902377128601074, "learning_rate": 9.993166823949923e-05, "loss": 1.9711, "step": 7 }, { "epoch": 0.007549840745546774, "grad_norm": 1.8327195644378662, "learning_rate": 9.984630218530014e-05, "loss": 1.4732, "step": 8 }, { "epoch": 0.00849357083874012, "grad_norm": 1.453201413154602, "learning_rate": 9.972688047930772e-05, "loss": 1.1737, "step": 9 }, { "epoch": 0.009437300931933467, "grad_norm": 1.3615171909332275, "learning_rate": 9.957349380868764e-05, "loss": 0.9125, "step": 10 }, { "epoch": 0.010381031025126814, "grad_norm": 1.2617502212524414, "learning_rate": 9.938625865312251e-05, "loss": 0.7211, "step": 11 }, { "epoch": 0.01132476111832016, "grad_norm": 1.0414522886276245, "learning_rate": 9.916531719635881e-05, "loss": 0.5865, "step": 12 }, { "epoch": 0.012268491211513508, "grad_norm": 1.51614248752594, "learning_rate": 9.891083721823461e-05, "loss": 0.9089, "step": 13 }, { "epoch": 0.013212221304706853, "grad_norm": 2.9659416675567627, "learning_rate": 9.862301196726987e-05, "loss": 1.2334, "step": 14 }, { "epoch": 0.0141559513979002, "grad_norm": 1.000846266746521, "learning_rate": 9.830206001391626e-05, "loss": 0.4075, "step": 15 }, { "epoch": 0.015099681491093548, "grad_norm": 0.8368968367576599, "learning_rate": 9.794822508457784e-05, "loss": 0.3667, "step": 16 }, { "epoch": 0.016043411584286894, "grad_norm": 0.48357707262039185, "learning_rate": 9.756177587652856e-05, "loss": 0.3149, "step": 17 }, { "epoch": 0.01698714167748024, "grad_norm": 0.528506875038147, "learning_rate": 9.714300585386747e-05, "loss": 0.3147, "step": 18 }, { "epoch": 0.01793087177067359, "grad_norm": 0.7525439858436584, "learning_rate": 9.669223302466608e-05, "loss": 0.3128, "step": 19 }, { "epoch": 0.018874601863866934, "grad_norm": 0.8638641834259033, "learning_rate": 9.620979969947759e-05, "loss": 0.317, "step": 20 }, { "epoch": 0.01981833195706028, "grad_norm": 0.410800039768219, "learning_rate": 9.5696072231391e-05, "loss": 0.2799, "step": 21 }, { "epoch": 0.02076206205025363, "grad_norm": 0.1946195811033249, "learning_rate": 9.515144073782774e-05, "loss": 0.2837, "step": 22 }, { "epoch": 0.021705792143446975, "grad_norm": 0.3468669652938843, "learning_rate": 9.4576318804292e-05, "loss": 0.2776, "step": 23 }, { "epoch": 0.02264952223664032, "grad_norm": 0.5359954833984375, "learning_rate": 9.397114317029975e-05, "loss": 0.2646, "step": 24 }, { "epoch": 0.023593252329833666, "grad_norm": 0.44181787967681885, "learning_rate": 9.333637339772472e-05, "loss": 0.2667, "step": 25 }, { "epoch": 0.023593252329833666, "eval_loss": 0.3099350035190582, "eval_runtime": 0.9945, "eval_samples_per_second": 50.276, "eval_steps_per_second": 13.072, "step": 25 }, { "epoch": 0.024536982423027015, "grad_norm": 1.1429702043533325, "learning_rate": 9.267249152181379e-05, "loss": 0.4415, "step": 26 }, { "epoch": 0.02548071251622036, "grad_norm": 0.6452208757400513, "learning_rate": 9.198000168513604e-05, "loss": 0.3879, "step": 27 }, { "epoch": 0.026424442609413706, "grad_norm": 0.3990679085254669, "learning_rate": 9.125942975474403e-05, "loss": 0.2871, "step": 28 }, { "epoch": 0.027368172702607056, "grad_norm": 0.2523767948150635, "learning_rate": 9.051132292283771e-05, "loss": 0.263, "step": 29 }, { "epoch": 0.0283119027958004, "grad_norm": 0.2475036233663559, "learning_rate": 8.973624929123445e-05, "loss": 0.2693, "step": 30 }, { "epoch": 0.029255632888993747, "grad_norm": 0.25849035382270813, "learning_rate": 8.893479743996034e-05, "loss": 0.2643, "step": 31 }, { "epoch": 0.030199362982187096, "grad_norm": 0.3408520221710205, "learning_rate": 8.810757598029093e-05, "loss": 0.267, "step": 32 }, { "epoch": 0.03114309307538044, "grad_norm": 0.4147489666938782, "learning_rate": 8.725521309258031e-05, "loss": 0.2645, "step": 33 }, { "epoch": 0.03208682316857379, "grad_norm": 0.36394861340522766, "learning_rate": 8.637835604922979e-05, "loss": 0.2596, "step": 34 }, { "epoch": 0.03303055326176713, "grad_norm": 0.20617513358592987, "learning_rate": 8.547767072315835e-05, "loss": 0.2668, "step": 35 }, { "epoch": 0.03397428335496048, "grad_norm": 0.25404950976371765, "learning_rate": 8.455384108214805e-05, "loss": 0.2651, "step": 36 }, { "epoch": 0.03491801344815383, "grad_norm": 0.3631896674633026, "learning_rate": 8.360756866944858e-05, "loss": 0.2594, "step": 37 }, { "epoch": 0.03586174354134718, "grad_norm": 0.3934739828109741, "learning_rate": 8.263957207103507e-05, "loss": 0.2608, "step": 38 }, { "epoch": 0.03680547363454052, "grad_norm": 0.41364550590515137, "learning_rate": 8.165058636992411e-05, "loss": 0.2771, "step": 39 }, { "epoch": 0.03774920372773387, "grad_norm": 1.3705769777297974, "learning_rate": 8.064136258796198e-05, "loss": 0.3437, "step": 40 }, { "epoch": 0.038692933820927214, "grad_norm": 1.066591739654541, "learning_rate": 7.961266711550922e-05, "loss": 0.3018, "step": 41 }, { "epoch": 0.03963666391412056, "grad_norm": 0.2451457381248474, "learning_rate": 7.856528112945452e-05, "loss": 0.2415, "step": 42 }, { "epoch": 0.040580394007313905, "grad_norm": 0.24031850695610046, "learning_rate": 7.75e-05, "loss": 0.246, "step": 43 }, { "epoch": 0.04152412410050726, "grad_norm": 1.0646287202835083, "learning_rate": 7.641763268666831e-05, "loss": 0.267, "step": 44 }, { "epoch": 0.042467854193700603, "grad_norm": 0.9229537844657898, "learning_rate": 7.531900112399004e-05, "loss": 0.2811, "step": 45 }, { "epoch": 0.04341158428689395, "grad_norm": 0.39775267243385315, "learning_rate": 7.420493959733816e-05, "loss": 0.2581, "step": 46 }, { "epoch": 0.044355314380087295, "grad_norm": 0.3674805164337158, "learning_rate": 7.307629410938363e-05, "loss": 0.2455, "step": 47 }, { "epoch": 0.04529904447328064, "grad_norm": 0.24181093275547028, "learning_rate": 7.193392173765261e-05, "loss": 0.2466, "step": 48 }, { "epoch": 0.046242774566473986, "grad_norm": 0.19443802535533905, "learning_rate": 7.077868998367395e-05, "loss": 0.239, "step": 49 }, { "epoch": 0.04718650465966733, "grad_norm": 0.25048816204071045, "learning_rate": 6.961147611421075e-05, "loss": 0.2369, "step": 50 }, { "epoch": 0.04718650465966733, "eval_loss": 0.2570982277393341, "eval_runtime": 1.0048, "eval_samples_per_second": 49.764, "eval_steps_per_second": 12.939, "step": 50 }, { "epoch": 0.048130234752860684, "grad_norm": 0.48732152581214905, "learning_rate": 6.843316649507626e-05, "loss": 0.2634, "step": 51 }, { "epoch": 0.04907396484605403, "grad_norm": 0.4416004717350006, "learning_rate": 6.724465591804008e-05, "loss": 0.2494, "step": 52 }, { "epoch": 0.050017694939247376, "grad_norm": 0.5777482390403748, "learning_rate": 6.604684692133597e-05, "loss": 0.2685, "step": 53 }, { "epoch": 0.05096142503244072, "grad_norm": 0.6688599586486816, "learning_rate": 6.484064910428692e-05, "loss": 0.2626, "step": 54 }, { "epoch": 0.05190515512563407, "grad_norm": 0.5525055527687073, "learning_rate": 6.362697843656823e-05, "loss": 0.2506, "step": 55 }, { "epoch": 0.05284888521882741, "grad_norm": 0.4974307119846344, "learning_rate": 6.240675656263303e-05, "loss": 0.2361, "step": 56 }, { "epoch": 0.053792615312020765, "grad_norm": 0.29403093457221985, "learning_rate": 6.118091010182837e-05, "loss": 0.2196, "step": 57 }, { "epoch": 0.05473634540521411, "grad_norm": 0.2774328589439392, "learning_rate": 5.995036994473357e-05, "loss": 0.2291, "step": 58 }, { "epoch": 0.05568007549840746, "grad_norm": 0.2612267732620239, "learning_rate": 5.8716070546254966e-05, "loss": 0.2241, "step": 59 }, { "epoch": 0.0566238055916008, "grad_norm": 0.3055221140384674, "learning_rate": 5.747894921601396e-05, "loss": 0.213, "step": 60 }, { "epoch": 0.05756753568479415, "grad_norm": 0.37357643246650696, "learning_rate": 5.62399454065673e-05, "loss": 0.2206, "step": 61 }, { "epoch": 0.058511265777987494, "grad_norm": 0.48611095547676086, "learning_rate": 5.500000000000001e-05, "loss": 0.2383, "step": 62 }, { "epoch": 0.05945499587118084, "grad_norm": 0.8126237392425537, "learning_rate": 5.376005459343272e-05, "loss": 0.3158, "step": 63 }, { "epoch": 0.06039872596437419, "grad_norm": 1.5249502658843994, "learning_rate": 5.2521050783986046e-05, "loss": 0.3773, "step": 64 }, { "epoch": 0.06134245605756754, "grad_norm": 0.4674637019634247, "learning_rate": 5.128392945374505e-05, "loss": 0.2159, "step": 65 }, { "epoch": 0.06228618615076088, "grad_norm": 0.2695397138595581, "learning_rate": 5.004963005526644e-05, "loss": 0.2065, "step": 66 }, { "epoch": 0.06322991624395423, "grad_norm": 0.19801601767539978, "learning_rate": 4.881908989817163e-05, "loss": 0.1862, "step": 67 }, { "epoch": 0.06417364633714757, "grad_norm": 0.27344810962677, "learning_rate": 4.7593243437366975e-05, "loss": 0.1887, "step": 68 }, { "epoch": 0.06511737643034092, "grad_norm": 0.2628972828388214, "learning_rate": 4.6373021563431784e-05, "loss": 0.2018, "step": 69 }, { "epoch": 0.06606110652353427, "grad_norm": 0.3281818926334381, "learning_rate": 4.515935089571309e-05, "loss": 0.2013, "step": 70 }, { "epoch": 0.06700483661672761, "grad_norm": 0.3598101735115051, "learning_rate": 4.395315307866405e-05, "loss": 0.1972, "step": 71 }, { "epoch": 0.06794856670992096, "grad_norm": 0.41488173604011536, "learning_rate": 4.275534408195991e-05, "loss": 0.198, "step": 72 }, { "epoch": 0.0688922968031143, "grad_norm": 0.41623586416244507, "learning_rate": 4.156683350492376e-05, "loss": 0.2004, "step": 73 }, { "epoch": 0.06983602689630766, "grad_norm": 0.32774344086647034, "learning_rate": 4.0388523885789256e-05, "loss": 0.2025, "step": 74 }, { "epoch": 0.07077975698950101, "grad_norm": 0.3913571238517761, "learning_rate": 3.922131001632606e-05, "loss": 0.2174, "step": 75 }, { "epoch": 0.07077975698950101, "eval_loss": 0.19621771574020386, "eval_runtime": 1.0112, "eval_samples_per_second": 49.447, "eval_steps_per_second": 12.856, "step": 75 }, { "epoch": 0.07172348708269435, "grad_norm": 0.48670119047164917, "learning_rate": 3.8066078262347406e-05, "loss": 0.248, "step": 76 }, { "epoch": 0.0726672171758877, "grad_norm": 0.34487494826316833, "learning_rate": 3.692370589061639e-05, "loss": 0.1985, "step": 77 }, { "epoch": 0.07361094726908105, "grad_norm": 0.1863425225019455, "learning_rate": 3.579506040266184e-05, "loss": 0.1804, "step": 78 }, { "epoch": 0.07455467736227439, "grad_norm": 0.19545269012451172, "learning_rate": 3.468099887600999e-05, "loss": 0.1795, "step": 79 }, { "epoch": 0.07549840745546774, "grad_norm": 0.20196500420570374, "learning_rate": 3.358236731333169e-05, "loss": 0.1766, "step": 80 }, { "epoch": 0.07644213754866108, "grad_norm": 0.2229662984609604, "learning_rate": 3.250000000000001e-05, "loss": 0.1834, "step": 81 }, { "epoch": 0.07738586764185443, "grad_norm": 0.16917945444583893, "learning_rate": 3.14347188705455e-05, "loss": 0.1745, "step": 82 }, { "epoch": 0.07832959773504777, "grad_norm": 0.18550167977809906, "learning_rate": 3.0387332884490805e-05, "loss": 0.1787, "step": 83 }, { "epoch": 0.07927332782824112, "grad_norm": 0.23968204855918884, "learning_rate": 2.9358637412038027e-05, "loss": 0.1864, "step": 84 }, { "epoch": 0.08021705792143446, "grad_norm": 0.22192811965942383, "learning_rate": 2.8349413630075906e-05, "loss": 0.1931, "step": 85 }, { "epoch": 0.08116078801462781, "grad_norm": 0.2770540118217468, "learning_rate": 2.736042792896495e-05, "loss": 0.1746, "step": 86 }, { "epoch": 0.08210451810782117, "grad_norm": 0.2459287792444229, "learning_rate": 2.639243133055145e-05, "loss": 0.1803, "step": 87 }, { "epoch": 0.08304824820101452, "grad_norm": 0.43496978282928467, "learning_rate": 2.5446158917851958e-05, "loss": 0.2465, "step": 88 }, { "epoch": 0.08399197829420786, "grad_norm": 0.5976945757865906, "learning_rate": 2.4522329276841663e-05, "loss": 0.2248, "step": 89 }, { "epoch": 0.08493570838740121, "grad_norm": 0.291526734828949, "learning_rate": 2.362164395077021e-05, "loss": 0.1671, "step": 90 }, { "epoch": 0.08587943848059455, "grad_norm": 0.2855542302131653, "learning_rate": 2.2744786907419703e-05, "loss": 0.1743, "step": 91 }, { "epoch": 0.0868231685737879, "grad_norm": 0.2278595119714737, "learning_rate": 2.189242401970908e-05, "loss": 0.164, "step": 92 }, { "epoch": 0.08776689866698124, "grad_norm": 0.1674998253583908, "learning_rate": 2.1065202560039677e-05, "loss": 0.1697, "step": 93 }, { "epoch": 0.08871062876017459, "grad_norm": 0.22201752662658691, "learning_rate": 2.026375070876556e-05, "loss": 0.1659, "step": 94 }, { "epoch": 0.08965435885336794, "grad_norm": 0.23097634315490723, "learning_rate": 1.9488677077162295e-05, "loss": 0.1779, "step": 95 }, { "epoch": 0.09059808894656128, "grad_norm": 0.31884652376174927, "learning_rate": 1.8740570245255984e-05, "loss": 0.1704, "step": 96 }, { "epoch": 0.09154181903975463, "grad_norm": 0.2808215320110321, "learning_rate": 1.8019998314863974e-05, "loss": 0.1699, "step": 97 }, { "epoch": 0.09248554913294797, "grad_norm": 0.31133246421813965, "learning_rate": 1.7327508478186218e-05, "loss": 0.173, "step": 98 }, { "epoch": 0.09342927922614132, "grad_norm": 0.3080236613750458, "learning_rate": 1.6663626602275288e-05, "loss": 0.1652, "step": 99 }, { "epoch": 0.09437300931933466, "grad_norm": 0.3504279553890228, "learning_rate": 1.602885682970026e-05, "loss": 0.2068, "step": 100 }, { "epoch": 0.09437300931933466, "eval_loss": 0.18359769880771637, "eval_runtime": 0.996, "eval_samples_per_second": 50.2, "eval_steps_per_second": 13.052, "step": 100 }, { "epoch": 0.09531673941252802, "grad_norm": 0.5977703928947449, "learning_rate": 1.5423681195707997e-05, "loss": 0.2196, "step": 101 }, { "epoch": 0.09626046950572137, "grad_norm": 0.4760293960571289, "learning_rate": 1.484855926217227e-05, "loss": 0.1857, "step": 102 }, { "epoch": 0.09720419959891471, "grad_norm": 0.2669781744480133, "learning_rate": 1.4303927768609015e-05, "loss": 0.1815, "step": 103 }, { "epoch": 0.09814792969210806, "grad_norm": 0.18762975931167603, "learning_rate": 1.3790200300522413e-05, "loss": 0.1615, "step": 104 }, { "epoch": 0.0990916597853014, "grad_norm": 0.22948940098285675, "learning_rate": 1.330776697533392e-05, "loss": 0.1608, "step": 105 }, { "epoch": 0.10003538987849475, "grad_norm": 0.17586657404899597, "learning_rate": 1.2856994146132542e-05, "loss": 0.1506, "step": 106 }, { "epoch": 0.1009791199716881, "grad_norm": 0.23077844083309174, "learning_rate": 1.2438224123471442e-05, "loss": 0.162, "step": 107 }, { "epoch": 0.10192285006488144, "grad_norm": 0.21372710168361664, "learning_rate": 1.2051774915422163e-05, "loss": 0.1605, "step": 108 }, { "epoch": 0.10286658015807479, "grad_norm": 0.16886736452579498, "learning_rate": 1.1697939986083733e-05, "loss": 0.1506, "step": 109 }, { "epoch": 0.10381031025126813, "grad_norm": 0.2670663595199585, "learning_rate": 1.1376988032730134e-05, "loss": 0.1698, "step": 110 }, { "epoch": 0.10475404034446148, "grad_norm": 0.1998496651649475, "learning_rate": 1.1089162781765398e-05, "loss": 0.1691, "step": 111 }, { "epoch": 0.10569777043765483, "grad_norm": 0.17204442620277405, "learning_rate": 1.0834682803641197e-05, "loss": 0.1822, "step": 112 }, { "epoch": 0.10664150053084817, "grad_norm": 0.3721541464328766, "learning_rate": 1.0613741346877497e-05, "loss": 0.2265, "step": 113 }, { "epoch": 0.10758523062404153, "grad_norm": 0.5235680937767029, "learning_rate": 1.0426506191312355e-05, "loss": 0.1901, "step": 114 }, { "epoch": 0.10852896071723488, "grad_norm": 0.312836229801178, "learning_rate": 1.0273119520692275e-05, "loss": 0.1713, "step": 115 }, { "epoch": 0.10947269081042822, "grad_norm": 0.18158796429634094, "learning_rate": 1.0153697814699859e-05, "loss": 0.1683, "step": 116 }, { "epoch": 0.11041642090362157, "grad_norm": 0.24202579259872437, "learning_rate": 1.0068331760500774e-05, "loss": 0.1639, "step": 117 }, { "epoch": 0.11136015099681491, "grad_norm": 0.24022828042507172, "learning_rate": 1.0017086183877188e-05, "loss": 0.1677, "step": 118 }, { "epoch": 0.11230388109000826, "grad_norm": 0.2230340838432312, "learning_rate": 1e-05, "loss": 0.1475, "step": 119 } ], "logging_steps": 1, "max_steps": 119, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 20, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3299091053509345e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }