{ "best_metric": 0.4816867411136627, "best_model_checkpoint": "paligema_cafe/checkpoint-700", "epoch": 49.840255591054316, "eval_steps": 100, "global_step": 3900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012779552715654952, "grad_norm": 0.8142021894454956, "learning_rate": 1.99948717948718e-05, "loss": 2.9034, "step": 1 }, { "epoch": 0.025559105431309903, "grad_norm": 0.8559026718139648, "learning_rate": 1.998974358974359e-05, "loss": 2.9373, "step": 2 }, { "epoch": 0.038338658146964855, "grad_norm": 0.9021930694580078, "learning_rate": 1.9984615384615387e-05, "loss": 2.925, "step": 3 }, { "epoch": 0.051118210862619806, "grad_norm": 0.8621866106987, "learning_rate": 1.997948717948718e-05, "loss": 3.0668, "step": 4 }, { "epoch": 0.06389776357827476, "grad_norm": 1.0313794612884521, "learning_rate": 1.9974358974358975e-05, "loss": 2.9666, "step": 5 }, { "epoch": 0.07667731629392971, "grad_norm": 0.7908412218093872, "learning_rate": 1.9969230769230773e-05, "loss": 3.0073, "step": 6 }, { "epoch": 0.08945686900958466, "grad_norm": 0.8272828459739685, "learning_rate": 1.9964102564102567e-05, "loss": 2.784, "step": 7 }, { "epoch": 0.10223642172523961, "grad_norm": 0.9401512742042542, "learning_rate": 1.995897435897436e-05, "loss": 2.9083, "step": 8 }, { "epoch": 0.11501597444089456, "grad_norm": 0.9094955325126648, "learning_rate": 1.9953846153846155e-05, "loss": 2.806, "step": 9 }, { "epoch": 0.12779552715654952, "grad_norm": 0.9415909647941589, "learning_rate": 1.994871794871795e-05, "loss": 2.9022, "step": 10 }, { "epoch": 0.14057507987220447, "grad_norm": 0.9576209783554077, "learning_rate": 1.9943589743589746e-05, "loss": 2.8489, "step": 11 }, { "epoch": 0.15335463258785942, "grad_norm": 0.9657884240150452, "learning_rate": 1.993846153846154e-05, "loss": 2.9371, "step": 12 }, { "epoch": 0.16613418530351437, "grad_norm": 0.9966011047363281, "learning_rate": 1.9933333333333334e-05, "loss": 2.8831, "step": 13 }, { "epoch": 0.17891373801916932, "grad_norm": 1.2204116582870483, "learning_rate": 1.992820512820513e-05, "loss": 3.0116, "step": 14 }, { "epoch": 0.19169329073482427, "grad_norm": 1.1095342636108398, "learning_rate": 1.9923076923076926e-05, "loss": 2.6358, "step": 15 }, { "epoch": 0.20447284345047922, "grad_norm": 1.0488131046295166, "learning_rate": 1.991794871794872e-05, "loss": 2.7842, "step": 16 }, { "epoch": 0.21725239616613418, "grad_norm": 1.1298291683197021, "learning_rate": 1.9912820512820514e-05, "loss": 2.6071, "step": 17 }, { "epoch": 0.23003194888178913, "grad_norm": 1.20250403881073, "learning_rate": 1.990769230769231e-05, "loss": 2.2891, "step": 18 }, { "epoch": 0.24281150159744408, "grad_norm": 1.0653283596038818, "learning_rate": 1.9902564102564102e-05, "loss": 2.694, "step": 19 }, { "epoch": 0.25559105431309903, "grad_norm": 1.0217392444610596, "learning_rate": 1.98974358974359e-05, "loss": 2.7254, "step": 20 }, { "epoch": 0.268370607028754, "grad_norm": 1.1248751878738403, "learning_rate": 1.9892307692307694e-05, "loss": 2.8926, "step": 21 }, { "epoch": 0.28115015974440893, "grad_norm": 1.0898598432540894, "learning_rate": 1.9887179487179488e-05, "loss": 2.6239, "step": 22 }, { "epoch": 0.2939297124600639, "grad_norm": 1.0956969261169434, "learning_rate": 1.9882051282051285e-05, "loss": 2.5163, "step": 23 }, { "epoch": 0.30670926517571884, "grad_norm": 1.1412856578826904, "learning_rate": 1.987692307692308e-05, "loss": 2.6186, "step": 24 }, { "epoch": 0.3194888178913738, "grad_norm": 1.1162058115005493, "learning_rate": 1.9871794871794873e-05, "loss": 2.505, "step": 25 }, { "epoch": 0.33226837060702874, "grad_norm": 1.301196575164795, "learning_rate": 1.9866666666666667e-05, "loss": 2.1748, "step": 26 }, { "epoch": 0.3450479233226837, "grad_norm": 1.2784515619277954, "learning_rate": 1.9861538461538465e-05, "loss": 2.3362, "step": 27 }, { "epoch": 0.35782747603833864, "grad_norm": 1.2827022075653076, "learning_rate": 1.985641025641026e-05, "loss": 2.403, "step": 28 }, { "epoch": 0.3706070287539936, "grad_norm": 1.1135437488555908, "learning_rate": 1.9851282051282053e-05, "loss": 2.2757, "step": 29 }, { "epoch": 0.38338658146964855, "grad_norm": 1.2342116832733154, "learning_rate": 1.9846153846153847e-05, "loss": 2.6038, "step": 30 }, { "epoch": 0.3961661341853035, "grad_norm": 1.5493252277374268, "learning_rate": 1.984102564102564e-05, "loss": 2.4121, "step": 31 }, { "epoch": 0.40894568690095845, "grad_norm": 1.2742743492126465, "learning_rate": 1.983589743589744e-05, "loss": 2.2992, "step": 32 }, { "epoch": 0.4217252396166134, "grad_norm": 1.1998932361602783, "learning_rate": 1.9830769230769232e-05, "loss": 2.2267, "step": 33 }, { "epoch": 0.43450479233226835, "grad_norm": 1.302520751953125, "learning_rate": 1.9825641025641027e-05, "loss": 2.308, "step": 34 }, { "epoch": 0.4472843450479233, "grad_norm": 1.4223449230194092, "learning_rate": 1.9820512820512824e-05, "loss": 2.045, "step": 35 }, { "epoch": 0.46006389776357826, "grad_norm": 1.3312417268753052, "learning_rate": 1.9815384615384618e-05, "loss": 2.2282, "step": 36 }, { "epoch": 0.4728434504792332, "grad_norm": 1.2069036960601807, "learning_rate": 1.9810256410256412e-05, "loss": 2.102, "step": 37 }, { "epoch": 0.48562300319488816, "grad_norm": 1.3504729270935059, "learning_rate": 1.9805128205128206e-05, "loss": 2.1236, "step": 38 }, { "epoch": 0.4984025559105431, "grad_norm": 1.258859395980835, "learning_rate": 1.98e-05, "loss": 2.3477, "step": 39 }, { "epoch": 0.5111821086261981, "grad_norm": 1.3494940996170044, "learning_rate": 1.9794871794871798e-05, "loss": 2.2073, "step": 40 }, { "epoch": 0.5239616613418531, "grad_norm": 1.217779278755188, "learning_rate": 1.978974358974359e-05, "loss": 2.2316, "step": 41 }, { "epoch": 0.536741214057508, "grad_norm": 1.3344950675964355, "learning_rate": 1.9784615384615386e-05, "loss": 2.0516, "step": 42 }, { "epoch": 0.549520766773163, "grad_norm": 1.2877260446548462, "learning_rate": 1.977948717948718e-05, "loss": 1.9832, "step": 43 }, { "epoch": 0.5623003194888179, "grad_norm": 1.225043773651123, "learning_rate": 1.9774358974358977e-05, "loss": 2.1805, "step": 44 }, { "epoch": 0.5750798722044729, "grad_norm": 1.3049123287200928, "learning_rate": 1.976923076923077e-05, "loss": 1.7603, "step": 45 }, { "epoch": 0.5878594249201278, "grad_norm": 1.2719806432724, "learning_rate": 1.9764102564102565e-05, "loss": 1.928, "step": 46 }, { "epoch": 0.6006389776357828, "grad_norm": 1.4088648557662964, "learning_rate": 1.975897435897436e-05, "loss": 1.8306, "step": 47 }, { "epoch": 0.6134185303514377, "grad_norm": 1.3367751836776733, "learning_rate": 1.9753846153846153e-05, "loss": 2.0066, "step": 48 }, { "epoch": 0.6261980830670927, "grad_norm": 1.2356927394866943, "learning_rate": 1.974871794871795e-05, "loss": 1.9573, "step": 49 }, { "epoch": 0.6389776357827476, "grad_norm": 1.4789074659347534, "learning_rate": 1.9743589743589745e-05, "loss": 1.9171, "step": 50 }, { "epoch": 0.6517571884984026, "grad_norm": 1.335368037223816, "learning_rate": 1.973846153846154e-05, "loss": 1.9178, "step": 51 }, { "epoch": 0.6645367412140575, "grad_norm": 1.4464950561523438, "learning_rate": 1.9733333333333336e-05, "loss": 1.9905, "step": 52 }, { "epoch": 0.6773162939297125, "grad_norm": 1.6089545488357544, "learning_rate": 1.972820512820513e-05, "loss": 1.8002, "step": 53 }, { "epoch": 0.6900958466453674, "grad_norm": 1.397189736366272, "learning_rate": 1.9723076923076924e-05, "loss": 1.8588, "step": 54 }, { "epoch": 0.7028753993610224, "grad_norm": 1.346503734588623, "learning_rate": 1.9717948717948722e-05, "loss": 1.6663, "step": 55 }, { "epoch": 0.7156549520766773, "grad_norm": 1.2886115312576294, "learning_rate": 1.9712820512820513e-05, "loss": 1.7527, "step": 56 }, { "epoch": 0.7284345047923323, "grad_norm": 1.4284838438034058, "learning_rate": 1.970769230769231e-05, "loss": 1.7273, "step": 57 }, { "epoch": 0.7412140575079872, "grad_norm": 1.3520196676254272, "learning_rate": 1.9702564102564104e-05, "loss": 1.6347, "step": 58 }, { "epoch": 0.7539936102236422, "grad_norm": 1.7988076210021973, "learning_rate": 1.9697435897435898e-05, "loss": 1.5542, "step": 59 }, { "epoch": 0.7667731629392971, "grad_norm": 1.6694620847702026, "learning_rate": 1.9692307692307696e-05, "loss": 1.6283, "step": 60 }, { "epoch": 0.7795527156549521, "grad_norm": 1.3663523197174072, "learning_rate": 1.968717948717949e-05, "loss": 1.73, "step": 61 }, { "epoch": 0.792332268370607, "grad_norm": 1.4919695854187012, "learning_rate": 1.9682051282051284e-05, "loss": 1.6079, "step": 62 }, { "epoch": 0.805111821086262, "grad_norm": 1.550233244895935, "learning_rate": 1.9676923076923078e-05, "loss": 1.4532, "step": 63 }, { "epoch": 0.8178913738019169, "grad_norm": 1.4926998615264893, "learning_rate": 1.9671794871794875e-05, "loss": 1.3253, "step": 64 }, { "epoch": 0.8306709265175719, "grad_norm": 1.656699538230896, "learning_rate": 1.9666666666666666e-05, "loss": 1.4525, "step": 65 }, { "epoch": 0.8434504792332268, "grad_norm": 1.774846076965332, "learning_rate": 1.9661538461538463e-05, "loss": 1.5266, "step": 66 }, { "epoch": 0.8562300319488818, "grad_norm": 1.41655695438385, "learning_rate": 1.9656410256410257e-05, "loss": 1.724, "step": 67 }, { "epoch": 0.8690095846645367, "grad_norm": 1.3376843929290771, "learning_rate": 1.965128205128205e-05, "loss": 1.62, "step": 68 }, { "epoch": 0.8817891373801917, "grad_norm": 1.439816951751709, "learning_rate": 1.964615384615385e-05, "loss": 1.4697, "step": 69 }, { "epoch": 0.8945686900958466, "grad_norm": 1.5517547130584717, "learning_rate": 1.9641025641025643e-05, "loss": 1.4436, "step": 70 }, { "epoch": 0.9073482428115016, "grad_norm": 1.4276851415634155, "learning_rate": 1.9635897435897437e-05, "loss": 1.4793, "step": 71 }, { "epoch": 0.9201277955271565, "grad_norm": 1.402268409729004, "learning_rate": 1.9630769230769234e-05, "loss": 1.3503, "step": 72 }, { "epoch": 0.9329073482428115, "grad_norm": 1.4884088039398193, "learning_rate": 1.962564102564103e-05, "loss": 1.2738, "step": 73 }, { "epoch": 0.9456869009584664, "grad_norm": 2.289491653442383, "learning_rate": 1.9620512820512822e-05, "loss": 1.4683, "step": 74 }, { "epoch": 0.9584664536741214, "grad_norm": 1.571138620376587, "learning_rate": 1.9615384615384617e-05, "loss": 1.3477, "step": 75 }, { "epoch": 0.9712460063897763, "grad_norm": 1.9306889772415161, "learning_rate": 1.961025641025641e-05, "loss": 1.3338, "step": 76 }, { "epoch": 0.9840255591054313, "grad_norm": 1.4522299766540527, "learning_rate": 1.9605128205128208e-05, "loss": 1.2846, "step": 77 }, { "epoch": 0.9968051118210862, "grad_norm": 1.491416335105896, "learning_rate": 1.9600000000000002e-05, "loss": 1.1654, "step": 78 }, { "epoch": 1.0095846645367412, "grad_norm": 1.2546143531799316, "learning_rate": 1.9594871794871796e-05, "loss": 1.0005, "step": 79 }, { "epoch": 1.0223642172523961, "grad_norm": 1.8387315273284912, "learning_rate": 1.958974358974359e-05, "loss": 1.309, "step": 80 }, { "epoch": 1.035143769968051, "grad_norm": 2.288738250732422, "learning_rate": 1.9584615384615388e-05, "loss": 1.25, "step": 81 }, { "epoch": 1.0479233226837061, "grad_norm": 1.4508757591247559, "learning_rate": 1.957948717948718e-05, "loss": 1.3196, "step": 82 }, { "epoch": 1.060702875399361, "grad_norm": 1.538571834564209, "learning_rate": 1.9574358974358976e-05, "loss": 1.0223, "step": 83 }, { "epoch": 1.073482428115016, "grad_norm": 1.5000869035720825, "learning_rate": 1.9569230769230773e-05, "loss": 1.1089, "step": 84 }, { "epoch": 1.0862619808306708, "grad_norm": 1.6408222913742065, "learning_rate": 1.9564102564102564e-05, "loss": 1.0924, "step": 85 }, { "epoch": 1.099041533546326, "grad_norm": 1.7912194728851318, "learning_rate": 1.955897435897436e-05, "loss": 1.0234, "step": 86 }, { "epoch": 1.1118210862619808, "grad_norm": 1.4508275985717773, "learning_rate": 1.9553846153846155e-05, "loss": 1.1314, "step": 87 }, { "epoch": 1.1246006389776357, "grad_norm": 1.8075443506240845, "learning_rate": 1.954871794871795e-05, "loss": 1.007, "step": 88 }, { "epoch": 1.1373801916932909, "grad_norm": 1.595278024673462, "learning_rate": 1.9543589743589747e-05, "loss": 1.1304, "step": 89 }, { "epoch": 1.1501597444089458, "grad_norm": 1.43287193775177, "learning_rate": 1.953846153846154e-05, "loss": 0.9737, "step": 90 }, { "epoch": 1.1629392971246006, "grad_norm": 1.7888563871383667, "learning_rate": 1.9533333333333335e-05, "loss": 1.0763, "step": 91 }, { "epoch": 1.1757188498402555, "grad_norm": 1.7963002920150757, "learning_rate": 1.952820512820513e-05, "loss": 1.0133, "step": 92 }, { "epoch": 1.1884984025559104, "grad_norm": 1.6066761016845703, "learning_rate": 1.9523076923076923e-05, "loss": 0.9496, "step": 93 }, { "epoch": 1.2012779552715656, "grad_norm": 1.7649800777435303, "learning_rate": 1.951794871794872e-05, "loss": 0.9979, "step": 94 }, { "epoch": 1.2140575079872205, "grad_norm": 1.8012830018997192, "learning_rate": 1.9512820512820515e-05, "loss": 1.0609, "step": 95 }, { "epoch": 1.2268370607028753, "grad_norm": 1.6527281999588013, "learning_rate": 1.950769230769231e-05, "loss": 0.8774, "step": 96 }, { "epoch": 1.2396166134185305, "grad_norm": 1.5712525844573975, "learning_rate": 1.9502564102564103e-05, "loss": 0.9831, "step": 97 }, { "epoch": 1.2523961661341854, "grad_norm": 2.5428738594055176, "learning_rate": 1.94974358974359e-05, "loss": 1.3413, "step": 98 }, { "epoch": 1.2651757188498403, "grad_norm": 1.9915602207183838, "learning_rate": 1.9492307692307694e-05, "loss": 0.8742, "step": 99 }, { "epoch": 1.2779552715654952, "grad_norm": 2.5296716690063477, "learning_rate": 1.9487179487179488e-05, "loss": 1.0443, "step": 100 }, { "epoch": 1.2779552715654952, "eval_loss": 0.992189347743988, "eval_runtime": 181.8595, "eval_samples_per_second": 0.863, "eval_steps_per_second": 0.11, "step": 100 }, { "epoch": 1.29073482428115, "grad_norm": 1.5496196746826172, "learning_rate": 1.9482051282051286e-05, "loss": 0.9173, "step": 101 }, { "epoch": 1.3035143769968052, "grad_norm": 1.8158091306686401, "learning_rate": 1.9476923076923076e-05, "loss": 1.2035, "step": 102 }, { "epoch": 1.31629392971246, "grad_norm": 2.238765239715576, "learning_rate": 1.9471794871794874e-05, "loss": 1.0669, "step": 103 }, { "epoch": 1.329073482428115, "grad_norm": 1.706765055656433, "learning_rate": 1.9466666666666668e-05, "loss": 0.7441, "step": 104 }, { "epoch": 1.34185303514377, "grad_norm": 1.669560432434082, "learning_rate": 1.9461538461538462e-05, "loss": 0.8697, "step": 105 }, { "epoch": 1.354632587859425, "grad_norm": 1.5571810007095337, "learning_rate": 1.945641025641026e-05, "loss": 0.836, "step": 106 }, { "epoch": 1.3674121405750799, "grad_norm": 2.0145392417907715, "learning_rate": 1.9451282051282053e-05, "loss": 1.0022, "step": 107 }, { "epoch": 1.3801916932907348, "grad_norm": 2.5267364978790283, "learning_rate": 1.9446153846153847e-05, "loss": 1.188, "step": 108 }, { "epoch": 1.3929712460063897, "grad_norm": 2.094905138015747, "learning_rate": 1.9441025641025645e-05, "loss": 0.9611, "step": 109 }, { "epoch": 1.4057507987220448, "grad_norm": 1.7281832695007324, "learning_rate": 1.943589743589744e-05, "loss": 0.7503, "step": 110 }, { "epoch": 1.4185303514376997, "grad_norm": 1.7954721450805664, "learning_rate": 1.9430769230769233e-05, "loss": 0.7309, "step": 111 }, { "epoch": 1.4313099041533546, "grad_norm": 1.8233072757720947, "learning_rate": 1.9425641025641027e-05, "loss": 0.9671, "step": 112 }, { "epoch": 1.4440894568690097, "grad_norm": 2.052917003631592, "learning_rate": 1.942051282051282e-05, "loss": 0.9148, "step": 113 }, { "epoch": 1.4568690095846646, "grad_norm": 1.862432599067688, "learning_rate": 1.9415384615384615e-05, "loss": 0.9417, "step": 114 }, { "epoch": 1.4696485623003195, "grad_norm": 1.6614469289779663, "learning_rate": 1.9410256410256413e-05, "loss": 0.7467, "step": 115 }, { "epoch": 1.4824281150159744, "grad_norm": 1.9247630834579468, "learning_rate": 1.9405128205128207e-05, "loss": 0.92, "step": 116 }, { "epoch": 1.4952076677316293, "grad_norm": 1.9821908473968506, "learning_rate": 1.94e-05, "loss": 0.7205, "step": 117 }, { "epoch": 1.5079872204472844, "grad_norm": 2.299802303314209, "learning_rate": 1.9394871794871798e-05, "loss": 0.7411, "step": 118 }, { "epoch": 1.5207667731629393, "grad_norm": 1.7966883182525635, "learning_rate": 1.938974358974359e-05, "loss": 0.8728, "step": 119 }, { "epoch": 1.5335463258785942, "grad_norm": 2.048807382583618, "learning_rate": 1.9384615384615386e-05, "loss": 0.8071, "step": 120 }, { "epoch": 1.5463258785942493, "grad_norm": 1.8083820343017578, "learning_rate": 1.9379487179487184e-05, "loss": 1.044, "step": 121 }, { "epoch": 1.5591054313099042, "grad_norm": 1.692712664604187, "learning_rate": 1.9374358974358974e-05, "loss": 0.7637, "step": 122 }, { "epoch": 1.571884984025559, "grad_norm": 1.9947776794433594, "learning_rate": 1.936923076923077e-05, "loss": 0.7374, "step": 123 }, { "epoch": 1.5846645367412142, "grad_norm": 1.7471729516983032, "learning_rate": 1.9364102564102566e-05, "loss": 0.72, "step": 124 }, { "epoch": 1.5974440894568689, "grad_norm": 1.6320607662200928, "learning_rate": 1.935897435897436e-05, "loss": 0.8545, "step": 125 }, { "epoch": 1.610223642172524, "grad_norm": 1.4828823804855347, "learning_rate": 1.9353846153846157e-05, "loss": 0.7754, "step": 126 }, { "epoch": 1.623003194888179, "grad_norm": 1.6667225360870361, "learning_rate": 1.934871794871795e-05, "loss": 0.7953, "step": 127 }, { "epoch": 1.6357827476038338, "grad_norm": 1.9879682064056396, "learning_rate": 1.9343589743589745e-05, "loss": 0.8459, "step": 128 }, { "epoch": 1.648562300319489, "grad_norm": 1.8809915781021118, "learning_rate": 1.933846153846154e-05, "loss": 0.7419, "step": 129 }, { "epoch": 1.6613418530351438, "grad_norm": 2.041389226913452, "learning_rate": 1.9333333333333333e-05, "loss": 0.8847, "step": 130 }, { "epoch": 1.6741214057507987, "grad_norm": 2.201057195663452, "learning_rate": 1.932820512820513e-05, "loss": 0.5984, "step": 131 }, { "epoch": 1.6869009584664538, "grad_norm": 2.1080873012542725, "learning_rate": 1.9323076923076925e-05, "loss": 0.9419, "step": 132 }, { "epoch": 1.6996805111821085, "grad_norm": 2.061424493789673, "learning_rate": 1.931794871794872e-05, "loss": 0.7766, "step": 133 }, { "epoch": 1.7124600638977636, "grad_norm": 1.7074888944625854, "learning_rate": 1.9312820512820513e-05, "loss": 0.6979, "step": 134 }, { "epoch": 1.7252396166134185, "grad_norm": 1.6944959163665771, "learning_rate": 1.930769230769231e-05, "loss": 0.8957, "step": 135 }, { "epoch": 1.7380191693290734, "grad_norm": 1.9344595670700073, "learning_rate": 1.9302564102564105e-05, "loss": 0.7251, "step": 136 }, { "epoch": 1.7507987220447285, "grad_norm": 2.4287431240081787, "learning_rate": 1.92974358974359e-05, "loss": 0.641, "step": 137 }, { "epoch": 1.7635782747603834, "grad_norm": 2.3612871170043945, "learning_rate": 1.9292307692307696e-05, "loss": 0.7049, "step": 138 }, { "epoch": 1.7763578274760383, "grad_norm": 2.3364453315734863, "learning_rate": 1.9287179487179487e-05, "loss": 0.7515, "step": 139 }, { "epoch": 1.7891373801916934, "grad_norm": 1.9627810716629028, "learning_rate": 1.9282051282051284e-05, "loss": 0.6797, "step": 140 }, { "epoch": 1.8019169329073481, "grad_norm": 2.451805830001831, "learning_rate": 1.9276923076923078e-05, "loss": 0.7781, "step": 141 }, { "epoch": 1.8146964856230032, "grad_norm": 1.8913015127182007, "learning_rate": 1.9271794871794872e-05, "loss": 0.8206, "step": 142 }, { "epoch": 1.8274760383386581, "grad_norm": 1.6889513731002808, "learning_rate": 1.926666666666667e-05, "loss": 0.5143, "step": 143 }, { "epoch": 1.840255591054313, "grad_norm": 2.1671924591064453, "learning_rate": 1.9261538461538464e-05, "loss": 1.0332, "step": 144 }, { "epoch": 1.8530351437699681, "grad_norm": 1.8647410869598389, "learning_rate": 1.9256410256410258e-05, "loss": 0.687, "step": 145 }, { "epoch": 1.865814696485623, "grad_norm": 1.7664998769760132, "learning_rate": 1.9251282051282052e-05, "loss": 0.8675, "step": 146 }, { "epoch": 1.878594249201278, "grad_norm": 2.581796407699585, "learning_rate": 1.924615384615385e-05, "loss": 1.0433, "step": 147 }, { "epoch": 1.891373801916933, "grad_norm": 2.2406113147735596, "learning_rate": 1.9241025641025643e-05, "loss": 1.2425, "step": 148 }, { "epoch": 1.9041533546325877, "grad_norm": 2.4278011322021484, "learning_rate": 1.9235897435897437e-05, "loss": 0.5769, "step": 149 }, { "epoch": 1.9169329073482428, "grad_norm": 2.2902305126190186, "learning_rate": 1.923076923076923e-05, "loss": 0.6431, "step": 150 }, { "epoch": 1.9297124600638977, "grad_norm": 1.8957128524780273, "learning_rate": 1.9225641025641025e-05, "loss": 0.8498, "step": 151 }, { "epoch": 1.9424920127795526, "grad_norm": 1.6397674083709717, "learning_rate": 1.9220512820512823e-05, "loss": 0.8161, "step": 152 }, { "epoch": 1.9552715654952078, "grad_norm": 1.9125725030899048, "learning_rate": 1.9215384615384617e-05, "loss": 0.5361, "step": 153 }, { "epoch": 1.9680511182108626, "grad_norm": 1.9646354913711548, "learning_rate": 1.921025641025641e-05, "loss": 0.8507, "step": 154 }, { "epoch": 1.9808306709265175, "grad_norm": 1.851611852645874, "learning_rate": 1.920512820512821e-05, "loss": 0.6281, "step": 155 }, { "epoch": 1.9936102236421727, "grad_norm": 1.6641134023666382, "learning_rate": 1.9200000000000003e-05, "loss": 0.6391, "step": 156 }, { "epoch": 2.0063897763578273, "grad_norm": 2.319157361984253, "learning_rate": 1.9194871794871797e-05, "loss": 1.1003, "step": 157 }, { "epoch": 2.0191693290734825, "grad_norm": 2.1189239025115967, "learning_rate": 1.9189743589743594e-05, "loss": 0.7064, "step": 158 }, { "epoch": 2.0319488817891376, "grad_norm": 2.2989776134490967, "learning_rate": 1.9184615384615385e-05, "loss": 0.7314, "step": 159 }, { "epoch": 2.0447284345047922, "grad_norm": 1.766953468322754, "learning_rate": 1.9179487179487182e-05, "loss": 0.5125, "step": 160 }, { "epoch": 2.0575079872204474, "grad_norm": 2.2920029163360596, "learning_rate": 1.9174358974358976e-05, "loss": 0.6074, "step": 161 }, { "epoch": 2.070287539936102, "grad_norm": 1.7585970163345337, "learning_rate": 1.916923076923077e-05, "loss": 0.8434, "step": 162 }, { "epoch": 2.083067092651757, "grad_norm": 1.5543984174728394, "learning_rate": 1.9164102564102564e-05, "loss": 0.5749, "step": 163 }, { "epoch": 2.0958466453674123, "grad_norm": 2.3347678184509277, "learning_rate": 1.9158974358974362e-05, "loss": 0.7675, "step": 164 }, { "epoch": 2.108626198083067, "grad_norm": 1.7411854267120361, "learning_rate": 1.9153846153846156e-05, "loss": 0.5865, "step": 165 }, { "epoch": 2.121405750798722, "grad_norm": 3.4430465698242188, "learning_rate": 1.914871794871795e-05, "loss": 0.7566, "step": 166 }, { "epoch": 2.134185303514377, "grad_norm": 1.8736684322357178, "learning_rate": 1.9143589743589744e-05, "loss": 0.6378, "step": 167 }, { "epoch": 2.146964856230032, "grad_norm": 1.6040446758270264, "learning_rate": 1.9138461538461538e-05, "loss": 0.5306, "step": 168 }, { "epoch": 2.159744408945687, "grad_norm": 2.1889142990112305, "learning_rate": 1.9133333333333335e-05, "loss": 0.5335, "step": 169 }, { "epoch": 2.1725239616613417, "grad_norm": 1.7461152076721191, "learning_rate": 1.912820512820513e-05, "loss": 0.5002, "step": 170 }, { "epoch": 2.1853035143769968, "grad_norm": 1.7458964586257935, "learning_rate": 1.9123076923076923e-05, "loss": 0.6409, "step": 171 }, { "epoch": 2.198083067092652, "grad_norm": 2.153862953186035, "learning_rate": 1.911794871794872e-05, "loss": 0.5139, "step": 172 }, { "epoch": 2.2108626198083066, "grad_norm": 1.9740614891052246, "learning_rate": 1.9112820512820515e-05, "loss": 0.7204, "step": 173 }, { "epoch": 2.2236421725239617, "grad_norm": 2.859555721282959, "learning_rate": 1.910769230769231e-05, "loss": 0.8671, "step": 174 }, { "epoch": 2.236421725239617, "grad_norm": 2.5436668395996094, "learning_rate": 1.9102564102564106e-05, "loss": 0.5918, "step": 175 }, { "epoch": 2.2492012779552715, "grad_norm": 2.3637661933898926, "learning_rate": 1.9097435897435897e-05, "loss": 0.6223, "step": 176 }, { "epoch": 2.2619808306709266, "grad_norm": 2.048809766769409, "learning_rate": 1.9092307692307695e-05, "loss": 0.5968, "step": 177 }, { "epoch": 2.2747603833865817, "grad_norm": 2.409816265106201, "learning_rate": 1.908717948717949e-05, "loss": 0.6192, "step": 178 }, { "epoch": 2.2875399361022364, "grad_norm": 2.0535144805908203, "learning_rate": 1.9082051282051283e-05, "loss": 0.5019, "step": 179 }, { "epoch": 2.3003194888178915, "grad_norm": 2.1206672191619873, "learning_rate": 1.907692307692308e-05, "loss": 0.8196, "step": 180 }, { "epoch": 2.313099041533546, "grad_norm": 1.8516058921813965, "learning_rate": 1.9071794871794874e-05, "loss": 0.6362, "step": 181 }, { "epoch": 2.3258785942492013, "grad_norm": 2.1362321376800537, "learning_rate": 1.9066666666666668e-05, "loss": 0.5798, "step": 182 }, { "epoch": 2.3386581469648564, "grad_norm": 1.775614619255066, "learning_rate": 1.9061538461538462e-05, "loss": 0.6961, "step": 183 }, { "epoch": 2.351437699680511, "grad_norm": 2.2190535068511963, "learning_rate": 1.905641025641026e-05, "loss": 0.7895, "step": 184 }, { "epoch": 2.364217252396166, "grad_norm": 1.8019541501998901, "learning_rate": 1.905128205128205e-05, "loss": 0.4996, "step": 185 }, { "epoch": 2.376996805111821, "grad_norm": 3.500237226486206, "learning_rate": 1.9046153846153848e-05, "loss": 0.8526, "step": 186 }, { "epoch": 2.389776357827476, "grad_norm": 2.187237024307251, "learning_rate": 1.9041025641025642e-05, "loss": 0.6901, "step": 187 }, { "epoch": 2.402555910543131, "grad_norm": 1.8445521593093872, "learning_rate": 1.9035897435897436e-05, "loss": 0.6152, "step": 188 }, { "epoch": 2.415335463258786, "grad_norm": 2.152160882949829, "learning_rate": 1.9030769230769233e-05, "loss": 0.6574, "step": 189 }, { "epoch": 2.428115015974441, "grad_norm": 2.056286573410034, "learning_rate": 1.9025641025641027e-05, "loss": 0.5641, "step": 190 }, { "epoch": 2.440894568690096, "grad_norm": 1.9102013111114502, "learning_rate": 1.902051282051282e-05, "loss": 0.666, "step": 191 }, { "epoch": 2.4536741214057507, "grad_norm": 2.1371514797210693, "learning_rate": 1.901538461538462e-05, "loss": 0.768, "step": 192 }, { "epoch": 2.466453674121406, "grad_norm": 2.719141721725464, "learning_rate": 1.9010256410256413e-05, "loss": 0.7495, "step": 193 }, { "epoch": 2.479233226837061, "grad_norm": 2.1122324466705322, "learning_rate": 1.9005128205128207e-05, "loss": 0.5684, "step": 194 }, { "epoch": 2.4920127795527156, "grad_norm": 1.485161542892456, "learning_rate": 1.9e-05, "loss": 0.5679, "step": 195 }, { "epoch": 2.5047923322683707, "grad_norm": 1.7819613218307495, "learning_rate": 1.8994871794871795e-05, "loss": 0.4931, "step": 196 }, { "epoch": 2.5175718849840254, "grad_norm": 1.859328269958496, "learning_rate": 1.8989743589743593e-05, "loss": 0.6118, "step": 197 }, { "epoch": 2.5303514376996805, "grad_norm": 1.929071068763733, "learning_rate": 1.8984615384615387e-05, "loss": 0.6645, "step": 198 }, { "epoch": 2.543130990415335, "grad_norm": 2.3670995235443115, "learning_rate": 1.897948717948718e-05, "loss": 0.696, "step": 199 }, { "epoch": 2.5559105431309903, "grad_norm": 2.6638221740722656, "learning_rate": 1.8974358974358975e-05, "loss": 0.8578, "step": 200 }, { "epoch": 2.5559105431309903, "eval_loss": 0.6453202366828918, "eval_runtime": 181.0992, "eval_samples_per_second": 0.867, "eval_steps_per_second": 0.11, "step": 200 }, { "epoch": 2.5686900958466454, "grad_norm": 2.1391708850860596, "learning_rate": 1.8969230769230772e-05, "loss": 0.6726, "step": 201 }, { "epoch": 2.5814696485623, "grad_norm": 2.205821990966797, "learning_rate": 1.8964102564102566e-05, "loss": 0.6483, "step": 202 }, { "epoch": 2.594249201277955, "grad_norm": 2.030141592025757, "learning_rate": 1.895897435897436e-05, "loss": 0.7137, "step": 203 }, { "epoch": 2.6070287539936103, "grad_norm": 1.7788156270980835, "learning_rate": 1.8953846153846158e-05, "loss": 0.7249, "step": 204 }, { "epoch": 2.619808306709265, "grad_norm": 1.799312710762024, "learning_rate": 1.894871794871795e-05, "loss": 0.6609, "step": 205 }, { "epoch": 2.63258785942492, "grad_norm": 1.8184287548065186, "learning_rate": 1.8943589743589746e-05, "loss": 0.5924, "step": 206 }, { "epoch": 2.6453674121405752, "grad_norm": 1.925370216369629, "learning_rate": 1.893846153846154e-05, "loss": 0.6466, "step": 207 }, { "epoch": 2.65814696485623, "grad_norm": 1.9518979787826538, "learning_rate": 1.8933333333333334e-05, "loss": 0.5184, "step": 208 }, { "epoch": 2.670926517571885, "grad_norm": 1.5550258159637451, "learning_rate": 1.892820512820513e-05, "loss": 0.5325, "step": 209 }, { "epoch": 2.68370607028754, "grad_norm": 1.674920678138733, "learning_rate": 1.8923076923076925e-05, "loss": 0.4642, "step": 210 }, { "epoch": 2.696485623003195, "grad_norm": 2.142885684967041, "learning_rate": 1.891794871794872e-05, "loss": 0.7174, "step": 211 }, { "epoch": 2.70926517571885, "grad_norm": 2.322446346282959, "learning_rate": 1.8912820512820513e-05, "loss": 0.7566, "step": 212 }, { "epoch": 2.722044728434505, "grad_norm": 1.7620545625686646, "learning_rate": 1.8907692307692308e-05, "loss": 0.5529, "step": 213 }, { "epoch": 2.7348242811501597, "grad_norm": 1.606350064277649, "learning_rate": 1.8902564102564105e-05, "loss": 0.469, "step": 214 }, { "epoch": 2.747603833865815, "grad_norm": 2.2989253997802734, "learning_rate": 1.88974358974359e-05, "loss": 0.7478, "step": 215 }, { "epoch": 2.7603833865814695, "grad_norm": 2.260450601577759, "learning_rate": 1.8892307692307693e-05, "loss": 0.5506, "step": 216 }, { "epoch": 2.7731629392971247, "grad_norm": 2.017993211746216, "learning_rate": 1.8887179487179487e-05, "loss": 0.4649, "step": 217 }, { "epoch": 2.7859424920127793, "grad_norm": 1.8710030317306519, "learning_rate": 1.8882051282051285e-05, "loss": 0.6181, "step": 218 }, { "epoch": 2.7987220447284344, "grad_norm": 2.171832799911499, "learning_rate": 1.887692307692308e-05, "loss": 0.566, "step": 219 }, { "epoch": 2.8115015974440896, "grad_norm": 2.444894552230835, "learning_rate": 1.8871794871794873e-05, "loss": 0.6019, "step": 220 }, { "epoch": 2.8242811501597442, "grad_norm": 2.516892194747925, "learning_rate": 1.886666666666667e-05, "loss": 0.609, "step": 221 }, { "epoch": 2.8370607028753994, "grad_norm": 1.8376013040542603, "learning_rate": 1.886153846153846e-05, "loss": 0.4318, "step": 222 }, { "epoch": 2.8498402555910545, "grad_norm": 2.615744113922119, "learning_rate": 1.8856410256410258e-05, "loss": 0.6561, "step": 223 }, { "epoch": 2.862619808306709, "grad_norm": 2.35602068901062, "learning_rate": 1.8851282051282052e-05, "loss": 0.7015, "step": 224 }, { "epoch": 2.8753993610223643, "grad_norm": 1.974861979484558, "learning_rate": 1.8846153846153846e-05, "loss": 0.6246, "step": 225 }, { "epoch": 2.8881789137380194, "grad_norm": 1.9995675086975098, "learning_rate": 1.8841025641025644e-05, "loss": 0.5532, "step": 226 }, { "epoch": 2.900958466453674, "grad_norm": 2.0074105262756348, "learning_rate": 1.8835897435897438e-05, "loss": 0.5717, "step": 227 }, { "epoch": 2.913738019169329, "grad_norm": 2.0054516792297363, "learning_rate": 1.8830769230769232e-05, "loss": 0.6862, "step": 228 }, { "epoch": 2.9265175718849843, "grad_norm": 2.4561843872070312, "learning_rate": 1.882564102564103e-05, "loss": 0.6273, "step": 229 }, { "epoch": 2.939297124600639, "grad_norm": 2.190880537033081, "learning_rate": 1.8820512820512823e-05, "loss": 0.618, "step": 230 }, { "epoch": 2.952076677316294, "grad_norm": 2.0371289253234863, "learning_rate": 1.8815384615384617e-05, "loss": 0.4063, "step": 231 }, { "epoch": 2.9648562300319488, "grad_norm": 2.6498465538024902, "learning_rate": 1.881025641025641e-05, "loss": 0.6254, "step": 232 }, { "epoch": 2.977635782747604, "grad_norm": 1.9762572050094604, "learning_rate": 1.8805128205128206e-05, "loss": 0.6503, "step": 233 }, { "epoch": 2.9904153354632586, "grad_norm": 1.9959864616394043, "learning_rate": 1.88e-05, "loss": 0.5692, "step": 234 }, { "epoch": 3.0031948881789137, "grad_norm": 2.349118232727051, "learning_rate": 1.8794871794871797e-05, "loss": 0.7357, "step": 235 }, { "epoch": 3.015974440894569, "grad_norm": 2.024961471557617, "learning_rate": 1.878974358974359e-05, "loss": 0.5339, "step": 236 }, { "epoch": 3.0287539936102235, "grad_norm": 2.3128573894500732, "learning_rate": 1.8784615384615385e-05, "loss": 0.6399, "step": 237 }, { "epoch": 3.0415335463258786, "grad_norm": 2.0917768478393555, "learning_rate": 1.8779487179487183e-05, "loss": 0.6905, "step": 238 }, { "epoch": 3.0543130990415337, "grad_norm": 2.443096876144409, "learning_rate": 1.8774358974358977e-05, "loss": 0.58, "step": 239 }, { "epoch": 3.0670926517571884, "grad_norm": 2.049499988555908, "learning_rate": 1.876923076923077e-05, "loss": 0.5741, "step": 240 }, { "epoch": 3.0798722044728435, "grad_norm": 2.377434730529785, "learning_rate": 1.8764102564102568e-05, "loss": 0.6907, "step": 241 }, { "epoch": 3.0926517571884986, "grad_norm": 2.2461509704589844, "learning_rate": 1.875897435897436e-05, "loss": 0.6526, "step": 242 }, { "epoch": 3.1054313099041533, "grad_norm": 2.7188053131103516, "learning_rate": 1.8753846153846156e-05, "loss": 0.603, "step": 243 }, { "epoch": 3.1182108626198084, "grad_norm": 2.164670705795288, "learning_rate": 1.874871794871795e-05, "loss": 0.5971, "step": 244 }, { "epoch": 3.130990415335463, "grad_norm": 2.7351036071777344, "learning_rate": 1.8743589743589744e-05, "loss": 0.5675, "step": 245 }, { "epoch": 3.143769968051118, "grad_norm": 1.7953331470489502, "learning_rate": 1.8738461538461542e-05, "loss": 0.4819, "step": 246 }, { "epoch": 3.1565495207667733, "grad_norm": 2.31479811668396, "learning_rate": 1.8733333333333336e-05, "loss": 0.7826, "step": 247 }, { "epoch": 3.169329073482428, "grad_norm": 1.7311149835586548, "learning_rate": 1.872820512820513e-05, "loss": 0.6296, "step": 248 }, { "epoch": 3.182108626198083, "grad_norm": 1.964176893234253, "learning_rate": 1.8723076923076924e-05, "loss": 0.5487, "step": 249 }, { "epoch": 3.194888178913738, "grad_norm": 2.2353405952453613, "learning_rate": 1.8717948717948718e-05, "loss": 0.7026, "step": 250 }, { "epoch": 3.207667731629393, "grad_norm": 2.1692943572998047, "learning_rate": 1.8712820512820515e-05, "loss": 0.4247, "step": 251 }, { "epoch": 3.220447284345048, "grad_norm": 1.894005298614502, "learning_rate": 1.870769230769231e-05, "loss": 0.577, "step": 252 }, { "epoch": 3.2332268370607027, "grad_norm": 2.4834699630737305, "learning_rate": 1.8702564102564104e-05, "loss": 0.7699, "step": 253 }, { "epoch": 3.246006389776358, "grad_norm": 2.7217936515808105, "learning_rate": 1.8697435897435898e-05, "loss": 0.5841, "step": 254 }, { "epoch": 3.258785942492013, "grad_norm": 2.007357120513916, "learning_rate": 1.8692307692307695e-05, "loss": 0.4561, "step": 255 }, { "epoch": 3.2715654952076676, "grad_norm": 2.752572536468506, "learning_rate": 1.868717948717949e-05, "loss": 0.7493, "step": 256 }, { "epoch": 3.2843450479233227, "grad_norm": 1.9526993036270142, "learning_rate": 1.8682051282051283e-05, "loss": 0.62, "step": 257 }, { "epoch": 3.297124600638978, "grad_norm": 2.3786776065826416, "learning_rate": 1.867692307692308e-05, "loss": 0.6224, "step": 258 }, { "epoch": 3.3099041533546325, "grad_norm": 2.043469190597534, "learning_rate": 1.867179487179487e-05, "loss": 0.5053, "step": 259 }, { "epoch": 3.3226837060702876, "grad_norm": 1.7549484968185425, "learning_rate": 1.866666666666667e-05, "loss": 0.4134, "step": 260 }, { "epoch": 3.3354632587859427, "grad_norm": 2.2081825733184814, "learning_rate": 1.8661538461538463e-05, "loss": 0.8591, "step": 261 }, { "epoch": 3.3482428115015974, "grad_norm": 1.8951700925827026, "learning_rate": 1.8656410256410257e-05, "loss": 0.6255, "step": 262 }, { "epoch": 3.3610223642172525, "grad_norm": 2.757758140563965, "learning_rate": 1.8651282051282054e-05, "loss": 0.4654, "step": 263 }, { "epoch": 3.373801916932907, "grad_norm": 2.7067980766296387, "learning_rate": 1.8646153846153848e-05, "loss": 0.6433, "step": 264 }, { "epoch": 3.3865814696485623, "grad_norm": 2.0272269248962402, "learning_rate": 1.8641025641025642e-05, "loss": 0.4873, "step": 265 }, { "epoch": 3.3993610223642174, "grad_norm": 2.0535693168640137, "learning_rate": 1.8635897435897436e-05, "loss": 0.6818, "step": 266 }, { "epoch": 3.412140575079872, "grad_norm": 2.127767562866211, "learning_rate": 1.8630769230769234e-05, "loss": 0.4724, "step": 267 }, { "epoch": 3.4249201277955272, "grad_norm": 1.7211772203445435, "learning_rate": 1.8625641025641028e-05, "loss": 0.4135, "step": 268 }, { "epoch": 3.437699680511182, "grad_norm": 1.9719951152801514, "learning_rate": 1.8620512820512822e-05, "loss": 0.4409, "step": 269 }, { "epoch": 3.450479233226837, "grad_norm": 1.8113923072814941, "learning_rate": 1.8615384615384616e-05, "loss": 0.6051, "step": 270 }, { "epoch": 3.463258785942492, "grad_norm": 2.094407796859741, "learning_rate": 1.861025641025641e-05, "loss": 0.4492, "step": 271 }, { "epoch": 3.476038338658147, "grad_norm": 2.2038345336914062, "learning_rate": 1.8605128205128207e-05, "loss": 0.6206, "step": 272 }, { "epoch": 3.488817891373802, "grad_norm": 2.0385520458221436, "learning_rate": 1.86e-05, "loss": 0.4314, "step": 273 }, { "epoch": 3.501597444089457, "grad_norm": 2.0348079204559326, "learning_rate": 1.8594871794871796e-05, "loss": 0.5783, "step": 274 }, { "epoch": 3.5143769968051117, "grad_norm": 1.742039680480957, "learning_rate": 1.8589743589743593e-05, "loss": 0.4207, "step": 275 }, { "epoch": 3.527156549520767, "grad_norm": 2.7231643199920654, "learning_rate": 1.8584615384615387e-05, "loss": 0.703, "step": 276 }, { "epoch": 3.539936102236422, "grad_norm": 2.291921615600586, "learning_rate": 1.857948717948718e-05, "loss": 0.5661, "step": 277 }, { "epoch": 3.5527156549520766, "grad_norm": 1.8833556175231934, "learning_rate": 1.857435897435898e-05, "loss": 0.4881, "step": 278 }, { "epoch": 3.5654952076677318, "grad_norm": 2.2225875854492188, "learning_rate": 1.856923076923077e-05, "loss": 0.4945, "step": 279 }, { "epoch": 3.5782747603833864, "grad_norm": 2.337555408477783, "learning_rate": 1.8564102564102567e-05, "loss": 0.6395, "step": 280 }, { "epoch": 3.5910543130990416, "grad_norm": 2.4982399940490723, "learning_rate": 1.855897435897436e-05, "loss": 0.6697, "step": 281 }, { "epoch": 3.6038338658146962, "grad_norm": 3.495062828063965, "learning_rate": 1.8553846153846155e-05, "loss": 0.5231, "step": 282 }, { "epoch": 3.6166134185303513, "grad_norm": 1.9106608629226685, "learning_rate": 1.854871794871795e-05, "loss": 0.4807, "step": 283 }, { "epoch": 3.6293929712460065, "grad_norm": 2.2010486125946045, "learning_rate": 1.8543589743589746e-05, "loss": 0.5052, "step": 284 }, { "epoch": 3.642172523961661, "grad_norm": 2.2390029430389404, "learning_rate": 1.853846153846154e-05, "loss": 0.4321, "step": 285 }, { "epoch": 3.6549520766773163, "grad_norm": 2.0129148960113525, "learning_rate": 1.8533333333333334e-05, "loss": 0.4684, "step": 286 }, { "epoch": 3.6677316293929714, "grad_norm": 2.1632564067840576, "learning_rate": 1.852820512820513e-05, "loss": 0.5153, "step": 287 }, { "epoch": 3.680511182108626, "grad_norm": 2.2095839977264404, "learning_rate": 1.8523076923076922e-05, "loss": 0.6905, "step": 288 }, { "epoch": 3.693290734824281, "grad_norm": 2.3573625087738037, "learning_rate": 1.851794871794872e-05, "loss": 0.7279, "step": 289 }, { "epoch": 3.7060702875399363, "grad_norm": 1.7171701192855835, "learning_rate": 1.8512820512820514e-05, "loss": 0.5147, "step": 290 }, { "epoch": 3.718849840255591, "grad_norm": 1.9186244010925293, "learning_rate": 1.8507692307692308e-05, "loss": 0.475, "step": 291 }, { "epoch": 3.731629392971246, "grad_norm": 2.364483118057251, "learning_rate": 1.8502564102564105e-05, "loss": 0.5201, "step": 292 }, { "epoch": 3.744408945686901, "grad_norm": 2.2425689697265625, "learning_rate": 1.84974358974359e-05, "loss": 0.5556, "step": 293 }, { "epoch": 3.757188498402556, "grad_norm": 2.286992311477661, "learning_rate": 1.8492307692307694e-05, "loss": 0.5985, "step": 294 }, { "epoch": 3.769968051118211, "grad_norm": 1.652174949645996, "learning_rate": 1.848717948717949e-05, "loss": 0.3692, "step": 295 }, { "epoch": 3.7827476038338657, "grad_norm": 2.004711627960205, "learning_rate": 1.848205128205128e-05, "loss": 0.6071, "step": 296 }, { "epoch": 3.7955271565495208, "grad_norm": 1.9623836278915405, "learning_rate": 1.847692307692308e-05, "loss": 0.5376, "step": 297 }, { "epoch": 3.8083067092651754, "grad_norm": 1.977774977684021, "learning_rate": 1.8471794871794873e-05, "loss": 0.4829, "step": 298 }, { "epoch": 3.8210862619808306, "grad_norm": 2.2446415424346924, "learning_rate": 1.8466666666666667e-05, "loss": 0.4534, "step": 299 }, { "epoch": 3.8338658146964857, "grad_norm": 2.3083629608154297, "learning_rate": 1.8461538461538465e-05, "loss": 0.6234, "step": 300 }, { "epoch": 3.8338658146964857, "eval_loss": 0.5684083700180054, "eval_runtime": 180.8233, "eval_samples_per_second": 0.868, "eval_steps_per_second": 0.111, "step": 300 }, { "epoch": 3.8466453674121404, "grad_norm": 2.4002017974853516, "learning_rate": 1.845641025641026e-05, "loss": 0.6042, "step": 301 }, { "epoch": 3.8594249201277955, "grad_norm": 1.574140191078186, "learning_rate": 1.8451282051282053e-05, "loss": 0.3668, "step": 302 }, { "epoch": 3.8722044728434506, "grad_norm": 1.857406497001648, "learning_rate": 1.8446153846153847e-05, "loss": 0.4021, "step": 303 }, { "epoch": 3.8849840255591053, "grad_norm": 2.3369333744049072, "learning_rate": 1.8441025641025644e-05, "loss": 0.4543, "step": 304 }, { "epoch": 3.8977635782747604, "grad_norm": 2.348994255065918, "learning_rate": 1.8435897435897435e-05, "loss": 0.5099, "step": 305 }, { "epoch": 3.9105431309904155, "grad_norm": 2.381173610687256, "learning_rate": 1.8430769230769232e-05, "loss": 0.5886, "step": 306 }, { "epoch": 3.92332268370607, "grad_norm": 2.188190460205078, "learning_rate": 1.8425641025641026e-05, "loss": 0.5159, "step": 307 }, { "epoch": 3.9361022364217253, "grad_norm": 2.331792116165161, "learning_rate": 1.842051282051282e-05, "loss": 0.5538, "step": 308 }, { "epoch": 3.9488817891373804, "grad_norm": 2.4262869358062744, "learning_rate": 1.8415384615384618e-05, "loss": 0.4675, "step": 309 }, { "epoch": 3.961661341853035, "grad_norm": 2.115399122238159, "learning_rate": 1.8410256410256412e-05, "loss": 0.4631, "step": 310 }, { "epoch": 3.97444089456869, "grad_norm": 2.0459988117218018, "learning_rate": 1.8405128205128206e-05, "loss": 0.6768, "step": 311 }, { "epoch": 3.987220447284345, "grad_norm": 3.0487661361694336, "learning_rate": 1.8400000000000003e-05, "loss": 0.6092, "step": 312 }, { "epoch": 4.0, "grad_norm": 2.4635841846466064, "learning_rate": 1.8394871794871797e-05, "loss": 0.7452, "step": 313 }, { "epoch": 4.012779552715655, "grad_norm": 3.330390214920044, "learning_rate": 1.838974358974359e-05, "loss": 0.6009, "step": 314 }, { "epoch": 4.02555910543131, "grad_norm": 2.3095197677612305, "learning_rate": 1.8384615384615386e-05, "loss": 0.4163, "step": 315 }, { "epoch": 4.038338658146965, "grad_norm": 2.4546499252319336, "learning_rate": 1.837948717948718e-05, "loss": 0.5153, "step": 316 }, { "epoch": 4.05111821086262, "grad_norm": 2.220017671585083, "learning_rate": 1.8374358974358977e-05, "loss": 0.5124, "step": 317 }, { "epoch": 4.063897763578275, "grad_norm": 2.0015223026275635, "learning_rate": 1.836923076923077e-05, "loss": 0.4947, "step": 318 }, { "epoch": 4.07667731629393, "grad_norm": 2.588055372238159, "learning_rate": 1.8364102564102565e-05, "loss": 0.6403, "step": 319 }, { "epoch": 4.0894568690095845, "grad_norm": 2.0123562812805176, "learning_rate": 1.835897435897436e-05, "loss": 0.4782, "step": 320 }, { "epoch": 4.102236421725239, "grad_norm": 2.275867462158203, "learning_rate": 1.8353846153846157e-05, "loss": 0.5304, "step": 321 }, { "epoch": 4.115015974440895, "grad_norm": 2.5205776691436768, "learning_rate": 1.834871794871795e-05, "loss": 0.5028, "step": 322 }, { "epoch": 4.127795527156549, "grad_norm": 2.3919801712036133, "learning_rate": 1.8343589743589745e-05, "loss": 0.6356, "step": 323 }, { "epoch": 4.140575079872204, "grad_norm": 2.4463231563568115, "learning_rate": 1.8338461538461542e-05, "loss": 0.6879, "step": 324 }, { "epoch": 4.15335463258786, "grad_norm": 2.269652843475342, "learning_rate": 1.8333333333333333e-05, "loss": 0.561, "step": 325 }, { "epoch": 4.166134185303514, "grad_norm": 2.2289443016052246, "learning_rate": 1.832820512820513e-05, "loss": 0.5963, "step": 326 }, { "epoch": 4.178913738019169, "grad_norm": 1.8987523317337036, "learning_rate": 1.8323076923076924e-05, "loss": 0.4023, "step": 327 }, { "epoch": 4.1916932907348246, "grad_norm": 1.9962126016616821, "learning_rate": 1.831794871794872e-05, "loss": 0.4075, "step": 328 }, { "epoch": 4.204472843450479, "grad_norm": 2.6134021282196045, "learning_rate": 1.8312820512820516e-05, "loss": 0.5599, "step": 329 }, { "epoch": 4.217252396166134, "grad_norm": 1.8641389608383179, "learning_rate": 1.830769230769231e-05, "loss": 0.4824, "step": 330 }, { "epoch": 4.2300319488817895, "grad_norm": 2.134913921356201, "learning_rate": 1.8302564102564104e-05, "loss": 0.4844, "step": 331 }, { "epoch": 4.242811501597444, "grad_norm": 2.1769747734069824, "learning_rate": 1.8297435897435898e-05, "loss": 0.429, "step": 332 }, { "epoch": 4.255591054313099, "grad_norm": 1.921721339225769, "learning_rate": 1.8292307692307692e-05, "loss": 0.3981, "step": 333 }, { "epoch": 4.268370607028754, "grad_norm": 2.113063335418701, "learning_rate": 1.828717948717949e-05, "loss": 0.499, "step": 334 }, { "epoch": 4.281150159744409, "grad_norm": 2.1482808589935303, "learning_rate": 1.8282051282051284e-05, "loss": 0.5937, "step": 335 }, { "epoch": 4.293929712460064, "grad_norm": 2.5962321758270264, "learning_rate": 1.8276923076923078e-05, "loss": 0.4059, "step": 336 }, { "epoch": 4.306709265175719, "grad_norm": 1.8513988256454468, "learning_rate": 1.827179487179487e-05, "loss": 0.4338, "step": 337 }, { "epoch": 4.319488817891374, "grad_norm": 2.2054789066314697, "learning_rate": 1.826666666666667e-05, "loss": 0.4734, "step": 338 }, { "epoch": 4.332268370607029, "grad_norm": 2.223257064819336, "learning_rate": 1.8261538461538463e-05, "loss": 0.5564, "step": 339 }, { "epoch": 4.345047923322683, "grad_norm": 2.102458953857422, "learning_rate": 1.8256410256410257e-05, "loss": 0.4475, "step": 340 }, { "epoch": 4.357827476038339, "grad_norm": 2.7025585174560547, "learning_rate": 1.8251282051282055e-05, "loss": 0.6849, "step": 341 }, { "epoch": 4.3706070287539935, "grad_norm": 2.189150094985962, "learning_rate": 1.8246153846153845e-05, "loss": 0.6558, "step": 342 }, { "epoch": 4.383386581469648, "grad_norm": 3.0373575687408447, "learning_rate": 1.8241025641025643e-05, "loss": 0.7083, "step": 343 }, { "epoch": 4.396166134185304, "grad_norm": 2.166837692260742, "learning_rate": 1.8235897435897437e-05, "loss": 0.3849, "step": 344 }, { "epoch": 4.4089456869009584, "grad_norm": 2.1708340644836426, "learning_rate": 1.823076923076923e-05, "loss": 0.4122, "step": 345 }, { "epoch": 4.421725239616613, "grad_norm": 2.3275482654571533, "learning_rate": 1.8225641025641028e-05, "loss": 0.3932, "step": 346 }, { "epoch": 4.434504792332269, "grad_norm": 3.253466844558716, "learning_rate": 1.8220512820512822e-05, "loss": 0.6906, "step": 347 }, { "epoch": 4.447284345047923, "grad_norm": 2.473674774169922, "learning_rate": 1.8215384615384616e-05, "loss": 0.6586, "step": 348 }, { "epoch": 4.460063897763578, "grad_norm": 2.164419651031494, "learning_rate": 1.8210256410256414e-05, "loss": 0.4456, "step": 349 }, { "epoch": 4.472843450479234, "grad_norm": 2.3611209392547607, "learning_rate": 1.8205128205128208e-05, "loss": 0.4352, "step": 350 }, { "epoch": 4.485623003194888, "grad_norm": 2.4800209999084473, "learning_rate": 1.8200000000000002e-05, "loss": 0.4834, "step": 351 }, { "epoch": 4.498402555910543, "grad_norm": 2.4389426708221436, "learning_rate": 1.8194871794871796e-05, "loss": 0.57, "step": 352 }, { "epoch": 4.511182108626198, "grad_norm": 2.4953975677490234, "learning_rate": 1.818974358974359e-05, "loss": 0.3799, "step": 353 }, { "epoch": 4.523961661341853, "grad_norm": 1.9845151901245117, "learning_rate": 1.8184615384615384e-05, "loss": 0.4779, "step": 354 }, { "epoch": 4.536741214057508, "grad_norm": 2.2074553966522217, "learning_rate": 1.817948717948718e-05, "loss": 0.3435, "step": 355 }, { "epoch": 4.549520766773163, "grad_norm": 2.5801925659179688, "learning_rate": 1.8174358974358976e-05, "loss": 0.4775, "step": 356 }, { "epoch": 4.562300319488818, "grad_norm": 2.800104856491089, "learning_rate": 1.816923076923077e-05, "loss": 0.6591, "step": 357 }, { "epoch": 4.575079872204473, "grad_norm": 2.1821672916412354, "learning_rate": 1.8164102564102567e-05, "loss": 0.3604, "step": 358 }, { "epoch": 4.587859424920127, "grad_norm": 2.050424575805664, "learning_rate": 1.815897435897436e-05, "loss": 0.4017, "step": 359 }, { "epoch": 4.600638977635783, "grad_norm": 2.6958281993865967, "learning_rate": 1.8153846153846155e-05, "loss": 0.613, "step": 360 }, { "epoch": 4.613418530351438, "grad_norm": 2.337655544281006, "learning_rate": 1.8148717948717953e-05, "loss": 0.6911, "step": 361 }, { "epoch": 4.626198083067092, "grad_norm": 2.8686039447784424, "learning_rate": 1.8143589743589743e-05, "loss": 0.5681, "step": 362 }, { "epoch": 4.638977635782748, "grad_norm": 2.4105000495910645, "learning_rate": 1.813846153846154e-05, "loss": 0.6666, "step": 363 }, { "epoch": 4.651757188498403, "grad_norm": 2.0860559940338135, "learning_rate": 1.8133333333333335e-05, "loss": 0.411, "step": 364 }, { "epoch": 4.664536741214057, "grad_norm": 2.3210160732269287, "learning_rate": 1.812820512820513e-05, "loss": 0.4459, "step": 365 }, { "epoch": 4.677316293929713, "grad_norm": 1.9436842203140259, "learning_rate": 1.8123076923076926e-05, "loss": 0.493, "step": 366 }, { "epoch": 4.6900958466453675, "grad_norm": 2.178853750228882, "learning_rate": 1.811794871794872e-05, "loss": 0.6059, "step": 367 }, { "epoch": 4.702875399361022, "grad_norm": 2.5311412811279297, "learning_rate": 1.8112820512820514e-05, "loss": 0.504, "step": 368 }, { "epoch": 4.715654952076678, "grad_norm": 1.6586120128631592, "learning_rate": 1.810769230769231e-05, "loss": 0.4196, "step": 369 }, { "epoch": 4.728434504792332, "grad_norm": 2.5906753540039062, "learning_rate": 1.8102564102564102e-05, "loss": 0.462, "step": 370 }, { "epoch": 4.741214057507987, "grad_norm": 2.0996625423431396, "learning_rate": 1.80974358974359e-05, "loss": 0.5456, "step": 371 }, { "epoch": 4.753993610223642, "grad_norm": 2.0776517391204834, "learning_rate": 1.8092307692307694e-05, "loss": 0.5379, "step": 372 }, { "epoch": 4.766773162939297, "grad_norm": 1.8418030738830566, "learning_rate": 1.8087179487179488e-05, "loss": 0.4525, "step": 373 }, { "epoch": 4.779552715654952, "grad_norm": 2.2306466102600098, "learning_rate": 1.8082051282051282e-05, "loss": 0.4133, "step": 374 }, { "epoch": 4.792332268370607, "grad_norm": 2.0150136947631836, "learning_rate": 1.807692307692308e-05, "loss": 0.5694, "step": 375 }, { "epoch": 4.805111821086262, "grad_norm": 2.669576644897461, "learning_rate": 1.8071794871794874e-05, "loss": 0.7247, "step": 376 }, { "epoch": 4.817891373801917, "grad_norm": 2.7284021377563477, "learning_rate": 1.8066666666666668e-05, "loss": 0.4458, "step": 377 }, { "epoch": 4.830670926517572, "grad_norm": 1.9394841194152832, "learning_rate": 1.8061538461538465e-05, "loss": 0.4246, "step": 378 }, { "epoch": 4.843450479233227, "grad_norm": 2.0008041858673096, "learning_rate": 1.8056410256410256e-05, "loss": 0.3953, "step": 379 }, { "epoch": 4.856230031948882, "grad_norm": 2.4139251708984375, "learning_rate": 1.8051282051282053e-05, "loss": 0.5843, "step": 380 }, { "epoch": 4.8690095846645365, "grad_norm": 3.531747817993164, "learning_rate": 1.8046153846153847e-05, "loss": 0.4477, "step": 381 }, { "epoch": 4.881789137380192, "grad_norm": 2.701043128967285, "learning_rate": 1.804102564102564e-05, "loss": 0.5375, "step": 382 }, { "epoch": 4.894568690095847, "grad_norm": 2.51804256439209, "learning_rate": 1.803589743589744e-05, "loss": 0.442, "step": 383 }, { "epoch": 4.907348242811501, "grad_norm": 1.9672200679779053, "learning_rate": 1.8030769230769233e-05, "loss": 0.4485, "step": 384 }, { "epoch": 4.920127795527156, "grad_norm": 2.2077980041503906, "learning_rate": 1.8025641025641027e-05, "loss": 0.5409, "step": 385 }, { "epoch": 4.932907348242812, "grad_norm": 2.3517746925354004, "learning_rate": 1.802051282051282e-05, "loss": 0.5482, "step": 386 }, { "epoch": 4.945686900958466, "grad_norm": 2.220970869064331, "learning_rate": 1.8015384615384618e-05, "loss": 0.757, "step": 387 }, { "epoch": 4.958466453674122, "grad_norm": 2.4243104457855225, "learning_rate": 1.8010256410256412e-05, "loss": 0.5577, "step": 388 }, { "epoch": 4.9712460063897765, "grad_norm": 2.6521098613739014, "learning_rate": 1.8005128205128206e-05, "loss": 0.6587, "step": 389 }, { "epoch": 4.984025559105431, "grad_norm": 2.181779384613037, "learning_rate": 1.8e-05, "loss": 0.4766, "step": 390 }, { "epoch": 4.996805111821086, "grad_norm": 2.2270212173461914, "learning_rate": 1.7994871794871795e-05, "loss": 0.528, "step": 391 }, { "epoch": 5.0095846645367414, "grad_norm": 2.2594521045684814, "learning_rate": 1.7989743589743592e-05, "loss": 0.4741, "step": 392 }, { "epoch": 5.022364217252396, "grad_norm": 2.1782121658325195, "learning_rate": 1.7984615384615386e-05, "loss": 0.599, "step": 393 }, { "epoch": 5.035143769968051, "grad_norm": 2.0388574600219727, "learning_rate": 1.797948717948718e-05, "loss": 0.7343, "step": 394 }, { "epoch": 5.047923322683706, "grad_norm": 2.135293483734131, "learning_rate": 1.7974358974358977e-05, "loss": 0.6222, "step": 395 }, { "epoch": 5.060702875399361, "grad_norm": 2.4240000247955322, "learning_rate": 1.796923076923077e-05, "loss": 0.5916, "step": 396 }, { "epoch": 5.073482428115016, "grad_norm": 2.2460944652557373, "learning_rate": 1.7964102564102566e-05, "loss": 0.571, "step": 397 }, { "epoch": 5.086261980830671, "grad_norm": 2.330200433731079, "learning_rate": 1.7958974358974363e-05, "loss": 0.4237, "step": 398 }, { "epoch": 5.099041533546326, "grad_norm": 2.227525234222412, "learning_rate": 1.7953846153846154e-05, "loss": 0.64, "step": 399 }, { "epoch": 5.111821086261981, "grad_norm": 1.9940121173858643, "learning_rate": 1.794871794871795e-05, "loss": 0.4471, "step": 400 }, { "epoch": 5.111821086261981, "eval_loss": 0.5250003337860107, "eval_runtime": 181.0045, "eval_samples_per_second": 0.867, "eval_steps_per_second": 0.11, "step": 400 }, { "epoch": 5.124600638977636, "grad_norm": 1.8769639730453491, "learning_rate": 1.7943589743589745e-05, "loss": 0.3262, "step": 401 }, { "epoch": 5.137380191693291, "grad_norm": 2.1913700103759766, "learning_rate": 1.793846153846154e-05, "loss": 0.502, "step": 402 }, { "epoch": 5.1501597444089455, "grad_norm": 1.8566724061965942, "learning_rate": 1.7933333333333333e-05, "loss": 0.4005, "step": 403 }, { "epoch": 5.1629392971246, "grad_norm": 2.3526904582977295, "learning_rate": 1.792820512820513e-05, "loss": 0.6051, "step": 404 }, { "epoch": 5.175718849840256, "grad_norm": 2.3529820442199707, "learning_rate": 1.7923076923076925e-05, "loss": 0.5727, "step": 405 }, { "epoch": 5.18849840255591, "grad_norm": 2.2235405445098877, "learning_rate": 1.791794871794872e-05, "loss": 0.3353, "step": 406 }, { "epoch": 5.201277955271565, "grad_norm": 2.3372035026550293, "learning_rate": 1.7912820512820516e-05, "loss": 0.4259, "step": 407 }, { "epoch": 5.214057507987221, "grad_norm": 2.4938113689422607, "learning_rate": 1.7907692307692307e-05, "loss": 0.4206, "step": 408 }, { "epoch": 5.226837060702875, "grad_norm": 2.129610776901245, "learning_rate": 1.7902564102564104e-05, "loss": 0.5185, "step": 409 }, { "epoch": 5.23961661341853, "grad_norm": 2.3278634548187256, "learning_rate": 1.78974358974359e-05, "loss": 0.4314, "step": 410 }, { "epoch": 5.252396166134186, "grad_norm": 2.271420478820801, "learning_rate": 1.7892307692307692e-05, "loss": 0.3916, "step": 411 }, { "epoch": 5.26517571884984, "grad_norm": 1.915278434753418, "learning_rate": 1.788717948717949e-05, "loss": 0.4356, "step": 412 }, { "epoch": 5.277955271565495, "grad_norm": 2.2529425621032715, "learning_rate": 1.7882051282051284e-05, "loss": 0.4071, "step": 413 }, { "epoch": 5.2907348242811505, "grad_norm": 2.971188545227051, "learning_rate": 1.7876923076923078e-05, "loss": 0.3759, "step": 414 }, { "epoch": 5.303514376996805, "grad_norm": 1.9062541723251343, "learning_rate": 1.7871794871794875e-05, "loss": 0.3086, "step": 415 }, { "epoch": 5.31629392971246, "grad_norm": 2.860849618911743, "learning_rate": 1.7866666666666666e-05, "loss": 0.529, "step": 416 }, { "epoch": 5.329073482428115, "grad_norm": 2.6254971027374268, "learning_rate": 1.7861538461538464e-05, "loss": 0.4456, "step": 417 }, { "epoch": 5.34185303514377, "grad_norm": 2.4462695121765137, "learning_rate": 1.7856410256410258e-05, "loss": 0.5505, "step": 418 }, { "epoch": 5.354632587859425, "grad_norm": 3.001704216003418, "learning_rate": 1.785128205128205e-05, "loss": 0.4583, "step": 419 }, { "epoch": 5.36741214057508, "grad_norm": 2.2287330627441406, "learning_rate": 1.784615384615385e-05, "loss": 0.4334, "step": 420 }, { "epoch": 5.380191693290735, "grad_norm": 2.3208580017089844, "learning_rate": 1.7841025641025643e-05, "loss": 0.4962, "step": 421 }, { "epoch": 5.39297124600639, "grad_norm": 2.5191850662231445, "learning_rate": 1.7835897435897437e-05, "loss": 0.5019, "step": 422 }, { "epoch": 5.405750798722044, "grad_norm": 2.3573968410491943, "learning_rate": 1.783076923076923e-05, "loss": 0.5416, "step": 423 }, { "epoch": 5.4185303514377, "grad_norm": 2.279006242752075, "learning_rate": 1.782564102564103e-05, "loss": 0.3585, "step": 424 }, { "epoch": 5.431309904153355, "grad_norm": 1.831701397895813, "learning_rate": 1.7820512820512823e-05, "loss": 0.4025, "step": 425 }, { "epoch": 5.444089456869009, "grad_norm": 2.1646835803985596, "learning_rate": 1.7815384615384617e-05, "loss": 0.3548, "step": 426 }, { "epoch": 5.456869009584665, "grad_norm": 2.598620653152466, "learning_rate": 1.781025641025641e-05, "loss": 0.4204, "step": 427 }, { "epoch": 5.4696485623003195, "grad_norm": 3.2102348804473877, "learning_rate": 1.7805128205128205e-05, "loss": 0.6015, "step": 428 }, { "epoch": 5.482428115015974, "grad_norm": 2.23750376701355, "learning_rate": 1.7800000000000002e-05, "loss": 0.3484, "step": 429 }, { "epoch": 5.49520766773163, "grad_norm": 2.155402421951294, "learning_rate": 1.7794871794871796e-05, "loss": 0.4067, "step": 430 }, { "epoch": 5.507987220447284, "grad_norm": 2.757122755050659, "learning_rate": 1.778974358974359e-05, "loss": 0.5165, "step": 431 }, { "epoch": 5.520766773162939, "grad_norm": 2.5486087799072266, "learning_rate": 1.7784615384615388e-05, "loss": 0.5124, "step": 432 }, { "epoch": 5.533546325878595, "grad_norm": 2.1844065189361572, "learning_rate": 1.7779487179487182e-05, "loss": 0.5247, "step": 433 }, { "epoch": 5.546325878594249, "grad_norm": 2.0668201446533203, "learning_rate": 1.7774358974358976e-05, "loss": 0.6111, "step": 434 }, { "epoch": 5.559105431309904, "grad_norm": 2.5725159645080566, "learning_rate": 1.776923076923077e-05, "loss": 0.4193, "step": 435 }, { "epoch": 5.571884984025559, "grad_norm": 2.1765971183776855, "learning_rate": 1.7764102564102564e-05, "loss": 0.3695, "step": 436 }, { "epoch": 5.584664536741214, "grad_norm": 2.1050305366516113, "learning_rate": 1.775897435897436e-05, "loss": 0.4419, "step": 437 }, { "epoch": 5.597444089456869, "grad_norm": 2.6420412063598633, "learning_rate": 1.7753846153846156e-05, "loss": 0.6949, "step": 438 }, { "epoch": 5.6102236421725244, "grad_norm": 3.008789300918579, "learning_rate": 1.774871794871795e-05, "loss": 0.4901, "step": 439 }, { "epoch": 5.623003194888179, "grad_norm": 2.481837511062622, "learning_rate": 1.7743589743589744e-05, "loss": 0.6561, "step": 440 }, { "epoch": 5.635782747603834, "grad_norm": 2.06085467338562, "learning_rate": 1.773846153846154e-05, "loss": 0.4835, "step": 441 }, { "epoch": 5.6485623003194885, "grad_norm": 1.9079381227493286, "learning_rate": 1.7733333333333335e-05, "loss": 0.3557, "step": 442 }, { "epoch": 5.661341853035144, "grad_norm": 2.1730477809906006, "learning_rate": 1.772820512820513e-05, "loss": 0.4584, "step": 443 }, { "epoch": 5.674121405750799, "grad_norm": 2.3779098987579346, "learning_rate": 1.7723076923076927e-05, "loss": 0.2891, "step": 444 }, { "epoch": 5.686900958466453, "grad_norm": 2.3261685371398926, "learning_rate": 1.7717948717948717e-05, "loss": 0.3954, "step": 445 }, { "epoch": 5.699680511182109, "grad_norm": 1.8987905979156494, "learning_rate": 1.7712820512820515e-05, "loss": 0.4915, "step": 446 }, { "epoch": 5.712460063897764, "grad_norm": 2.268406867980957, "learning_rate": 1.770769230769231e-05, "loss": 0.3933, "step": 447 }, { "epoch": 5.725239616613418, "grad_norm": 2.096902847290039, "learning_rate": 1.7702564102564103e-05, "loss": 0.3868, "step": 448 }, { "epoch": 5.738019169329074, "grad_norm": 2.9784042835235596, "learning_rate": 1.76974358974359e-05, "loss": 0.4829, "step": 449 }, { "epoch": 5.7507987220447285, "grad_norm": 2.718769073486328, "learning_rate": 1.7692307692307694e-05, "loss": 0.523, "step": 450 }, { "epoch": 5.763578274760383, "grad_norm": 1.9845243692398071, "learning_rate": 1.768717948717949e-05, "loss": 0.3861, "step": 451 }, { "epoch": 5.776357827476039, "grad_norm": 2.1359245777130127, "learning_rate": 1.7682051282051283e-05, "loss": 0.4859, "step": 452 }, { "epoch": 5.789137380191693, "grad_norm": 2.4216926097869873, "learning_rate": 1.7676923076923077e-05, "loss": 0.4118, "step": 453 }, { "epoch": 5.801916932907348, "grad_norm": 2.8170251846313477, "learning_rate": 1.7671794871794874e-05, "loss": 0.5945, "step": 454 }, { "epoch": 5.814696485623003, "grad_norm": 2.7723488807678223, "learning_rate": 1.7666666666666668e-05, "loss": 0.6911, "step": 455 }, { "epoch": 5.827476038338658, "grad_norm": 2.405487537384033, "learning_rate": 1.7661538461538462e-05, "loss": 0.5594, "step": 456 }, { "epoch": 5.840255591054313, "grad_norm": 2.5241634845733643, "learning_rate": 1.7656410256410256e-05, "loss": 0.446, "step": 457 }, { "epoch": 5.853035143769968, "grad_norm": 2.443591833114624, "learning_rate": 1.7651282051282054e-05, "loss": 0.4842, "step": 458 }, { "epoch": 5.865814696485623, "grad_norm": 2.4201200008392334, "learning_rate": 1.7646153846153848e-05, "loss": 0.5486, "step": 459 }, { "epoch": 5.878594249201278, "grad_norm": 2.012280225753784, "learning_rate": 1.7641025641025642e-05, "loss": 0.514, "step": 460 }, { "epoch": 5.891373801916933, "grad_norm": 2.587604284286499, "learning_rate": 1.763589743589744e-05, "loss": 0.522, "step": 461 }, { "epoch": 5.904153354632588, "grad_norm": 2.1839914321899414, "learning_rate": 1.763076923076923e-05, "loss": 0.4902, "step": 462 }, { "epoch": 5.916932907348243, "grad_norm": 2.1530771255493164, "learning_rate": 1.7625641025641027e-05, "loss": 0.3949, "step": 463 }, { "epoch": 5.9297124600638975, "grad_norm": 2.5098278522491455, "learning_rate": 1.762051282051282e-05, "loss": 0.5611, "step": 464 }, { "epoch": 5.942492012779553, "grad_norm": 2.27274227142334, "learning_rate": 1.7615384615384615e-05, "loss": 0.4455, "step": 465 }, { "epoch": 5.955271565495208, "grad_norm": 2.370197057723999, "learning_rate": 1.7610256410256413e-05, "loss": 0.4224, "step": 466 }, { "epoch": 5.968051118210862, "grad_norm": 1.684553623199463, "learning_rate": 1.7605128205128207e-05, "loss": 0.4083, "step": 467 }, { "epoch": 5.980830670926517, "grad_norm": 2.499558210372925, "learning_rate": 1.76e-05, "loss": 0.4626, "step": 468 }, { "epoch": 5.993610223642173, "grad_norm": 2.5789244174957275, "learning_rate": 1.75948717948718e-05, "loss": 0.5812, "step": 469 }, { "epoch": 6.006389776357827, "grad_norm": 2.08774471282959, "learning_rate": 1.7589743589743592e-05, "loss": 0.4607, "step": 470 }, { "epoch": 6.019169329073482, "grad_norm": 2.242676258087158, "learning_rate": 1.7584615384615386e-05, "loss": 0.3949, "step": 471 }, { "epoch": 6.031948881789138, "grad_norm": 2.3085381984710693, "learning_rate": 1.757948717948718e-05, "loss": 0.518, "step": 472 }, { "epoch": 6.044728434504792, "grad_norm": 2.412428855895996, "learning_rate": 1.7574358974358975e-05, "loss": 0.5979, "step": 473 }, { "epoch": 6.057507987220447, "grad_norm": 2.2813498973846436, "learning_rate": 1.7569230769230772e-05, "loss": 0.3837, "step": 474 }, { "epoch": 6.0702875399361025, "grad_norm": 2.6682474613189697, "learning_rate": 1.7564102564102566e-05, "loss": 0.4495, "step": 475 }, { "epoch": 6.083067092651757, "grad_norm": 3.1144282817840576, "learning_rate": 1.755897435897436e-05, "loss": 0.4583, "step": 476 }, { "epoch": 6.095846645367412, "grad_norm": 2.3847062587738037, "learning_rate": 1.7553846153846154e-05, "loss": 0.4137, "step": 477 }, { "epoch": 6.108626198083067, "grad_norm": 2.61763334274292, "learning_rate": 1.754871794871795e-05, "loss": 0.3974, "step": 478 }, { "epoch": 6.121405750798722, "grad_norm": 2.383402109146118, "learning_rate": 1.7543589743589746e-05, "loss": 0.4834, "step": 479 }, { "epoch": 6.134185303514377, "grad_norm": 3.521724224090576, "learning_rate": 1.753846153846154e-05, "loss": 0.3352, "step": 480 }, { "epoch": 6.146964856230032, "grad_norm": 2.1919541358947754, "learning_rate": 1.7533333333333337e-05, "loss": 0.3448, "step": 481 }, { "epoch": 6.159744408945687, "grad_norm": 2.2474188804626465, "learning_rate": 1.7528205128205128e-05, "loss": 0.4474, "step": 482 }, { "epoch": 6.172523961661342, "grad_norm": 2.5635788440704346, "learning_rate": 1.7523076923076925e-05, "loss": 0.492, "step": 483 }, { "epoch": 6.185303514376997, "grad_norm": 2.388453245162964, "learning_rate": 1.751794871794872e-05, "loss": 0.3088, "step": 484 }, { "epoch": 6.198083067092652, "grad_norm": 2.631474256515503, "learning_rate": 1.7512820512820513e-05, "loss": 0.4037, "step": 485 }, { "epoch": 6.210862619808307, "grad_norm": 2.8673110008239746, "learning_rate": 1.750769230769231e-05, "loss": 0.454, "step": 486 }, { "epoch": 6.223642172523961, "grad_norm": 3.3764970302581787, "learning_rate": 1.7502564102564105e-05, "loss": 0.5351, "step": 487 }, { "epoch": 6.236421725239617, "grad_norm": 2.687863349914551, "learning_rate": 1.74974358974359e-05, "loss": 0.4543, "step": 488 }, { "epoch": 6.2492012779552715, "grad_norm": 2.591404914855957, "learning_rate": 1.7492307692307693e-05, "loss": 0.4279, "step": 489 }, { "epoch": 6.261980830670926, "grad_norm": 2.737273693084717, "learning_rate": 1.7487179487179487e-05, "loss": 0.6408, "step": 490 }, { "epoch": 6.274760383386582, "grad_norm": 2.39886736869812, "learning_rate": 1.7482051282051284e-05, "loss": 0.3358, "step": 491 }, { "epoch": 6.287539936102236, "grad_norm": 2.9275259971618652, "learning_rate": 1.747692307692308e-05, "loss": 0.6465, "step": 492 }, { "epoch": 6.300319488817891, "grad_norm": 2.810349941253662, "learning_rate": 1.7471794871794873e-05, "loss": 0.3819, "step": 493 }, { "epoch": 6.313099041533547, "grad_norm": 2.2829222679138184, "learning_rate": 1.7466666666666667e-05, "loss": 0.5803, "step": 494 }, { "epoch": 6.325878594249201, "grad_norm": 2.397336721420288, "learning_rate": 1.7461538461538464e-05, "loss": 0.4829, "step": 495 }, { "epoch": 6.338658146964856, "grad_norm": 2.189927101135254, "learning_rate": 1.7456410256410258e-05, "loss": 0.5757, "step": 496 }, { "epoch": 6.3514376996805115, "grad_norm": 2.369382381439209, "learning_rate": 1.7451282051282052e-05, "loss": 0.2897, "step": 497 }, { "epoch": 6.364217252396166, "grad_norm": 2.5294623374938965, "learning_rate": 1.744615384615385e-05, "loss": 0.5625, "step": 498 }, { "epoch": 6.376996805111821, "grad_norm": 2.429304361343384, "learning_rate": 1.744102564102564e-05, "loss": 0.5052, "step": 499 }, { "epoch": 6.389776357827476, "grad_norm": 2.7437777519226074, "learning_rate": 1.7435897435897438e-05, "loss": 0.4608, "step": 500 }, { "epoch": 6.389776357827476, "eval_loss": 0.5022287964820862, "eval_runtime": 182.7071, "eval_samples_per_second": 0.859, "eval_steps_per_second": 0.109, "step": 500 }, { "epoch": 6.402555910543131, "grad_norm": 3.0428144931793213, "learning_rate": 1.7430769230769232e-05, "loss": 0.5481, "step": 501 }, { "epoch": 6.415335463258786, "grad_norm": 1.789218544960022, "learning_rate": 1.7425641025641026e-05, "loss": 0.379, "step": 502 }, { "epoch": 6.428115015974441, "grad_norm": 2.447986364364624, "learning_rate": 1.7420512820512823e-05, "loss": 0.36, "step": 503 }, { "epoch": 6.440894568690096, "grad_norm": 2.2227280139923096, "learning_rate": 1.7415384615384617e-05, "loss": 0.4233, "step": 504 }, { "epoch": 6.453674121405751, "grad_norm": 2.918168783187866, "learning_rate": 1.741025641025641e-05, "loss": 0.4857, "step": 505 }, { "epoch": 6.466453674121405, "grad_norm": 2.720658779144287, "learning_rate": 1.7405128205128205e-05, "loss": 0.3815, "step": 506 }, { "epoch": 6.479233226837061, "grad_norm": 2.6913065910339355, "learning_rate": 1.7400000000000003e-05, "loss": 0.4462, "step": 507 }, { "epoch": 6.492012779552716, "grad_norm": 2.7055540084838867, "learning_rate": 1.7394871794871797e-05, "loss": 0.474, "step": 508 }, { "epoch": 6.50479233226837, "grad_norm": 2.182354688644409, "learning_rate": 1.738974358974359e-05, "loss": 0.4572, "step": 509 }, { "epoch": 6.517571884984026, "grad_norm": 2.5811660289764404, "learning_rate": 1.7384615384615385e-05, "loss": 0.3676, "step": 510 }, { "epoch": 6.5303514376996805, "grad_norm": 2.9754414558410645, "learning_rate": 1.737948717948718e-05, "loss": 0.5659, "step": 511 }, { "epoch": 6.543130990415335, "grad_norm": 3.129412889480591, "learning_rate": 1.7374358974358976e-05, "loss": 0.4372, "step": 512 }, { "epoch": 6.555910543130991, "grad_norm": 3.000218391418457, "learning_rate": 1.736923076923077e-05, "loss": 0.5066, "step": 513 }, { "epoch": 6.568690095846645, "grad_norm": 3.06145977973938, "learning_rate": 1.7364102564102565e-05, "loss": 0.456, "step": 514 }, { "epoch": 6.5814696485623, "grad_norm": 2.7481515407562256, "learning_rate": 1.7358974358974362e-05, "loss": 0.3037, "step": 515 }, { "epoch": 6.594249201277956, "grad_norm": 2.6574368476867676, "learning_rate": 1.7353846153846156e-05, "loss": 0.4023, "step": 516 }, { "epoch": 6.60702875399361, "grad_norm": 2.2366273403167725, "learning_rate": 1.734871794871795e-05, "loss": 0.4253, "step": 517 }, { "epoch": 6.619808306709265, "grad_norm": 2.5262792110443115, "learning_rate": 1.7343589743589748e-05, "loss": 0.4337, "step": 518 }, { "epoch": 6.63258785942492, "grad_norm": 2.2034177780151367, "learning_rate": 1.7338461538461538e-05, "loss": 0.4043, "step": 519 }, { "epoch": 6.645367412140575, "grad_norm": 3.698709487915039, "learning_rate": 1.7333333333333336e-05, "loss": 0.6953, "step": 520 }, { "epoch": 6.65814696485623, "grad_norm": 2.3171446323394775, "learning_rate": 1.732820512820513e-05, "loss": 0.4593, "step": 521 }, { "epoch": 6.6709265175718855, "grad_norm": 2.540459156036377, "learning_rate": 1.7323076923076924e-05, "loss": 0.5598, "step": 522 }, { "epoch": 6.68370607028754, "grad_norm": 2.8060197830200195, "learning_rate": 1.731794871794872e-05, "loss": 0.3275, "step": 523 }, { "epoch": 6.696485623003195, "grad_norm": 2.904841899871826, "learning_rate": 1.7312820512820515e-05, "loss": 0.5502, "step": 524 }, { "epoch": 6.7092651757188495, "grad_norm": 2.756107807159424, "learning_rate": 1.730769230769231e-05, "loss": 0.4434, "step": 525 }, { "epoch": 6.722044728434505, "grad_norm": 2.744088888168335, "learning_rate": 1.7302564102564103e-05, "loss": 0.4104, "step": 526 }, { "epoch": 6.73482428115016, "grad_norm": 2.26206636428833, "learning_rate": 1.72974358974359e-05, "loss": 0.4231, "step": 527 }, { "epoch": 6.747603833865814, "grad_norm": 2.0125904083251953, "learning_rate": 1.729230769230769e-05, "loss": 0.4656, "step": 528 }, { "epoch": 6.76038338658147, "grad_norm": 2.1869449615478516, "learning_rate": 1.728717948717949e-05, "loss": 0.3672, "step": 529 }, { "epoch": 6.773162939297125, "grad_norm": 1.9915401935577393, "learning_rate": 1.7282051282051283e-05, "loss": 0.3877, "step": 530 }, { "epoch": 6.785942492012779, "grad_norm": 2.237344741821289, "learning_rate": 1.7276923076923077e-05, "loss": 0.3781, "step": 531 }, { "epoch": 6.798722044728435, "grad_norm": 2.5108916759490967, "learning_rate": 1.7271794871794874e-05, "loss": 0.5929, "step": 532 }, { "epoch": 6.81150159744409, "grad_norm": 2.3276259899139404, "learning_rate": 1.726666666666667e-05, "loss": 0.3589, "step": 533 }, { "epoch": 6.824281150159744, "grad_norm": 3.0176141262054443, "learning_rate": 1.7261538461538463e-05, "loss": 0.5621, "step": 534 }, { "epoch": 6.8370607028754, "grad_norm": 2.0819737911224365, "learning_rate": 1.725641025641026e-05, "loss": 0.3701, "step": 535 }, { "epoch": 6.8498402555910545, "grad_norm": 2.4570374488830566, "learning_rate": 1.725128205128205e-05, "loss": 0.463, "step": 536 }, { "epoch": 6.862619808306709, "grad_norm": 2.484651803970337, "learning_rate": 1.7246153846153848e-05, "loss": 0.4525, "step": 537 }, { "epoch": 6.875399361022364, "grad_norm": 2.1618449687957764, "learning_rate": 1.7241025641025642e-05, "loss": 0.4213, "step": 538 }, { "epoch": 6.888178913738019, "grad_norm": 2.1422417163848877, "learning_rate": 1.7235897435897436e-05, "loss": 0.4375, "step": 539 }, { "epoch": 6.900958466453674, "grad_norm": 2.6813485622406006, "learning_rate": 1.7230769230769234e-05, "loss": 0.5162, "step": 540 }, { "epoch": 6.913738019169329, "grad_norm": 2.3753178119659424, "learning_rate": 1.7225641025641028e-05, "loss": 0.4616, "step": 541 }, { "epoch": 6.926517571884984, "grad_norm": 2.3657495975494385, "learning_rate": 1.7220512820512822e-05, "loss": 0.5949, "step": 542 }, { "epoch": 6.939297124600639, "grad_norm": 2.5588560104370117, "learning_rate": 1.7215384615384616e-05, "loss": 0.5441, "step": 543 }, { "epoch": 6.952076677316294, "grad_norm": 2.5401077270507812, "learning_rate": 1.7210256410256413e-05, "loss": 0.4311, "step": 544 }, { "epoch": 6.964856230031949, "grad_norm": 2.427499532699585, "learning_rate": 1.7205128205128207e-05, "loss": 0.365, "step": 545 }, { "epoch": 6.977635782747604, "grad_norm": 2.534796714782715, "learning_rate": 1.72e-05, "loss": 0.4757, "step": 546 }, { "epoch": 6.9904153354632586, "grad_norm": 2.5496671199798584, "learning_rate": 1.7194871794871795e-05, "loss": 0.4701, "step": 547 }, { "epoch": 7.003194888178914, "grad_norm": 2.4693853855133057, "learning_rate": 1.718974358974359e-05, "loss": 0.3917, "step": 548 }, { "epoch": 7.015974440894569, "grad_norm": 2.0053486824035645, "learning_rate": 1.7184615384615387e-05, "loss": 0.356, "step": 549 }, { "epoch": 7.0287539936102235, "grad_norm": 2.299156427383423, "learning_rate": 1.717948717948718e-05, "loss": 0.3343, "step": 550 }, { "epoch": 7.041533546325879, "grad_norm": 2.448904514312744, "learning_rate": 1.7174358974358975e-05, "loss": 0.427, "step": 551 }, { "epoch": 7.054313099041534, "grad_norm": 2.3865370750427246, "learning_rate": 1.7169230769230772e-05, "loss": 0.4465, "step": 552 }, { "epoch": 7.067092651757188, "grad_norm": 2.5128276348114014, "learning_rate": 1.7164102564102566e-05, "loss": 0.4417, "step": 553 }, { "epoch": 7.079872204472843, "grad_norm": 2.5499255657196045, "learning_rate": 1.715897435897436e-05, "loss": 0.403, "step": 554 }, { "epoch": 7.092651757188499, "grad_norm": 2.176884174346924, "learning_rate": 1.7153846153846155e-05, "loss": 0.3505, "step": 555 }, { "epoch": 7.105431309904153, "grad_norm": 2.64473032951355, "learning_rate": 1.714871794871795e-05, "loss": 0.3731, "step": 556 }, { "epoch": 7.118210862619808, "grad_norm": 2.417506456375122, "learning_rate": 1.7143589743589746e-05, "loss": 0.3703, "step": 557 }, { "epoch": 7.1309904153354635, "grad_norm": 2.3878259658813477, "learning_rate": 1.713846153846154e-05, "loss": 0.3763, "step": 558 }, { "epoch": 7.143769968051118, "grad_norm": 2.473843574523926, "learning_rate": 1.7133333333333334e-05, "loss": 0.3958, "step": 559 }, { "epoch": 7.156549520766773, "grad_norm": 2.9820239543914795, "learning_rate": 1.7128205128205128e-05, "loss": 0.337, "step": 560 }, { "epoch": 7.169329073482428, "grad_norm": 2.1744096279144287, "learning_rate": 1.7123076923076926e-05, "loss": 0.4535, "step": 561 }, { "epoch": 7.182108626198083, "grad_norm": 2.5609939098358154, "learning_rate": 1.711794871794872e-05, "loss": 0.4376, "step": 562 }, { "epoch": 7.194888178913738, "grad_norm": 2.5857808589935303, "learning_rate": 1.7112820512820514e-05, "loss": 0.4187, "step": 563 }, { "epoch": 7.207667731629393, "grad_norm": 3.49450945854187, "learning_rate": 1.710769230769231e-05, "loss": 0.5547, "step": 564 }, { "epoch": 7.220447284345048, "grad_norm": 2.712728261947632, "learning_rate": 1.7102564102564102e-05, "loss": 0.3876, "step": 565 }, { "epoch": 7.233226837060703, "grad_norm": 2.9620182514190674, "learning_rate": 1.70974358974359e-05, "loss": 0.5748, "step": 566 }, { "epoch": 7.246006389776358, "grad_norm": 2.54795241355896, "learning_rate": 1.7092307692307693e-05, "loss": 0.336, "step": 567 }, { "epoch": 7.258785942492013, "grad_norm": 2.2571918964385986, "learning_rate": 1.7087179487179487e-05, "loss": 0.4416, "step": 568 }, { "epoch": 7.271565495207668, "grad_norm": 2.327420711517334, "learning_rate": 1.7082051282051285e-05, "loss": 0.3582, "step": 569 }, { "epoch": 7.284345047923322, "grad_norm": 2.4296958446502686, "learning_rate": 1.707692307692308e-05, "loss": 0.3854, "step": 570 }, { "epoch": 7.297124600638978, "grad_norm": 2.5808708667755127, "learning_rate": 1.7071794871794873e-05, "loss": 0.4507, "step": 571 }, { "epoch": 7.3099041533546325, "grad_norm": 3.1893603801727295, "learning_rate": 1.706666666666667e-05, "loss": 0.4189, "step": 572 }, { "epoch": 7.322683706070287, "grad_norm": 2.9760212898254395, "learning_rate": 1.706153846153846e-05, "loss": 0.4227, "step": 573 }, { "epoch": 7.335463258785943, "grad_norm": 1.8840742111206055, "learning_rate": 1.705641025641026e-05, "loss": 0.3461, "step": 574 }, { "epoch": 7.348242811501597, "grad_norm": 2.352691173553467, "learning_rate": 1.7051282051282053e-05, "loss": 0.4223, "step": 575 }, { "epoch": 7.361022364217252, "grad_norm": 2.5904428958892822, "learning_rate": 1.7046153846153847e-05, "loss": 0.384, "step": 576 }, { "epoch": 7.373801916932908, "grad_norm": 2.190866470336914, "learning_rate": 1.704102564102564e-05, "loss": 0.2897, "step": 577 }, { "epoch": 7.386581469648562, "grad_norm": 2.4126853942871094, "learning_rate": 1.7035897435897438e-05, "loss": 0.4496, "step": 578 }, { "epoch": 7.399361022364217, "grad_norm": 2.798590898513794, "learning_rate": 1.7030769230769232e-05, "loss": 0.432, "step": 579 }, { "epoch": 7.412140575079873, "grad_norm": 3.0337421894073486, "learning_rate": 1.7025641025641026e-05, "loss": 0.3953, "step": 580 }, { "epoch": 7.424920127795527, "grad_norm": 2.6716692447662354, "learning_rate": 1.7020512820512824e-05, "loss": 0.3673, "step": 581 }, { "epoch": 7.437699680511182, "grad_norm": 2.6297688484191895, "learning_rate": 1.7015384615384614e-05, "loss": 0.5193, "step": 582 }, { "epoch": 7.4504792332268375, "grad_norm": 3.0829811096191406, "learning_rate": 1.7010256410256412e-05, "loss": 0.5615, "step": 583 }, { "epoch": 7.463258785942492, "grad_norm": 3.34427809715271, "learning_rate": 1.7005128205128206e-05, "loss": 0.4111, "step": 584 }, { "epoch": 7.476038338658147, "grad_norm": 2.949704647064209, "learning_rate": 1.7e-05, "loss": 0.4972, "step": 585 }, { "epoch": 7.488817891373802, "grad_norm": 1.9306256771087646, "learning_rate": 1.6994871794871797e-05, "loss": 0.2485, "step": 586 }, { "epoch": 7.501597444089457, "grad_norm": 2.582099199295044, "learning_rate": 1.698974358974359e-05, "loss": 0.3855, "step": 587 }, { "epoch": 7.514376996805112, "grad_norm": 2.52205491065979, "learning_rate": 1.6984615384615385e-05, "loss": 0.3522, "step": 588 }, { "epoch": 7.527156549520766, "grad_norm": 3.256770372390747, "learning_rate": 1.6979487179487183e-05, "loss": 0.4599, "step": 589 }, { "epoch": 7.539936102236422, "grad_norm": 2.6770920753479004, "learning_rate": 1.6974358974358977e-05, "loss": 0.3897, "step": 590 }, { "epoch": 7.552715654952077, "grad_norm": 2.4803273677825928, "learning_rate": 1.696923076923077e-05, "loss": 0.4065, "step": 591 }, { "epoch": 7.565495207667731, "grad_norm": 2.484539747238159, "learning_rate": 1.6964102564102565e-05, "loss": 0.4344, "step": 592 }, { "epoch": 7.578274760383387, "grad_norm": 3.078622579574585, "learning_rate": 1.695897435897436e-05, "loss": 0.3477, "step": 593 }, { "epoch": 7.5910543130990416, "grad_norm": 2.6187045574188232, "learning_rate": 1.6953846153846156e-05, "loss": 0.4671, "step": 594 }, { "epoch": 7.603833865814696, "grad_norm": 2.429306745529175, "learning_rate": 1.694871794871795e-05, "loss": 0.4353, "step": 595 }, { "epoch": 7.616613418530352, "grad_norm": 2.7066404819488525, "learning_rate": 1.6943589743589745e-05, "loss": 0.3992, "step": 596 }, { "epoch": 7.6293929712460065, "grad_norm": 3.0803706645965576, "learning_rate": 1.693846153846154e-05, "loss": 0.5206, "step": 597 }, { "epoch": 7.642172523961661, "grad_norm": 2.8471038341522217, "learning_rate": 1.6933333333333336e-05, "loss": 0.6463, "step": 598 }, { "epoch": 7.654952076677317, "grad_norm": 2.572672128677368, "learning_rate": 1.692820512820513e-05, "loss": 0.4674, "step": 599 }, { "epoch": 7.667731629392971, "grad_norm": 2.5649611949920654, "learning_rate": 1.6923076923076924e-05, "loss": 0.4261, "step": 600 }, { "epoch": 7.667731629392971, "eval_loss": 0.4893907606601715, "eval_runtime": 180.8537, "eval_samples_per_second": 0.868, "eval_steps_per_second": 0.111, "step": 600 }, { "epoch": 7.680511182108626, "grad_norm": 2.6403310298919678, "learning_rate": 1.691794871794872e-05, "loss": 0.5065, "step": 601 }, { "epoch": 7.693290734824281, "grad_norm": 2.59029221534729, "learning_rate": 1.6912820512820512e-05, "loss": 0.4302, "step": 602 }, { "epoch": 7.706070287539936, "grad_norm": 2.2685883045196533, "learning_rate": 1.690769230769231e-05, "loss": 0.3629, "step": 603 }, { "epoch": 7.718849840255591, "grad_norm": 2.280118227005005, "learning_rate": 1.6902564102564104e-05, "loss": 0.5129, "step": 604 }, { "epoch": 7.731629392971246, "grad_norm": 2.507329225540161, "learning_rate": 1.6897435897435898e-05, "loss": 0.4688, "step": 605 }, { "epoch": 7.744408945686901, "grad_norm": 2.897050380706787, "learning_rate": 1.6892307692307695e-05, "loss": 0.5061, "step": 606 }, { "epoch": 7.757188498402556, "grad_norm": 2.5373477935791016, "learning_rate": 1.688717948717949e-05, "loss": 0.4823, "step": 607 }, { "epoch": 7.7699680511182105, "grad_norm": 3.002852201461792, "learning_rate": 1.6882051282051283e-05, "loss": 0.4908, "step": 608 }, { "epoch": 7.782747603833866, "grad_norm": 2.089543104171753, "learning_rate": 1.6876923076923077e-05, "loss": 0.4308, "step": 609 }, { "epoch": 7.795527156549521, "grad_norm": 2.6164510250091553, "learning_rate": 1.687179487179487e-05, "loss": 0.5141, "step": 610 }, { "epoch": 7.8083067092651754, "grad_norm": 2.691242218017578, "learning_rate": 1.686666666666667e-05, "loss": 0.4898, "step": 611 }, { "epoch": 7.821086261980831, "grad_norm": 2.664036273956299, "learning_rate": 1.6861538461538463e-05, "loss": 0.4248, "step": 612 }, { "epoch": 7.833865814696486, "grad_norm": 1.9409226179122925, "learning_rate": 1.6856410256410257e-05, "loss": 0.3507, "step": 613 }, { "epoch": 7.84664536741214, "grad_norm": 3.2386646270751953, "learning_rate": 1.685128205128205e-05, "loss": 0.3971, "step": 614 }, { "epoch": 7.859424920127795, "grad_norm": 2.29748797416687, "learning_rate": 1.684615384615385e-05, "loss": 0.3966, "step": 615 }, { "epoch": 7.872204472843451, "grad_norm": 2.7634127140045166, "learning_rate": 1.6841025641025643e-05, "loss": 0.4292, "step": 616 }, { "epoch": 7.884984025559105, "grad_norm": 2.2984588146209717, "learning_rate": 1.6835897435897437e-05, "loss": 0.3906, "step": 617 }, { "epoch": 7.897763578274761, "grad_norm": 2.8001909255981445, "learning_rate": 1.6830769230769234e-05, "loss": 0.6094, "step": 618 }, { "epoch": 7.9105431309904155, "grad_norm": 2.892528533935547, "learning_rate": 1.6825641025641025e-05, "loss": 0.4006, "step": 619 }, { "epoch": 7.92332268370607, "grad_norm": 3.403233051300049, "learning_rate": 1.6820512820512822e-05, "loss": 0.4464, "step": 620 }, { "epoch": 7.936102236421725, "grad_norm": 2.3043155670166016, "learning_rate": 1.6815384615384616e-05, "loss": 0.3331, "step": 621 }, { "epoch": 7.94888178913738, "grad_norm": 2.8333663940429688, "learning_rate": 1.681025641025641e-05, "loss": 0.415, "step": 622 }, { "epoch": 7.961661341853035, "grad_norm": 2.719902992248535, "learning_rate": 1.6805128205128208e-05, "loss": 0.5258, "step": 623 }, { "epoch": 7.97444089456869, "grad_norm": 3.271904468536377, "learning_rate": 1.6800000000000002e-05, "loss": 0.4089, "step": 624 }, { "epoch": 7.987220447284345, "grad_norm": 2.5888147354125977, "learning_rate": 1.6794871794871796e-05, "loss": 0.382, "step": 625 }, { "epoch": 8.0, "grad_norm": 3.209883213043213, "learning_rate": 1.678974358974359e-05, "loss": 0.6444, "step": 626 }, { "epoch": 8.012779552715655, "grad_norm": 2.370129346847534, "learning_rate": 1.6784615384615387e-05, "loss": 0.5045, "step": 627 }, { "epoch": 8.02555910543131, "grad_norm": 2.871953010559082, "learning_rate": 1.677948717948718e-05, "loss": 0.5227, "step": 628 }, { "epoch": 8.038338658146964, "grad_norm": 2.793961763381958, "learning_rate": 1.6774358974358975e-05, "loss": 0.4616, "step": 629 }, { "epoch": 8.05111821086262, "grad_norm": 2.622335910797119, "learning_rate": 1.676923076923077e-05, "loss": 0.4296, "step": 630 }, { "epoch": 8.063897763578275, "grad_norm": 2.6651382446289062, "learning_rate": 1.6764102564102564e-05, "loss": 0.391, "step": 631 }, { "epoch": 8.07667731629393, "grad_norm": 2.927123546600342, "learning_rate": 1.675897435897436e-05, "loss": 0.3775, "step": 632 }, { "epoch": 8.089456869009584, "grad_norm": 2.3731541633605957, "learning_rate": 1.6753846153846155e-05, "loss": 0.3785, "step": 633 }, { "epoch": 8.10223642172524, "grad_norm": 2.4673285484313965, "learning_rate": 1.674871794871795e-05, "loss": 0.4422, "step": 634 }, { "epoch": 8.115015974440894, "grad_norm": 2.902761459350586, "learning_rate": 1.6743589743589747e-05, "loss": 0.3814, "step": 635 }, { "epoch": 8.12779552715655, "grad_norm": 3.2571306228637695, "learning_rate": 1.673846153846154e-05, "loss": 0.443, "step": 636 }, { "epoch": 8.140575079872205, "grad_norm": 3.122844934463501, "learning_rate": 1.6733333333333335e-05, "loss": 0.4561, "step": 637 }, { "epoch": 8.15335463258786, "grad_norm": 2.018770456314087, "learning_rate": 1.6728205128205132e-05, "loss": 0.1985, "step": 638 }, { "epoch": 8.166134185303514, "grad_norm": 2.4061696529388428, "learning_rate": 1.6723076923076923e-05, "loss": 0.37, "step": 639 }, { "epoch": 8.178913738019169, "grad_norm": 2.3851146697998047, "learning_rate": 1.671794871794872e-05, "loss": 0.3915, "step": 640 }, { "epoch": 8.191693290734824, "grad_norm": 3.250897169113159, "learning_rate": 1.6712820512820514e-05, "loss": 0.4104, "step": 641 }, { "epoch": 8.204472843450478, "grad_norm": 2.8355727195739746, "learning_rate": 1.6707692307692308e-05, "loss": 0.3437, "step": 642 }, { "epoch": 8.217252396166135, "grad_norm": 2.1854217052459717, "learning_rate": 1.6702564102564106e-05, "loss": 0.3172, "step": 643 }, { "epoch": 8.23003194888179, "grad_norm": 4.091235160827637, "learning_rate": 1.66974358974359e-05, "loss": 0.3752, "step": 644 }, { "epoch": 8.242811501597444, "grad_norm": 3.4227828979492188, "learning_rate": 1.6692307692307694e-05, "loss": 0.4983, "step": 645 }, { "epoch": 8.255591054313099, "grad_norm": 2.888979911804199, "learning_rate": 1.6687179487179488e-05, "loss": 0.5062, "step": 646 }, { "epoch": 8.268370607028753, "grad_norm": 2.7345168590545654, "learning_rate": 1.6682051282051285e-05, "loss": 0.3431, "step": 647 }, { "epoch": 8.281150159744408, "grad_norm": 3.435631036758423, "learning_rate": 1.6676923076923076e-05, "loss": 0.4714, "step": 648 }, { "epoch": 8.293929712460065, "grad_norm": 2.1894478797912598, "learning_rate": 1.6671794871794873e-05, "loss": 0.2558, "step": 649 }, { "epoch": 8.30670926517572, "grad_norm": 3.0066120624542236, "learning_rate": 1.6666666666666667e-05, "loss": 0.3377, "step": 650 }, { "epoch": 8.319488817891374, "grad_norm": 2.613056182861328, "learning_rate": 1.666153846153846e-05, "loss": 0.3492, "step": 651 }, { "epoch": 8.332268370607029, "grad_norm": 2.5527093410491943, "learning_rate": 1.665641025641026e-05, "loss": 0.3209, "step": 652 }, { "epoch": 8.345047923322683, "grad_norm": 2.7275326251983643, "learning_rate": 1.6651282051282053e-05, "loss": 0.3943, "step": 653 }, { "epoch": 8.357827476038338, "grad_norm": 2.0672011375427246, "learning_rate": 1.6646153846153847e-05, "loss": 0.3674, "step": 654 }, { "epoch": 8.370607028753994, "grad_norm": 2.8069820404052734, "learning_rate": 1.6641025641025645e-05, "loss": 0.3685, "step": 655 }, { "epoch": 8.383386581469649, "grad_norm": 3.1376593112945557, "learning_rate": 1.6635897435897435e-05, "loss": 0.5658, "step": 656 }, { "epoch": 8.396166134185304, "grad_norm": 2.542144775390625, "learning_rate": 1.6630769230769233e-05, "loss": 0.3924, "step": 657 }, { "epoch": 8.408945686900958, "grad_norm": 2.6188881397247314, "learning_rate": 1.6625641025641027e-05, "loss": 0.4069, "step": 658 }, { "epoch": 8.421725239616613, "grad_norm": 2.8141064643859863, "learning_rate": 1.662051282051282e-05, "loss": 0.5237, "step": 659 }, { "epoch": 8.434504792332268, "grad_norm": 2.6490237712860107, "learning_rate": 1.6615384615384618e-05, "loss": 0.4192, "step": 660 }, { "epoch": 8.447284345047922, "grad_norm": 2.3943517208099365, "learning_rate": 1.6610256410256412e-05, "loss": 0.3519, "step": 661 }, { "epoch": 8.460063897763579, "grad_norm": 2.4590401649475098, "learning_rate": 1.6605128205128206e-05, "loss": 0.3297, "step": 662 }, { "epoch": 8.472843450479234, "grad_norm": 2.6325976848602295, "learning_rate": 1.66e-05, "loss": 0.3053, "step": 663 }, { "epoch": 8.485623003194888, "grad_norm": 3.596081495285034, "learning_rate": 1.6594871794871798e-05, "loss": 0.4897, "step": 664 }, { "epoch": 8.498402555910543, "grad_norm": 2.748485803604126, "learning_rate": 1.6589743589743592e-05, "loss": 0.4362, "step": 665 }, { "epoch": 8.511182108626198, "grad_norm": 3.381138563156128, "learning_rate": 1.6584615384615386e-05, "loss": 0.4669, "step": 666 }, { "epoch": 8.523961661341852, "grad_norm": 2.4086904525756836, "learning_rate": 1.657948717948718e-05, "loss": 0.4312, "step": 667 }, { "epoch": 8.536741214057509, "grad_norm": 2.4660682678222656, "learning_rate": 1.6574358974358974e-05, "loss": 0.4272, "step": 668 }, { "epoch": 8.549520766773163, "grad_norm": 2.776521682739258, "learning_rate": 1.656923076923077e-05, "loss": 0.4104, "step": 669 }, { "epoch": 8.562300319488818, "grad_norm": 3.0339174270629883, "learning_rate": 1.6564102564102565e-05, "loss": 0.443, "step": 670 }, { "epoch": 8.575079872204473, "grad_norm": 2.8308653831481934, "learning_rate": 1.655897435897436e-05, "loss": 0.3196, "step": 671 }, { "epoch": 8.587859424920127, "grad_norm": 2.376342296600342, "learning_rate": 1.6553846153846157e-05, "loss": 0.3005, "step": 672 }, { "epoch": 8.600638977635782, "grad_norm": 2.4938461780548096, "learning_rate": 1.654871794871795e-05, "loss": 0.3203, "step": 673 }, { "epoch": 8.613418530351439, "grad_norm": 2.8383700847625732, "learning_rate": 1.6543589743589745e-05, "loss": 0.5523, "step": 674 }, { "epoch": 8.626198083067093, "grad_norm": 2.876192331314087, "learning_rate": 1.653846153846154e-05, "loss": 0.4058, "step": 675 }, { "epoch": 8.638977635782748, "grad_norm": 2.5752999782562256, "learning_rate": 1.6533333333333333e-05, "loss": 0.5115, "step": 676 }, { "epoch": 8.651757188498403, "grad_norm": 2.703507661819458, "learning_rate": 1.652820512820513e-05, "loss": 0.4486, "step": 677 }, { "epoch": 8.664536741214057, "grad_norm": 3.2754502296447754, "learning_rate": 1.6523076923076925e-05, "loss": 0.3713, "step": 678 }, { "epoch": 8.677316293929712, "grad_norm": 3.1390693187713623, "learning_rate": 1.651794871794872e-05, "loss": 0.4578, "step": 679 }, { "epoch": 8.690095846645367, "grad_norm": 2.548980236053467, "learning_rate": 1.6512820512820513e-05, "loss": 0.38, "step": 680 }, { "epoch": 8.702875399361023, "grad_norm": 2.7647883892059326, "learning_rate": 1.650769230769231e-05, "loss": 0.4191, "step": 681 }, { "epoch": 8.715654952076678, "grad_norm": 2.330361843109131, "learning_rate": 1.6502564102564104e-05, "loss": 0.5003, "step": 682 }, { "epoch": 8.728434504792332, "grad_norm": 2.790050745010376, "learning_rate": 1.6497435897435898e-05, "loss": 0.3901, "step": 683 }, { "epoch": 8.741214057507987, "grad_norm": 3.12176775932312, "learning_rate": 1.6492307692307696e-05, "loss": 0.3778, "step": 684 }, { "epoch": 8.753993610223642, "grad_norm": 2.1713685989379883, "learning_rate": 1.6487179487179486e-05, "loss": 0.3297, "step": 685 }, { "epoch": 8.766773162939296, "grad_norm": 2.4351470470428467, "learning_rate": 1.6482051282051284e-05, "loss": 0.3086, "step": 686 }, { "epoch": 8.779552715654953, "grad_norm": 2.7927756309509277, "learning_rate": 1.6476923076923078e-05, "loss": 0.3062, "step": 687 }, { "epoch": 8.792332268370608, "grad_norm": 3.960897922515869, "learning_rate": 1.6471794871794872e-05, "loss": 0.4952, "step": 688 }, { "epoch": 8.805111821086262, "grad_norm": 2.980947971343994, "learning_rate": 1.646666666666667e-05, "loss": 0.4061, "step": 689 }, { "epoch": 8.817891373801917, "grad_norm": 3.104675769805908, "learning_rate": 1.6461538461538463e-05, "loss": 0.4165, "step": 690 }, { "epoch": 8.830670926517572, "grad_norm": 2.799107074737549, "learning_rate": 1.6456410256410257e-05, "loss": 0.3603, "step": 691 }, { "epoch": 8.843450479233226, "grad_norm": 2.9702889919281006, "learning_rate": 1.6451282051282055e-05, "loss": 0.4726, "step": 692 }, { "epoch": 8.856230031948883, "grad_norm": 3.655086040496826, "learning_rate": 1.6446153846153846e-05, "loss": 0.491, "step": 693 }, { "epoch": 8.869009584664537, "grad_norm": 3.1771860122680664, "learning_rate": 1.6441025641025643e-05, "loss": 0.3899, "step": 694 }, { "epoch": 8.881789137380192, "grad_norm": 2.3967061042785645, "learning_rate": 1.6435897435897437e-05, "loss": 0.4936, "step": 695 }, { "epoch": 8.894568690095847, "grad_norm": 3.973982095718384, "learning_rate": 1.643076923076923e-05, "loss": 0.4266, "step": 696 }, { "epoch": 8.907348242811501, "grad_norm": 2.963104248046875, "learning_rate": 1.6425641025641025e-05, "loss": 0.3856, "step": 697 }, { "epoch": 8.920127795527156, "grad_norm": 2.7379801273345947, "learning_rate": 1.6420512820512823e-05, "loss": 0.3364, "step": 698 }, { "epoch": 8.93290734824281, "grad_norm": 3.148500442504883, "learning_rate": 1.6415384615384617e-05, "loss": 0.3674, "step": 699 }, { "epoch": 8.945686900958467, "grad_norm": 2.757758140563965, "learning_rate": 1.641025641025641e-05, "loss": 0.406, "step": 700 }, { "epoch": 8.945686900958467, "eval_loss": 0.4816867411136627, "eval_runtime": 180.9578, "eval_samples_per_second": 0.868, "eval_steps_per_second": 0.111, "step": 700 }, { "epoch": 8.958466453674122, "grad_norm": 2.902454376220703, "learning_rate": 1.6405128205128208e-05, "loss": 0.4216, "step": 701 }, { "epoch": 8.971246006389777, "grad_norm": 2.8162240982055664, "learning_rate": 1.64e-05, "loss": 0.3975, "step": 702 }, { "epoch": 8.984025559105431, "grad_norm": 2.35843563079834, "learning_rate": 1.6394871794871796e-05, "loss": 0.4158, "step": 703 }, { "epoch": 8.996805111821086, "grad_norm": 3.153388023376465, "learning_rate": 1.638974358974359e-05, "loss": 0.5259, "step": 704 }, { "epoch": 9.00958466453674, "grad_norm": 2.3435404300689697, "learning_rate": 1.6384615384615384e-05, "loss": 0.285, "step": 705 }, { "epoch": 9.022364217252397, "grad_norm": 2.2724061012268066, "learning_rate": 1.6379487179487182e-05, "loss": 0.4115, "step": 706 }, { "epoch": 9.035143769968052, "grad_norm": 3.4185988903045654, "learning_rate": 1.6374358974358976e-05, "loss": 0.3614, "step": 707 }, { "epoch": 9.047923322683706, "grad_norm": 2.1561129093170166, "learning_rate": 1.636923076923077e-05, "loss": 0.314, "step": 708 }, { "epoch": 9.060702875399361, "grad_norm": 2.4209418296813965, "learning_rate": 1.6364102564102567e-05, "loss": 0.284, "step": 709 }, { "epoch": 9.073482428115016, "grad_norm": 2.662139892578125, "learning_rate": 1.635897435897436e-05, "loss": 0.5409, "step": 710 }, { "epoch": 9.08626198083067, "grad_norm": 2.8799054622650146, "learning_rate": 1.6353846153846155e-05, "loss": 0.3771, "step": 711 }, { "epoch": 9.099041533546325, "grad_norm": 2.704667806625366, "learning_rate": 1.634871794871795e-05, "loss": 0.3255, "step": 712 }, { "epoch": 9.111821086261982, "grad_norm": 2.934540033340454, "learning_rate": 1.6343589743589744e-05, "loss": 0.4394, "step": 713 }, { "epoch": 9.124600638977636, "grad_norm": 3.1625423431396484, "learning_rate": 1.633846153846154e-05, "loss": 0.3893, "step": 714 }, { "epoch": 9.13738019169329, "grad_norm": 3.0641441345214844, "learning_rate": 1.6333333333333335e-05, "loss": 0.375, "step": 715 }, { "epoch": 9.150159744408946, "grad_norm": 3.011550188064575, "learning_rate": 1.632820512820513e-05, "loss": 0.3112, "step": 716 }, { "epoch": 9.1629392971246, "grad_norm": 2.9342551231384277, "learning_rate": 1.6323076923076923e-05, "loss": 0.4093, "step": 717 }, { "epoch": 9.175718849840255, "grad_norm": 2.970946788787842, "learning_rate": 1.631794871794872e-05, "loss": 0.4874, "step": 718 }, { "epoch": 9.188498402555911, "grad_norm": 2.548109531402588, "learning_rate": 1.6312820512820515e-05, "loss": 0.4156, "step": 719 }, { "epoch": 9.201277955271566, "grad_norm": 3.7871553897857666, "learning_rate": 1.630769230769231e-05, "loss": 0.3074, "step": 720 }, { "epoch": 9.21405750798722, "grad_norm": 2.4763805866241455, "learning_rate": 1.6302564102564106e-05, "loss": 0.331, "step": 721 }, { "epoch": 9.226837060702875, "grad_norm": 3.3531510829925537, "learning_rate": 1.6297435897435897e-05, "loss": 0.498, "step": 722 }, { "epoch": 9.23961661341853, "grad_norm": 2.8672120571136475, "learning_rate": 1.6292307692307694e-05, "loss": 0.3971, "step": 723 }, { "epoch": 9.252396166134185, "grad_norm": 3.8358211517333984, "learning_rate": 1.628717948717949e-05, "loss": 0.3478, "step": 724 }, { "epoch": 9.26517571884984, "grad_norm": 3.4077041149139404, "learning_rate": 1.6282051282051282e-05, "loss": 0.3313, "step": 725 }, { "epoch": 9.277955271565496, "grad_norm": 3.170940637588501, "learning_rate": 1.627692307692308e-05, "loss": 0.5251, "step": 726 }, { "epoch": 9.29073482428115, "grad_norm": 2.9623332023620605, "learning_rate": 1.6271794871794874e-05, "loss": 0.5047, "step": 727 }, { "epoch": 9.303514376996805, "grad_norm": 3.793362855911255, "learning_rate": 1.6266666666666668e-05, "loss": 0.5395, "step": 728 }, { "epoch": 9.31629392971246, "grad_norm": 2.935246467590332, "learning_rate": 1.6261538461538462e-05, "loss": 0.4144, "step": 729 }, { "epoch": 9.329073482428115, "grad_norm": 2.9262139797210693, "learning_rate": 1.625641025641026e-05, "loss": 0.4187, "step": 730 }, { "epoch": 9.34185303514377, "grad_norm": 2.8220555782318115, "learning_rate": 1.6251282051282053e-05, "loss": 0.4186, "step": 731 }, { "epoch": 9.354632587859426, "grad_norm": 4.042216777801514, "learning_rate": 1.6246153846153848e-05, "loss": 0.3774, "step": 732 }, { "epoch": 9.36741214057508, "grad_norm": 3.7640748023986816, "learning_rate": 1.624102564102564e-05, "loss": 0.3001, "step": 733 }, { "epoch": 9.380191693290735, "grad_norm": 2.9387130737304688, "learning_rate": 1.6235897435897436e-05, "loss": 0.3889, "step": 734 }, { "epoch": 9.39297124600639, "grad_norm": 3.4702775478363037, "learning_rate": 1.6230769230769233e-05, "loss": 0.4282, "step": 735 }, { "epoch": 9.405750798722044, "grad_norm": 3.894277572631836, "learning_rate": 1.6225641025641027e-05, "loss": 0.5151, "step": 736 }, { "epoch": 9.418530351437699, "grad_norm": 2.918215751647949, "learning_rate": 1.622051282051282e-05, "loss": 0.4282, "step": 737 }, { "epoch": 9.431309904153355, "grad_norm": 3.2780513763427734, "learning_rate": 1.621538461538462e-05, "loss": 0.3878, "step": 738 }, { "epoch": 9.44408945686901, "grad_norm": 2.4160218238830566, "learning_rate": 1.621025641025641e-05, "loss": 0.3296, "step": 739 }, { "epoch": 9.456869009584665, "grad_norm": 2.724004030227661, "learning_rate": 1.6205128205128207e-05, "loss": 0.3507, "step": 740 }, { "epoch": 9.46964856230032, "grad_norm": 3.2544777393341064, "learning_rate": 1.62e-05, "loss": 0.5059, "step": 741 }, { "epoch": 9.482428115015974, "grad_norm": 2.500000238418579, "learning_rate": 1.6194871794871795e-05, "loss": 0.3442, "step": 742 }, { "epoch": 9.495207667731629, "grad_norm": 3.0941834449768066, "learning_rate": 1.6189743589743592e-05, "loss": 0.3219, "step": 743 }, { "epoch": 9.507987220447284, "grad_norm": 2.9002368450164795, "learning_rate": 1.6184615384615386e-05, "loss": 0.3519, "step": 744 }, { "epoch": 9.52076677316294, "grad_norm": 2.824436664581299, "learning_rate": 1.617948717948718e-05, "loss": 0.3762, "step": 745 }, { "epoch": 9.533546325878595, "grad_norm": 2.759906530380249, "learning_rate": 1.6174358974358974e-05, "loss": 0.4067, "step": 746 }, { "epoch": 9.54632587859425, "grad_norm": 2.4780590534210205, "learning_rate": 1.6169230769230772e-05, "loss": 0.3595, "step": 747 }, { "epoch": 9.559105431309904, "grad_norm": 2.7761125564575195, "learning_rate": 1.6164102564102566e-05, "loss": 0.3456, "step": 748 }, { "epoch": 9.571884984025559, "grad_norm": 4.0381951332092285, "learning_rate": 1.615897435897436e-05, "loss": 0.3862, "step": 749 }, { "epoch": 9.584664536741213, "grad_norm": 3.450500726699829, "learning_rate": 1.6153846153846154e-05, "loss": 0.3582, "step": 750 }, { "epoch": 9.59744408945687, "grad_norm": 2.6575820446014404, "learning_rate": 1.6148717948717948e-05, "loss": 0.3178, "step": 751 }, { "epoch": 9.610223642172524, "grad_norm": 2.5573747158050537, "learning_rate": 1.6143589743589745e-05, "loss": 0.3958, "step": 752 }, { "epoch": 9.62300319488818, "grad_norm": 2.849857807159424, "learning_rate": 1.613846153846154e-05, "loss": 0.3588, "step": 753 }, { "epoch": 9.635782747603834, "grad_norm": 2.322702646255493, "learning_rate": 1.6133333333333334e-05, "loss": 0.3252, "step": 754 }, { "epoch": 9.648562300319488, "grad_norm": 2.626924753189087, "learning_rate": 1.612820512820513e-05, "loss": 0.3818, "step": 755 }, { "epoch": 9.661341853035143, "grad_norm": 2.556973934173584, "learning_rate": 1.6123076923076925e-05, "loss": 0.2887, "step": 756 }, { "epoch": 9.6741214057508, "grad_norm": 3.3718507289886475, "learning_rate": 1.611794871794872e-05, "loss": 0.4669, "step": 757 }, { "epoch": 9.686900958466454, "grad_norm": 2.6675846576690674, "learning_rate": 1.6112820512820517e-05, "loss": 0.3147, "step": 758 }, { "epoch": 9.699680511182109, "grad_norm": 2.686455011367798, "learning_rate": 1.6107692307692307e-05, "loss": 0.3977, "step": 759 }, { "epoch": 9.712460063897764, "grad_norm": 3.091524839401245, "learning_rate": 1.6102564102564105e-05, "loss": 0.4134, "step": 760 }, { "epoch": 9.725239616613418, "grad_norm": 3.1435928344726562, "learning_rate": 1.60974358974359e-05, "loss": 0.4106, "step": 761 }, { "epoch": 9.738019169329073, "grad_norm": 3.289584159851074, "learning_rate": 1.6092307692307693e-05, "loss": 0.3564, "step": 762 }, { "epoch": 9.750798722044728, "grad_norm": 3.0257647037506104, "learning_rate": 1.608717948717949e-05, "loss": 0.405, "step": 763 }, { "epoch": 9.763578274760384, "grad_norm": 3.0910072326660156, "learning_rate": 1.6082051282051284e-05, "loss": 0.3332, "step": 764 }, { "epoch": 9.776357827476039, "grad_norm": 3.3785085678100586, "learning_rate": 1.607692307692308e-05, "loss": 0.3729, "step": 765 }, { "epoch": 9.789137380191693, "grad_norm": 3.3935513496398926, "learning_rate": 1.6071794871794872e-05, "loss": 0.5261, "step": 766 }, { "epoch": 9.801916932907348, "grad_norm": 3.9248788356781006, "learning_rate": 1.606666666666667e-05, "loss": 0.3136, "step": 767 }, { "epoch": 9.814696485623003, "grad_norm": 3.094261884689331, "learning_rate": 1.606153846153846e-05, "loss": 0.3301, "step": 768 }, { "epoch": 9.827476038338657, "grad_norm": 4.1841254234313965, "learning_rate": 1.6056410256410258e-05, "loss": 0.4053, "step": 769 }, { "epoch": 9.840255591054314, "grad_norm": 2.513289213180542, "learning_rate": 1.6051282051282052e-05, "loss": 0.3944, "step": 770 }, { "epoch": 9.853035143769969, "grad_norm": 3.5416557788848877, "learning_rate": 1.6046153846153846e-05, "loss": 0.3499, "step": 771 }, { "epoch": 9.865814696485623, "grad_norm": 2.6167125701904297, "learning_rate": 1.6041025641025643e-05, "loss": 0.3378, "step": 772 }, { "epoch": 9.878594249201278, "grad_norm": 2.5471789836883545, "learning_rate": 1.6035897435897438e-05, "loss": 0.346, "step": 773 }, { "epoch": 9.891373801916933, "grad_norm": 3.2733585834503174, "learning_rate": 1.603076923076923e-05, "loss": 0.4733, "step": 774 }, { "epoch": 9.904153354632587, "grad_norm": 3.283470392227173, "learning_rate": 1.602564102564103e-05, "loss": 0.3673, "step": 775 }, { "epoch": 9.916932907348244, "grad_norm": 2.748051643371582, "learning_rate": 1.602051282051282e-05, "loss": 0.3411, "step": 776 }, { "epoch": 9.929712460063898, "grad_norm": 3.7860305309295654, "learning_rate": 1.6015384615384617e-05, "loss": 0.3744, "step": 777 }, { "epoch": 9.942492012779553, "grad_norm": 2.9770305156707764, "learning_rate": 1.601025641025641e-05, "loss": 0.3485, "step": 778 }, { "epoch": 9.955271565495208, "grad_norm": 2.8799893856048584, "learning_rate": 1.6005128205128205e-05, "loss": 0.4349, "step": 779 }, { "epoch": 9.968051118210862, "grad_norm": 2.887939214706421, "learning_rate": 1.6000000000000003e-05, "loss": 0.3541, "step": 780 }, { "epoch": 9.980830670926517, "grad_norm": 3.629877805709839, "learning_rate": 1.5994871794871797e-05, "loss": 0.4275, "step": 781 }, { "epoch": 9.993610223642172, "grad_norm": 3.523859739303589, "learning_rate": 1.598974358974359e-05, "loss": 0.3754, "step": 782 }, { "epoch": 10.006389776357828, "grad_norm": 3.029161214828491, "learning_rate": 1.5984615384615385e-05, "loss": 0.3925, "step": 783 }, { "epoch": 10.019169329073483, "grad_norm": 2.83622670173645, "learning_rate": 1.5979487179487182e-05, "loss": 0.3667, "step": 784 }, { "epoch": 10.031948881789138, "grad_norm": 3.071251630783081, "learning_rate": 1.5974358974358976e-05, "loss": 0.4086, "step": 785 }, { "epoch": 10.044728434504792, "grad_norm": 2.993438959121704, "learning_rate": 1.596923076923077e-05, "loss": 0.4054, "step": 786 }, { "epoch": 10.057507987220447, "grad_norm": 2.8429837226867676, "learning_rate": 1.5964102564102564e-05, "loss": 0.2923, "step": 787 }, { "epoch": 10.070287539936102, "grad_norm": 2.995058059692383, "learning_rate": 1.595897435897436e-05, "loss": 0.4094, "step": 788 }, { "epoch": 10.083067092651758, "grad_norm": 2.814988374710083, "learning_rate": 1.5953846153846156e-05, "loss": 0.2915, "step": 789 }, { "epoch": 10.095846645367413, "grad_norm": 2.83194637298584, "learning_rate": 1.594871794871795e-05, "loss": 0.2686, "step": 790 }, { "epoch": 10.108626198083067, "grad_norm": 2.4209916591644287, "learning_rate": 1.5943589743589744e-05, "loss": 0.208, "step": 791 }, { "epoch": 10.121405750798722, "grad_norm": 2.3989953994750977, "learning_rate": 1.593846153846154e-05, "loss": 0.2616, "step": 792 }, { "epoch": 10.134185303514377, "grad_norm": 3.202815294265747, "learning_rate": 1.5933333333333336e-05, "loss": 0.3878, "step": 793 }, { "epoch": 10.146964856230031, "grad_norm": 3.395106077194214, "learning_rate": 1.592820512820513e-05, "loss": 0.3824, "step": 794 }, { "epoch": 10.159744408945686, "grad_norm": 2.954267740249634, "learning_rate": 1.5923076923076924e-05, "loss": 0.2722, "step": 795 }, { "epoch": 10.172523961661343, "grad_norm": 2.9634196758270264, "learning_rate": 1.5917948717948718e-05, "loss": 0.4102, "step": 796 }, { "epoch": 10.185303514376997, "grad_norm": 3.2138140201568604, "learning_rate": 1.5912820512820515e-05, "loss": 0.5894, "step": 797 }, { "epoch": 10.198083067092652, "grad_norm": 3.678295850753784, "learning_rate": 1.590769230769231e-05, "loss": 0.4693, "step": 798 }, { "epoch": 10.210862619808307, "grad_norm": 2.467360496520996, "learning_rate": 1.5902564102564103e-05, "loss": 0.3232, "step": 799 }, { "epoch": 10.223642172523961, "grad_norm": 2.8098020553588867, "learning_rate": 1.5897435897435897e-05, "loss": 0.2776, "step": 800 }, { "epoch": 10.223642172523961, "eval_loss": 0.4912850856781006, "eval_runtime": 183.0248, "eval_samples_per_second": 0.858, "eval_steps_per_second": 0.109, "step": 800 }, { "epoch": 10.236421725239616, "grad_norm": 2.853611946105957, "learning_rate": 1.5892307692307695e-05, "loss": 0.2964, "step": 801 }, { "epoch": 10.249201277955272, "grad_norm": 3.0821776390075684, "learning_rate": 1.588717948717949e-05, "loss": 0.3843, "step": 802 }, { "epoch": 10.261980830670927, "grad_norm": 3.1763129234313965, "learning_rate": 1.5882051282051283e-05, "loss": 0.4039, "step": 803 }, { "epoch": 10.274760383386582, "grad_norm": 3.0195302963256836, "learning_rate": 1.587692307692308e-05, "loss": 0.232, "step": 804 }, { "epoch": 10.287539936102236, "grad_norm": 2.823012113571167, "learning_rate": 1.587179487179487e-05, "loss": 0.2895, "step": 805 }, { "epoch": 10.300319488817891, "grad_norm": 3.0618393421173096, "learning_rate": 1.586666666666667e-05, "loss": 0.3148, "step": 806 }, { "epoch": 10.313099041533546, "grad_norm": 3.9492385387420654, "learning_rate": 1.5861538461538462e-05, "loss": 0.4268, "step": 807 }, { "epoch": 10.3258785942492, "grad_norm": 4.203477382659912, "learning_rate": 1.5856410256410256e-05, "loss": 0.395, "step": 808 }, { "epoch": 10.338658146964857, "grad_norm": 3.123945951461792, "learning_rate": 1.5851282051282054e-05, "loss": 0.3183, "step": 809 }, { "epoch": 10.351437699680512, "grad_norm": 3.6256487369537354, "learning_rate": 1.5846153846153848e-05, "loss": 0.3871, "step": 810 }, { "epoch": 10.364217252396166, "grad_norm": 3.1102445125579834, "learning_rate": 1.5841025641025642e-05, "loss": 0.2778, "step": 811 }, { "epoch": 10.37699680511182, "grad_norm": 3.4523651599884033, "learning_rate": 1.583589743589744e-05, "loss": 0.3446, "step": 812 }, { "epoch": 10.389776357827476, "grad_norm": 4.347939491271973, "learning_rate": 1.583076923076923e-05, "loss": 0.46, "step": 813 }, { "epoch": 10.40255591054313, "grad_norm": 3.2706682682037354, "learning_rate": 1.5825641025641028e-05, "loss": 0.4192, "step": 814 }, { "epoch": 10.415335463258787, "grad_norm": 2.67073392868042, "learning_rate": 1.582051282051282e-05, "loss": 0.2856, "step": 815 }, { "epoch": 10.428115015974441, "grad_norm": 3.1094117164611816, "learning_rate": 1.5815384615384616e-05, "loss": 0.2858, "step": 816 }, { "epoch": 10.440894568690096, "grad_norm": 3.1258299350738525, "learning_rate": 1.581025641025641e-05, "loss": 0.394, "step": 817 }, { "epoch": 10.45367412140575, "grad_norm": 3.4687180519104004, "learning_rate": 1.5805128205128207e-05, "loss": 0.3374, "step": 818 }, { "epoch": 10.466453674121405, "grad_norm": 3.3258817195892334, "learning_rate": 1.58e-05, "loss": 0.4092, "step": 819 }, { "epoch": 10.47923322683706, "grad_norm": 3.752936840057373, "learning_rate": 1.5794871794871795e-05, "loss": 0.3828, "step": 820 }, { "epoch": 10.492012779552716, "grad_norm": 3.322557210922241, "learning_rate": 1.5789743589743593e-05, "loss": 0.3219, "step": 821 }, { "epoch": 10.504792332268371, "grad_norm": 4.089233875274658, "learning_rate": 1.5784615384615383e-05, "loss": 0.4634, "step": 822 }, { "epoch": 10.517571884984026, "grad_norm": 3.043755292892456, "learning_rate": 1.577948717948718e-05, "loss": 0.3131, "step": 823 }, { "epoch": 10.53035143769968, "grad_norm": 2.8665387630462646, "learning_rate": 1.5774358974358975e-05, "loss": 0.4076, "step": 824 }, { "epoch": 10.543130990415335, "grad_norm": 3.6322171688079834, "learning_rate": 1.576923076923077e-05, "loss": 0.3824, "step": 825 }, { "epoch": 10.55591054313099, "grad_norm": 3.296170711517334, "learning_rate": 1.5764102564102566e-05, "loss": 0.4484, "step": 826 }, { "epoch": 10.568690095846645, "grad_norm": 2.6710081100463867, "learning_rate": 1.575897435897436e-05, "loss": 0.2255, "step": 827 }, { "epoch": 10.581469648562301, "grad_norm": 3.920431137084961, "learning_rate": 1.5753846153846154e-05, "loss": 0.4387, "step": 828 }, { "epoch": 10.594249201277956, "grad_norm": 2.7028772830963135, "learning_rate": 1.5748717948717952e-05, "loss": 0.3869, "step": 829 }, { "epoch": 10.60702875399361, "grad_norm": 3.00683331489563, "learning_rate": 1.5743589743589746e-05, "loss": 0.2282, "step": 830 }, { "epoch": 10.619808306709265, "grad_norm": 3.6137325763702393, "learning_rate": 1.573846153846154e-05, "loss": 0.3368, "step": 831 }, { "epoch": 10.63258785942492, "grad_norm": 3.2704150676727295, "learning_rate": 1.5733333333333334e-05, "loss": 0.3509, "step": 832 }, { "epoch": 10.645367412140574, "grad_norm": 3.7699923515319824, "learning_rate": 1.5728205128205128e-05, "loss": 0.3801, "step": 833 }, { "epoch": 10.65814696485623, "grad_norm": 3.1907780170440674, "learning_rate": 1.5723076923076926e-05, "loss": 0.4088, "step": 834 }, { "epoch": 10.670926517571885, "grad_norm": 3.042274236679077, "learning_rate": 1.571794871794872e-05, "loss": 0.3376, "step": 835 }, { "epoch": 10.68370607028754, "grad_norm": 2.675173282623291, "learning_rate": 1.5712820512820514e-05, "loss": 0.3057, "step": 836 }, { "epoch": 10.696485623003195, "grad_norm": 3.164384365081787, "learning_rate": 1.5707692307692308e-05, "loss": 0.361, "step": 837 }, { "epoch": 10.70926517571885, "grad_norm": 3.2448647022247314, "learning_rate": 1.5702564102564105e-05, "loss": 0.2626, "step": 838 }, { "epoch": 10.722044728434504, "grad_norm": 2.900031566619873, "learning_rate": 1.56974358974359e-05, "loss": 0.296, "step": 839 }, { "epoch": 10.73482428115016, "grad_norm": 3.1761841773986816, "learning_rate": 1.5692307692307693e-05, "loss": 0.2739, "step": 840 }, { "epoch": 10.747603833865815, "grad_norm": 3.444432497024536, "learning_rate": 1.568717948717949e-05, "loss": 0.3222, "step": 841 }, { "epoch": 10.76038338658147, "grad_norm": 3.3714373111724854, "learning_rate": 1.568205128205128e-05, "loss": 0.4064, "step": 842 }, { "epoch": 10.773162939297125, "grad_norm": 3.7057511806488037, "learning_rate": 1.567692307692308e-05, "loss": 0.5211, "step": 843 }, { "epoch": 10.78594249201278, "grad_norm": 3.079998254776001, "learning_rate": 1.5671794871794873e-05, "loss": 0.3329, "step": 844 }, { "epoch": 10.798722044728434, "grad_norm": 3.1543796062469482, "learning_rate": 1.5666666666666667e-05, "loss": 0.4407, "step": 845 }, { "epoch": 10.811501597444089, "grad_norm": 2.9210171699523926, "learning_rate": 1.5661538461538464e-05, "loss": 0.346, "step": 846 }, { "epoch": 10.824281150159745, "grad_norm": 3.3310635089874268, "learning_rate": 1.565641025641026e-05, "loss": 0.4371, "step": 847 }, { "epoch": 10.8370607028754, "grad_norm": 3.584244966506958, "learning_rate": 1.5651282051282052e-05, "loss": 0.4223, "step": 848 }, { "epoch": 10.849840255591054, "grad_norm": 2.6379106044769287, "learning_rate": 1.5646153846153846e-05, "loss": 0.3967, "step": 849 }, { "epoch": 10.86261980830671, "grad_norm": 2.972970724105835, "learning_rate": 1.5641025641025644e-05, "loss": 0.3419, "step": 850 }, { "epoch": 10.875399361022364, "grad_norm": 3.2894978523254395, "learning_rate": 1.5635897435897438e-05, "loss": 0.4133, "step": 851 }, { "epoch": 10.888178913738018, "grad_norm": 3.3246779441833496, "learning_rate": 1.5630769230769232e-05, "loss": 0.4367, "step": 852 }, { "epoch": 10.900958466453675, "grad_norm": 3.2328429222106934, "learning_rate": 1.5625641025641026e-05, "loss": 0.4688, "step": 853 }, { "epoch": 10.91373801916933, "grad_norm": 3.18841552734375, "learning_rate": 1.562051282051282e-05, "loss": 0.3355, "step": 854 }, { "epoch": 10.926517571884984, "grad_norm": 3.63415265083313, "learning_rate": 1.5615384615384618e-05, "loss": 0.3284, "step": 855 }, { "epoch": 10.939297124600639, "grad_norm": 2.851058006286621, "learning_rate": 1.561025641025641e-05, "loss": 0.2644, "step": 856 }, { "epoch": 10.952076677316294, "grad_norm": 3.3312418460845947, "learning_rate": 1.5605128205128206e-05, "loss": 0.4438, "step": 857 }, { "epoch": 10.964856230031948, "grad_norm": 4.002857685089111, "learning_rate": 1.5600000000000003e-05, "loss": 0.336, "step": 858 }, { "epoch": 10.977635782747605, "grad_norm": 3.100975275039673, "learning_rate": 1.5594871794871794e-05, "loss": 0.437, "step": 859 }, { "epoch": 10.99041533546326, "grad_norm": 4.84814977645874, "learning_rate": 1.558974358974359e-05, "loss": 0.4298, "step": 860 }, { "epoch": 11.003194888178914, "grad_norm": 4.237227439880371, "learning_rate": 1.5584615384615385e-05, "loss": 0.34, "step": 861 }, { "epoch": 11.015974440894569, "grad_norm": 3.8608264923095703, "learning_rate": 1.557948717948718e-05, "loss": 0.4767, "step": 862 }, { "epoch": 11.028753993610223, "grad_norm": 3.2381210327148438, "learning_rate": 1.5574358974358977e-05, "loss": 0.3816, "step": 863 }, { "epoch": 11.041533546325878, "grad_norm": 3.009047031402588, "learning_rate": 1.556923076923077e-05, "loss": 0.3754, "step": 864 }, { "epoch": 11.054313099041533, "grad_norm": 3.1122117042541504, "learning_rate": 1.5564102564102565e-05, "loss": 0.4406, "step": 865 }, { "epoch": 11.06709265175719, "grad_norm": 2.9538486003875732, "learning_rate": 1.555897435897436e-05, "loss": 0.3496, "step": 866 }, { "epoch": 11.079872204472844, "grad_norm": 4.3143157958984375, "learning_rate": 1.5553846153846156e-05, "loss": 0.3724, "step": 867 }, { "epoch": 11.092651757188499, "grad_norm": 3.1895341873168945, "learning_rate": 1.554871794871795e-05, "loss": 0.3627, "step": 868 }, { "epoch": 11.105431309904153, "grad_norm": 2.7092154026031494, "learning_rate": 1.5543589743589744e-05, "loss": 0.2933, "step": 869 }, { "epoch": 11.118210862619808, "grad_norm": 2.3844687938690186, "learning_rate": 1.553846153846154e-05, "loss": 0.2012, "step": 870 }, { "epoch": 11.130990415335463, "grad_norm": 2.5584909915924072, "learning_rate": 1.5533333333333333e-05, "loss": 0.2561, "step": 871 }, { "epoch": 11.143769968051119, "grad_norm": 3.2287497520446777, "learning_rate": 1.552820512820513e-05, "loss": 0.2891, "step": 872 }, { "epoch": 11.156549520766774, "grad_norm": 2.3542449474334717, "learning_rate": 1.5523076923076924e-05, "loss": 0.2908, "step": 873 }, { "epoch": 11.169329073482428, "grad_norm": 2.944882392883301, "learning_rate": 1.5517948717948718e-05, "loss": 0.2287, "step": 874 }, { "epoch": 11.182108626198083, "grad_norm": 2.9199087619781494, "learning_rate": 1.5512820512820516e-05, "loss": 0.3413, "step": 875 }, { "epoch": 11.194888178913738, "grad_norm": 2.931480646133423, "learning_rate": 1.550769230769231e-05, "loss": 0.2779, "step": 876 }, { "epoch": 11.207667731629392, "grad_norm": 3.4157614707946777, "learning_rate": 1.5502564102564104e-05, "loss": 0.3269, "step": 877 }, { "epoch": 11.220447284345047, "grad_norm": 3.1395103931427, "learning_rate": 1.54974358974359e-05, "loss": 0.2674, "step": 878 }, { "epoch": 11.233226837060704, "grad_norm": 4.254883289337158, "learning_rate": 1.5492307692307692e-05, "loss": 0.4352, "step": 879 }, { "epoch": 11.246006389776358, "grad_norm": 3.7788186073303223, "learning_rate": 1.548717948717949e-05, "loss": 0.2672, "step": 880 }, { "epoch": 11.258785942492013, "grad_norm": 4.6283087730407715, "learning_rate": 1.5482051282051283e-05, "loss": 0.3878, "step": 881 }, { "epoch": 11.271565495207668, "grad_norm": 2.980556011199951, "learning_rate": 1.5476923076923077e-05, "loss": 0.2345, "step": 882 }, { "epoch": 11.284345047923322, "grad_norm": 3.723423719406128, "learning_rate": 1.5471794871794875e-05, "loss": 0.411, "step": 883 }, { "epoch": 11.297124600638977, "grad_norm": 3.2626750469207764, "learning_rate": 1.546666666666667e-05, "loss": 0.3025, "step": 884 }, { "epoch": 11.309904153354633, "grad_norm": 2.825082302093506, "learning_rate": 1.5461538461538463e-05, "loss": 0.3917, "step": 885 }, { "epoch": 11.322683706070288, "grad_norm": 3.563110828399658, "learning_rate": 1.5456410256410257e-05, "loss": 0.4258, "step": 886 }, { "epoch": 11.335463258785943, "grad_norm": 3.154862880706787, "learning_rate": 1.5451282051282054e-05, "loss": 0.3543, "step": 887 }, { "epoch": 11.348242811501597, "grad_norm": 3.8026084899902344, "learning_rate": 1.544615384615385e-05, "loss": 0.4576, "step": 888 }, { "epoch": 11.361022364217252, "grad_norm": 3.8097572326660156, "learning_rate": 1.5441025641025642e-05, "loss": 0.3371, "step": 889 }, { "epoch": 11.373801916932907, "grad_norm": 3.6081221103668213, "learning_rate": 1.5435897435897436e-05, "loss": 0.3445, "step": 890 }, { "epoch": 11.386581469648561, "grad_norm": 3.46155047416687, "learning_rate": 1.543076923076923e-05, "loss": 0.3375, "step": 891 }, { "epoch": 11.399361022364218, "grad_norm": 3.3258464336395264, "learning_rate": 1.5425641025641028e-05, "loss": 0.295, "step": 892 }, { "epoch": 11.412140575079873, "grad_norm": 3.3153886795043945, "learning_rate": 1.5420512820512822e-05, "loss": 0.3332, "step": 893 }, { "epoch": 11.424920127795527, "grad_norm": 3.5005221366882324, "learning_rate": 1.5415384615384616e-05, "loss": 0.286, "step": 894 }, { "epoch": 11.437699680511182, "grad_norm": 3.0043089389801025, "learning_rate": 1.5410256410256414e-05, "loss": 0.3095, "step": 895 }, { "epoch": 11.450479233226837, "grad_norm": 3.113715410232544, "learning_rate": 1.5405128205128204e-05, "loss": 0.2693, "step": 896 }, { "epoch": 11.463258785942491, "grad_norm": 3.4853270053863525, "learning_rate": 1.54e-05, "loss": 0.3533, "step": 897 }, { "epoch": 11.476038338658148, "grad_norm": 3.3521034717559814, "learning_rate": 1.5394871794871796e-05, "loss": 0.3699, "step": 898 }, { "epoch": 11.488817891373802, "grad_norm": 3.983391046524048, "learning_rate": 1.538974358974359e-05, "loss": 0.303, "step": 899 }, { "epoch": 11.501597444089457, "grad_norm": 3.161827325820923, "learning_rate": 1.5384615384615387e-05, "loss": 0.2873, "step": 900 }, { "epoch": 11.501597444089457, "eval_loss": 0.49615687131881714, "eval_runtime": 183.2354, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.109, "step": 900 }, { "epoch": 11.514376996805112, "grad_norm": 3.4103128910064697, "learning_rate": 1.537948717948718e-05, "loss": 0.3428, "step": 901 }, { "epoch": 11.527156549520766, "grad_norm": 3.620884418487549, "learning_rate": 1.5374358974358975e-05, "loss": 0.4551, "step": 902 }, { "epoch": 11.539936102236421, "grad_norm": 3.1130776405334473, "learning_rate": 1.536923076923077e-05, "loss": 0.3008, "step": 903 }, { "epoch": 11.552715654952078, "grad_norm": 3.5067083835601807, "learning_rate": 1.5364102564102567e-05, "loss": 0.4184, "step": 904 }, { "epoch": 11.565495207667732, "grad_norm": 3.9358503818511963, "learning_rate": 1.535897435897436e-05, "loss": 0.4187, "step": 905 }, { "epoch": 11.578274760383387, "grad_norm": 3.875493049621582, "learning_rate": 1.5353846153846155e-05, "loss": 0.3645, "step": 906 }, { "epoch": 11.591054313099042, "grad_norm": 3.0121140480041504, "learning_rate": 1.534871794871795e-05, "loss": 0.2874, "step": 907 }, { "epoch": 11.603833865814696, "grad_norm": 3.0029237270355225, "learning_rate": 1.5343589743589743e-05, "loss": 0.3083, "step": 908 }, { "epoch": 11.616613418530351, "grad_norm": 2.714341878890991, "learning_rate": 1.533846153846154e-05, "loss": 0.22, "step": 909 }, { "epoch": 11.629392971246006, "grad_norm": 2.5919463634490967, "learning_rate": 1.5333333333333334e-05, "loss": 0.27, "step": 910 }, { "epoch": 11.642172523961662, "grad_norm": 3.5662481784820557, "learning_rate": 1.532820512820513e-05, "loss": 0.3671, "step": 911 }, { "epoch": 11.654952076677317, "grad_norm": 3.2857398986816406, "learning_rate": 1.5323076923076926e-05, "loss": 0.3629, "step": 912 }, { "epoch": 11.667731629392971, "grad_norm": 3.2995290756225586, "learning_rate": 1.531794871794872e-05, "loss": 0.345, "step": 913 }, { "epoch": 11.680511182108626, "grad_norm": 3.5005154609680176, "learning_rate": 1.5312820512820514e-05, "loss": 0.2964, "step": 914 }, { "epoch": 11.69329073482428, "grad_norm": 4.271259307861328, "learning_rate": 1.5307692307692308e-05, "loss": 0.4868, "step": 915 }, { "epoch": 11.706070287539935, "grad_norm": 3.320885181427002, "learning_rate": 1.5302564102564102e-05, "loss": 0.3535, "step": 916 }, { "epoch": 11.718849840255592, "grad_norm": 4.391448020935059, "learning_rate": 1.52974358974359e-05, "loss": 0.4242, "step": 917 }, { "epoch": 11.731629392971247, "grad_norm": 3.9817147254943848, "learning_rate": 1.5292307692307694e-05, "loss": 0.329, "step": 918 }, { "epoch": 11.744408945686901, "grad_norm": 3.0762832164764404, "learning_rate": 1.5287179487179488e-05, "loss": 0.248, "step": 919 }, { "epoch": 11.757188498402556, "grad_norm": 3.191990613937378, "learning_rate": 1.5282051282051282e-05, "loss": 0.3691, "step": 920 }, { "epoch": 11.76996805111821, "grad_norm": 3.4746310710906982, "learning_rate": 1.527692307692308e-05, "loss": 0.3916, "step": 921 }, { "epoch": 11.782747603833865, "grad_norm": 3.9543192386627197, "learning_rate": 1.5271794871794873e-05, "loss": 0.4443, "step": 922 }, { "epoch": 11.795527156549522, "grad_norm": 3.446842670440674, "learning_rate": 1.5266666666666667e-05, "loss": 0.329, "step": 923 }, { "epoch": 11.808306709265176, "grad_norm": 3.134056806564331, "learning_rate": 1.5261538461538465e-05, "loss": 0.2734, "step": 924 }, { "epoch": 11.821086261980831, "grad_norm": 3.8701589107513428, "learning_rate": 1.5256410256410257e-05, "loss": 0.4735, "step": 925 }, { "epoch": 11.833865814696486, "grad_norm": 4.0366902351379395, "learning_rate": 1.5251282051282053e-05, "loss": 0.3053, "step": 926 }, { "epoch": 11.84664536741214, "grad_norm": 2.7000772953033447, "learning_rate": 1.5246153846153849e-05, "loss": 0.2448, "step": 927 }, { "epoch": 11.859424920127795, "grad_norm": 3.872736930847168, "learning_rate": 1.5241025641025643e-05, "loss": 0.4172, "step": 928 }, { "epoch": 11.87220447284345, "grad_norm": 2.7194664478302, "learning_rate": 1.5235897435897438e-05, "loss": 0.2179, "step": 929 }, { "epoch": 11.884984025559106, "grad_norm": 2.8944573402404785, "learning_rate": 1.523076923076923e-05, "loss": 0.2808, "step": 930 }, { "epoch": 11.89776357827476, "grad_norm": 3.091066598892212, "learning_rate": 1.5225641025641027e-05, "loss": 0.3228, "step": 931 }, { "epoch": 11.910543130990416, "grad_norm": 3.6224167346954346, "learning_rate": 1.5220512820512822e-05, "loss": 0.4068, "step": 932 }, { "epoch": 11.92332268370607, "grad_norm": 3.6418344974517822, "learning_rate": 1.5215384615384616e-05, "loss": 0.3721, "step": 933 }, { "epoch": 11.936102236421725, "grad_norm": 3.031254529953003, "learning_rate": 1.5210256410256412e-05, "loss": 0.2927, "step": 934 }, { "epoch": 11.94888178913738, "grad_norm": 4.135180950164795, "learning_rate": 1.5205128205128206e-05, "loss": 0.3205, "step": 935 }, { "epoch": 11.961661341853034, "grad_norm": 3.6757776737213135, "learning_rate": 1.5200000000000002e-05, "loss": 0.3106, "step": 936 }, { "epoch": 11.97444089456869, "grad_norm": 3.8090317249298096, "learning_rate": 1.5194871794871798e-05, "loss": 0.3209, "step": 937 }, { "epoch": 11.987220447284345, "grad_norm": 2.8670780658721924, "learning_rate": 1.518974358974359e-05, "loss": 0.3085, "step": 938 }, { "epoch": 12.0, "grad_norm": 3.156921148300171, "learning_rate": 1.5184615384615386e-05, "loss": 0.3247, "step": 939 }, { "epoch": 12.012779552715655, "grad_norm": 3.98889422416687, "learning_rate": 1.517948717948718e-05, "loss": 0.2793, "step": 940 }, { "epoch": 12.02555910543131, "grad_norm": 3.3414158821105957, "learning_rate": 1.5174358974358976e-05, "loss": 0.2521, "step": 941 }, { "epoch": 12.038338658146964, "grad_norm": 3.1532747745513916, "learning_rate": 1.516923076923077e-05, "loss": 0.2792, "step": 942 }, { "epoch": 12.05111821086262, "grad_norm": 3.7512667179107666, "learning_rate": 1.5164102564102565e-05, "loss": 0.2176, "step": 943 }, { "epoch": 12.063897763578275, "grad_norm": 3.0928268432617188, "learning_rate": 1.5158974358974361e-05, "loss": 0.376, "step": 944 }, { "epoch": 12.07667731629393, "grad_norm": 2.8840041160583496, "learning_rate": 1.5153846153846155e-05, "loss": 0.2967, "step": 945 }, { "epoch": 12.089456869009584, "grad_norm": 2.912339925765991, "learning_rate": 1.514871794871795e-05, "loss": 0.2307, "step": 946 }, { "epoch": 12.10223642172524, "grad_norm": 4.07706880569458, "learning_rate": 1.5143589743589743e-05, "loss": 0.3104, "step": 947 }, { "epoch": 12.115015974440894, "grad_norm": 3.8254222869873047, "learning_rate": 1.5138461538461539e-05, "loss": 0.299, "step": 948 }, { "epoch": 12.12779552715655, "grad_norm": 3.255998134613037, "learning_rate": 1.5133333333333335e-05, "loss": 0.2724, "step": 949 }, { "epoch": 12.140575079872205, "grad_norm": 3.329000949859619, "learning_rate": 1.5128205128205129e-05, "loss": 0.3398, "step": 950 }, { "epoch": 12.15335463258786, "grad_norm": 3.314716339111328, "learning_rate": 1.5123076923076924e-05, "loss": 0.2941, "step": 951 }, { "epoch": 12.166134185303514, "grad_norm": 2.983738660812378, "learning_rate": 1.5117948717948719e-05, "loss": 0.2255, "step": 952 }, { "epoch": 12.178913738019169, "grad_norm": 3.6834664344787598, "learning_rate": 1.5112820512820514e-05, "loss": 0.2873, "step": 953 }, { "epoch": 12.191693290734824, "grad_norm": 3.5403318405151367, "learning_rate": 1.510769230769231e-05, "loss": 0.3309, "step": 954 }, { "epoch": 12.204472843450478, "grad_norm": 3.9441781044006348, "learning_rate": 1.5102564102564104e-05, "loss": 0.287, "step": 955 }, { "epoch": 12.217252396166135, "grad_norm": 2.9877588748931885, "learning_rate": 1.50974358974359e-05, "loss": 0.2715, "step": 956 }, { "epoch": 12.23003194888179, "grad_norm": 3.5199813842773438, "learning_rate": 1.5092307692307692e-05, "loss": 0.4142, "step": 957 }, { "epoch": 12.242811501597444, "grad_norm": 3.2963194847106934, "learning_rate": 1.5087179487179488e-05, "loss": 0.2854, "step": 958 }, { "epoch": 12.255591054313099, "grad_norm": 4.313882350921631, "learning_rate": 1.5082051282051284e-05, "loss": 0.4677, "step": 959 }, { "epoch": 12.268370607028753, "grad_norm": 4.474306583404541, "learning_rate": 1.5076923076923078e-05, "loss": 0.3723, "step": 960 }, { "epoch": 12.281150159744408, "grad_norm": 3.5448555946350098, "learning_rate": 1.5071794871794873e-05, "loss": 0.2871, "step": 961 }, { "epoch": 12.293929712460065, "grad_norm": 3.978175163269043, "learning_rate": 1.5066666666666668e-05, "loss": 0.3368, "step": 962 }, { "epoch": 12.30670926517572, "grad_norm": 4.108724117279053, "learning_rate": 1.5061538461538463e-05, "loss": 0.3639, "step": 963 }, { "epoch": 12.319488817891374, "grad_norm": 3.223417043685913, "learning_rate": 1.5056410256410257e-05, "loss": 0.2408, "step": 964 }, { "epoch": 12.332268370607029, "grad_norm": 3.166851043701172, "learning_rate": 1.5051282051282053e-05, "loss": 0.2552, "step": 965 }, { "epoch": 12.345047923322683, "grad_norm": 4.144366264343262, "learning_rate": 1.5046153846153849e-05, "loss": 0.344, "step": 966 }, { "epoch": 12.357827476038338, "grad_norm": 3.4670357704162598, "learning_rate": 1.5041025641025641e-05, "loss": 0.3234, "step": 967 }, { "epoch": 12.370607028753994, "grad_norm": 3.67396879196167, "learning_rate": 1.5035897435897437e-05, "loss": 0.3976, "step": 968 }, { "epoch": 12.383386581469649, "grad_norm": 7.3044867515563965, "learning_rate": 1.5030769230769231e-05, "loss": 0.2831, "step": 969 }, { "epoch": 12.396166134185304, "grad_norm": 4.498189926147461, "learning_rate": 1.5025641025641027e-05, "loss": 0.2689, "step": 970 }, { "epoch": 12.408945686900958, "grad_norm": 3.387584924697876, "learning_rate": 1.5020512820512822e-05, "loss": 0.2477, "step": 971 }, { "epoch": 12.421725239616613, "grad_norm": 4.313808917999268, "learning_rate": 1.5015384615384617e-05, "loss": 0.309, "step": 972 }, { "epoch": 12.434504792332268, "grad_norm": 3.6473467350006104, "learning_rate": 1.5010256410256412e-05, "loss": 0.2361, "step": 973 }, { "epoch": 12.447284345047922, "grad_norm": 4.041528701782227, "learning_rate": 1.5005128205128205e-05, "loss": 0.3795, "step": 974 }, { "epoch": 12.460063897763579, "grad_norm": 3.3756825923919678, "learning_rate": 1.5000000000000002e-05, "loss": 0.2511, "step": 975 }, { "epoch": 12.472843450479234, "grad_norm": 3.396024703979492, "learning_rate": 1.4994871794871798e-05, "loss": 0.3154, "step": 976 }, { "epoch": 12.485623003194888, "grad_norm": 3.4328291416168213, "learning_rate": 1.498974358974359e-05, "loss": 0.2605, "step": 977 }, { "epoch": 12.498402555910543, "grad_norm": 4.163756370544434, "learning_rate": 1.4984615384615386e-05, "loss": 0.2772, "step": 978 }, { "epoch": 12.511182108626198, "grad_norm": 4.738368034362793, "learning_rate": 1.497948717948718e-05, "loss": 0.3521, "step": 979 }, { "epoch": 12.523961661341852, "grad_norm": 3.4938671588897705, "learning_rate": 1.4974358974358976e-05, "loss": 0.363, "step": 980 }, { "epoch": 12.536741214057509, "grad_norm": 3.3207099437713623, "learning_rate": 1.4969230769230771e-05, "loss": 0.2558, "step": 981 }, { "epoch": 12.549520766773163, "grad_norm": 3.4112465381622314, "learning_rate": 1.4964102564102566e-05, "loss": 0.3083, "step": 982 }, { "epoch": 12.562300319488818, "grad_norm": 3.6576359272003174, "learning_rate": 1.4958974358974361e-05, "loss": 0.3153, "step": 983 }, { "epoch": 12.575079872204473, "grad_norm": 3.1514041423797607, "learning_rate": 1.4953846153846154e-05, "loss": 0.2925, "step": 984 }, { "epoch": 12.587859424920127, "grad_norm": 3.492708206176758, "learning_rate": 1.494871794871795e-05, "loss": 0.3153, "step": 985 }, { "epoch": 12.600638977635782, "grad_norm": 3.412132740020752, "learning_rate": 1.4943589743589745e-05, "loss": 0.3022, "step": 986 }, { "epoch": 12.613418530351439, "grad_norm": 4.253443241119385, "learning_rate": 1.493846153846154e-05, "loss": 0.2909, "step": 987 }, { "epoch": 12.626198083067093, "grad_norm": 3.0681474208831787, "learning_rate": 1.4933333333333335e-05, "loss": 0.4074, "step": 988 }, { "epoch": 12.638977635782748, "grad_norm": 3.2361674308776855, "learning_rate": 1.4928205128205129e-05, "loss": 0.4487, "step": 989 }, { "epoch": 12.651757188498403, "grad_norm": 3.1446540355682373, "learning_rate": 1.4923076923076925e-05, "loss": 0.242, "step": 990 }, { "epoch": 12.664536741214057, "grad_norm": 4.866894245147705, "learning_rate": 1.4917948717948719e-05, "loss": 0.4236, "step": 991 }, { "epoch": 12.677316293929712, "grad_norm": 3.479435920715332, "learning_rate": 1.4912820512820515e-05, "loss": 0.2779, "step": 992 }, { "epoch": 12.690095846645367, "grad_norm": 3.4308862686157227, "learning_rate": 1.490769230769231e-05, "loss": 0.312, "step": 993 }, { "epoch": 12.702875399361023, "grad_norm": 4.010195732116699, "learning_rate": 1.4902564102564103e-05, "loss": 0.2753, "step": 994 }, { "epoch": 12.715654952076678, "grad_norm": 3.1232686042785645, "learning_rate": 1.4897435897435898e-05, "loss": 0.2055, "step": 995 }, { "epoch": 12.728434504792332, "grad_norm": 4.415332317352295, "learning_rate": 1.4892307692307692e-05, "loss": 0.4439, "step": 996 }, { "epoch": 12.741214057507987, "grad_norm": 4.104088306427002, "learning_rate": 1.4887179487179488e-05, "loss": 0.3345, "step": 997 }, { "epoch": 12.753993610223642, "grad_norm": 4.060264587402344, "learning_rate": 1.4882051282051284e-05, "loss": 0.2695, "step": 998 }, { "epoch": 12.766773162939296, "grad_norm": 2.8599672317504883, "learning_rate": 1.4876923076923078e-05, "loss": 0.2159, "step": 999 }, { "epoch": 12.779552715654953, "grad_norm": 4.045029640197754, "learning_rate": 1.4871794871794874e-05, "loss": 0.4147, "step": 1000 }, { "epoch": 12.779552715654953, "eval_loss": 0.50066077709198, "eval_runtime": 183.3606, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.109, "step": 1000 }, { "epoch": 12.792332268370608, "grad_norm": 3.8444182872772217, "learning_rate": 1.4866666666666668e-05, "loss": 0.2935, "step": 1001 }, { "epoch": 12.805111821086262, "grad_norm": 3.37121844291687, "learning_rate": 1.4861538461538464e-05, "loss": 0.3359, "step": 1002 }, { "epoch": 12.817891373801917, "grad_norm": 3.2778074741363525, "learning_rate": 1.485641025641026e-05, "loss": 0.2979, "step": 1003 }, { "epoch": 12.830670926517572, "grad_norm": 4.910933017730713, "learning_rate": 1.4851282051282052e-05, "loss": 0.4186, "step": 1004 }, { "epoch": 12.843450479233226, "grad_norm": 3.9079768657684326, "learning_rate": 1.4846153846153847e-05, "loss": 0.3384, "step": 1005 }, { "epoch": 12.856230031948883, "grad_norm": 4.07132625579834, "learning_rate": 1.4841025641025641e-05, "loss": 0.2982, "step": 1006 }, { "epoch": 12.869009584664537, "grad_norm": 3.813551664352417, "learning_rate": 1.4835897435897437e-05, "loss": 0.316, "step": 1007 }, { "epoch": 12.881789137380192, "grad_norm": 3.3355159759521484, "learning_rate": 1.4830769230769233e-05, "loss": 0.3037, "step": 1008 }, { "epoch": 12.894568690095847, "grad_norm": 3.209801197052002, "learning_rate": 1.4825641025641027e-05, "loss": 0.2607, "step": 1009 }, { "epoch": 12.907348242811501, "grad_norm": 3.589578151702881, "learning_rate": 1.4820512820512823e-05, "loss": 0.2936, "step": 1010 }, { "epoch": 12.920127795527156, "grad_norm": 3.3078291416168213, "learning_rate": 1.4815384615384617e-05, "loss": 0.3175, "step": 1011 }, { "epoch": 12.93290734824281, "grad_norm": 4.202841281890869, "learning_rate": 1.4810256410256412e-05, "loss": 0.3471, "step": 1012 }, { "epoch": 12.945686900958467, "grad_norm": 3.8688015937805176, "learning_rate": 1.4805128205128205e-05, "loss": 0.2581, "step": 1013 }, { "epoch": 12.958466453674122, "grad_norm": 3.566972255706787, "learning_rate": 1.48e-05, "loss": 0.3632, "step": 1014 }, { "epoch": 12.971246006389777, "grad_norm": 4.008389472961426, "learning_rate": 1.4794871794871796e-05, "loss": 0.3105, "step": 1015 }, { "epoch": 12.984025559105431, "grad_norm": 4.361541271209717, "learning_rate": 1.478974358974359e-05, "loss": 0.4006, "step": 1016 }, { "epoch": 12.996805111821086, "grad_norm": 4.685972213745117, "learning_rate": 1.4784615384615386e-05, "loss": 0.4876, "step": 1017 }, { "epoch": 13.00958466453674, "grad_norm": 3.5489296913146973, "learning_rate": 1.477948717948718e-05, "loss": 0.2712, "step": 1018 }, { "epoch": 13.022364217252397, "grad_norm": 3.135805606842041, "learning_rate": 1.4774358974358976e-05, "loss": 0.1518, "step": 1019 }, { "epoch": 13.035143769968052, "grad_norm": 3.1784472465515137, "learning_rate": 1.4769230769230772e-05, "loss": 0.237, "step": 1020 }, { "epoch": 13.047923322683706, "grad_norm": 3.3885161876678467, "learning_rate": 1.4764102564102564e-05, "loss": 0.3225, "step": 1021 }, { "epoch": 13.060702875399361, "grad_norm": 4.256035327911377, "learning_rate": 1.475897435897436e-05, "loss": 0.3613, "step": 1022 }, { "epoch": 13.073482428115016, "grad_norm": 4.67989444732666, "learning_rate": 1.4753846153846154e-05, "loss": 0.3487, "step": 1023 }, { "epoch": 13.08626198083067, "grad_norm": 3.1670827865600586, "learning_rate": 1.474871794871795e-05, "loss": 0.2751, "step": 1024 }, { "epoch": 13.099041533546325, "grad_norm": 2.552567481994629, "learning_rate": 1.4743589743589745e-05, "loss": 0.1818, "step": 1025 }, { "epoch": 13.111821086261982, "grad_norm": 4.138055801391602, "learning_rate": 1.473846153846154e-05, "loss": 0.314, "step": 1026 }, { "epoch": 13.124600638977636, "grad_norm": 3.173854112625122, "learning_rate": 1.4733333333333335e-05, "loss": 0.2503, "step": 1027 }, { "epoch": 13.13738019169329, "grad_norm": 4.015395164489746, "learning_rate": 1.472820512820513e-05, "loss": 0.2352, "step": 1028 }, { "epoch": 13.150159744408946, "grad_norm": 4.005403995513916, "learning_rate": 1.4723076923076925e-05, "loss": 0.364, "step": 1029 }, { "epoch": 13.1629392971246, "grad_norm": 3.498249053955078, "learning_rate": 1.471794871794872e-05, "loss": 0.2862, "step": 1030 }, { "epoch": 13.175718849840255, "grad_norm": 4.659630298614502, "learning_rate": 1.4712820512820513e-05, "loss": 0.2042, "step": 1031 }, { "epoch": 13.188498402555911, "grad_norm": 3.3224682807922363, "learning_rate": 1.4707692307692309e-05, "loss": 0.2379, "step": 1032 }, { "epoch": 13.201277955271566, "grad_norm": 3.7572021484375, "learning_rate": 1.4702564102564103e-05, "loss": 0.2609, "step": 1033 }, { "epoch": 13.21405750798722, "grad_norm": 4.294779300689697, "learning_rate": 1.4697435897435899e-05, "loss": 0.2982, "step": 1034 }, { "epoch": 13.226837060702875, "grad_norm": 3.1984715461730957, "learning_rate": 1.4692307692307694e-05, "loss": 0.2467, "step": 1035 }, { "epoch": 13.23961661341853, "grad_norm": 4.2931318283081055, "learning_rate": 1.4687179487179488e-05, "loss": 0.3132, "step": 1036 }, { "epoch": 13.252396166134185, "grad_norm": 4.280733585357666, "learning_rate": 1.4682051282051284e-05, "loss": 0.3911, "step": 1037 }, { "epoch": 13.26517571884984, "grad_norm": 3.6063473224639893, "learning_rate": 1.4676923076923078e-05, "loss": 0.2817, "step": 1038 }, { "epoch": 13.277955271565496, "grad_norm": 3.960935354232788, "learning_rate": 1.4671794871794874e-05, "loss": 0.2993, "step": 1039 }, { "epoch": 13.29073482428115, "grad_norm": 3.975257396697998, "learning_rate": 1.4666666666666666e-05, "loss": 0.265, "step": 1040 }, { "epoch": 13.303514376996805, "grad_norm": 4.084619045257568, "learning_rate": 1.4661538461538462e-05, "loss": 0.2999, "step": 1041 }, { "epoch": 13.31629392971246, "grad_norm": 4.0943074226379395, "learning_rate": 1.4656410256410258e-05, "loss": 0.3043, "step": 1042 }, { "epoch": 13.329073482428115, "grad_norm": 4.909326553344727, "learning_rate": 1.4651282051282052e-05, "loss": 0.3645, "step": 1043 }, { "epoch": 13.34185303514377, "grad_norm": 3.738766670227051, "learning_rate": 1.4646153846153848e-05, "loss": 0.2897, "step": 1044 }, { "epoch": 13.354632587859426, "grad_norm": 4.270811557769775, "learning_rate": 1.4641025641025642e-05, "loss": 0.267, "step": 1045 }, { "epoch": 13.36741214057508, "grad_norm": 4.264181613922119, "learning_rate": 1.4635897435897437e-05, "loss": 0.3641, "step": 1046 }, { "epoch": 13.380191693290735, "grad_norm": 3.642995595932007, "learning_rate": 1.4630769230769233e-05, "loss": 0.2911, "step": 1047 }, { "epoch": 13.39297124600639, "grad_norm": 3.135666847229004, "learning_rate": 1.4625641025641027e-05, "loss": 0.2103, "step": 1048 }, { "epoch": 13.405750798722044, "grad_norm": 3.1206305027008057, "learning_rate": 1.4620512820512823e-05, "loss": 0.1989, "step": 1049 }, { "epoch": 13.418530351437699, "grad_norm": 3.8206746578216553, "learning_rate": 1.4615384615384615e-05, "loss": 0.3463, "step": 1050 }, { "epoch": 13.431309904153355, "grad_norm": 4.430676460266113, "learning_rate": 1.4610256410256411e-05, "loss": 0.2887, "step": 1051 }, { "epoch": 13.44408945686901, "grad_norm": 3.5906941890716553, "learning_rate": 1.4605128205128207e-05, "loss": 0.2832, "step": 1052 }, { "epoch": 13.456869009584665, "grad_norm": 3.3331151008605957, "learning_rate": 1.46e-05, "loss": 0.3086, "step": 1053 }, { "epoch": 13.46964856230032, "grad_norm": 3.582110643386841, "learning_rate": 1.4594871794871797e-05, "loss": 0.2267, "step": 1054 }, { "epoch": 13.482428115015974, "grad_norm": 3.459418535232544, "learning_rate": 1.458974358974359e-05, "loss": 0.2595, "step": 1055 }, { "epoch": 13.495207667731629, "grad_norm": 3.323029041290283, "learning_rate": 1.4584615384615386e-05, "loss": 0.287, "step": 1056 }, { "epoch": 13.507987220447284, "grad_norm": 3.314846992492676, "learning_rate": 1.4579487179487182e-05, "loss": 0.2498, "step": 1057 }, { "epoch": 13.52076677316294, "grad_norm": 3.9220614433288574, "learning_rate": 1.4574358974358974e-05, "loss": 0.2485, "step": 1058 }, { "epoch": 13.533546325878595, "grad_norm": 3.4297268390655518, "learning_rate": 1.4569230769230772e-05, "loss": 0.2627, "step": 1059 }, { "epoch": 13.54632587859425, "grad_norm": 4.226787567138672, "learning_rate": 1.4564102564102564e-05, "loss": 0.3431, "step": 1060 }, { "epoch": 13.559105431309904, "grad_norm": 4.605924606323242, "learning_rate": 1.455897435897436e-05, "loss": 0.2833, "step": 1061 }, { "epoch": 13.571884984025559, "grad_norm": 4.2034525871276855, "learning_rate": 1.4553846153846154e-05, "loss": 0.3244, "step": 1062 }, { "epoch": 13.584664536741213, "grad_norm": 3.6468238830566406, "learning_rate": 1.454871794871795e-05, "loss": 0.3597, "step": 1063 }, { "epoch": 13.59744408945687, "grad_norm": 3.9421803951263428, "learning_rate": 1.4543589743589746e-05, "loss": 0.2524, "step": 1064 }, { "epoch": 13.610223642172524, "grad_norm": 3.6966607570648193, "learning_rate": 1.453846153846154e-05, "loss": 0.255, "step": 1065 }, { "epoch": 13.62300319488818, "grad_norm": 4.093931674957275, "learning_rate": 1.4533333333333335e-05, "loss": 0.2975, "step": 1066 }, { "epoch": 13.635782747603834, "grad_norm": 3.0992209911346436, "learning_rate": 1.4528205128205128e-05, "loss": 0.3078, "step": 1067 }, { "epoch": 13.648562300319488, "grad_norm": 4.053220272064209, "learning_rate": 1.4523076923076923e-05, "loss": 0.2661, "step": 1068 }, { "epoch": 13.661341853035143, "grad_norm": 4.509278297424316, "learning_rate": 1.451794871794872e-05, "loss": 0.2897, "step": 1069 }, { "epoch": 13.6741214057508, "grad_norm": 4.291611194610596, "learning_rate": 1.4512820512820513e-05, "loss": 0.3536, "step": 1070 }, { "epoch": 13.686900958466454, "grad_norm": 4.656402111053467, "learning_rate": 1.4507692307692309e-05, "loss": 0.2744, "step": 1071 }, { "epoch": 13.699680511182109, "grad_norm": 3.606649398803711, "learning_rate": 1.4502564102564103e-05, "loss": 0.2722, "step": 1072 }, { "epoch": 13.712460063897764, "grad_norm": 3.419447422027588, "learning_rate": 1.4497435897435899e-05, "loss": 0.2498, "step": 1073 }, { "epoch": 13.725239616613418, "grad_norm": 4.817013263702393, "learning_rate": 1.4492307692307695e-05, "loss": 0.3591, "step": 1074 }, { "epoch": 13.738019169329073, "grad_norm": 3.9367573261260986, "learning_rate": 1.4487179487179489e-05, "loss": 0.4173, "step": 1075 }, { "epoch": 13.750798722044728, "grad_norm": 3.777169942855835, "learning_rate": 1.4482051282051284e-05, "loss": 0.2441, "step": 1076 }, { "epoch": 13.763578274760384, "grad_norm": 3.349952220916748, "learning_rate": 1.4476923076923077e-05, "loss": 0.2305, "step": 1077 }, { "epoch": 13.776357827476039, "grad_norm": 3.7569973468780518, "learning_rate": 1.4471794871794872e-05, "loss": 0.253, "step": 1078 }, { "epoch": 13.789137380191693, "grad_norm": 4.3344950675964355, "learning_rate": 1.4466666666666668e-05, "loss": 0.3236, "step": 1079 }, { "epoch": 13.801916932907348, "grad_norm": 4.360065937042236, "learning_rate": 1.4461538461538462e-05, "loss": 0.3244, "step": 1080 }, { "epoch": 13.814696485623003, "grad_norm": 3.8668954372406006, "learning_rate": 1.4456410256410258e-05, "loss": 0.2798, "step": 1081 }, { "epoch": 13.827476038338657, "grad_norm": 4.55317497253418, "learning_rate": 1.4451282051282052e-05, "loss": 0.322, "step": 1082 }, { "epoch": 13.840255591054314, "grad_norm": 4.994724750518799, "learning_rate": 1.4446153846153848e-05, "loss": 0.2554, "step": 1083 }, { "epoch": 13.853035143769969, "grad_norm": 3.47524094581604, "learning_rate": 1.4441025641025644e-05, "loss": 0.3137, "step": 1084 }, { "epoch": 13.865814696485623, "grad_norm": 3.828433036804199, "learning_rate": 1.4435897435897438e-05, "loss": 0.2692, "step": 1085 }, { "epoch": 13.878594249201278, "grad_norm": 3.433011054992676, "learning_rate": 1.4430769230769233e-05, "loss": 0.3363, "step": 1086 }, { "epoch": 13.891373801916933, "grad_norm": 3.745716094970703, "learning_rate": 1.4425641025641026e-05, "loss": 0.2724, "step": 1087 }, { "epoch": 13.904153354632587, "grad_norm": 4.244868755340576, "learning_rate": 1.4420512820512821e-05, "loss": 0.2699, "step": 1088 }, { "epoch": 13.916932907348244, "grad_norm": 3.151378631591797, "learning_rate": 1.4415384615384615e-05, "loss": 0.2184, "step": 1089 }, { "epoch": 13.929712460063898, "grad_norm": 3.5328688621520996, "learning_rate": 1.4410256410256411e-05, "loss": 0.3188, "step": 1090 }, { "epoch": 13.942492012779553, "grad_norm": 3.9043216705322266, "learning_rate": 1.4405128205128207e-05, "loss": 0.4269, "step": 1091 }, { "epoch": 13.955271565495208, "grad_norm": 4.665742874145508, "learning_rate": 1.4400000000000001e-05, "loss": 0.3837, "step": 1092 }, { "epoch": 13.968051118210862, "grad_norm": 3.9148755073547363, "learning_rate": 1.4394871794871797e-05, "loss": 0.274, "step": 1093 }, { "epoch": 13.980830670926517, "grad_norm": 4.3452630043029785, "learning_rate": 1.4389743589743589e-05, "loss": 0.3916, "step": 1094 }, { "epoch": 13.993610223642172, "grad_norm": 3.2570807933807373, "learning_rate": 1.4384615384615387e-05, "loss": 0.225, "step": 1095 }, { "epoch": 14.006389776357828, "grad_norm": 5.40920352935791, "learning_rate": 1.4379487179487182e-05, "loss": 0.4157, "step": 1096 }, { "epoch": 14.019169329073483, "grad_norm": 3.3941538333892822, "learning_rate": 1.4374358974358975e-05, "loss": 0.2426, "step": 1097 }, { "epoch": 14.031948881789138, "grad_norm": 3.874727964401245, "learning_rate": 1.436923076923077e-05, "loss": 0.2408, "step": 1098 }, { "epoch": 14.044728434504792, "grad_norm": 3.6664178371429443, "learning_rate": 1.4364102564102564e-05, "loss": 0.3009, "step": 1099 }, { "epoch": 14.057507987220447, "grad_norm": 4.098280429840088, "learning_rate": 1.435897435897436e-05, "loss": 0.2583, "step": 1100 }, { "epoch": 14.057507987220447, "eval_loss": 0.5211612582206726, "eval_runtime": 183.7005, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 1100 }, { "epoch": 14.070287539936102, "grad_norm": 3.162128210067749, "learning_rate": 1.4353846153846156e-05, "loss": 0.1974, "step": 1101 }, { "epoch": 14.083067092651758, "grad_norm": 3.343733787536621, "learning_rate": 1.434871794871795e-05, "loss": 0.2123, "step": 1102 }, { "epoch": 14.095846645367413, "grad_norm": 4.774056911468506, "learning_rate": 1.4343589743589746e-05, "loss": 0.3359, "step": 1103 }, { "epoch": 14.108626198083067, "grad_norm": 3.923760175704956, "learning_rate": 1.4338461538461538e-05, "loss": 0.2391, "step": 1104 }, { "epoch": 14.121405750798722, "grad_norm": 4.176171779632568, "learning_rate": 1.4333333333333334e-05, "loss": 0.2406, "step": 1105 }, { "epoch": 14.134185303514377, "grad_norm": 4.3658246994018555, "learning_rate": 1.432820512820513e-05, "loss": 0.2898, "step": 1106 }, { "epoch": 14.146964856230031, "grad_norm": 4.221414566040039, "learning_rate": 1.4323076923076924e-05, "loss": 0.298, "step": 1107 }, { "epoch": 14.159744408945686, "grad_norm": 4.855254173278809, "learning_rate": 1.431794871794872e-05, "loss": 0.2802, "step": 1108 }, { "epoch": 14.172523961661343, "grad_norm": 4.903701305389404, "learning_rate": 1.4312820512820513e-05, "loss": 0.3151, "step": 1109 }, { "epoch": 14.185303514376997, "grad_norm": 2.996689796447754, "learning_rate": 1.430769230769231e-05, "loss": 0.1835, "step": 1110 }, { "epoch": 14.198083067092652, "grad_norm": 4.175163745880127, "learning_rate": 1.4302564102564103e-05, "loss": 0.2826, "step": 1111 }, { "epoch": 14.210862619808307, "grad_norm": 4.538963317871094, "learning_rate": 1.4297435897435899e-05, "loss": 0.2523, "step": 1112 }, { "epoch": 14.223642172523961, "grad_norm": 4.176215648651123, "learning_rate": 1.4292307692307695e-05, "loss": 0.2728, "step": 1113 }, { "epoch": 14.236421725239616, "grad_norm": 4.221230983734131, "learning_rate": 1.4287179487179487e-05, "loss": 0.3101, "step": 1114 }, { "epoch": 14.249201277955272, "grad_norm": 5.207468509674072, "learning_rate": 1.4282051282051283e-05, "loss": 0.2379, "step": 1115 }, { "epoch": 14.261980830670927, "grad_norm": 2.8653461933135986, "learning_rate": 1.4276923076923077e-05, "loss": 0.2439, "step": 1116 }, { "epoch": 14.274760383386582, "grad_norm": 3.800055742263794, "learning_rate": 1.4271794871794873e-05, "loss": 0.2981, "step": 1117 }, { "epoch": 14.287539936102236, "grad_norm": 3.1056973934173584, "learning_rate": 1.4266666666666668e-05, "loss": 0.1567, "step": 1118 }, { "epoch": 14.300319488817891, "grad_norm": 3.552155017852783, "learning_rate": 1.4261538461538462e-05, "loss": 0.2754, "step": 1119 }, { "epoch": 14.313099041533546, "grad_norm": 4.416917324066162, "learning_rate": 1.4256410256410258e-05, "loss": 0.3312, "step": 1120 }, { "epoch": 14.3258785942492, "grad_norm": 3.600459098815918, "learning_rate": 1.4251282051282052e-05, "loss": 0.2085, "step": 1121 }, { "epoch": 14.338658146964857, "grad_norm": 3.768317699432373, "learning_rate": 1.4246153846153848e-05, "loss": 0.2848, "step": 1122 }, { "epoch": 14.351437699680512, "grad_norm": 3.480001211166382, "learning_rate": 1.4241025641025644e-05, "loss": 0.1951, "step": 1123 }, { "epoch": 14.364217252396166, "grad_norm": 4.8978495597839355, "learning_rate": 1.4235897435897436e-05, "loss": 0.3184, "step": 1124 }, { "epoch": 14.37699680511182, "grad_norm": 3.9940297603607178, "learning_rate": 1.4230769230769232e-05, "loss": 0.2288, "step": 1125 }, { "epoch": 14.389776357827476, "grad_norm": 3.5290699005126953, "learning_rate": 1.4225641025641026e-05, "loss": 0.1906, "step": 1126 }, { "epoch": 14.40255591054313, "grad_norm": 4.18889856338501, "learning_rate": 1.4220512820512822e-05, "loss": 0.1804, "step": 1127 }, { "epoch": 14.415335463258787, "grad_norm": 4.013011455535889, "learning_rate": 1.4215384615384617e-05, "loss": 0.3671, "step": 1128 }, { "epoch": 14.428115015974441, "grad_norm": 4.15484619140625, "learning_rate": 1.4210256410256411e-05, "loss": 0.2944, "step": 1129 }, { "epoch": 14.440894568690096, "grad_norm": 4.773779392242432, "learning_rate": 1.4205128205128207e-05, "loss": 0.3174, "step": 1130 }, { "epoch": 14.45367412140575, "grad_norm": 5.463952541351318, "learning_rate": 1.4200000000000001e-05, "loss": 0.2962, "step": 1131 }, { "epoch": 14.466453674121405, "grad_norm": 4.950415134429932, "learning_rate": 1.4194871794871797e-05, "loss": 0.2809, "step": 1132 }, { "epoch": 14.47923322683706, "grad_norm": 3.709376096725464, "learning_rate": 1.4189743589743593e-05, "loss": 0.2346, "step": 1133 }, { "epoch": 14.492012779552716, "grad_norm": 3.465916395187378, "learning_rate": 1.4184615384615385e-05, "loss": 0.212, "step": 1134 }, { "epoch": 14.504792332268371, "grad_norm": 4.268784046173096, "learning_rate": 1.4179487179487181e-05, "loss": 0.2537, "step": 1135 }, { "epoch": 14.517571884984026, "grad_norm": 3.6786417961120605, "learning_rate": 1.4174358974358975e-05, "loss": 0.2346, "step": 1136 }, { "epoch": 14.53035143769968, "grad_norm": 4.235080718994141, "learning_rate": 1.416923076923077e-05, "loss": 0.2716, "step": 1137 }, { "epoch": 14.543130990415335, "grad_norm": 4.019659519195557, "learning_rate": 1.4164102564102565e-05, "loss": 0.1886, "step": 1138 }, { "epoch": 14.55591054313099, "grad_norm": 3.9104185104370117, "learning_rate": 1.415897435897436e-05, "loss": 0.2084, "step": 1139 }, { "epoch": 14.568690095846645, "grad_norm": 4.124183177947998, "learning_rate": 1.4153846153846156e-05, "loss": 0.2415, "step": 1140 }, { "epoch": 14.581469648562301, "grad_norm": 3.432605504989624, "learning_rate": 1.4148717948717949e-05, "loss": 0.2125, "step": 1141 }, { "epoch": 14.594249201277956, "grad_norm": 4.036380290985107, "learning_rate": 1.4143589743589744e-05, "loss": 0.2915, "step": 1142 }, { "epoch": 14.60702875399361, "grad_norm": 4.067980766296387, "learning_rate": 1.4138461538461538e-05, "loss": 0.2747, "step": 1143 }, { "epoch": 14.619808306709265, "grad_norm": 3.7604496479034424, "learning_rate": 1.4133333333333334e-05, "loss": 0.1984, "step": 1144 }, { "epoch": 14.63258785942492, "grad_norm": 3.815758466720581, "learning_rate": 1.412820512820513e-05, "loss": 0.2544, "step": 1145 }, { "epoch": 14.645367412140574, "grad_norm": 4.104861259460449, "learning_rate": 1.4123076923076924e-05, "loss": 0.261, "step": 1146 }, { "epoch": 14.65814696485623, "grad_norm": 4.117253303527832, "learning_rate": 1.411794871794872e-05, "loss": 0.2269, "step": 1147 }, { "epoch": 14.670926517571885, "grad_norm": 4.8703837394714355, "learning_rate": 1.4112820512820514e-05, "loss": 0.3065, "step": 1148 }, { "epoch": 14.68370607028754, "grad_norm": 3.6039960384368896, "learning_rate": 1.410769230769231e-05, "loss": 0.2675, "step": 1149 }, { "epoch": 14.696485623003195, "grad_norm": 4.1222615242004395, "learning_rate": 1.4102564102564105e-05, "loss": 0.2527, "step": 1150 }, { "epoch": 14.70926517571885, "grad_norm": 4.146384239196777, "learning_rate": 1.4097435897435898e-05, "loss": 0.2932, "step": 1151 }, { "epoch": 14.722044728434504, "grad_norm": 3.5116586685180664, "learning_rate": 1.4092307692307693e-05, "loss": 0.1855, "step": 1152 }, { "epoch": 14.73482428115016, "grad_norm": 4.162652492523193, "learning_rate": 1.4087179487179487e-05, "loss": 0.3125, "step": 1153 }, { "epoch": 14.747603833865815, "grad_norm": 4.2060747146606445, "learning_rate": 1.4082051282051283e-05, "loss": 0.2546, "step": 1154 }, { "epoch": 14.76038338658147, "grad_norm": 4.2779765129089355, "learning_rate": 1.4076923076923079e-05, "loss": 0.2141, "step": 1155 }, { "epoch": 14.773162939297125, "grad_norm": 3.6725728511810303, "learning_rate": 1.4071794871794873e-05, "loss": 0.2514, "step": 1156 }, { "epoch": 14.78594249201278, "grad_norm": 4.1309356689453125, "learning_rate": 1.4066666666666669e-05, "loss": 0.2159, "step": 1157 }, { "epoch": 14.798722044728434, "grad_norm": 4.970953941345215, "learning_rate": 1.4061538461538463e-05, "loss": 0.3516, "step": 1158 }, { "epoch": 14.811501597444089, "grad_norm": 4.282907009124756, "learning_rate": 1.4056410256410258e-05, "loss": 0.2957, "step": 1159 }, { "epoch": 14.824281150159745, "grad_norm": 4.15264368057251, "learning_rate": 1.405128205128205e-05, "loss": 0.3114, "step": 1160 }, { "epoch": 14.8370607028754, "grad_norm": 6.291984558105469, "learning_rate": 1.4046153846153847e-05, "loss": 0.3656, "step": 1161 }, { "epoch": 14.849840255591054, "grad_norm": 4.7016520500183105, "learning_rate": 1.4041025641025642e-05, "loss": 0.2923, "step": 1162 }, { "epoch": 14.86261980830671, "grad_norm": 4.363107204437256, "learning_rate": 1.4035897435897436e-05, "loss": 0.2412, "step": 1163 }, { "epoch": 14.875399361022364, "grad_norm": 4.666199207305908, "learning_rate": 1.4030769230769232e-05, "loss": 0.3477, "step": 1164 }, { "epoch": 14.888178913738018, "grad_norm": 5.607475280761719, "learning_rate": 1.4025641025641026e-05, "loss": 0.4089, "step": 1165 }, { "epoch": 14.900958466453675, "grad_norm": 4.675595760345459, "learning_rate": 1.4020512820512822e-05, "loss": 0.247, "step": 1166 }, { "epoch": 14.91373801916933, "grad_norm": 4.405561447143555, "learning_rate": 1.4015384615384618e-05, "loss": 0.2161, "step": 1167 }, { "epoch": 14.926517571884984, "grad_norm": 4.1602983474731445, "learning_rate": 1.4010256410256412e-05, "loss": 0.3375, "step": 1168 }, { "epoch": 14.939297124600639, "grad_norm": 4.212582111358643, "learning_rate": 1.4005128205128207e-05, "loss": 0.2899, "step": 1169 }, { "epoch": 14.952076677316294, "grad_norm": 3.6928679943084717, "learning_rate": 1.4e-05, "loss": 0.2195, "step": 1170 }, { "epoch": 14.964856230031948, "grad_norm": 3.720360040664673, "learning_rate": 1.3994871794871796e-05, "loss": 0.2048, "step": 1171 }, { "epoch": 14.977635782747605, "grad_norm": 4.18305778503418, "learning_rate": 1.3989743589743591e-05, "loss": 0.2354, "step": 1172 }, { "epoch": 14.99041533546326, "grad_norm": 3.789397716522217, "learning_rate": 1.3984615384615385e-05, "loss": 0.2287, "step": 1173 }, { "epoch": 15.003194888178914, "grad_norm": 3.868288516998291, "learning_rate": 1.3979487179487181e-05, "loss": 0.3126, "step": 1174 }, { "epoch": 15.015974440894569, "grad_norm": 3.489731550216675, "learning_rate": 1.3974358974358975e-05, "loss": 0.2087, "step": 1175 }, { "epoch": 15.028753993610223, "grad_norm": 3.4861392974853516, "learning_rate": 1.3969230769230771e-05, "loss": 0.2221, "step": 1176 }, { "epoch": 15.041533546325878, "grad_norm": 4.025823593139648, "learning_rate": 1.3964102564102567e-05, "loss": 0.2304, "step": 1177 }, { "epoch": 15.054313099041533, "grad_norm": 3.0816750526428223, "learning_rate": 1.3958974358974359e-05, "loss": 0.2452, "step": 1178 }, { "epoch": 15.06709265175719, "grad_norm": 3.171478033065796, "learning_rate": 1.3953846153846156e-05, "loss": 0.159, "step": 1179 }, { "epoch": 15.079872204472844, "grad_norm": 3.781932830810547, "learning_rate": 1.3948717948717949e-05, "loss": 0.2649, "step": 1180 }, { "epoch": 15.092651757188499, "grad_norm": 3.238219738006592, "learning_rate": 1.3943589743589745e-05, "loss": 0.2139, "step": 1181 }, { "epoch": 15.105431309904153, "grad_norm": 4.112623691558838, "learning_rate": 1.393846153846154e-05, "loss": 0.2313, "step": 1182 }, { "epoch": 15.118210862619808, "grad_norm": 4.599632263183594, "learning_rate": 1.3933333333333334e-05, "loss": 0.2026, "step": 1183 }, { "epoch": 15.130990415335463, "grad_norm": 4.236635684967041, "learning_rate": 1.392820512820513e-05, "loss": 0.2559, "step": 1184 }, { "epoch": 15.143769968051119, "grad_norm": 3.911651134490967, "learning_rate": 1.3923076923076924e-05, "loss": 0.2067, "step": 1185 }, { "epoch": 15.156549520766774, "grad_norm": 3.7214670181274414, "learning_rate": 1.391794871794872e-05, "loss": 0.1649, "step": 1186 }, { "epoch": 15.169329073482428, "grad_norm": 4.420977592468262, "learning_rate": 1.3912820512820512e-05, "loss": 0.2361, "step": 1187 }, { "epoch": 15.182108626198083, "grad_norm": 4.661736011505127, "learning_rate": 1.3907692307692308e-05, "loss": 0.1881, "step": 1188 }, { "epoch": 15.194888178913738, "grad_norm": 4.55980110168457, "learning_rate": 1.3902564102564104e-05, "loss": 0.2297, "step": 1189 }, { "epoch": 15.207667731629392, "grad_norm": 3.6603283882141113, "learning_rate": 1.3897435897435898e-05, "loss": 0.1636, "step": 1190 }, { "epoch": 15.220447284345047, "grad_norm": 4.215869426727295, "learning_rate": 1.3892307692307694e-05, "loss": 0.2806, "step": 1191 }, { "epoch": 15.233226837060704, "grad_norm": 3.951566696166992, "learning_rate": 1.3887179487179488e-05, "loss": 0.308, "step": 1192 }, { "epoch": 15.246006389776358, "grad_norm": 4.275181770324707, "learning_rate": 1.3882051282051283e-05, "loss": 0.2921, "step": 1193 }, { "epoch": 15.258785942492013, "grad_norm": 3.9310081005096436, "learning_rate": 1.3876923076923079e-05, "loss": 0.2026, "step": 1194 }, { "epoch": 15.271565495207668, "grad_norm": 4.194605350494385, "learning_rate": 1.3871794871794873e-05, "loss": 0.2177, "step": 1195 }, { "epoch": 15.284345047923322, "grad_norm": 3.8993043899536133, "learning_rate": 1.3866666666666669e-05, "loss": 0.2069, "step": 1196 }, { "epoch": 15.297124600638977, "grad_norm": 4.589502334594727, "learning_rate": 1.3861538461538461e-05, "loss": 0.2307, "step": 1197 }, { "epoch": 15.309904153354633, "grad_norm": 3.8216519355773926, "learning_rate": 1.3856410256410257e-05, "loss": 0.2491, "step": 1198 }, { "epoch": 15.322683706070288, "grad_norm": 3.8244547843933105, "learning_rate": 1.3851282051282053e-05, "loss": 0.197, "step": 1199 }, { "epoch": 15.335463258785943, "grad_norm": 3.713712215423584, "learning_rate": 1.3846153846153847e-05, "loss": 0.2318, "step": 1200 }, { "epoch": 15.335463258785943, "eval_loss": 0.5259692668914795, "eval_runtime": 183.2489, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.109, "step": 1200 }, { "epoch": 15.348242811501597, "grad_norm": 3.2528011798858643, "learning_rate": 1.3841025641025643e-05, "loss": 0.2092, "step": 1201 }, { "epoch": 15.361022364217252, "grad_norm": 3.8089711666107178, "learning_rate": 1.3835897435897437e-05, "loss": 0.2623, "step": 1202 }, { "epoch": 15.373801916932907, "grad_norm": 3.917395830154419, "learning_rate": 1.3830769230769232e-05, "loss": 0.2547, "step": 1203 }, { "epoch": 15.386581469648561, "grad_norm": 3.8710122108459473, "learning_rate": 1.3825641025641028e-05, "loss": 0.2426, "step": 1204 }, { "epoch": 15.399361022364218, "grad_norm": 3.7556984424591064, "learning_rate": 1.3820512820512822e-05, "loss": 0.2428, "step": 1205 }, { "epoch": 15.412140575079873, "grad_norm": 3.3744430541992188, "learning_rate": 1.3815384615384618e-05, "loss": 0.2461, "step": 1206 }, { "epoch": 15.424920127795527, "grad_norm": 3.5001065731048584, "learning_rate": 1.381025641025641e-05, "loss": 0.1807, "step": 1207 }, { "epoch": 15.437699680511182, "grad_norm": 3.866056442260742, "learning_rate": 1.3805128205128206e-05, "loss": 0.1739, "step": 1208 }, { "epoch": 15.450479233226837, "grad_norm": 4.576041221618652, "learning_rate": 1.38e-05, "loss": 0.2128, "step": 1209 }, { "epoch": 15.463258785942491, "grad_norm": 4.190147876739502, "learning_rate": 1.3794871794871796e-05, "loss": 0.1893, "step": 1210 }, { "epoch": 15.476038338658148, "grad_norm": 4.4232940673828125, "learning_rate": 1.3789743589743592e-05, "loss": 0.2429, "step": 1211 }, { "epoch": 15.488817891373802, "grad_norm": 4.887746810913086, "learning_rate": 1.3784615384615386e-05, "loss": 0.193, "step": 1212 }, { "epoch": 15.501597444089457, "grad_norm": 4.072628498077393, "learning_rate": 1.3779487179487181e-05, "loss": 0.2494, "step": 1213 }, { "epoch": 15.514376996805112, "grad_norm": 4.401451110839844, "learning_rate": 1.3774358974358975e-05, "loss": 0.2221, "step": 1214 }, { "epoch": 15.527156549520766, "grad_norm": 4.5076422691345215, "learning_rate": 1.3769230769230771e-05, "loss": 0.2881, "step": 1215 }, { "epoch": 15.539936102236421, "grad_norm": 4.133837699890137, "learning_rate": 1.3764102564102567e-05, "loss": 0.1981, "step": 1216 }, { "epoch": 15.552715654952078, "grad_norm": 4.182383060455322, "learning_rate": 1.375897435897436e-05, "loss": 0.2526, "step": 1217 }, { "epoch": 15.565495207667732, "grad_norm": 4.154629707336426, "learning_rate": 1.3753846153846155e-05, "loss": 0.2917, "step": 1218 }, { "epoch": 15.578274760383387, "grad_norm": 4.1074347496032715, "learning_rate": 1.3748717948717949e-05, "loss": 0.181, "step": 1219 }, { "epoch": 15.591054313099042, "grad_norm": 4.3682403564453125, "learning_rate": 1.3743589743589745e-05, "loss": 0.1997, "step": 1220 }, { "epoch": 15.603833865814696, "grad_norm": 4.54557466506958, "learning_rate": 1.373846153846154e-05, "loss": 0.1816, "step": 1221 }, { "epoch": 15.616613418530351, "grad_norm": 5.212317943572998, "learning_rate": 1.3733333333333335e-05, "loss": 0.2666, "step": 1222 }, { "epoch": 15.629392971246006, "grad_norm": 3.5865111351013184, "learning_rate": 1.372820512820513e-05, "loss": 0.1983, "step": 1223 }, { "epoch": 15.642172523961662, "grad_norm": 4.672217845916748, "learning_rate": 1.3723076923076923e-05, "loss": 0.3151, "step": 1224 }, { "epoch": 15.654952076677317, "grad_norm": 3.933727741241455, "learning_rate": 1.3717948717948718e-05, "loss": 0.2512, "step": 1225 }, { "epoch": 15.667731629392971, "grad_norm": 3.665114164352417, "learning_rate": 1.3712820512820514e-05, "loss": 0.1774, "step": 1226 }, { "epoch": 15.680511182108626, "grad_norm": 4.567009925842285, "learning_rate": 1.3707692307692308e-05, "loss": 0.2918, "step": 1227 }, { "epoch": 15.69329073482428, "grad_norm": 3.781317949295044, "learning_rate": 1.3702564102564104e-05, "loss": 0.2409, "step": 1228 }, { "epoch": 15.706070287539935, "grad_norm": 4.618795394897461, "learning_rate": 1.3697435897435898e-05, "loss": 0.29, "step": 1229 }, { "epoch": 15.718849840255592, "grad_norm": 4.268044471740723, "learning_rate": 1.3692307692307694e-05, "loss": 0.2358, "step": 1230 }, { "epoch": 15.731629392971247, "grad_norm": 4.229768753051758, "learning_rate": 1.3687179487179488e-05, "loss": 0.221, "step": 1231 }, { "epoch": 15.744408945686901, "grad_norm": 4.791956901550293, "learning_rate": 1.3682051282051284e-05, "loss": 0.3294, "step": 1232 }, { "epoch": 15.757188498402556, "grad_norm": 4.449877738952637, "learning_rate": 1.367692307692308e-05, "loss": 0.2586, "step": 1233 }, { "epoch": 15.76996805111821, "grad_norm": 4.792293071746826, "learning_rate": 1.3671794871794872e-05, "loss": 0.2943, "step": 1234 }, { "epoch": 15.782747603833865, "grad_norm": 3.772984743118286, "learning_rate": 1.3666666666666667e-05, "loss": 0.2267, "step": 1235 }, { "epoch": 15.795527156549522, "grad_norm": 3.729710578918457, "learning_rate": 1.3661538461538461e-05, "loss": 0.223, "step": 1236 }, { "epoch": 15.808306709265176, "grad_norm": 5.369871616363525, "learning_rate": 1.3656410256410257e-05, "loss": 0.2562, "step": 1237 }, { "epoch": 15.821086261980831, "grad_norm": 4.903266906738281, "learning_rate": 1.3651282051282053e-05, "loss": 0.2558, "step": 1238 }, { "epoch": 15.833865814696486, "grad_norm": 3.3962509632110596, "learning_rate": 1.3646153846153847e-05, "loss": 0.1516, "step": 1239 }, { "epoch": 15.84664536741214, "grad_norm": 5.115278244018555, "learning_rate": 1.3641025641025643e-05, "loss": 0.1924, "step": 1240 }, { "epoch": 15.859424920127795, "grad_norm": 4.458291053771973, "learning_rate": 1.3635897435897437e-05, "loss": 0.3075, "step": 1241 }, { "epoch": 15.87220447284345, "grad_norm": 3.402921199798584, "learning_rate": 1.3630769230769233e-05, "loss": 0.1319, "step": 1242 }, { "epoch": 15.884984025559106, "grad_norm": 4.096307277679443, "learning_rate": 1.3625641025641028e-05, "loss": 0.2711, "step": 1243 }, { "epoch": 15.89776357827476, "grad_norm": 4.215847969055176, "learning_rate": 1.362051282051282e-05, "loss": 0.2353, "step": 1244 }, { "epoch": 15.910543130990416, "grad_norm": 4.571593761444092, "learning_rate": 1.3615384615384616e-05, "loss": 0.2092, "step": 1245 }, { "epoch": 15.92332268370607, "grad_norm": 4.602342128753662, "learning_rate": 1.361025641025641e-05, "loss": 0.351, "step": 1246 }, { "epoch": 15.936102236421725, "grad_norm": 4.611629009246826, "learning_rate": 1.3605128205128206e-05, "loss": 0.1752, "step": 1247 }, { "epoch": 15.94888178913738, "grad_norm": 5.004614353179932, "learning_rate": 1.3600000000000002e-05, "loss": 0.2447, "step": 1248 }, { "epoch": 15.961661341853034, "grad_norm": 4.196335315704346, "learning_rate": 1.3594871794871796e-05, "loss": 0.2436, "step": 1249 }, { "epoch": 15.97444089456869, "grad_norm": 3.1841630935668945, "learning_rate": 1.3589743589743592e-05, "loss": 0.1514, "step": 1250 }, { "epoch": 15.987220447284345, "grad_norm": 3.8978161811828613, "learning_rate": 1.3584615384615386e-05, "loss": 0.3935, "step": 1251 }, { "epoch": 16.0, "grad_norm": 4.464201927185059, "learning_rate": 1.3579487179487182e-05, "loss": 0.2584, "step": 1252 }, { "epoch": 16.012779552715656, "grad_norm": 3.4007415771484375, "learning_rate": 1.3574358974358977e-05, "loss": 0.2057, "step": 1253 }, { "epoch": 16.02555910543131, "grad_norm": 3.3608012199401855, "learning_rate": 1.356923076923077e-05, "loss": 0.2504, "step": 1254 }, { "epoch": 16.038338658146966, "grad_norm": 3.8571906089782715, "learning_rate": 1.3564102564102565e-05, "loss": 0.1662, "step": 1255 }, { "epoch": 16.05111821086262, "grad_norm": 4.310530185699463, "learning_rate": 1.355897435897436e-05, "loss": 0.2197, "step": 1256 }, { "epoch": 16.063897763578275, "grad_norm": 4.129376411437988, "learning_rate": 1.3553846153846155e-05, "loss": 0.1793, "step": 1257 }, { "epoch": 16.076677316293928, "grad_norm": 4.5880045890808105, "learning_rate": 1.354871794871795e-05, "loss": 0.1637, "step": 1258 }, { "epoch": 16.089456869009584, "grad_norm": 34.26468276977539, "learning_rate": 1.3543589743589745e-05, "loss": 0.2003, "step": 1259 }, { "epoch": 16.10223642172524, "grad_norm": 4.134389877319336, "learning_rate": 1.353846153846154e-05, "loss": 0.191, "step": 1260 }, { "epoch": 16.115015974440894, "grad_norm": 5.159793376922607, "learning_rate": 1.3533333333333333e-05, "loss": 0.1985, "step": 1261 }, { "epoch": 16.12779552715655, "grad_norm": 4.2011003494262695, "learning_rate": 1.3528205128205129e-05, "loss": 0.1624, "step": 1262 }, { "epoch": 16.140575079872203, "grad_norm": 4.127978801727295, "learning_rate": 1.3523076923076923e-05, "loss": 0.1746, "step": 1263 }, { "epoch": 16.15335463258786, "grad_norm": 4.027311325073242, "learning_rate": 1.3517948717948719e-05, "loss": 0.1958, "step": 1264 }, { "epoch": 16.166134185303516, "grad_norm": 4.948392868041992, "learning_rate": 1.3512820512820514e-05, "loss": 0.2804, "step": 1265 }, { "epoch": 16.17891373801917, "grad_norm": 3.5425753593444824, "learning_rate": 1.3507692307692308e-05, "loss": 0.132, "step": 1266 }, { "epoch": 16.191693290734825, "grad_norm": 4.664565563201904, "learning_rate": 1.3502564102564104e-05, "loss": 0.2148, "step": 1267 }, { "epoch": 16.20447284345048, "grad_norm": 4.611464977264404, "learning_rate": 1.3497435897435898e-05, "loss": 0.2382, "step": 1268 }, { "epoch": 16.217252396166135, "grad_norm": 3.5738282203674316, "learning_rate": 1.3492307692307694e-05, "loss": 0.1605, "step": 1269 }, { "epoch": 16.230031948881788, "grad_norm": 3.991009473800659, "learning_rate": 1.348717948717949e-05, "loss": 0.1839, "step": 1270 }, { "epoch": 16.242811501597444, "grad_norm": 4.211781024932861, "learning_rate": 1.3482051282051282e-05, "loss": 0.2611, "step": 1271 }, { "epoch": 16.2555910543131, "grad_norm": 3.7441012859344482, "learning_rate": 1.3476923076923078e-05, "loss": 0.1583, "step": 1272 }, { "epoch": 16.268370607028753, "grad_norm": 5.4371490478515625, "learning_rate": 1.3471794871794872e-05, "loss": 0.3684, "step": 1273 }, { "epoch": 16.28115015974441, "grad_norm": 3.0008156299591064, "learning_rate": 1.3466666666666668e-05, "loss": 0.1685, "step": 1274 }, { "epoch": 16.293929712460063, "grad_norm": 4.355956554412842, "learning_rate": 1.3461538461538463e-05, "loss": 0.1921, "step": 1275 }, { "epoch": 16.30670926517572, "grad_norm": 3.5138537883758545, "learning_rate": 1.3456410256410257e-05, "loss": 0.136, "step": 1276 }, { "epoch": 16.319488817891372, "grad_norm": 3.696995258331299, "learning_rate": 1.3451282051282053e-05, "loss": 0.1814, "step": 1277 }, { "epoch": 16.33226837060703, "grad_norm": 4.32089376449585, "learning_rate": 1.3446153846153847e-05, "loss": 0.2155, "step": 1278 }, { "epoch": 16.345047923322685, "grad_norm": 3.804948568344116, "learning_rate": 1.3441025641025643e-05, "loss": 0.1293, "step": 1279 }, { "epoch": 16.357827476038338, "grad_norm": 3.8062856197357178, "learning_rate": 1.3435897435897435e-05, "loss": 0.1458, "step": 1280 }, { "epoch": 16.370607028753994, "grad_norm": 4.169013500213623, "learning_rate": 1.3430769230769231e-05, "loss": 0.2107, "step": 1281 }, { "epoch": 16.383386581469647, "grad_norm": 5.1493306159973145, "learning_rate": 1.3425641025641027e-05, "loss": 0.1938, "step": 1282 }, { "epoch": 16.396166134185304, "grad_norm": 4.721076011657715, "learning_rate": 1.3420512820512821e-05, "loss": 0.3148, "step": 1283 }, { "epoch": 16.408945686900957, "grad_norm": 4.6146321296691895, "learning_rate": 1.3415384615384617e-05, "loss": 0.2838, "step": 1284 }, { "epoch": 16.421725239616613, "grad_norm": 3.725146770477295, "learning_rate": 1.341025641025641e-05, "loss": 0.1809, "step": 1285 }, { "epoch": 16.43450479233227, "grad_norm": 4.293447017669678, "learning_rate": 1.3405128205128206e-05, "loss": 0.2271, "step": 1286 }, { "epoch": 16.447284345047922, "grad_norm": 4.092364311218262, "learning_rate": 1.3400000000000002e-05, "loss": 0.1976, "step": 1287 }, { "epoch": 16.46006389776358, "grad_norm": 5.2461161613464355, "learning_rate": 1.3394871794871796e-05, "loss": 0.2599, "step": 1288 }, { "epoch": 16.472843450479232, "grad_norm": 3.7622294425964355, "learning_rate": 1.3389743589743592e-05, "loss": 0.193, "step": 1289 }, { "epoch": 16.48562300319489, "grad_norm": 3.409365653991699, "learning_rate": 1.3384615384615384e-05, "loss": 0.1305, "step": 1290 }, { "epoch": 16.498402555910545, "grad_norm": 3.733654737472534, "learning_rate": 1.337948717948718e-05, "loss": 0.2246, "step": 1291 }, { "epoch": 16.511182108626198, "grad_norm": 4.387030124664307, "learning_rate": 1.3374358974358976e-05, "loss": 0.2397, "step": 1292 }, { "epoch": 16.523961661341854, "grad_norm": 3.9117209911346436, "learning_rate": 1.336923076923077e-05, "loss": 0.1676, "step": 1293 }, { "epoch": 16.536741214057507, "grad_norm": 4.131397724151611, "learning_rate": 1.3364102564102566e-05, "loss": 0.2039, "step": 1294 }, { "epoch": 16.549520766773163, "grad_norm": 3.769676446914673, "learning_rate": 1.335897435897436e-05, "loss": 0.1712, "step": 1295 }, { "epoch": 16.562300319488816, "grad_norm": 4.4638848304748535, "learning_rate": 1.3353846153846155e-05, "loss": 0.1844, "step": 1296 }, { "epoch": 16.575079872204473, "grad_norm": 4.5951361656188965, "learning_rate": 1.3348717948717951e-05, "loss": 0.1891, "step": 1297 }, { "epoch": 16.58785942492013, "grad_norm": 4.4006667137146, "learning_rate": 1.3343589743589745e-05, "loss": 0.2095, "step": 1298 }, { "epoch": 16.600638977635782, "grad_norm": 5.249375820159912, "learning_rate": 1.3338461538461541e-05, "loss": 0.2312, "step": 1299 }, { "epoch": 16.61341853035144, "grad_norm": 4.6556243896484375, "learning_rate": 1.3333333333333333e-05, "loss": 0.297, "step": 1300 }, { "epoch": 16.61341853035144, "eval_loss": 0.5607043504714966, "eval_runtime": 183.5728, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 1300 }, { "epoch": 16.62619808306709, "grad_norm": 4.347846031188965, "learning_rate": 1.3328205128205129e-05, "loss": 0.2292, "step": 1301 }, { "epoch": 16.638977635782748, "grad_norm": 4.716831684112549, "learning_rate": 1.3323076923076925e-05, "loss": 0.1874, "step": 1302 }, { "epoch": 16.6517571884984, "grad_norm": 3.6589035987854004, "learning_rate": 1.3317948717948719e-05, "loss": 0.1601, "step": 1303 }, { "epoch": 16.664536741214057, "grad_norm": 3.7984180450439453, "learning_rate": 1.3312820512820515e-05, "loss": 0.2677, "step": 1304 }, { "epoch": 16.677316293929714, "grad_norm": 3.922208309173584, "learning_rate": 1.3307692307692309e-05, "loss": 0.1818, "step": 1305 }, { "epoch": 16.690095846645367, "grad_norm": 4.426130294799805, "learning_rate": 1.3302564102564104e-05, "loss": 0.2011, "step": 1306 }, { "epoch": 16.702875399361023, "grad_norm": 4.943461894989014, "learning_rate": 1.3297435897435897e-05, "loss": 0.3119, "step": 1307 }, { "epoch": 16.715654952076676, "grad_norm": 3.7821176052093506, "learning_rate": 1.3292307692307692e-05, "loss": 0.13, "step": 1308 }, { "epoch": 16.728434504792332, "grad_norm": 3.5825772285461426, "learning_rate": 1.3287179487179488e-05, "loss": 0.1673, "step": 1309 }, { "epoch": 16.74121405750799, "grad_norm": 5.194157600402832, "learning_rate": 1.3282051282051282e-05, "loss": 0.1959, "step": 1310 }, { "epoch": 16.75399361022364, "grad_norm": 5.931427001953125, "learning_rate": 1.3276923076923078e-05, "loss": 0.3577, "step": 1311 }, { "epoch": 16.766773162939298, "grad_norm": 4.4217071533203125, "learning_rate": 1.3271794871794872e-05, "loss": 0.2588, "step": 1312 }, { "epoch": 16.77955271565495, "grad_norm": 4.538804054260254, "learning_rate": 1.3266666666666668e-05, "loss": 0.2171, "step": 1313 }, { "epoch": 16.792332268370608, "grad_norm": 5.510791778564453, "learning_rate": 1.3261538461538464e-05, "loss": 0.345, "step": 1314 }, { "epoch": 16.80511182108626, "grad_norm": 4.038776874542236, "learning_rate": 1.3256410256410258e-05, "loss": 0.1822, "step": 1315 }, { "epoch": 16.817891373801917, "grad_norm": 4.527102947235107, "learning_rate": 1.3251282051282053e-05, "loss": 0.2724, "step": 1316 }, { "epoch": 16.830670926517573, "grad_norm": 4.719036102294922, "learning_rate": 1.3246153846153846e-05, "loss": 0.2073, "step": 1317 }, { "epoch": 16.843450479233226, "grad_norm": 5.0867791175842285, "learning_rate": 1.3241025641025641e-05, "loss": 0.2437, "step": 1318 }, { "epoch": 16.856230031948883, "grad_norm": 3.9539568424224854, "learning_rate": 1.3235897435897437e-05, "loss": 0.1762, "step": 1319 }, { "epoch": 16.869009584664536, "grad_norm": 3.6446242332458496, "learning_rate": 1.3230769230769231e-05, "loss": 0.1671, "step": 1320 }, { "epoch": 16.881789137380192, "grad_norm": 4.763880729675293, "learning_rate": 1.3225641025641027e-05, "loss": 0.2316, "step": 1321 }, { "epoch": 16.894568690095845, "grad_norm": 3.9518229961395264, "learning_rate": 1.3220512820512821e-05, "loss": 0.1966, "step": 1322 }, { "epoch": 16.9073482428115, "grad_norm": 3.724243402481079, "learning_rate": 1.3215384615384617e-05, "loss": 0.1164, "step": 1323 }, { "epoch": 16.920127795527158, "grad_norm": 3.9408881664276123, "learning_rate": 1.3210256410256413e-05, "loss": 0.1555, "step": 1324 }, { "epoch": 16.93290734824281, "grad_norm": 4.742832183837891, "learning_rate": 1.3205128205128207e-05, "loss": 0.256, "step": 1325 }, { "epoch": 16.945686900958467, "grad_norm": 4.283969879150391, "learning_rate": 1.3200000000000002e-05, "loss": 0.19, "step": 1326 }, { "epoch": 16.95846645367412, "grad_norm": 4.325873374938965, "learning_rate": 1.3194871794871795e-05, "loss": 0.2203, "step": 1327 }, { "epoch": 16.971246006389777, "grad_norm": 4.042814254760742, "learning_rate": 1.318974358974359e-05, "loss": 0.2201, "step": 1328 }, { "epoch": 16.984025559105433, "grad_norm": 4.018223285675049, "learning_rate": 1.3184615384615385e-05, "loss": 0.1951, "step": 1329 }, { "epoch": 16.996805111821086, "grad_norm": 4.505209922790527, "learning_rate": 1.317948717948718e-05, "loss": 0.234, "step": 1330 }, { "epoch": 17.009584664536742, "grad_norm": 4.061479091644287, "learning_rate": 1.3174358974358976e-05, "loss": 0.2135, "step": 1331 }, { "epoch": 17.022364217252395, "grad_norm": 4.851925373077393, "learning_rate": 1.316923076923077e-05, "loss": 0.2187, "step": 1332 }, { "epoch": 17.03514376996805, "grad_norm": 3.1499011516571045, "learning_rate": 1.3164102564102566e-05, "loss": 0.1274, "step": 1333 }, { "epoch": 17.047923322683705, "grad_norm": 4.1169233322143555, "learning_rate": 1.315897435897436e-05, "loss": 0.1484, "step": 1334 }, { "epoch": 17.06070287539936, "grad_norm": 4.023073673248291, "learning_rate": 1.3153846153846156e-05, "loss": 0.1449, "step": 1335 }, { "epoch": 17.073482428115017, "grad_norm": 3.9548263549804688, "learning_rate": 1.3148717948717951e-05, "loss": 0.11, "step": 1336 }, { "epoch": 17.08626198083067, "grad_norm": 3.6275691986083984, "learning_rate": 1.3143589743589744e-05, "loss": 0.1581, "step": 1337 }, { "epoch": 17.099041533546327, "grad_norm": 3.9273226261138916, "learning_rate": 1.313846153846154e-05, "loss": 0.137, "step": 1338 }, { "epoch": 17.11182108626198, "grad_norm": 3.3632616996765137, "learning_rate": 1.3133333333333334e-05, "loss": 0.1551, "step": 1339 }, { "epoch": 17.124600638977636, "grad_norm": 4.732468605041504, "learning_rate": 1.312820512820513e-05, "loss": 0.2677, "step": 1340 }, { "epoch": 17.13738019169329, "grad_norm": 4.406965255737305, "learning_rate": 1.3123076923076925e-05, "loss": 0.1712, "step": 1341 }, { "epoch": 17.150159744408946, "grad_norm": 4.088700294494629, "learning_rate": 1.3117948717948719e-05, "loss": 0.2075, "step": 1342 }, { "epoch": 17.162939297124602, "grad_norm": 3.2795867919921875, "learning_rate": 1.3112820512820515e-05, "loss": 0.132, "step": 1343 }, { "epoch": 17.175718849840255, "grad_norm": 4.069875240325928, "learning_rate": 1.3107692307692307e-05, "loss": 0.1838, "step": 1344 }, { "epoch": 17.18849840255591, "grad_norm": 4.887452602386475, "learning_rate": 1.3102564102564103e-05, "loss": 0.2012, "step": 1345 }, { "epoch": 17.201277955271564, "grad_norm": 5.204954147338867, "learning_rate": 1.3097435897435899e-05, "loss": 0.1768, "step": 1346 }, { "epoch": 17.21405750798722, "grad_norm": 4.781338691711426, "learning_rate": 1.3092307692307693e-05, "loss": 0.2594, "step": 1347 }, { "epoch": 17.226837060702877, "grad_norm": 4.1371870040893555, "learning_rate": 1.3087179487179488e-05, "loss": 0.1797, "step": 1348 }, { "epoch": 17.23961661341853, "grad_norm": 4.221982002258301, "learning_rate": 1.3082051282051283e-05, "loss": 0.1579, "step": 1349 }, { "epoch": 17.252396166134186, "grad_norm": 4.641051769256592, "learning_rate": 1.3076923076923078e-05, "loss": 0.2327, "step": 1350 }, { "epoch": 17.26517571884984, "grad_norm": 4.150181293487549, "learning_rate": 1.3071794871794874e-05, "loss": 0.1501, "step": 1351 }, { "epoch": 17.277955271565496, "grad_norm": 4.188239097595215, "learning_rate": 1.3066666666666668e-05, "loss": 0.1982, "step": 1352 }, { "epoch": 17.29073482428115, "grad_norm": 3.598334312438965, "learning_rate": 1.3061538461538464e-05, "loss": 0.1179, "step": 1353 }, { "epoch": 17.303514376996805, "grad_norm": 4.53875732421875, "learning_rate": 1.3056410256410256e-05, "loss": 0.1706, "step": 1354 }, { "epoch": 17.31629392971246, "grad_norm": 3.595996379852295, "learning_rate": 1.3051282051282052e-05, "loss": 0.1708, "step": 1355 }, { "epoch": 17.329073482428115, "grad_norm": 3.832343578338623, "learning_rate": 1.3046153846153846e-05, "loss": 0.1785, "step": 1356 }, { "epoch": 17.34185303514377, "grad_norm": 5.0454583168029785, "learning_rate": 1.3041025641025642e-05, "loss": 0.1427, "step": 1357 }, { "epoch": 17.354632587859424, "grad_norm": 4.708250999450684, "learning_rate": 1.3035897435897437e-05, "loss": 0.1731, "step": 1358 }, { "epoch": 17.36741214057508, "grad_norm": 3.5192267894744873, "learning_rate": 1.3030769230769231e-05, "loss": 0.1117, "step": 1359 }, { "epoch": 17.380191693290733, "grad_norm": 4.695788860321045, "learning_rate": 1.3025641025641027e-05, "loss": 0.2105, "step": 1360 }, { "epoch": 17.39297124600639, "grad_norm": 4.710606098175049, "learning_rate": 1.3020512820512821e-05, "loss": 0.1613, "step": 1361 }, { "epoch": 17.405750798722046, "grad_norm": 3.6666271686553955, "learning_rate": 1.3015384615384617e-05, "loss": 0.179, "step": 1362 }, { "epoch": 17.4185303514377, "grad_norm": 3.9461636543273926, "learning_rate": 1.3010256410256413e-05, "loss": 0.1426, "step": 1363 }, { "epoch": 17.431309904153355, "grad_norm": 3.5908493995666504, "learning_rate": 1.3005128205128205e-05, "loss": 0.1152, "step": 1364 }, { "epoch": 17.44408945686901, "grad_norm": 4.1953959465026855, "learning_rate": 1.3000000000000001e-05, "loss": 0.1717, "step": 1365 }, { "epoch": 17.456869009584665, "grad_norm": 5.178595542907715, "learning_rate": 1.2994871794871795e-05, "loss": 0.1723, "step": 1366 }, { "epoch": 17.46964856230032, "grad_norm": 4.45409631729126, "learning_rate": 1.298974358974359e-05, "loss": 0.2075, "step": 1367 }, { "epoch": 17.482428115015974, "grad_norm": 6.469843864440918, "learning_rate": 1.2984615384615386e-05, "loss": 0.2422, "step": 1368 }, { "epoch": 17.49520766773163, "grad_norm": 3.8967084884643555, "learning_rate": 1.297948717948718e-05, "loss": 0.1658, "step": 1369 }, { "epoch": 17.507987220447284, "grad_norm": 4.227680683135986, "learning_rate": 1.2974358974358976e-05, "loss": 0.1941, "step": 1370 }, { "epoch": 17.52076677316294, "grad_norm": 4.642873287200928, "learning_rate": 1.296923076923077e-05, "loss": 0.1918, "step": 1371 }, { "epoch": 17.533546325878593, "grad_norm": 4.665454864501953, "learning_rate": 1.2964102564102566e-05, "loss": 0.2758, "step": 1372 }, { "epoch": 17.54632587859425, "grad_norm": 4.068156719207764, "learning_rate": 1.2958974358974362e-05, "loss": 0.1334, "step": 1373 }, { "epoch": 17.559105431309906, "grad_norm": 4.479705810546875, "learning_rate": 1.2953846153846154e-05, "loss": 0.1793, "step": 1374 }, { "epoch": 17.57188498402556, "grad_norm": 4.446591377258301, "learning_rate": 1.294871794871795e-05, "loss": 0.1761, "step": 1375 }, { "epoch": 17.584664536741215, "grad_norm": 5.095440864562988, "learning_rate": 1.2943589743589744e-05, "loss": 0.1738, "step": 1376 }, { "epoch": 17.597444089456868, "grad_norm": 5.519388675689697, "learning_rate": 1.293846153846154e-05, "loss": 0.2111, "step": 1377 }, { "epoch": 17.610223642172524, "grad_norm": 4.9799675941467285, "learning_rate": 1.2933333333333334e-05, "loss": 0.2324, "step": 1378 }, { "epoch": 17.623003194888177, "grad_norm": 3.9601807594299316, "learning_rate": 1.292820512820513e-05, "loss": 0.1808, "step": 1379 }, { "epoch": 17.635782747603834, "grad_norm": 2.8730149269104004, "learning_rate": 1.2923076923076925e-05, "loss": 0.0883, "step": 1380 }, { "epoch": 17.64856230031949, "grad_norm": 3.890813112258911, "learning_rate": 1.2917948717948718e-05, "loss": 0.1648, "step": 1381 }, { "epoch": 17.661341853035143, "grad_norm": 4.435185432434082, "learning_rate": 1.2912820512820515e-05, "loss": 0.2343, "step": 1382 }, { "epoch": 17.6741214057508, "grad_norm": 4.9541096687316895, "learning_rate": 1.2907692307692307e-05, "loss": 0.156, "step": 1383 }, { "epoch": 17.686900958466452, "grad_norm": 4.387731552124023, "learning_rate": 1.2902564102564103e-05, "loss": 0.2392, "step": 1384 }, { "epoch": 17.69968051118211, "grad_norm": 5.2987284660339355, "learning_rate": 1.2897435897435899e-05, "loss": 0.1897, "step": 1385 }, { "epoch": 17.712460063897765, "grad_norm": 3.6852712631225586, "learning_rate": 1.2892307692307693e-05, "loss": 0.1852, "step": 1386 }, { "epoch": 17.72523961661342, "grad_norm": 4.723902702331543, "learning_rate": 1.2887179487179489e-05, "loss": 0.1622, "step": 1387 }, { "epoch": 17.738019169329075, "grad_norm": 4.313000202178955, "learning_rate": 1.2882051282051283e-05, "loss": 0.1296, "step": 1388 }, { "epoch": 17.750798722044728, "grad_norm": 4.494014263153076, "learning_rate": 1.2876923076923078e-05, "loss": 0.1799, "step": 1389 }, { "epoch": 17.763578274760384, "grad_norm": 4.625412940979004, "learning_rate": 1.2871794871794874e-05, "loss": 0.207, "step": 1390 }, { "epoch": 17.776357827476037, "grad_norm": 4.199818134307861, "learning_rate": 1.2866666666666667e-05, "loss": 0.1388, "step": 1391 }, { "epoch": 17.789137380191693, "grad_norm": 4.411600112915039, "learning_rate": 1.2861538461538462e-05, "loss": 0.2175, "step": 1392 }, { "epoch": 17.80191693290735, "grad_norm": 4.975483417510986, "learning_rate": 1.2856410256410256e-05, "loss": 0.2334, "step": 1393 }, { "epoch": 17.814696485623003, "grad_norm": 4.207812786102295, "learning_rate": 1.2851282051282052e-05, "loss": 0.1924, "step": 1394 }, { "epoch": 17.82747603833866, "grad_norm": 5.613093852996826, "learning_rate": 1.2846153846153848e-05, "loss": 0.2351, "step": 1395 }, { "epoch": 17.840255591054312, "grad_norm": 5.09583854675293, "learning_rate": 1.2841025641025642e-05, "loss": 0.2433, "step": 1396 }, { "epoch": 17.85303514376997, "grad_norm": 5.236515522003174, "learning_rate": 1.2835897435897438e-05, "loss": 0.2328, "step": 1397 }, { "epoch": 17.86581469648562, "grad_norm": 4.290408611297607, "learning_rate": 1.2830769230769232e-05, "loss": 0.1266, "step": 1398 }, { "epoch": 17.878594249201278, "grad_norm": 4.449280261993408, "learning_rate": 1.2825641025641027e-05, "loss": 0.1595, "step": 1399 }, { "epoch": 17.891373801916934, "grad_norm": 5.017588138580322, "learning_rate": 1.2820512820512823e-05, "loss": 0.1671, "step": 1400 }, { "epoch": 17.891373801916934, "eval_loss": 0.5858176946640015, "eval_runtime": 183.6009, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 1400 }, { "epoch": 17.904153354632587, "grad_norm": 3.8054473400115967, "learning_rate": 1.2815384615384616e-05, "loss": 0.1687, "step": 1401 }, { "epoch": 17.916932907348244, "grad_norm": 4.932668685913086, "learning_rate": 1.2810256410256411e-05, "loss": 0.1664, "step": 1402 }, { "epoch": 17.929712460063897, "grad_norm": 4.4709391593933105, "learning_rate": 1.2805128205128205e-05, "loss": 0.2496, "step": 1403 }, { "epoch": 17.942492012779553, "grad_norm": 5.316714286804199, "learning_rate": 1.2800000000000001e-05, "loss": 0.2514, "step": 1404 }, { "epoch": 17.955271565495206, "grad_norm": 6.395861625671387, "learning_rate": 1.2794871794871795e-05, "loss": 0.2261, "step": 1405 }, { "epoch": 17.968051118210862, "grad_norm": 5.766951560974121, "learning_rate": 1.2789743589743591e-05, "loss": 0.156, "step": 1406 }, { "epoch": 17.98083067092652, "grad_norm": 4.028629302978516, "learning_rate": 1.2784615384615387e-05, "loss": 0.1327, "step": 1407 }, { "epoch": 17.99361022364217, "grad_norm": 4.956418991088867, "learning_rate": 1.277948717948718e-05, "loss": 0.1746, "step": 1408 }, { "epoch": 18.00638977635783, "grad_norm": 4.1461052894592285, "learning_rate": 1.2774358974358976e-05, "loss": 0.1636, "step": 1409 }, { "epoch": 18.01916932907348, "grad_norm": 3.723752021789551, "learning_rate": 1.2769230769230769e-05, "loss": 0.2003, "step": 1410 }, { "epoch": 18.031948881789138, "grad_norm": 3.700064182281494, "learning_rate": 1.2764102564102565e-05, "loss": 0.1339, "step": 1411 }, { "epoch": 18.044728434504794, "grad_norm": 3.3118534088134766, "learning_rate": 1.275897435897436e-05, "loss": 0.1082, "step": 1412 }, { "epoch": 18.057507987220447, "grad_norm": 3.8128724098205566, "learning_rate": 1.2753846153846154e-05, "loss": 0.1564, "step": 1413 }, { "epoch": 18.070287539936103, "grad_norm": 3.9735348224639893, "learning_rate": 1.274871794871795e-05, "loss": 0.1304, "step": 1414 }, { "epoch": 18.083067092651756, "grad_norm": 4.680019855499268, "learning_rate": 1.2743589743589744e-05, "loss": 0.1849, "step": 1415 }, { "epoch": 18.095846645367413, "grad_norm": 4.512134075164795, "learning_rate": 1.273846153846154e-05, "loss": 0.2095, "step": 1416 }, { "epoch": 18.108626198083066, "grad_norm": 4.275150299072266, "learning_rate": 1.2733333333333336e-05, "loss": 0.1757, "step": 1417 }, { "epoch": 18.121405750798722, "grad_norm": 3.94869327545166, "learning_rate": 1.272820512820513e-05, "loss": 0.104, "step": 1418 }, { "epoch": 18.13418530351438, "grad_norm": 4.781008243560791, "learning_rate": 1.2723076923076925e-05, "loss": 0.1372, "step": 1419 }, { "epoch": 18.14696485623003, "grad_norm": 4.485940456390381, "learning_rate": 1.2717948717948718e-05, "loss": 0.1488, "step": 1420 }, { "epoch": 18.159744408945688, "grad_norm": 2.775332450866699, "learning_rate": 1.2712820512820514e-05, "loss": 0.1255, "step": 1421 }, { "epoch": 18.17252396166134, "grad_norm": 3.302737236022949, "learning_rate": 1.270769230769231e-05, "loss": 0.1398, "step": 1422 }, { "epoch": 18.185303514376997, "grad_norm": 5.000697612762451, "learning_rate": 1.2702564102564103e-05, "loss": 0.1503, "step": 1423 }, { "epoch": 18.19808306709265, "grad_norm": 4.334658622741699, "learning_rate": 1.2697435897435899e-05, "loss": 0.1535, "step": 1424 }, { "epoch": 18.210862619808307, "grad_norm": 4.618746280670166, "learning_rate": 1.2692307692307693e-05, "loss": 0.1605, "step": 1425 }, { "epoch": 18.223642172523963, "grad_norm": 4.05863094329834, "learning_rate": 1.2687179487179489e-05, "loss": 0.0956, "step": 1426 }, { "epoch": 18.236421725239616, "grad_norm": 4.230029106140137, "learning_rate": 1.2682051282051281e-05, "loss": 0.1435, "step": 1427 }, { "epoch": 18.249201277955272, "grad_norm": 2.6787288188934326, "learning_rate": 1.2676923076923077e-05, "loss": 0.0948, "step": 1428 }, { "epoch": 18.261980830670925, "grad_norm": 4.375114440917969, "learning_rate": 1.2671794871794873e-05, "loss": 0.1843, "step": 1429 }, { "epoch": 18.27476038338658, "grad_norm": 3.706968069076538, "learning_rate": 1.2666666666666667e-05, "loss": 0.1272, "step": 1430 }, { "epoch": 18.287539936102238, "grad_norm": 4.491248607635498, "learning_rate": 1.2661538461538463e-05, "loss": 0.3349, "step": 1431 }, { "epoch": 18.30031948881789, "grad_norm": 3.2398135662078857, "learning_rate": 1.2656410256410257e-05, "loss": 0.1115, "step": 1432 }, { "epoch": 18.313099041533548, "grad_norm": 4.5026702880859375, "learning_rate": 1.2651282051282052e-05, "loss": 0.1812, "step": 1433 }, { "epoch": 18.3258785942492, "grad_norm": 4.205916404724121, "learning_rate": 1.2646153846153848e-05, "loss": 0.1706, "step": 1434 }, { "epoch": 18.338658146964857, "grad_norm": 3.850865602493286, "learning_rate": 1.2641025641025642e-05, "loss": 0.1835, "step": 1435 }, { "epoch": 18.35143769968051, "grad_norm": 3.3740909099578857, "learning_rate": 1.2635897435897438e-05, "loss": 0.1409, "step": 1436 }, { "epoch": 18.364217252396166, "grad_norm": 3.58420467376709, "learning_rate": 1.263076923076923e-05, "loss": 0.1231, "step": 1437 }, { "epoch": 18.376996805111823, "grad_norm": 4.956149101257324, "learning_rate": 1.2625641025641026e-05, "loss": 0.1889, "step": 1438 }, { "epoch": 18.389776357827476, "grad_norm": 5.362755298614502, "learning_rate": 1.2620512820512822e-05, "loss": 0.1947, "step": 1439 }, { "epoch": 18.402555910543132, "grad_norm": 4.240985870361328, "learning_rate": 1.2615384615384616e-05, "loss": 0.1353, "step": 1440 }, { "epoch": 18.415335463258785, "grad_norm": 4.410498142242432, "learning_rate": 1.2610256410256412e-05, "loss": 0.1332, "step": 1441 }, { "epoch": 18.42811501597444, "grad_norm": 4.465317726135254, "learning_rate": 1.2605128205128206e-05, "loss": 0.1308, "step": 1442 }, { "epoch": 18.440894568690094, "grad_norm": 5.566956043243408, "learning_rate": 1.2600000000000001e-05, "loss": 0.1684, "step": 1443 }, { "epoch": 18.45367412140575, "grad_norm": 4.643564701080322, "learning_rate": 1.2594871794871797e-05, "loss": 0.133, "step": 1444 }, { "epoch": 18.466453674121407, "grad_norm": 3.7559735774993896, "learning_rate": 1.2589743589743591e-05, "loss": 0.1165, "step": 1445 }, { "epoch": 18.47923322683706, "grad_norm": 4.310507297515869, "learning_rate": 1.2584615384615387e-05, "loss": 0.1284, "step": 1446 }, { "epoch": 18.492012779552716, "grad_norm": 4.490650177001953, "learning_rate": 1.257948717948718e-05, "loss": 0.1571, "step": 1447 }, { "epoch": 18.50479233226837, "grad_norm": 3.584522008895874, "learning_rate": 1.2574358974358975e-05, "loss": 0.0975, "step": 1448 }, { "epoch": 18.517571884984026, "grad_norm": 3.844524621963501, "learning_rate": 1.256923076923077e-05, "loss": 0.1871, "step": 1449 }, { "epoch": 18.53035143769968, "grad_norm": 4.957360744476318, "learning_rate": 1.2564102564102565e-05, "loss": 0.1563, "step": 1450 }, { "epoch": 18.543130990415335, "grad_norm": 3.925318717956543, "learning_rate": 1.255897435897436e-05, "loss": 0.1406, "step": 1451 }, { "epoch": 18.55591054313099, "grad_norm": 4.986582279205322, "learning_rate": 1.2553846153846155e-05, "loss": 0.1655, "step": 1452 }, { "epoch": 18.568690095846645, "grad_norm": 4.465045928955078, "learning_rate": 1.254871794871795e-05, "loss": 0.1403, "step": 1453 }, { "epoch": 18.5814696485623, "grad_norm": 5.537022113800049, "learning_rate": 1.2543589743589744e-05, "loss": 0.1297, "step": 1454 }, { "epoch": 18.594249201277954, "grad_norm": 4.612170696258545, "learning_rate": 1.253846153846154e-05, "loss": 0.1416, "step": 1455 }, { "epoch": 18.60702875399361, "grad_norm": 3.784298896789551, "learning_rate": 1.2533333333333336e-05, "loss": 0.1546, "step": 1456 }, { "epoch": 18.619808306709267, "grad_norm": 5.120687961578369, "learning_rate": 1.2528205128205128e-05, "loss": 0.2026, "step": 1457 }, { "epoch": 18.63258785942492, "grad_norm": 3.836606979370117, "learning_rate": 1.2523076923076924e-05, "loss": 0.1175, "step": 1458 }, { "epoch": 18.645367412140576, "grad_norm": 3.785749912261963, "learning_rate": 1.2517948717948718e-05, "loss": 0.1788, "step": 1459 }, { "epoch": 18.65814696485623, "grad_norm": 4.040433883666992, "learning_rate": 1.2512820512820514e-05, "loss": 0.1345, "step": 1460 }, { "epoch": 18.670926517571885, "grad_norm": 4.360098838806152, "learning_rate": 1.250769230769231e-05, "loss": 0.1565, "step": 1461 }, { "epoch": 18.68370607028754, "grad_norm": 5.078848838806152, "learning_rate": 1.2502564102564104e-05, "loss": 0.1558, "step": 1462 }, { "epoch": 18.696485623003195, "grad_norm": 6.132460117340088, "learning_rate": 1.24974358974359e-05, "loss": 0.1972, "step": 1463 }, { "epoch": 18.70926517571885, "grad_norm": 3.8772716522216797, "learning_rate": 1.2492307692307692e-05, "loss": 0.1498, "step": 1464 }, { "epoch": 18.722044728434504, "grad_norm": 4.878615856170654, "learning_rate": 1.2487179487179487e-05, "loss": 0.161, "step": 1465 }, { "epoch": 18.73482428115016, "grad_norm": 4.384540557861328, "learning_rate": 1.2482051282051285e-05, "loss": 0.1664, "step": 1466 }, { "epoch": 18.747603833865814, "grad_norm": 4.386280536651611, "learning_rate": 1.2476923076923077e-05, "loss": 0.1628, "step": 1467 }, { "epoch": 18.76038338658147, "grad_norm": 4.095793724060059, "learning_rate": 1.2471794871794873e-05, "loss": 0.1324, "step": 1468 }, { "epoch": 18.773162939297123, "grad_norm": 4.629232883453369, "learning_rate": 1.2466666666666667e-05, "loss": 0.1747, "step": 1469 }, { "epoch": 18.78594249201278, "grad_norm": 4.9055256843566895, "learning_rate": 1.2461538461538463e-05, "loss": 0.1577, "step": 1470 }, { "epoch": 18.798722044728436, "grad_norm": 4.488932132720947, "learning_rate": 1.2456410256410259e-05, "loss": 0.1546, "step": 1471 }, { "epoch": 18.81150159744409, "grad_norm": 5.880629539489746, "learning_rate": 1.2451282051282053e-05, "loss": 0.1649, "step": 1472 }, { "epoch": 18.824281150159745, "grad_norm": 5.382693767547607, "learning_rate": 1.2446153846153848e-05, "loss": 0.1926, "step": 1473 }, { "epoch": 18.837060702875398, "grad_norm": 4.9048848152160645, "learning_rate": 1.244102564102564e-05, "loss": 0.15, "step": 1474 }, { "epoch": 18.849840255591054, "grad_norm": 4.941222190856934, "learning_rate": 1.2435897435897436e-05, "loss": 0.1193, "step": 1475 }, { "epoch": 18.86261980830671, "grad_norm": 3.5953900814056396, "learning_rate": 1.243076923076923e-05, "loss": 0.1311, "step": 1476 }, { "epoch": 18.875399361022364, "grad_norm": 3.8183746337890625, "learning_rate": 1.2425641025641026e-05, "loss": 0.116, "step": 1477 }, { "epoch": 18.88817891373802, "grad_norm": 4.224637985229492, "learning_rate": 1.2420512820512822e-05, "loss": 0.1153, "step": 1478 }, { "epoch": 18.900958466453673, "grad_norm": 4.898967742919922, "learning_rate": 1.2415384615384616e-05, "loss": 0.1783, "step": 1479 }, { "epoch": 18.91373801916933, "grad_norm": 5.724062442779541, "learning_rate": 1.2410256410256412e-05, "loss": 0.1652, "step": 1480 }, { "epoch": 18.926517571884983, "grad_norm": 4.231504917144775, "learning_rate": 1.2405128205128206e-05, "loss": 0.1838, "step": 1481 }, { "epoch": 18.93929712460064, "grad_norm": 4.956578731536865, "learning_rate": 1.2400000000000002e-05, "loss": 0.1708, "step": 1482 }, { "epoch": 18.952076677316295, "grad_norm": 4.780496120452881, "learning_rate": 1.2394871794871797e-05, "loss": 0.1335, "step": 1483 }, { "epoch": 18.96485623003195, "grad_norm": 4.9347028732299805, "learning_rate": 1.238974358974359e-05, "loss": 0.2332, "step": 1484 }, { "epoch": 18.977635782747605, "grad_norm": 5.3977837562561035, "learning_rate": 1.2384615384615385e-05, "loss": 0.2043, "step": 1485 }, { "epoch": 18.990415335463258, "grad_norm": 5.341475963592529, "learning_rate": 1.237948717948718e-05, "loss": 0.1632, "step": 1486 }, { "epoch": 19.003194888178914, "grad_norm": 4.257419586181641, "learning_rate": 1.2374358974358975e-05, "loss": 0.1374, "step": 1487 }, { "epoch": 19.015974440894567, "grad_norm": 4.08753776550293, "learning_rate": 1.2369230769230771e-05, "loss": 0.1346, "step": 1488 }, { "epoch": 19.028753993610223, "grad_norm": 3.8640146255493164, "learning_rate": 1.2364102564102565e-05, "loss": 0.107, "step": 1489 }, { "epoch": 19.04153354632588, "grad_norm": 4.158913612365723, "learning_rate": 1.235897435897436e-05, "loss": 0.1216, "step": 1490 }, { "epoch": 19.054313099041533, "grad_norm": 3.6827924251556396, "learning_rate": 1.2353846153846155e-05, "loss": 0.1169, "step": 1491 }, { "epoch": 19.06709265175719, "grad_norm": 3.9266042709350586, "learning_rate": 1.234871794871795e-05, "loss": 0.1164, "step": 1492 }, { "epoch": 19.079872204472842, "grad_norm": 4.07518196105957, "learning_rate": 1.2343589743589746e-05, "loss": 0.0927, "step": 1493 }, { "epoch": 19.0926517571885, "grad_norm": 4.363387107849121, "learning_rate": 1.2338461538461539e-05, "loss": 0.1345, "step": 1494 }, { "epoch": 19.105431309904155, "grad_norm": 4.984545707702637, "learning_rate": 1.2333333333333334e-05, "loss": 0.1205, "step": 1495 }, { "epoch": 19.118210862619808, "grad_norm": 2.710969924926758, "learning_rate": 1.2328205128205128e-05, "loss": 0.113, "step": 1496 }, { "epoch": 19.130990415335464, "grad_norm": 3.27824068069458, "learning_rate": 1.2323076923076924e-05, "loss": 0.0981, "step": 1497 }, { "epoch": 19.143769968051117, "grad_norm": 3.5957398414611816, "learning_rate": 1.231794871794872e-05, "loss": 0.1738, "step": 1498 }, { "epoch": 19.156549520766774, "grad_norm": 2.8213162422180176, "learning_rate": 1.2312820512820514e-05, "loss": 0.074, "step": 1499 }, { "epoch": 19.169329073482427, "grad_norm": 4.4678425788879395, "learning_rate": 1.230769230769231e-05, "loss": 0.1392, "step": 1500 }, { "epoch": 19.169329073482427, "eval_loss": 0.6049781441688538, "eval_runtime": 183.8411, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 1500 }, { "epoch": 19.182108626198083, "grad_norm": 4.500428676605225, "learning_rate": 1.2302564102564102e-05, "loss": 0.172, "step": 1501 }, { "epoch": 19.19488817891374, "grad_norm": 3.4990546703338623, "learning_rate": 1.22974358974359e-05, "loss": 0.1249, "step": 1502 }, { "epoch": 19.207667731629392, "grad_norm": 3.9662656784057617, "learning_rate": 1.2292307692307692e-05, "loss": 0.1258, "step": 1503 }, { "epoch": 19.22044728434505, "grad_norm": 5.195966720581055, "learning_rate": 1.2287179487179488e-05, "loss": 0.1837, "step": 1504 }, { "epoch": 19.233226837060702, "grad_norm": 3.6646034717559814, "learning_rate": 1.2282051282051283e-05, "loss": 0.0861, "step": 1505 }, { "epoch": 19.24600638977636, "grad_norm": 3.9439032077789307, "learning_rate": 1.2276923076923077e-05, "loss": 0.1212, "step": 1506 }, { "epoch": 19.25878594249201, "grad_norm": 4.427985191345215, "learning_rate": 1.2271794871794873e-05, "loss": 0.1297, "step": 1507 }, { "epoch": 19.271565495207668, "grad_norm": 3.02043080329895, "learning_rate": 1.2266666666666667e-05, "loss": 0.07, "step": 1508 }, { "epoch": 19.284345047923324, "grad_norm": 3.5911426544189453, "learning_rate": 1.2261538461538463e-05, "loss": 0.0998, "step": 1509 }, { "epoch": 19.297124600638977, "grad_norm": 3.7852537631988525, "learning_rate": 1.2256410256410259e-05, "loss": 0.0708, "step": 1510 }, { "epoch": 19.309904153354633, "grad_norm": 4.233508110046387, "learning_rate": 1.2251282051282051e-05, "loss": 0.1262, "step": 1511 }, { "epoch": 19.322683706070286, "grad_norm": 4.237662315368652, "learning_rate": 1.2246153846153847e-05, "loss": 0.1159, "step": 1512 }, { "epoch": 19.335463258785943, "grad_norm": 6.377729415893555, "learning_rate": 1.2241025641025641e-05, "loss": 0.2467, "step": 1513 }, { "epoch": 19.3482428115016, "grad_norm": 4.620576858520508, "learning_rate": 1.2235897435897437e-05, "loss": 0.1471, "step": 1514 }, { "epoch": 19.361022364217252, "grad_norm": 4.3901166915893555, "learning_rate": 1.2230769230769232e-05, "loss": 0.1519, "step": 1515 }, { "epoch": 19.37380191693291, "grad_norm": 6.668283462524414, "learning_rate": 1.2225641025641026e-05, "loss": 0.1385, "step": 1516 }, { "epoch": 19.38658146964856, "grad_norm": 3.3707025051116943, "learning_rate": 1.2220512820512822e-05, "loss": 0.1423, "step": 1517 }, { "epoch": 19.399361022364218, "grad_norm": 3.8963208198547363, "learning_rate": 1.2215384615384616e-05, "loss": 0.1763, "step": 1518 }, { "epoch": 19.41214057507987, "grad_norm": 3.0294902324676514, "learning_rate": 1.2210256410256412e-05, "loss": 0.0771, "step": 1519 }, { "epoch": 19.424920127795527, "grad_norm": 3.7675819396972656, "learning_rate": 1.2205128205128208e-05, "loss": 0.1174, "step": 1520 }, { "epoch": 19.437699680511184, "grad_norm": 3.797121047973633, "learning_rate": 1.22e-05, "loss": 0.1358, "step": 1521 }, { "epoch": 19.450479233226837, "grad_norm": 4.065491676330566, "learning_rate": 1.2194871794871796e-05, "loss": 0.108, "step": 1522 }, { "epoch": 19.463258785942493, "grad_norm": 3.6715574264526367, "learning_rate": 1.218974358974359e-05, "loss": 0.139, "step": 1523 }, { "epoch": 19.476038338658146, "grad_norm": 5.250418663024902, "learning_rate": 1.2184615384615386e-05, "loss": 0.1425, "step": 1524 }, { "epoch": 19.488817891373802, "grad_norm": 4.669528484344482, "learning_rate": 1.217948717948718e-05, "loss": 0.1321, "step": 1525 }, { "epoch": 19.501597444089455, "grad_norm": 4.585147380828857, "learning_rate": 1.2174358974358975e-05, "loss": 0.1232, "step": 1526 }, { "epoch": 19.51437699680511, "grad_norm": 3.8052077293395996, "learning_rate": 1.2169230769230771e-05, "loss": 0.1059, "step": 1527 }, { "epoch": 19.527156549520768, "grad_norm": 3.7040152549743652, "learning_rate": 1.2164102564102565e-05, "loss": 0.1017, "step": 1528 }, { "epoch": 19.53993610223642, "grad_norm": 3.7301838397979736, "learning_rate": 1.2158974358974361e-05, "loss": 0.0855, "step": 1529 }, { "epoch": 19.552715654952078, "grad_norm": 4.927285194396973, "learning_rate": 1.2153846153846153e-05, "loss": 0.1538, "step": 1530 }, { "epoch": 19.56549520766773, "grad_norm": 4.502018928527832, "learning_rate": 1.2148717948717949e-05, "loss": 0.097, "step": 1531 }, { "epoch": 19.578274760383387, "grad_norm": 4.707366943359375, "learning_rate": 1.2143589743589745e-05, "loss": 0.1858, "step": 1532 }, { "epoch": 19.591054313099043, "grad_norm": 5.845135688781738, "learning_rate": 1.2138461538461539e-05, "loss": 0.1434, "step": 1533 }, { "epoch": 19.603833865814696, "grad_norm": 4.260502338409424, "learning_rate": 1.2133333333333335e-05, "loss": 0.1188, "step": 1534 }, { "epoch": 19.616613418530353, "grad_norm": 4.49433708190918, "learning_rate": 1.2128205128205129e-05, "loss": 0.1205, "step": 1535 }, { "epoch": 19.629392971246006, "grad_norm": 4.624430179595947, "learning_rate": 1.2123076923076924e-05, "loss": 0.1194, "step": 1536 }, { "epoch": 19.642172523961662, "grad_norm": 4.401130676269531, "learning_rate": 1.211794871794872e-05, "loss": 0.1617, "step": 1537 }, { "epoch": 19.654952076677315, "grad_norm": 3.5289738178253174, "learning_rate": 1.2112820512820514e-05, "loss": 0.0927, "step": 1538 }, { "epoch": 19.66773162939297, "grad_norm": 6.962007522583008, "learning_rate": 1.210769230769231e-05, "loss": 0.1999, "step": 1539 }, { "epoch": 19.680511182108628, "grad_norm": 5.013849258422852, "learning_rate": 1.2102564102564102e-05, "loss": 0.1448, "step": 1540 }, { "epoch": 19.69329073482428, "grad_norm": 4.766885757446289, "learning_rate": 1.2097435897435898e-05, "loss": 0.1597, "step": 1541 }, { "epoch": 19.706070287539937, "grad_norm": 3.9394874572753906, "learning_rate": 1.2092307692307694e-05, "loss": 0.1112, "step": 1542 }, { "epoch": 19.71884984025559, "grad_norm": 5.361410140991211, "learning_rate": 1.2087179487179488e-05, "loss": 0.1157, "step": 1543 }, { "epoch": 19.731629392971247, "grad_norm": 3.999269962310791, "learning_rate": 1.2082051282051284e-05, "loss": 0.1391, "step": 1544 }, { "epoch": 19.7444089456869, "grad_norm": 5.273488998413086, "learning_rate": 1.2076923076923078e-05, "loss": 0.1272, "step": 1545 }, { "epoch": 19.757188498402556, "grad_norm": 4.8968329429626465, "learning_rate": 1.2071794871794873e-05, "loss": 0.1984, "step": 1546 }, { "epoch": 19.769968051118212, "grad_norm": 4.2163872718811035, "learning_rate": 1.206666666666667e-05, "loss": 0.1166, "step": 1547 }, { "epoch": 19.782747603833865, "grad_norm": 4.540549278259277, "learning_rate": 1.2061538461538462e-05, "loss": 0.1172, "step": 1548 }, { "epoch": 19.79552715654952, "grad_norm": 4.659392833709717, "learning_rate": 1.2056410256410257e-05, "loss": 0.1947, "step": 1549 }, { "epoch": 19.808306709265175, "grad_norm": 3.6807572841644287, "learning_rate": 1.2051282051282051e-05, "loss": 0.1054, "step": 1550 }, { "epoch": 19.82108626198083, "grad_norm": 4.467441558837891, "learning_rate": 1.2046153846153847e-05, "loss": 0.1532, "step": 1551 }, { "epoch": 19.833865814696484, "grad_norm": 3.2000980377197266, "learning_rate": 1.2041025641025641e-05, "loss": 0.0957, "step": 1552 }, { "epoch": 19.84664536741214, "grad_norm": 5.999240398406982, "learning_rate": 1.2035897435897437e-05, "loss": 0.1822, "step": 1553 }, { "epoch": 19.859424920127797, "grad_norm": 4.817787170410156, "learning_rate": 1.2030769230769233e-05, "loss": 0.1507, "step": 1554 }, { "epoch": 19.87220447284345, "grad_norm": 5.96992301940918, "learning_rate": 1.2025641025641027e-05, "loss": 0.1525, "step": 1555 }, { "epoch": 19.884984025559106, "grad_norm": 4.115090370178223, "learning_rate": 1.2020512820512822e-05, "loss": 0.1262, "step": 1556 }, { "epoch": 19.89776357827476, "grad_norm": 4.930627346038818, "learning_rate": 1.2015384615384615e-05, "loss": 0.2024, "step": 1557 }, { "epoch": 19.910543130990416, "grad_norm": 4.551126003265381, "learning_rate": 1.201025641025641e-05, "loss": 0.0929, "step": 1558 }, { "epoch": 19.923322683706072, "grad_norm": 4.8088836669921875, "learning_rate": 1.2005128205128206e-05, "loss": 0.1261, "step": 1559 }, { "epoch": 19.936102236421725, "grad_norm": 4.031205177307129, "learning_rate": 1.2e-05, "loss": 0.1601, "step": 1560 }, { "epoch": 19.94888178913738, "grad_norm": 4.078291893005371, "learning_rate": 1.1994871794871796e-05, "loss": 0.1488, "step": 1561 }, { "epoch": 19.961661341853034, "grad_norm": 3.802271842956543, "learning_rate": 1.198974358974359e-05, "loss": 0.1198, "step": 1562 }, { "epoch": 19.97444089456869, "grad_norm": 4.118734359741211, "learning_rate": 1.1984615384615386e-05, "loss": 0.1854, "step": 1563 }, { "epoch": 19.987220447284344, "grad_norm": 3.7523162364959717, "learning_rate": 1.1979487179487182e-05, "loss": 0.1037, "step": 1564 }, { "epoch": 20.0, "grad_norm": 4.222137451171875, "learning_rate": 1.1974358974358976e-05, "loss": 0.1086, "step": 1565 }, { "epoch": 20.012779552715656, "grad_norm": 4.975546836853027, "learning_rate": 1.1969230769230771e-05, "loss": 0.131, "step": 1566 }, { "epoch": 20.02555910543131, "grad_norm": 3.5944299697875977, "learning_rate": 1.1964102564102564e-05, "loss": 0.089, "step": 1567 }, { "epoch": 20.038338658146966, "grad_norm": 4.291761875152588, "learning_rate": 1.195897435897436e-05, "loss": 0.0949, "step": 1568 }, { "epoch": 20.05111821086262, "grad_norm": 4.3121337890625, "learning_rate": 1.1953846153846155e-05, "loss": 0.1194, "step": 1569 }, { "epoch": 20.063897763578275, "grad_norm": 5.17624044418335, "learning_rate": 1.194871794871795e-05, "loss": 0.1156, "step": 1570 }, { "epoch": 20.076677316293928, "grad_norm": 4.396629810333252, "learning_rate": 1.1943589743589745e-05, "loss": 0.1126, "step": 1571 }, { "epoch": 20.089456869009584, "grad_norm": 3.466047525405884, "learning_rate": 1.1938461538461539e-05, "loss": 0.1243, "step": 1572 }, { "epoch": 20.10223642172524, "grad_norm": 3.3175036907196045, "learning_rate": 1.1933333333333335e-05, "loss": 0.1041, "step": 1573 }, { "epoch": 20.115015974440894, "grad_norm": 3.193227767944336, "learning_rate": 1.1928205128205129e-05, "loss": 0.0859, "step": 1574 }, { "epoch": 20.12779552715655, "grad_norm": 3.787001132965088, "learning_rate": 1.1923076923076925e-05, "loss": 0.0922, "step": 1575 }, { "epoch": 20.140575079872203, "grad_norm": 4.286689758300781, "learning_rate": 1.191794871794872e-05, "loss": 0.0923, "step": 1576 }, { "epoch": 20.15335463258786, "grad_norm": 4.684820175170898, "learning_rate": 1.1912820512820513e-05, "loss": 0.1506, "step": 1577 }, { "epoch": 20.166134185303516, "grad_norm": 4.568631172180176, "learning_rate": 1.1907692307692308e-05, "loss": 0.0986, "step": 1578 }, { "epoch": 20.17891373801917, "grad_norm": 4.025661945343018, "learning_rate": 1.1902564102564103e-05, "loss": 0.0853, "step": 1579 }, { "epoch": 20.191693290734825, "grad_norm": 6.226049900054932, "learning_rate": 1.1897435897435898e-05, "loss": 0.1414, "step": 1580 }, { "epoch": 20.20447284345048, "grad_norm": 4.703970909118652, "learning_rate": 1.1892307692307694e-05, "loss": 0.1196, "step": 1581 }, { "epoch": 20.217252396166135, "grad_norm": 3.3595998287200928, "learning_rate": 1.1887179487179488e-05, "loss": 0.085, "step": 1582 }, { "epoch": 20.230031948881788, "grad_norm": 6.735140323638916, "learning_rate": 1.1882051282051284e-05, "loss": 0.1397, "step": 1583 }, { "epoch": 20.242811501597444, "grad_norm": 4.5568342208862305, "learning_rate": 1.1876923076923076e-05, "loss": 0.0925, "step": 1584 }, { "epoch": 20.2555910543131, "grad_norm": 4.656588554382324, "learning_rate": 1.1871794871794872e-05, "loss": 0.1483, "step": 1585 }, { "epoch": 20.268370607028753, "grad_norm": 3.1620121002197266, "learning_rate": 1.186666666666667e-05, "loss": 0.0802, "step": 1586 }, { "epoch": 20.28115015974441, "grad_norm": 3.6990063190460205, "learning_rate": 1.1861538461538462e-05, "loss": 0.1172, "step": 1587 }, { "epoch": 20.293929712460063, "grad_norm": 3.8228986263275146, "learning_rate": 1.1856410256410257e-05, "loss": 0.1211, "step": 1588 }, { "epoch": 20.30670926517572, "grad_norm": 4.515007972717285, "learning_rate": 1.1851282051282052e-05, "loss": 0.1159, "step": 1589 }, { "epoch": 20.319488817891372, "grad_norm": 4.058969974517822, "learning_rate": 1.1846153846153847e-05, "loss": 0.0675, "step": 1590 }, { "epoch": 20.33226837060703, "grad_norm": 5.006679058074951, "learning_rate": 1.1841025641025643e-05, "loss": 0.1014, "step": 1591 }, { "epoch": 20.345047923322685, "grad_norm": 4.770201206207275, "learning_rate": 1.1835897435897437e-05, "loss": 0.1304, "step": 1592 }, { "epoch": 20.357827476038338, "grad_norm": 4.045782566070557, "learning_rate": 1.1830769230769233e-05, "loss": 0.1109, "step": 1593 }, { "epoch": 20.370607028753994, "grad_norm": 4.972194194793701, "learning_rate": 1.1825641025641025e-05, "loss": 0.1055, "step": 1594 }, { "epoch": 20.383386581469647, "grad_norm": 3.656635284423828, "learning_rate": 1.1820512820512821e-05, "loss": 0.0696, "step": 1595 }, { "epoch": 20.396166134185304, "grad_norm": 4.930873394012451, "learning_rate": 1.1815384615384617e-05, "loss": 0.1229, "step": 1596 }, { "epoch": 20.408945686900957, "grad_norm": 3.699876070022583, "learning_rate": 1.181025641025641e-05, "loss": 0.0886, "step": 1597 }, { "epoch": 20.421725239616613, "grad_norm": 5.458634853363037, "learning_rate": 1.1805128205128206e-05, "loss": 0.0778, "step": 1598 }, { "epoch": 20.43450479233227, "grad_norm": 4.316697597503662, "learning_rate": 1.18e-05, "loss": 0.098, "step": 1599 }, { "epoch": 20.447284345047922, "grad_norm": 3.8660194873809814, "learning_rate": 1.1794871794871796e-05, "loss": 0.0971, "step": 1600 }, { "epoch": 20.447284345047922, "eval_loss": 0.6326173543930054, "eval_runtime": 183.9465, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 1600 }, { "epoch": 20.46006389776358, "grad_norm": 5.642141342163086, "learning_rate": 1.178974358974359e-05, "loss": 0.143, "step": 1601 }, { "epoch": 20.472843450479232, "grad_norm": 6.414536952972412, "learning_rate": 1.1784615384615386e-05, "loss": 0.1125, "step": 1602 }, { "epoch": 20.48562300319489, "grad_norm": 4.429561138153076, "learning_rate": 1.1779487179487182e-05, "loss": 0.1181, "step": 1603 }, { "epoch": 20.498402555910545, "grad_norm": 3.8313069343566895, "learning_rate": 1.1774358974358974e-05, "loss": 0.1016, "step": 1604 }, { "epoch": 20.511182108626198, "grad_norm": 3.912367105484009, "learning_rate": 1.176923076923077e-05, "loss": 0.1038, "step": 1605 }, { "epoch": 20.523961661341854, "grad_norm": 5.56271505355835, "learning_rate": 1.1764102564102564e-05, "loss": 0.1639, "step": 1606 }, { "epoch": 20.536741214057507, "grad_norm": 3.5914433002471924, "learning_rate": 1.175897435897436e-05, "loss": 0.0811, "step": 1607 }, { "epoch": 20.549520766773163, "grad_norm": 5.16848087310791, "learning_rate": 1.1753846153846155e-05, "loss": 0.1301, "step": 1608 }, { "epoch": 20.562300319488816, "grad_norm": 3.407752513885498, "learning_rate": 1.174871794871795e-05, "loss": 0.0736, "step": 1609 }, { "epoch": 20.575079872204473, "grad_norm": 3.251795530319214, "learning_rate": 1.1743589743589745e-05, "loss": 0.0633, "step": 1610 }, { "epoch": 20.58785942492013, "grad_norm": 5.2076029777526855, "learning_rate": 1.173846153846154e-05, "loss": 0.1263, "step": 1611 }, { "epoch": 20.600638977635782, "grad_norm": 5.330387115478516, "learning_rate": 1.1733333333333335e-05, "loss": 0.1103, "step": 1612 }, { "epoch": 20.61341853035144, "grad_norm": 4.368866920471191, "learning_rate": 1.172820512820513e-05, "loss": 0.1167, "step": 1613 }, { "epoch": 20.62619808306709, "grad_norm": 3.9925103187561035, "learning_rate": 1.1723076923076923e-05, "loss": 0.0782, "step": 1614 }, { "epoch": 20.638977635782748, "grad_norm": 3.283320426940918, "learning_rate": 1.1717948717948719e-05, "loss": 0.0994, "step": 1615 }, { "epoch": 20.6517571884984, "grad_norm": 4.990774154663086, "learning_rate": 1.1712820512820513e-05, "loss": 0.1341, "step": 1616 }, { "epoch": 20.664536741214057, "grad_norm": 3.5431063175201416, "learning_rate": 1.1707692307692309e-05, "loss": 0.0923, "step": 1617 }, { "epoch": 20.677316293929714, "grad_norm": 4.684637546539307, "learning_rate": 1.1702564102564104e-05, "loss": 0.084, "step": 1618 }, { "epoch": 20.690095846645367, "grad_norm": 4.183215618133545, "learning_rate": 1.1697435897435899e-05, "loss": 0.101, "step": 1619 }, { "epoch": 20.702875399361023, "grad_norm": 3.7057716846466064, "learning_rate": 1.1692307692307694e-05, "loss": 0.1015, "step": 1620 }, { "epoch": 20.715654952076676, "grad_norm": 5.4087324142456055, "learning_rate": 1.1687179487179488e-05, "loss": 0.1613, "step": 1621 }, { "epoch": 20.728434504792332, "grad_norm": 4.276289463043213, "learning_rate": 1.1682051282051284e-05, "loss": 0.1231, "step": 1622 }, { "epoch": 20.74121405750799, "grad_norm": 6.077336311340332, "learning_rate": 1.1676923076923076e-05, "loss": 0.1098, "step": 1623 }, { "epoch": 20.75399361022364, "grad_norm": 5.829557418823242, "learning_rate": 1.1671794871794872e-05, "loss": 0.193, "step": 1624 }, { "epoch": 20.766773162939298, "grad_norm": 4.161832809448242, "learning_rate": 1.1666666666666668e-05, "loss": 0.0767, "step": 1625 }, { "epoch": 20.77955271565495, "grad_norm": 5.5048441886901855, "learning_rate": 1.1661538461538462e-05, "loss": 0.1275, "step": 1626 }, { "epoch": 20.792332268370608, "grad_norm": 4.068666458129883, "learning_rate": 1.1656410256410258e-05, "loss": 0.1387, "step": 1627 }, { "epoch": 20.80511182108626, "grad_norm": 4.471652507781982, "learning_rate": 1.1651282051282052e-05, "loss": 0.1038, "step": 1628 }, { "epoch": 20.817891373801917, "grad_norm": 4.135310173034668, "learning_rate": 1.1646153846153848e-05, "loss": 0.0855, "step": 1629 }, { "epoch": 20.830670926517573, "grad_norm": 4.095508098602295, "learning_rate": 1.1641025641025643e-05, "loss": 0.117, "step": 1630 }, { "epoch": 20.843450479233226, "grad_norm": 3.997915029525757, "learning_rate": 1.1635897435897436e-05, "loss": 0.1411, "step": 1631 }, { "epoch": 20.856230031948883, "grad_norm": 4.30368709564209, "learning_rate": 1.1630769230769231e-05, "loss": 0.1085, "step": 1632 }, { "epoch": 20.869009584664536, "grad_norm": 4.3267107009887695, "learning_rate": 1.1625641025641025e-05, "loss": 0.1119, "step": 1633 }, { "epoch": 20.881789137380192, "grad_norm": 4.337647438049316, "learning_rate": 1.1620512820512821e-05, "loss": 0.1402, "step": 1634 }, { "epoch": 20.894568690095845, "grad_norm": 4.954765319824219, "learning_rate": 1.1615384615384617e-05, "loss": 0.1718, "step": 1635 }, { "epoch": 20.9073482428115, "grad_norm": 3.596308946609497, "learning_rate": 1.1610256410256411e-05, "loss": 0.0814, "step": 1636 }, { "epoch": 20.920127795527158, "grad_norm": 3.3369619846343994, "learning_rate": 1.1605128205128207e-05, "loss": 0.0947, "step": 1637 }, { "epoch": 20.93290734824281, "grad_norm": 4.8418755531311035, "learning_rate": 1.16e-05, "loss": 0.1547, "step": 1638 }, { "epoch": 20.945686900958467, "grad_norm": 6.589743614196777, "learning_rate": 1.1594871794871796e-05, "loss": 0.1832, "step": 1639 }, { "epoch": 20.95846645367412, "grad_norm": 3.8558123111724854, "learning_rate": 1.1589743589743592e-05, "loss": 0.0987, "step": 1640 }, { "epoch": 20.971246006389777, "grad_norm": 4.8623127937316895, "learning_rate": 1.1584615384615385e-05, "loss": 0.1301, "step": 1641 }, { "epoch": 20.984025559105433, "grad_norm": 4.397773265838623, "learning_rate": 1.157948717948718e-05, "loss": 0.0928, "step": 1642 }, { "epoch": 20.996805111821086, "grad_norm": 4.168147563934326, "learning_rate": 1.1574358974358974e-05, "loss": 0.1086, "step": 1643 }, { "epoch": 21.009584664536742, "grad_norm": 4.462241172790527, "learning_rate": 1.156923076923077e-05, "loss": 0.0797, "step": 1644 }, { "epoch": 21.022364217252395, "grad_norm": 2.952622413635254, "learning_rate": 1.1564102564102566e-05, "loss": 0.0535, "step": 1645 }, { "epoch": 21.03514376996805, "grad_norm": 2.7388765811920166, "learning_rate": 1.155897435897436e-05, "loss": 0.0567, "step": 1646 }, { "epoch": 21.047923322683705, "grad_norm": 4.662555694580078, "learning_rate": 1.1553846153846156e-05, "loss": 0.0913, "step": 1647 }, { "epoch": 21.06070287539936, "grad_norm": 4.469814777374268, "learning_rate": 1.154871794871795e-05, "loss": 0.0873, "step": 1648 }, { "epoch": 21.073482428115017, "grad_norm": 4.687971591949463, "learning_rate": 1.1543589743589745e-05, "loss": 0.1021, "step": 1649 }, { "epoch": 21.08626198083067, "grad_norm": 3.111229181289673, "learning_rate": 1.1538461538461538e-05, "loss": 0.0719, "step": 1650 }, { "epoch": 21.099041533546327, "grad_norm": 4.174816131591797, "learning_rate": 1.1533333333333334e-05, "loss": 0.0751, "step": 1651 }, { "epoch": 21.11182108626198, "grad_norm": 2.788222312927246, "learning_rate": 1.152820512820513e-05, "loss": 0.0743, "step": 1652 }, { "epoch": 21.124600638977636, "grad_norm": 3.829700231552124, "learning_rate": 1.1523076923076923e-05, "loss": 0.0927, "step": 1653 }, { "epoch": 21.13738019169329, "grad_norm": 3.773012399673462, "learning_rate": 1.1517948717948719e-05, "loss": 0.0859, "step": 1654 }, { "epoch": 21.150159744408946, "grad_norm": 3.516838788986206, "learning_rate": 1.1512820512820513e-05, "loss": 0.0993, "step": 1655 }, { "epoch": 21.162939297124602, "grad_norm": 5.086302280426025, "learning_rate": 1.1507692307692309e-05, "loss": 0.0787, "step": 1656 }, { "epoch": 21.175718849840255, "grad_norm": 3.660397529602051, "learning_rate": 1.1502564102564105e-05, "loss": 0.0716, "step": 1657 }, { "epoch": 21.18849840255591, "grad_norm": 3.944607734680176, "learning_rate": 1.1497435897435899e-05, "loss": 0.1091, "step": 1658 }, { "epoch": 21.201277955271564, "grad_norm": 4.121984004974365, "learning_rate": 1.1492307692307694e-05, "loss": 0.0729, "step": 1659 }, { "epoch": 21.21405750798722, "grad_norm": 3.834733009338379, "learning_rate": 1.1487179487179487e-05, "loss": 0.0966, "step": 1660 }, { "epoch": 21.226837060702877, "grad_norm": 4.8407883644104, "learning_rate": 1.1482051282051283e-05, "loss": 0.1234, "step": 1661 }, { "epoch": 21.23961661341853, "grad_norm": 3.637965679168701, "learning_rate": 1.1476923076923078e-05, "loss": 0.1126, "step": 1662 }, { "epoch": 21.252396166134186, "grad_norm": 3.130990743637085, "learning_rate": 1.1471794871794872e-05, "loss": 0.0941, "step": 1663 }, { "epoch": 21.26517571884984, "grad_norm": 5.102894306182861, "learning_rate": 1.1466666666666668e-05, "loss": 0.1326, "step": 1664 }, { "epoch": 21.277955271565496, "grad_norm": 3.611659049987793, "learning_rate": 1.1461538461538462e-05, "loss": 0.0885, "step": 1665 }, { "epoch": 21.29073482428115, "grad_norm": 3.9154715538024902, "learning_rate": 1.1456410256410258e-05, "loss": 0.0991, "step": 1666 }, { "epoch": 21.303514376996805, "grad_norm": 4.346821308135986, "learning_rate": 1.1451282051282054e-05, "loss": 0.1114, "step": 1667 }, { "epoch": 21.31629392971246, "grad_norm": 3.284734010696411, "learning_rate": 1.1446153846153846e-05, "loss": 0.0624, "step": 1668 }, { "epoch": 21.329073482428115, "grad_norm": 3.343594789505005, "learning_rate": 1.1441025641025642e-05, "loss": 0.1077, "step": 1669 }, { "epoch": 21.34185303514377, "grad_norm": 3.939784049987793, "learning_rate": 1.1435897435897436e-05, "loss": 0.1046, "step": 1670 }, { "epoch": 21.354632587859424, "grad_norm": 5.225682258605957, "learning_rate": 1.1430769230769232e-05, "loss": 0.1022, "step": 1671 }, { "epoch": 21.36741214057508, "grad_norm": 3.0708720684051514, "learning_rate": 1.1425641025641026e-05, "loss": 0.065, "step": 1672 }, { "epoch": 21.380191693290733, "grad_norm": 3.394704580307007, "learning_rate": 1.1420512820512821e-05, "loss": 0.0904, "step": 1673 }, { "epoch": 21.39297124600639, "grad_norm": 3.961637496948242, "learning_rate": 1.1415384615384617e-05, "loss": 0.1179, "step": 1674 }, { "epoch": 21.405750798722046, "grad_norm": 4.387579917907715, "learning_rate": 1.1410256410256411e-05, "loss": 0.0879, "step": 1675 }, { "epoch": 21.4185303514377, "grad_norm": 2.8807778358459473, "learning_rate": 1.1405128205128207e-05, "loss": 0.0582, "step": 1676 }, { "epoch": 21.431309904153355, "grad_norm": 3.1875922679901123, "learning_rate": 1.14e-05, "loss": 0.0685, "step": 1677 }, { "epoch": 21.44408945686901, "grad_norm": 3.213134527206421, "learning_rate": 1.1394871794871795e-05, "loss": 0.0933, "step": 1678 }, { "epoch": 21.456869009584665, "grad_norm": 4.480829238891602, "learning_rate": 1.138974358974359e-05, "loss": 0.0779, "step": 1679 }, { "epoch": 21.46964856230032, "grad_norm": 4.427398204803467, "learning_rate": 1.1384615384615385e-05, "loss": 0.1158, "step": 1680 }, { "epoch": 21.482428115015974, "grad_norm": 3.7674429416656494, "learning_rate": 1.137948717948718e-05, "loss": 0.0694, "step": 1681 }, { "epoch": 21.49520766773163, "grad_norm": 3.895766258239746, "learning_rate": 1.1374358974358975e-05, "loss": 0.0984, "step": 1682 }, { "epoch": 21.507987220447284, "grad_norm": 5.089962959289551, "learning_rate": 1.136923076923077e-05, "loss": 0.126, "step": 1683 }, { "epoch": 21.52076677316294, "grad_norm": 3.1773629188537598, "learning_rate": 1.1364102564102566e-05, "loss": 0.1193, "step": 1684 }, { "epoch": 21.533546325878593, "grad_norm": 4.2216949462890625, "learning_rate": 1.135897435897436e-05, "loss": 0.0929, "step": 1685 }, { "epoch": 21.54632587859425, "grad_norm": 3.353468179702759, "learning_rate": 1.1353846153846156e-05, "loss": 0.0954, "step": 1686 }, { "epoch": 21.559105431309906, "grad_norm": 4.303814888000488, "learning_rate": 1.1348717948717948e-05, "loss": 0.0674, "step": 1687 }, { "epoch": 21.57188498402556, "grad_norm": 4.027798175811768, "learning_rate": 1.1343589743589744e-05, "loss": 0.0724, "step": 1688 }, { "epoch": 21.584664536741215, "grad_norm": 2.9565556049346924, "learning_rate": 1.133846153846154e-05, "loss": 0.0624, "step": 1689 }, { "epoch": 21.597444089456868, "grad_norm": 4.148669242858887, "learning_rate": 1.1333333333333334e-05, "loss": 0.1211, "step": 1690 }, { "epoch": 21.610223642172524, "grad_norm": 5.265037536621094, "learning_rate": 1.132820512820513e-05, "loss": 0.1015, "step": 1691 }, { "epoch": 21.623003194888177, "grad_norm": 4.325460910797119, "learning_rate": 1.1323076923076924e-05, "loss": 0.1633, "step": 1692 }, { "epoch": 21.635782747603834, "grad_norm": 5.78082275390625, "learning_rate": 1.131794871794872e-05, "loss": 0.1113, "step": 1693 }, { "epoch": 21.64856230031949, "grad_norm": 4.066515922546387, "learning_rate": 1.1312820512820515e-05, "loss": 0.0874, "step": 1694 }, { "epoch": 21.661341853035143, "grad_norm": 4.018939018249512, "learning_rate": 1.1307692307692309e-05, "loss": 0.0933, "step": 1695 }, { "epoch": 21.6741214057508, "grad_norm": 2.7681217193603516, "learning_rate": 1.1302564102564105e-05, "loss": 0.0521, "step": 1696 }, { "epoch": 21.686900958466452, "grad_norm": 4.680244445800781, "learning_rate": 1.1297435897435897e-05, "loss": 0.1296, "step": 1697 }, { "epoch": 21.69968051118211, "grad_norm": 3.169238805770874, "learning_rate": 1.1292307692307693e-05, "loss": 0.0533, "step": 1698 }, { "epoch": 21.712460063897765, "grad_norm": 4.816842555999756, "learning_rate": 1.1287179487179487e-05, "loss": 0.1525, "step": 1699 }, { "epoch": 21.72523961661342, "grad_norm": 3.364724636077881, "learning_rate": 1.1282051282051283e-05, "loss": 0.0825, "step": 1700 }, { "epoch": 21.72523961661342, "eval_loss": 0.680626630783081, "eval_runtime": 184.1048, "eval_samples_per_second": 0.853, "eval_steps_per_second": 0.109, "step": 1700 }, { "epoch": 21.738019169329075, "grad_norm": 6.44060754776001, "learning_rate": 1.1276923076923079e-05, "loss": 0.1274, "step": 1701 }, { "epoch": 21.750798722044728, "grad_norm": 7.465785026550293, "learning_rate": 1.1271794871794873e-05, "loss": 0.0803, "step": 1702 }, { "epoch": 21.763578274760384, "grad_norm": 4.474442481994629, "learning_rate": 1.1266666666666668e-05, "loss": 0.0981, "step": 1703 }, { "epoch": 21.776357827476037, "grad_norm": 3.418708324432373, "learning_rate": 1.126153846153846e-05, "loss": 0.084, "step": 1704 }, { "epoch": 21.789137380191693, "grad_norm": 3.9616029262542725, "learning_rate": 1.1256410256410258e-05, "loss": 0.0726, "step": 1705 }, { "epoch": 21.80191693290735, "grad_norm": 6.344253063201904, "learning_rate": 1.1251282051282054e-05, "loss": 0.1246, "step": 1706 }, { "epoch": 21.814696485623003, "grad_norm": 3.4718990325927734, "learning_rate": 1.1246153846153846e-05, "loss": 0.0749, "step": 1707 }, { "epoch": 21.82747603833866, "grad_norm": 2.6628403663635254, "learning_rate": 1.1241025641025642e-05, "loss": 0.067, "step": 1708 }, { "epoch": 21.840255591054312, "grad_norm": 5.664834976196289, "learning_rate": 1.1235897435897436e-05, "loss": 0.1274, "step": 1709 }, { "epoch": 21.85303514376997, "grad_norm": 4.317172527313232, "learning_rate": 1.1230769230769232e-05, "loss": 0.0867, "step": 1710 }, { "epoch": 21.86581469648562, "grad_norm": 4.82378625869751, "learning_rate": 1.1225641025641028e-05, "loss": 0.0951, "step": 1711 }, { "epoch": 21.878594249201278, "grad_norm": 4.036247253417969, "learning_rate": 1.1220512820512822e-05, "loss": 0.1049, "step": 1712 }, { "epoch": 21.891373801916934, "grad_norm": 4.39935302734375, "learning_rate": 1.1215384615384617e-05, "loss": 0.1128, "step": 1713 }, { "epoch": 21.904153354632587, "grad_norm": 4.117251396179199, "learning_rate": 1.121025641025641e-05, "loss": 0.0852, "step": 1714 }, { "epoch": 21.916932907348244, "grad_norm": 4.607010841369629, "learning_rate": 1.1205128205128205e-05, "loss": 0.1085, "step": 1715 }, { "epoch": 21.929712460063897, "grad_norm": 3.569995641708374, "learning_rate": 1.1200000000000001e-05, "loss": 0.0757, "step": 1716 }, { "epoch": 21.942492012779553, "grad_norm": 3.508486747741699, "learning_rate": 1.1194871794871795e-05, "loss": 0.0843, "step": 1717 }, { "epoch": 21.955271565495206, "grad_norm": 3.5567374229431152, "learning_rate": 1.1189743589743591e-05, "loss": 0.0935, "step": 1718 }, { "epoch": 21.968051118210862, "grad_norm": 4.282688617706299, "learning_rate": 1.1184615384615385e-05, "loss": 0.1008, "step": 1719 }, { "epoch": 21.98083067092652, "grad_norm": 3.526252508163452, "learning_rate": 1.117948717948718e-05, "loss": 0.0816, "step": 1720 }, { "epoch": 21.99361022364217, "grad_norm": 3.7250707149505615, "learning_rate": 1.1174358974358975e-05, "loss": 0.0985, "step": 1721 }, { "epoch": 22.00638977635783, "grad_norm": 3.0080602169036865, "learning_rate": 1.116923076923077e-05, "loss": 0.055, "step": 1722 }, { "epoch": 22.01916932907348, "grad_norm": 3.0814597606658936, "learning_rate": 1.1164102564102566e-05, "loss": 0.0546, "step": 1723 }, { "epoch": 22.031948881789138, "grad_norm": 2.9059879779815674, "learning_rate": 1.1158974358974359e-05, "loss": 0.055, "step": 1724 }, { "epoch": 22.044728434504794, "grad_norm": 2.3464226722717285, "learning_rate": 1.1153846153846154e-05, "loss": 0.0488, "step": 1725 }, { "epoch": 22.057507987220447, "grad_norm": 3.984386444091797, "learning_rate": 1.1148717948717948e-05, "loss": 0.0724, "step": 1726 }, { "epoch": 22.070287539936103, "grad_norm": 3.7431716918945312, "learning_rate": 1.1143589743589744e-05, "loss": 0.0606, "step": 1727 }, { "epoch": 22.083067092651756, "grad_norm": 2.889275074005127, "learning_rate": 1.113846153846154e-05, "loss": 0.0478, "step": 1728 }, { "epoch": 22.095846645367413, "grad_norm": 4.641188621520996, "learning_rate": 1.1133333333333334e-05, "loss": 0.1025, "step": 1729 }, { "epoch": 22.108626198083066, "grad_norm": 3.6719138622283936, "learning_rate": 1.112820512820513e-05, "loss": 0.0824, "step": 1730 }, { "epoch": 22.121405750798722, "grad_norm": 4.5112810134887695, "learning_rate": 1.1123076923076924e-05, "loss": 0.0888, "step": 1731 }, { "epoch": 22.13418530351438, "grad_norm": 3.478308916091919, "learning_rate": 1.111794871794872e-05, "loss": 0.0689, "step": 1732 }, { "epoch": 22.14696485623003, "grad_norm": 2.2676029205322266, "learning_rate": 1.1112820512820515e-05, "loss": 0.0407, "step": 1733 }, { "epoch": 22.159744408945688, "grad_norm": 3.8547441959381104, "learning_rate": 1.1107692307692308e-05, "loss": 0.0598, "step": 1734 }, { "epoch": 22.17252396166134, "grad_norm": 3.650141716003418, "learning_rate": 1.1102564102564103e-05, "loss": 0.08, "step": 1735 }, { "epoch": 22.185303514376997, "grad_norm": 3.7249064445495605, "learning_rate": 1.1097435897435897e-05, "loss": 0.0573, "step": 1736 }, { "epoch": 22.19808306709265, "grad_norm": 2.4170174598693848, "learning_rate": 1.1092307692307693e-05, "loss": 0.0462, "step": 1737 }, { "epoch": 22.210862619808307, "grad_norm": 3.232990026473999, "learning_rate": 1.1087179487179489e-05, "loss": 0.0778, "step": 1738 }, { "epoch": 22.223642172523963, "grad_norm": 3.2713253498077393, "learning_rate": 1.1082051282051283e-05, "loss": 0.0375, "step": 1739 }, { "epoch": 22.236421725239616, "grad_norm": 2.955406427383423, "learning_rate": 1.1076923076923079e-05, "loss": 0.0592, "step": 1740 }, { "epoch": 22.249201277955272, "grad_norm": 3.5802533626556396, "learning_rate": 1.1071794871794873e-05, "loss": 0.072, "step": 1741 }, { "epoch": 22.261980830670925, "grad_norm": 4.030676364898682, "learning_rate": 1.1066666666666669e-05, "loss": 0.0934, "step": 1742 }, { "epoch": 22.27476038338658, "grad_norm": 4.585970401763916, "learning_rate": 1.1061538461538461e-05, "loss": 0.0751, "step": 1743 }, { "epoch": 22.287539936102238, "grad_norm": 3.766411781311035, "learning_rate": 1.1056410256410257e-05, "loss": 0.0828, "step": 1744 }, { "epoch": 22.30031948881789, "grad_norm": 2.6844403743743896, "learning_rate": 1.1051282051282052e-05, "loss": 0.0778, "step": 1745 }, { "epoch": 22.313099041533548, "grad_norm": 2.612366199493408, "learning_rate": 1.1046153846153846e-05, "loss": 0.0561, "step": 1746 }, { "epoch": 22.3258785942492, "grad_norm": 3.555194854736328, "learning_rate": 1.1041025641025642e-05, "loss": 0.0822, "step": 1747 }, { "epoch": 22.338658146964857, "grad_norm": 3.7457146644592285, "learning_rate": 1.1035897435897436e-05, "loss": 0.1377, "step": 1748 }, { "epoch": 22.35143769968051, "grad_norm": 3.957167625427246, "learning_rate": 1.1030769230769232e-05, "loss": 0.0838, "step": 1749 }, { "epoch": 22.364217252396166, "grad_norm": 3.65297794342041, "learning_rate": 1.1025641025641028e-05, "loss": 0.0689, "step": 1750 }, { "epoch": 22.376996805111823, "grad_norm": 7.409717082977295, "learning_rate": 1.102051282051282e-05, "loss": 0.1203, "step": 1751 }, { "epoch": 22.389776357827476, "grad_norm": 5.040288925170898, "learning_rate": 1.1015384615384616e-05, "loss": 0.0896, "step": 1752 }, { "epoch": 22.402555910543132, "grad_norm": 3.3070826530456543, "learning_rate": 1.101025641025641e-05, "loss": 0.0815, "step": 1753 }, { "epoch": 22.415335463258785, "grad_norm": 4.675379276275635, "learning_rate": 1.1005128205128206e-05, "loss": 0.0757, "step": 1754 }, { "epoch": 22.42811501597444, "grad_norm": 3.5492966175079346, "learning_rate": 1.1000000000000001e-05, "loss": 0.0759, "step": 1755 }, { "epoch": 22.440894568690094, "grad_norm": 2.287569999694824, "learning_rate": 1.0994871794871795e-05, "loss": 0.0337, "step": 1756 }, { "epoch": 22.45367412140575, "grad_norm": 4.1564764976501465, "learning_rate": 1.0989743589743591e-05, "loss": 0.0822, "step": 1757 }, { "epoch": 22.466453674121407, "grad_norm": 3.486571788787842, "learning_rate": 1.0984615384615385e-05, "loss": 0.0822, "step": 1758 }, { "epoch": 22.47923322683706, "grad_norm": 7.400767803192139, "learning_rate": 1.0979487179487181e-05, "loss": 0.1447, "step": 1759 }, { "epoch": 22.492012779552716, "grad_norm": 3.0589351654052734, "learning_rate": 1.0974358974358977e-05, "loss": 0.0506, "step": 1760 }, { "epoch": 22.50479233226837, "grad_norm": 2.8320963382720947, "learning_rate": 1.0969230769230769e-05, "loss": 0.0386, "step": 1761 }, { "epoch": 22.517571884984026, "grad_norm": 2.8496134281158447, "learning_rate": 1.0964102564102565e-05, "loss": 0.0607, "step": 1762 }, { "epoch": 22.53035143769968, "grad_norm": 3.732996702194214, "learning_rate": 1.0958974358974359e-05, "loss": 0.0981, "step": 1763 }, { "epoch": 22.543130990415335, "grad_norm": 4.311052322387695, "learning_rate": 1.0953846153846155e-05, "loss": 0.1022, "step": 1764 }, { "epoch": 22.55591054313099, "grad_norm": 3.167752504348755, "learning_rate": 1.094871794871795e-05, "loss": 0.0496, "step": 1765 }, { "epoch": 22.568690095846645, "grad_norm": 3.640716314315796, "learning_rate": 1.0943589743589744e-05, "loss": 0.0579, "step": 1766 }, { "epoch": 22.5814696485623, "grad_norm": 3.016212224960327, "learning_rate": 1.093846153846154e-05, "loss": 0.0725, "step": 1767 }, { "epoch": 22.594249201277954, "grad_norm": 3.809197187423706, "learning_rate": 1.0933333333333334e-05, "loss": 0.0548, "step": 1768 }, { "epoch": 22.60702875399361, "grad_norm": 3.489170789718628, "learning_rate": 1.092820512820513e-05, "loss": 0.0876, "step": 1769 }, { "epoch": 22.619808306709267, "grad_norm": 4.824202537536621, "learning_rate": 1.0923076923076922e-05, "loss": 0.0878, "step": 1770 }, { "epoch": 22.63258785942492, "grad_norm": 2.8183517456054688, "learning_rate": 1.0917948717948718e-05, "loss": 0.0377, "step": 1771 }, { "epoch": 22.645367412140576, "grad_norm": 3.3342647552490234, "learning_rate": 1.0912820512820514e-05, "loss": 0.0507, "step": 1772 }, { "epoch": 22.65814696485623, "grad_norm": 11.420624732971191, "learning_rate": 1.0907692307692308e-05, "loss": 0.117, "step": 1773 }, { "epoch": 22.670926517571885, "grad_norm": 5.182732582092285, "learning_rate": 1.0902564102564104e-05, "loss": 0.0992, "step": 1774 }, { "epoch": 22.68370607028754, "grad_norm": 3.9871864318847656, "learning_rate": 1.0897435897435898e-05, "loss": 0.063, "step": 1775 }, { "epoch": 22.696485623003195, "grad_norm": 3.9142467975616455, "learning_rate": 1.0892307692307693e-05, "loss": 0.057, "step": 1776 }, { "epoch": 22.70926517571885, "grad_norm": 2.7767791748046875, "learning_rate": 1.088717948717949e-05, "loss": 0.0532, "step": 1777 }, { "epoch": 22.722044728434504, "grad_norm": 9.83389949798584, "learning_rate": 1.0882051282051283e-05, "loss": 0.1355, "step": 1778 }, { "epoch": 22.73482428115016, "grad_norm": 3.4298386573791504, "learning_rate": 1.0876923076923079e-05, "loss": 0.0562, "step": 1779 }, { "epoch": 22.747603833865814, "grad_norm": 4.0902485847473145, "learning_rate": 1.0871794871794871e-05, "loss": 0.1059, "step": 1780 }, { "epoch": 22.76038338658147, "grad_norm": 2.769785165786743, "learning_rate": 1.0866666666666667e-05, "loss": 0.063, "step": 1781 }, { "epoch": 22.773162939297123, "grad_norm": 4.616344928741455, "learning_rate": 1.0861538461538463e-05, "loss": 0.0676, "step": 1782 }, { "epoch": 22.78594249201278, "grad_norm": 5.222856521606445, "learning_rate": 1.0856410256410257e-05, "loss": 0.0986, "step": 1783 }, { "epoch": 22.798722044728436, "grad_norm": 3.8730037212371826, "learning_rate": 1.0851282051282053e-05, "loss": 0.0627, "step": 1784 }, { "epoch": 22.81150159744409, "grad_norm": 4.0193681716918945, "learning_rate": 1.0846153846153847e-05, "loss": 0.0955, "step": 1785 }, { "epoch": 22.824281150159745, "grad_norm": 4.4240007400512695, "learning_rate": 1.0841025641025642e-05, "loss": 0.0901, "step": 1786 }, { "epoch": 22.837060702875398, "grad_norm": 3.280074119567871, "learning_rate": 1.0835897435897438e-05, "loss": 0.0732, "step": 1787 }, { "epoch": 22.849840255591054, "grad_norm": 3.105015993118286, "learning_rate": 1.083076923076923e-05, "loss": 0.0615, "step": 1788 }, { "epoch": 22.86261980830671, "grad_norm": 5.285585403442383, "learning_rate": 1.0825641025641028e-05, "loss": 0.0824, "step": 1789 }, { "epoch": 22.875399361022364, "grad_norm": 6.821863174438477, "learning_rate": 1.082051282051282e-05, "loss": 0.0929, "step": 1790 }, { "epoch": 22.88817891373802, "grad_norm": 7.419590473175049, "learning_rate": 1.0815384615384616e-05, "loss": 0.0764, "step": 1791 }, { "epoch": 22.900958466453673, "grad_norm": 6.400094032287598, "learning_rate": 1.081025641025641e-05, "loss": 0.1308, "step": 1792 }, { "epoch": 22.91373801916933, "grad_norm": 4.24391508102417, "learning_rate": 1.0805128205128206e-05, "loss": 0.0671, "step": 1793 }, { "epoch": 22.926517571884983, "grad_norm": 4.861712455749512, "learning_rate": 1.0800000000000002e-05, "loss": 0.0779, "step": 1794 }, { "epoch": 22.93929712460064, "grad_norm": 10.362529754638672, "learning_rate": 1.0794871794871796e-05, "loss": 0.0948, "step": 1795 }, { "epoch": 22.952076677316295, "grad_norm": 3.3236842155456543, "learning_rate": 1.0789743589743591e-05, "loss": 0.0666, "step": 1796 }, { "epoch": 22.96485623003195, "grad_norm": 4.896422863006592, "learning_rate": 1.0784615384615384e-05, "loss": 0.0983, "step": 1797 }, { "epoch": 22.977635782747605, "grad_norm": 3.114579916000366, "learning_rate": 1.077948717948718e-05, "loss": 0.0882, "step": 1798 }, { "epoch": 22.990415335463258, "grad_norm": 3.008934497833252, "learning_rate": 1.0774358974358975e-05, "loss": 0.0734, "step": 1799 }, { "epoch": 23.003194888178914, "grad_norm": 4.6161208152771, "learning_rate": 1.076923076923077e-05, "loss": 0.087, "step": 1800 }, { "epoch": 23.003194888178914, "eval_loss": 0.7027307748794556, "eval_runtime": 183.6326, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 1800 }, { "epoch": 23.015974440894567, "grad_norm": 2.4176926612854004, "learning_rate": 1.0764102564102565e-05, "loss": 0.0332, "step": 1801 }, { "epoch": 23.028753993610223, "grad_norm": 5.2224321365356445, "learning_rate": 1.0758974358974359e-05, "loss": 0.0607, "step": 1802 }, { "epoch": 23.04153354632588, "grad_norm": 3.5653960704803467, "learning_rate": 1.0753846153846155e-05, "loss": 0.0661, "step": 1803 }, { "epoch": 23.054313099041533, "grad_norm": 5.087371349334717, "learning_rate": 1.074871794871795e-05, "loss": 0.0714, "step": 1804 }, { "epoch": 23.06709265175719, "grad_norm": 2.5737032890319824, "learning_rate": 1.0743589743589745e-05, "loss": 0.0641, "step": 1805 }, { "epoch": 23.079872204472842, "grad_norm": 5.5952301025390625, "learning_rate": 1.073846153846154e-05, "loss": 0.0606, "step": 1806 }, { "epoch": 23.0926517571885, "grad_norm": 2.490704298019409, "learning_rate": 1.0733333333333333e-05, "loss": 0.0488, "step": 1807 }, { "epoch": 23.105431309904155, "grad_norm": 2.7343122959136963, "learning_rate": 1.0728205128205129e-05, "loss": 0.0427, "step": 1808 }, { "epoch": 23.118210862619808, "grad_norm": 2.6564457416534424, "learning_rate": 1.0723076923076924e-05, "loss": 0.0425, "step": 1809 }, { "epoch": 23.130990415335464, "grad_norm": 2.8839149475097656, "learning_rate": 1.0717948717948718e-05, "loss": 0.0586, "step": 1810 }, { "epoch": 23.143769968051117, "grad_norm": 2.411672592163086, "learning_rate": 1.0712820512820514e-05, "loss": 0.055, "step": 1811 }, { "epoch": 23.156549520766774, "grad_norm": 4.945863723754883, "learning_rate": 1.0707692307692308e-05, "loss": 0.0763, "step": 1812 }, { "epoch": 23.169329073482427, "grad_norm": 4.193752288818359, "learning_rate": 1.0702564102564104e-05, "loss": 0.0667, "step": 1813 }, { "epoch": 23.182108626198083, "grad_norm": 3.646803140640259, "learning_rate": 1.06974358974359e-05, "loss": 0.0677, "step": 1814 }, { "epoch": 23.19488817891374, "grad_norm": 2.4909188747406006, "learning_rate": 1.0692307692307694e-05, "loss": 0.0541, "step": 1815 }, { "epoch": 23.207667731629392, "grad_norm": 4.249711513519287, "learning_rate": 1.068717948717949e-05, "loss": 0.0685, "step": 1816 }, { "epoch": 23.22044728434505, "grad_norm": 3.983074188232422, "learning_rate": 1.0682051282051282e-05, "loss": 0.0587, "step": 1817 }, { "epoch": 23.233226837060702, "grad_norm": 3.1439638137817383, "learning_rate": 1.0676923076923078e-05, "loss": 0.0511, "step": 1818 }, { "epoch": 23.24600638977636, "grad_norm": 2.408771514892578, "learning_rate": 1.0671794871794872e-05, "loss": 0.0319, "step": 1819 }, { "epoch": 23.25878594249201, "grad_norm": 3.978127956390381, "learning_rate": 1.0666666666666667e-05, "loss": 0.0511, "step": 1820 }, { "epoch": 23.271565495207668, "grad_norm": 3.7737467288970947, "learning_rate": 1.0661538461538463e-05, "loss": 0.0651, "step": 1821 }, { "epoch": 23.284345047923324, "grad_norm": 2.5543150901794434, "learning_rate": 1.0656410256410257e-05, "loss": 0.0324, "step": 1822 }, { "epoch": 23.297124600638977, "grad_norm": 2.3911960124969482, "learning_rate": 1.0651282051282053e-05, "loss": 0.0446, "step": 1823 }, { "epoch": 23.309904153354633, "grad_norm": 3.72542405128479, "learning_rate": 1.0646153846153845e-05, "loss": 0.069, "step": 1824 }, { "epoch": 23.322683706070286, "grad_norm": 6.744537830352783, "learning_rate": 1.0641025641025643e-05, "loss": 0.0569, "step": 1825 }, { "epoch": 23.335463258785943, "grad_norm": 2.999110460281372, "learning_rate": 1.0635897435897438e-05, "loss": 0.0586, "step": 1826 }, { "epoch": 23.3482428115016, "grad_norm": 2.447598695755005, "learning_rate": 1.063076923076923e-05, "loss": 0.043, "step": 1827 }, { "epoch": 23.361022364217252, "grad_norm": 3.7535173892974854, "learning_rate": 1.0625641025641027e-05, "loss": 0.1043, "step": 1828 }, { "epoch": 23.37380191693291, "grad_norm": 4.816366672515869, "learning_rate": 1.062051282051282e-05, "loss": 0.0893, "step": 1829 }, { "epoch": 23.38658146964856, "grad_norm": 4.106537342071533, "learning_rate": 1.0615384615384616e-05, "loss": 0.0899, "step": 1830 }, { "epoch": 23.399361022364218, "grad_norm": 4.535812854766846, "learning_rate": 1.0610256410256412e-05, "loss": 0.0678, "step": 1831 }, { "epoch": 23.41214057507987, "grad_norm": 9.053688049316406, "learning_rate": 1.0605128205128206e-05, "loss": 0.0905, "step": 1832 }, { "epoch": 23.424920127795527, "grad_norm": 8.244091033935547, "learning_rate": 1.0600000000000002e-05, "loss": 0.0627, "step": 1833 }, { "epoch": 23.437699680511184, "grad_norm": 5.316174030303955, "learning_rate": 1.0594871794871794e-05, "loss": 0.0652, "step": 1834 }, { "epoch": 23.450479233226837, "grad_norm": 4.909565448760986, "learning_rate": 1.058974358974359e-05, "loss": 0.0836, "step": 1835 }, { "epoch": 23.463258785942493, "grad_norm": 4.151371955871582, "learning_rate": 1.0584615384615386e-05, "loss": 0.0482, "step": 1836 }, { "epoch": 23.476038338658146, "grad_norm": 3.2326231002807617, "learning_rate": 1.057948717948718e-05, "loss": 0.0522, "step": 1837 }, { "epoch": 23.488817891373802, "grad_norm": 2.452183723449707, "learning_rate": 1.0574358974358975e-05, "loss": 0.042, "step": 1838 }, { "epoch": 23.501597444089455, "grad_norm": 2.3406856060028076, "learning_rate": 1.056923076923077e-05, "loss": 0.0421, "step": 1839 }, { "epoch": 23.51437699680511, "grad_norm": 4.51849889755249, "learning_rate": 1.0564102564102565e-05, "loss": 0.0851, "step": 1840 }, { "epoch": 23.527156549520768, "grad_norm": 4.629650115966797, "learning_rate": 1.055897435897436e-05, "loss": 0.0669, "step": 1841 }, { "epoch": 23.53993610223642, "grad_norm": 7.944886207580566, "learning_rate": 1.0553846153846155e-05, "loss": 0.1098, "step": 1842 }, { "epoch": 23.552715654952078, "grad_norm": 5.454171180725098, "learning_rate": 1.054871794871795e-05, "loss": 0.1067, "step": 1843 }, { "epoch": 23.56549520766773, "grad_norm": 4.480544090270996, "learning_rate": 1.0543589743589743e-05, "loss": 0.0768, "step": 1844 }, { "epoch": 23.578274760383387, "grad_norm": 4.150940895080566, "learning_rate": 1.0538461538461539e-05, "loss": 0.0517, "step": 1845 }, { "epoch": 23.591054313099043, "grad_norm": 3.9581117630004883, "learning_rate": 1.0533333333333333e-05, "loss": 0.0553, "step": 1846 }, { "epoch": 23.603833865814696, "grad_norm": 2.3760437965393066, "learning_rate": 1.0528205128205129e-05, "loss": 0.0593, "step": 1847 }, { "epoch": 23.616613418530353, "grad_norm": 3.2939131259918213, "learning_rate": 1.0523076923076924e-05, "loss": 0.0452, "step": 1848 }, { "epoch": 23.629392971246006, "grad_norm": 3.511467218399048, "learning_rate": 1.0517948717948719e-05, "loss": 0.0528, "step": 1849 }, { "epoch": 23.642172523961662, "grad_norm": 4.303265571594238, "learning_rate": 1.0512820512820514e-05, "loss": 0.0739, "step": 1850 }, { "epoch": 23.654952076677315, "grad_norm": 4.462754726409912, "learning_rate": 1.0507692307692308e-05, "loss": 0.0698, "step": 1851 }, { "epoch": 23.66773162939297, "grad_norm": 2.4054338932037354, "learning_rate": 1.0502564102564104e-05, "loss": 0.0327, "step": 1852 }, { "epoch": 23.680511182108628, "grad_norm": 3.435103178024292, "learning_rate": 1.04974358974359e-05, "loss": 0.0523, "step": 1853 }, { "epoch": 23.69329073482428, "grad_norm": 3.345883846282959, "learning_rate": 1.0492307692307692e-05, "loss": 0.0894, "step": 1854 }, { "epoch": 23.706070287539937, "grad_norm": 3.384603500366211, "learning_rate": 1.0487179487179488e-05, "loss": 0.0569, "step": 1855 }, { "epoch": 23.71884984025559, "grad_norm": 3.4234015941619873, "learning_rate": 1.0482051282051282e-05, "loss": 0.039, "step": 1856 }, { "epoch": 23.731629392971247, "grad_norm": 3.0080175399780273, "learning_rate": 1.0476923076923078e-05, "loss": 0.0455, "step": 1857 }, { "epoch": 23.7444089456869, "grad_norm": 3.3850762844085693, "learning_rate": 1.0471794871794873e-05, "loss": 0.0846, "step": 1858 }, { "epoch": 23.757188498402556, "grad_norm": 5.632648468017578, "learning_rate": 1.0466666666666668e-05, "loss": 0.0632, "step": 1859 }, { "epoch": 23.769968051118212, "grad_norm": 4.267935276031494, "learning_rate": 1.0461538461538463e-05, "loss": 0.0674, "step": 1860 }, { "epoch": 23.782747603833865, "grad_norm": 4.575784206390381, "learning_rate": 1.0456410256410257e-05, "loss": 0.0822, "step": 1861 }, { "epoch": 23.79552715654952, "grad_norm": 7.271210193634033, "learning_rate": 1.0451282051282053e-05, "loss": 0.1074, "step": 1862 }, { "epoch": 23.808306709265175, "grad_norm": 4.030458927154541, "learning_rate": 1.0446153846153849e-05, "loss": 0.0714, "step": 1863 }, { "epoch": 23.82108626198083, "grad_norm": 5.308885097503662, "learning_rate": 1.0441025641025641e-05, "loss": 0.0888, "step": 1864 }, { "epoch": 23.833865814696484, "grad_norm": 5.200422763824463, "learning_rate": 1.0435897435897437e-05, "loss": 0.0694, "step": 1865 }, { "epoch": 23.84664536741214, "grad_norm": 2.9071249961853027, "learning_rate": 1.0430769230769231e-05, "loss": 0.0497, "step": 1866 }, { "epoch": 23.859424920127797, "grad_norm": 3.448936939239502, "learning_rate": 1.0425641025641027e-05, "loss": 0.0596, "step": 1867 }, { "epoch": 23.87220447284345, "grad_norm": 5.066628456115723, "learning_rate": 1.042051282051282e-05, "loss": 0.1144, "step": 1868 }, { "epoch": 23.884984025559106, "grad_norm": 3.3270418643951416, "learning_rate": 1.0415384615384617e-05, "loss": 0.0448, "step": 1869 }, { "epoch": 23.89776357827476, "grad_norm": 4.035781383514404, "learning_rate": 1.0410256410256412e-05, "loss": 0.036, "step": 1870 }, { "epoch": 23.910543130990416, "grad_norm": 4.189550399780273, "learning_rate": 1.0405128205128205e-05, "loss": 0.0761, "step": 1871 }, { "epoch": 23.923322683706072, "grad_norm": 4.90827751159668, "learning_rate": 1.04e-05, "loss": 0.057, "step": 1872 }, { "epoch": 23.936102236421725, "grad_norm": 6.179035186767578, "learning_rate": 1.0394871794871794e-05, "loss": 0.071, "step": 1873 }, { "epoch": 23.94888178913738, "grad_norm": 4.103394508361816, "learning_rate": 1.038974358974359e-05, "loss": 0.0509, "step": 1874 }, { "epoch": 23.961661341853034, "grad_norm": 3.149423599243164, "learning_rate": 1.0384615384615386e-05, "loss": 0.0585, "step": 1875 }, { "epoch": 23.97444089456869, "grad_norm": 7.171144485473633, "learning_rate": 1.037948717948718e-05, "loss": 0.1216, "step": 1876 }, { "epoch": 23.987220447284344, "grad_norm": 4.05762243270874, "learning_rate": 1.0374358974358976e-05, "loss": 0.0716, "step": 1877 }, { "epoch": 24.0, "grad_norm": 2.2400803565979004, "learning_rate": 1.036923076923077e-05, "loss": 0.0421, "step": 1878 }, { "epoch": 24.012779552715656, "grad_norm": 4.6594390869140625, "learning_rate": 1.0364102564102566e-05, "loss": 0.0667, "step": 1879 }, { "epoch": 24.02555910543131, "grad_norm": 2.486776351928711, "learning_rate": 1.0358974358974361e-05, "loss": 0.0367, "step": 1880 }, { "epoch": 24.038338658146966, "grad_norm": 2.264676332473755, "learning_rate": 1.0353846153846154e-05, "loss": 0.0314, "step": 1881 }, { "epoch": 24.05111821086262, "grad_norm": 3.9006893634796143, "learning_rate": 1.034871794871795e-05, "loss": 0.0685, "step": 1882 }, { "epoch": 24.063897763578275, "grad_norm": 7.912015438079834, "learning_rate": 1.0343589743589743e-05, "loss": 0.0558, "step": 1883 }, { "epoch": 24.076677316293928, "grad_norm": 3.379222869873047, "learning_rate": 1.033846153846154e-05, "loss": 0.0548, "step": 1884 }, { "epoch": 24.089456869009584, "grad_norm": 3.533841848373413, "learning_rate": 1.0333333333333335e-05, "loss": 0.0473, "step": 1885 }, { "epoch": 24.10223642172524, "grad_norm": 6.102855205535889, "learning_rate": 1.0328205128205129e-05, "loss": 0.0969, "step": 1886 }, { "epoch": 24.115015974440894, "grad_norm": 2.1626064777374268, "learning_rate": 1.0323076923076925e-05, "loss": 0.0393, "step": 1887 }, { "epoch": 24.12779552715655, "grad_norm": 2.1872377395629883, "learning_rate": 1.0317948717948719e-05, "loss": 0.0258, "step": 1888 }, { "epoch": 24.140575079872203, "grad_norm": 3.058176279067993, "learning_rate": 1.0312820512820515e-05, "loss": 0.0401, "step": 1889 }, { "epoch": 24.15335463258786, "grad_norm": 1.8977614641189575, "learning_rate": 1.0307692307692307e-05, "loss": 0.0352, "step": 1890 }, { "epoch": 24.166134185303516, "grad_norm": 3.599254846572876, "learning_rate": 1.0302564102564103e-05, "loss": 0.0491, "step": 1891 }, { "epoch": 24.17891373801917, "grad_norm": 4.911500453948975, "learning_rate": 1.0297435897435898e-05, "loss": 0.0512, "step": 1892 }, { "epoch": 24.191693290734825, "grad_norm": 3.6773841381073, "learning_rate": 1.0292307692307692e-05, "loss": 0.0503, "step": 1893 }, { "epoch": 24.20447284345048, "grad_norm": 2.7655606269836426, "learning_rate": 1.0287179487179488e-05, "loss": 0.0337, "step": 1894 }, { "epoch": 24.217252396166135, "grad_norm": 3.185929298400879, "learning_rate": 1.0282051282051282e-05, "loss": 0.0486, "step": 1895 }, { "epoch": 24.230031948881788, "grad_norm": 2.5345399379730225, "learning_rate": 1.0276923076923078e-05, "loss": 0.0418, "step": 1896 }, { "epoch": 24.242811501597444, "grad_norm": 3.7780027389526367, "learning_rate": 1.0271794871794874e-05, "loss": 0.0473, "step": 1897 }, { "epoch": 24.2555910543131, "grad_norm": 4.549624919891357, "learning_rate": 1.0266666666666668e-05, "loss": 0.0657, "step": 1898 }, { "epoch": 24.268370607028753, "grad_norm": 2.380779266357422, "learning_rate": 1.0261538461538464e-05, "loss": 0.0336, "step": 1899 }, { "epoch": 24.28115015974441, "grad_norm": 1.827032446861267, "learning_rate": 1.0256410256410256e-05, "loss": 0.0264, "step": 1900 }, { "epoch": 24.28115015974441, "eval_loss": 0.7386012077331543, "eval_runtime": 183.8505, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 1900 }, { "epoch": 24.293929712460063, "grad_norm": 2.8570687770843506, "learning_rate": 1.0251282051282052e-05, "loss": 0.0482, "step": 1901 }, { "epoch": 24.30670926517572, "grad_norm": 7.331059455871582, "learning_rate": 1.0246153846153847e-05, "loss": 0.0822, "step": 1902 }, { "epoch": 24.319488817891372, "grad_norm": 2.7676453590393066, "learning_rate": 1.0241025641025641e-05, "loss": 0.0311, "step": 1903 }, { "epoch": 24.33226837060703, "grad_norm": 1.8847846984863281, "learning_rate": 1.0235897435897437e-05, "loss": 0.0342, "step": 1904 }, { "epoch": 24.345047923322685, "grad_norm": 1.8493037223815918, "learning_rate": 1.0230769230769231e-05, "loss": 0.0267, "step": 1905 }, { "epoch": 24.357827476038338, "grad_norm": 1.7126322984695435, "learning_rate": 1.0225641025641027e-05, "loss": 0.0277, "step": 1906 }, { "epoch": 24.370607028753994, "grad_norm": 2.872636318206787, "learning_rate": 1.0220512820512823e-05, "loss": 0.0407, "step": 1907 }, { "epoch": 24.383386581469647, "grad_norm": 3.7814128398895264, "learning_rate": 1.0215384615384615e-05, "loss": 0.0338, "step": 1908 }, { "epoch": 24.396166134185304, "grad_norm": 8.474327087402344, "learning_rate": 1.0210256410256412e-05, "loss": 0.0839, "step": 1909 }, { "epoch": 24.408945686900957, "grad_norm": 4.64918327331543, "learning_rate": 1.0205128205128205e-05, "loss": 0.0722, "step": 1910 }, { "epoch": 24.421725239616613, "grad_norm": 3.299110174179077, "learning_rate": 1.02e-05, "loss": 0.0652, "step": 1911 }, { "epoch": 24.43450479233227, "grad_norm": 2.726750612258911, "learning_rate": 1.0194871794871796e-05, "loss": 0.0442, "step": 1912 }, { "epoch": 24.447284345047922, "grad_norm": 2.1698837280273438, "learning_rate": 1.018974358974359e-05, "loss": 0.0427, "step": 1913 }, { "epoch": 24.46006389776358, "grad_norm": 4.142160892486572, "learning_rate": 1.0184615384615386e-05, "loss": 0.0605, "step": 1914 }, { "epoch": 24.472843450479232, "grad_norm": 2.4861974716186523, "learning_rate": 1.017948717948718e-05, "loss": 0.0372, "step": 1915 }, { "epoch": 24.48562300319489, "grad_norm": 2.4246034622192383, "learning_rate": 1.0174358974358976e-05, "loss": 0.0332, "step": 1916 }, { "epoch": 24.498402555910545, "grad_norm": 4.91164493560791, "learning_rate": 1.0169230769230768e-05, "loss": 0.0329, "step": 1917 }, { "epoch": 24.511182108626198, "grad_norm": 5.635557174682617, "learning_rate": 1.0164102564102564e-05, "loss": 0.0745, "step": 1918 }, { "epoch": 24.523961661341854, "grad_norm": 6.509224891662598, "learning_rate": 1.015897435897436e-05, "loss": 0.0853, "step": 1919 }, { "epoch": 24.536741214057507, "grad_norm": 3.0304408073425293, "learning_rate": 1.0153846153846154e-05, "loss": 0.0568, "step": 1920 }, { "epoch": 24.549520766773163, "grad_norm": 4.4625959396362305, "learning_rate": 1.014871794871795e-05, "loss": 0.0489, "step": 1921 }, { "epoch": 24.562300319488816, "grad_norm": 3.8643152713775635, "learning_rate": 1.0143589743589744e-05, "loss": 0.0516, "step": 1922 }, { "epoch": 24.575079872204473, "grad_norm": 3.6063058376312256, "learning_rate": 1.013846153846154e-05, "loss": 0.0465, "step": 1923 }, { "epoch": 24.58785942492013, "grad_norm": 3.255998134613037, "learning_rate": 1.0133333333333335e-05, "loss": 0.0469, "step": 1924 }, { "epoch": 24.600638977635782, "grad_norm": 3.0367839336395264, "learning_rate": 1.012820512820513e-05, "loss": 0.0443, "step": 1925 }, { "epoch": 24.61341853035144, "grad_norm": 3.610450267791748, "learning_rate": 1.0123076923076925e-05, "loss": 0.0517, "step": 1926 }, { "epoch": 24.62619808306709, "grad_norm": 5.996733665466309, "learning_rate": 1.0117948717948717e-05, "loss": 0.0512, "step": 1927 }, { "epoch": 24.638977635782748, "grad_norm": 8.607100486755371, "learning_rate": 1.0112820512820513e-05, "loss": 0.0911, "step": 1928 }, { "epoch": 24.6517571884984, "grad_norm": 2.6721339225769043, "learning_rate": 1.0107692307692309e-05, "loss": 0.0366, "step": 1929 }, { "epoch": 24.664536741214057, "grad_norm": 2.7931532859802246, "learning_rate": 1.0102564102564103e-05, "loss": 0.0236, "step": 1930 }, { "epoch": 24.677316293929714, "grad_norm": 4.758185863494873, "learning_rate": 1.0097435897435899e-05, "loss": 0.0812, "step": 1931 }, { "epoch": 24.690095846645367, "grad_norm": 2.5296826362609863, "learning_rate": 1.0092307692307693e-05, "loss": 0.0384, "step": 1932 }, { "epoch": 24.702875399361023, "grad_norm": 4.656088829040527, "learning_rate": 1.0087179487179488e-05, "loss": 0.072, "step": 1933 }, { "epoch": 24.715654952076676, "grad_norm": 4.275123596191406, "learning_rate": 1.0082051282051284e-05, "loss": 0.0486, "step": 1934 }, { "epoch": 24.728434504792332, "grad_norm": 2.7582848072052, "learning_rate": 1.0076923076923078e-05, "loss": 0.0533, "step": 1935 }, { "epoch": 24.74121405750799, "grad_norm": 4.323748588562012, "learning_rate": 1.0071794871794874e-05, "loss": 0.069, "step": 1936 }, { "epoch": 24.75399361022364, "grad_norm": 4.799659729003906, "learning_rate": 1.0066666666666666e-05, "loss": 0.1001, "step": 1937 }, { "epoch": 24.766773162939298, "grad_norm": 4.191411018371582, "learning_rate": 1.0061538461538462e-05, "loss": 0.0508, "step": 1938 }, { "epoch": 24.77955271565495, "grad_norm": 2.4692318439483643, "learning_rate": 1.0056410256410256e-05, "loss": 0.0412, "step": 1939 }, { "epoch": 24.792332268370608, "grad_norm": 2.660069465637207, "learning_rate": 1.0051282051282052e-05, "loss": 0.0844, "step": 1940 }, { "epoch": 24.80511182108626, "grad_norm": 3.0512633323669434, "learning_rate": 1.0046153846153848e-05, "loss": 0.0372, "step": 1941 }, { "epoch": 24.817891373801917, "grad_norm": 4.098048210144043, "learning_rate": 1.0041025641025642e-05, "loss": 0.0682, "step": 1942 }, { "epoch": 24.830670926517573, "grad_norm": 2.6245460510253906, "learning_rate": 1.0035897435897437e-05, "loss": 0.0284, "step": 1943 }, { "epoch": 24.843450479233226, "grad_norm": 3.8218538761138916, "learning_rate": 1.0030769230769231e-05, "loss": 0.0786, "step": 1944 }, { "epoch": 24.856230031948883, "grad_norm": 4.563466548919678, "learning_rate": 1.0025641025641027e-05, "loss": 0.0896, "step": 1945 }, { "epoch": 24.869009584664536, "grad_norm": 2.8993422985076904, "learning_rate": 1.0020512820512823e-05, "loss": 0.0362, "step": 1946 }, { "epoch": 24.881789137380192, "grad_norm": 2.655593156814575, "learning_rate": 1.0015384615384615e-05, "loss": 0.0303, "step": 1947 }, { "epoch": 24.894568690095845, "grad_norm": 4.739352703094482, "learning_rate": 1.0010256410256411e-05, "loss": 0.0614, "step": 1948 }, { "epoch": 24.9073482428115, "grad_norm": 6.2021803855896, "learning_rate": 1.0005128205128205e-05, "loss": 0.0882, "step": 1949 }, { "epoch": 24.920127795527158, "grad_norm": 2.9262235164642334, "learning_rate": 1e-05, "loss": 0.0549, "step": 1950 }, { "epoch": 24.93290734824281, "grad_norm": 3.3608341217041016, "learning_rate": 9.994871794871795e-06, "loss": 0.0598, "step": 1951 }, { "epoch": 24.945686900958467, "grad_norm": 4.746756553649902, "learning_rate": 9.98974358974359e-06, "loss": 0.0448, "step": 1952 }, { "epoch": 24.95846645367412, "grad_norm": 5.411993026733398, "learning_rate": 9.984615384615386e-06, "loss": 0.0582, "step": 1953 }, { "epoch": 24.971246006389777, "grad_norm": 3.2365033626556396, "learning_rate": 9.97948717948718e-06, "loss": 0.07, "step": 1954 }, { "epoch": 24.984025559105433, "grad_norm": 2.531045913696289, "learning_rate": 9.974358974358974e-06, "loss": 0.0566, "step": 1955 }, { "epoch": 24.996805111821086, "grad_norm": 3.0786638259887695, "learning_rate": 9.96923076923077e-06, "loss": 0.0439, "step": 1956 }, { "epoch": 25.009584664536742, "grad_norm": 3.8696084022521973, "learning_rate": 9.964102564102564e-06, "loss": 0.0452, "step": 1957 }, { "epoch": 25.022364217252395, "grad_norm": 3.3101108074188232, "learning_rate": 9.95897435897436e-06, "loss": 0.0404, "step": 1958 }, { "epoch": 25.03514376996805, "grad_norm": 2.594020128250122, "learning_rate": 9.953846153846156e-06, "loss": 0.0346, "step": 1959 }, { "epoch": 25.047923322683705, "grad_norm": 3.4840526580810547, "learning_rate": 9.94871794871795e-06, "loss": 0.0588, "step": 1960 }, { "epoch": 25.06070287539936, "grad_norm": 1.951032042503357, "learning_rate": 9.943589743589744e-06, "loss": 0.0249, "step": 1961 }, { "epoch": 25.073482428115017, "grad_norm": 3.4109833240509033, "learning_rate": 9.93846153846154e-06, "loss": 0.0451, "step": 1962 }, { "epoch": 25.08626198083067, "grad_norm": 2.452439785003662, "learning_rate": 9.933333333333334e-06, "loss": 0.0328, "step": 1963 }, { "epoch": 25.099041533546327, "grad_norm": 3.8744537830352783, "learning_rate": 9.92820512820513e-06, "loss": 0.0558, "step": 1964 }, { "epoch": 25.11182108626198, "grad_norm": 2.6519272327423096, "learning_rate": 9.923076923076923e-06, "loss": 0.0304, "step": 1965 }, { "epoch": 25.124600638977636, "grad_norm": 4.713734149932861, "learning_rate": 9.91794871794872e-06, "loss": 0.043, "step": 1966 }, { "epoch": 25.13738019169329, "grad_norm": 1.7487976551055908, "learning_rate": 9.912820512820513e-06, "loss": 0.0248, "step": 1967 }, { "epoch": 25.150159744408946, "grad_norm": 3.6812212467193604, "learning_rate": 9.907692307692309e-06, "loss": 0.035, "step": 1968 }, { "epoch": 25.162939297124602, "grad_norm": 2.805856704711914, "learning_rate": 9.902564102564103e-06, "loss": 0.0358, "step": 1969 }, { "epoch": 25.175718849840255, "grad_norm": 2.701242446899414, "learning_rate": 9.897435897435899e-06, "loss": 0.0419, "step": 1970 }, { "epoch": 25.18849840255591, "grad_norm": 2.5276713371276855, "learning_rate": 9.892307692307693e-06, "loss": 0.0266, "step": 1971 }, { "epoch": 25.201277955271564, "grad_norm": 5.18175745010376, "learning_rate": 9.887179487179489e-06, "loss": 0.0282, "step": 1972 }, { "epoch": 25.21405750798722, "grad_norm": 8.315861701965332, "learning_rate": 9.882051282051283e-06, "loss": 0.0904, "step": 1973 }, { "epoch": 25.226837060702877, "grad_norm": 3.513111114501953, "learning_rate": 9.876923076923077e-06, "loss": 0.0458, "step": 1974 }, { "epoch": 25.23961661341853, "grad_norm": 3.5734593868255615, "learning_rate": 9.871794871794872e-06, "loss": 0.0688, "step": 1975 }, { "epoch": 25.252396166134186, "grad_norm": 2.2913684844970703, "learning_rate": 9.866666666666668e-06, "loss": 0.0305, "step": 1976 }, { "epoch": 25.26517571884984, "grad_norm": 4.16497802734375, "learning_rate": 9.861538461538462e-06, "loss": 0.0455, "step": 1977 }, { "epoch": 25.277955271565496, "grad_norm": 4.468432426452637, "learning_rate": 9.856410256410256e-06, "loss": 0.0491, "step": 1978 }, { "epoch": 25.29073482428115, "grad_norm": 2.0939643383026123, "learning_rate": 9.851282051282052e-06, "loss": 0.0303, "step": 1979 }, { "epoch": 25.303514376996805, "grad_norm": 1.9866693019866943, "learning_rate": 9.846153846153848e-06, "loss": 0.0208, "step": 1980 }, { "epoch": 25.31629392971246, "grad_norm": 2.8087332248687744, "learning_rate": 9.841025641025642e-06, "loss": 0.0328, "step": 1981 }, { "epoch": 25.329073482428115, "grad_norm": 3.123957633972168, "learning_rate": 9.835897435897438e-06, "loss": 0.034, "step": 1982 }, { "epoch": 25.34185303514377, "grad_norm": 2.317481279373169, "learning_rate": 9.830769230769232e-06, "loss": 0.0371, "step": 1983 }, { "epoch": 25.354632587859424, "grad_norm": 3.0201683044433594, "learning_rate": 9.825641025641026e-06, "loss": 0.038, "step": 1984 }, { "epoch": 25.36741214057508, "grad_norm": 3.6297476291656494, "learning_rate": 9.820512820512821e-06, "loss": 0.0508, "step": 1985 }, { "epoch": 25.380191693290733, "grad_norm": 2.2146573066711426, "learning_rate": 9.815384615384617e-06, "loss": 0.0329, "step": 1986 }, { "epoch": 25.39297124600639, "grad_norm": 2.0576748847961426, "learning_rate": 9.810256410256411e-06, "loss": 0.0383, "step": 1987 }, { "epoch": 25.405750798722046, "grad_norm": 2.456467390060425, "learning_rate": 9.805128205128205e-06, "loss": 0.0398, "step": 1988 }, { "epoch": 25.4185303514377, "grad_norm": 2.6752357482910156, "learning_rate": 9.800000000000001e-06, "loss": 0.0401, "step": 1989 }, { "epoch": 25.431309904153355, "grad_norm": 2.9409120082855225, "learning_rate": 9.794871794871795e-06, "loss": 0.0426, "step": 1990 }, { "epoch": 25.44408945686901, "grad_norm": 2.9854166507720947, "learning_rate": 9.78974358974359e-06, "loss": 0.0403, "step": 1991 }, { "epoch": 25.456869009584665, "grad_norm": 3.2052009105682373, "learning_rate": 9.784615384615387e-06, "loss": 0.0493, "step": 1992 }, { "epoch": 25.46964856230032, "grad_norm": 1.7671165466308594, "learning_rate": 9.77948717948718e-06, "loss": 0.0201, "step": 1993 }, { "epoch": 25.482428115015974, "grad_norm": 8.041123390197754, "learning_rate": 9.774358974358975e-06, "loss": 0.0452, "step": 1994 }, { "epoch": 25.49520766773163, "grad_norm": 2.9875450134277344, "learning_rate": 9.76923076923077e-06, "loss": 0.0777, "step": 1995 }, { "epoch": 25.507987220447284, "grad_norm": 3.887556552886963, "learning_rate": 9.764102564102564e-06, "loss": 0.0542, "step": 1996 }, { "epoch": 25.52076677316294, "grad_norm": 2.4877817630767822, "learning_rate": 9.75897435897436e-06, "loss": 0.0342, "step": 1997 }, { "epoch": 25.533546325878593, "grad_norm": 3.525219440460205, "learning_rate": 9.753846153846154e-06, "loss": 0.0647, "step": 1998 }, { "epoch": 25.54632587859425, "grad_norm": 3.614077568054199, "learning_rate": 9.74871794871795e-06, "loss": 0.0333, "step": 1999 }, { "epoch": 25.559105431309906, "grad_norm": 3.021026134490967, "learning_rate": 9.743589743589744e-06, "loss": 0.0458, "step": 2000 }, { "epoch": 25.559105431309906, "eval_loss": 0.7721496224403381, "eval_runtime": 183.9025, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 2000 }, { "epoch": 25.57188498402556, "grad_norm": 7.759271144866943, "learning_rate": 9.738461538461538e-06, "loss": 0.0931, "step": 2001 }, { "epoch": 25.584664536741215, "grad_norm": 2.6857120990753174, "learning_rate": 9.733333333333334e-06, "loss": 0.0421, "step": 2002 }, { "epoch": 25.597444089456868, "grad_norm": 2.8657867908477783, "learning_rate": 9.72820512820513e-06, "loss": 0.0413, "step": 2003 }, { "epoch": 25.610223642172524, "grad_norm": 2.1326651573181152, "learning_rate": 9.723076923076924e-06, "loss": 0.0257, "step": 2004 }, { "epoch": 25.623003194888177, "grad_norm": 3.7716381549835205, "learning_rate": 9.71794871794872e-06, "loss": 0.0297, "step": 2005 }, { "epoch": 25.635782747603834, "grad_norm": 2.4711177349090576, "learning_rate": 9.712820512820513e-06, "loss": 0.0344, "step": 2006 }, { "epoch": 25.64856230031949, "grad_norm": 2.9155871868133545, "learning_rate": 9.707692307692308e-06, "loss": 0.0447, "step": 2007 }, { "epoch": 25.661341853035143, "grad_norm": 2.907470703125, "learning_rate": 9.702564102564103e-06, "loss": 0.042, "step": 2008 }, { "epoch": 25.6741214057508, "grad_norm": 1.783984899520874, "learning_rate": 9.697435897435899e-06, "loss": 0.0172, "step": 2009 }, { "epoch": 25.686900958466452, "grad_norm": 3.024845600128174, "learning_rate": 9.692307692307693e-06, "loss": 0.0293, "step": 2010 }, { "epoch": 25.69968051118211, "grad_norm": 4.1440815925598145, "learning_rate": 9.687179487179487e-06, "loss": 0.0412, "step": 2011 }, { "epoch": 25.712460063897765, "grad_norm": 3.1864733695983887, "learning_rate": 9.682051282051283e-06, "loss": 0.0324, "step": 2012 }, { "epoch": 25.72523961661342, "grad_norm": 3.390996217727661, "learning_rate": 9.676923076923079e-06, "loss": 0.0287, "step": 2013 }, { "epoch": 25.738019169329075, "grad_norm": 5.366546630859375, "learning_rate": 9.671794871794873e-06, "loss": 0.0562, "step": 2014 }, { "epoch": 25.750798722044728, "grad_norm": 3.494955539703369, "learning_rate": 9.666666666666667e-06, "loss": 0.0489, "step": 2015 }, { "epoch": 25.763578274760384, "grad_norm": 4.710078239440918, "learning_rate": 9.661538461538462e-06, "loss": 0.0694, "step": 2016 }, { "epoch": 25.776357827476037, "grad_norm": 2.927586555480957, "learning_rate": 9.656410256410257e-06, "loss": 0.0492, "step": 2017 }, { "epoch": 25.789137380191693, "grad_norm": 2.5779459476470947, "learning_rate": 9.651282051282052e-06, "loss": 0.0371, "step": 2018 }, { "epoch": 25.80191693290735, "grad_norm": 7.566952705383301, "learning_rate": 9.646153846153848e-06, "loss": 0.0404, "step": 2019 }, { "epoch": 25.814696485623003, "grad_norm": 2.655378580093384, "learning_rate": 9.641025641025642e-06, "loss": 0.0301, "step": 2020 }, { "epoch": 25.82747603833866, "grad_norm": 9.1538724899292, "learning_rate": 9.635897435897436e-06, "loss": 0.0374, "step": 2021 }, { "epoch": 25.840255591054312, "grad_norm": 2.7713680267333984, "learning_rate": 9.630769230769232e-06, "loss": 0.0348, "step": 2022 }, { "epoch": 25.85303514376997, "grad_norm": 5.083139896392822, "learning_rate": 9.625641025641026e-06, "loss": 0.0648, "step": 2023 }, { "epoch": 25.86581469648562, "grad_norm": 3.456340789794922, "learning_rate": 9.620512820512822e-06, "loss": 0.0465, "step": 2024 }, { "epoch": 25.878594249201278, "grad_norm": 2.917619466781616, "learning_rate": 9.615384615384616e-06, "loss": 0.0486, "step": 2025 }, { "epoch": 25.891373801916934, "grad_norm": 4.159769058227539, "learning_rate": 9.610256410256411e-06, "loss": 0.0458, "step": 2026 }, { "epoch": 25.904153354632587, "grad_norm": 5.426984786987305, "learning_rate": 9.605128205128206e-06, "loss": 0.0679, "step": 2027 }, { "epoch": 25.916932907348244, "grad_norm": 2.974804639816284, "learning_rate": 9.600000000000001e-06, "loss": 0.0251, "step": 2028 }, { "epoch": 25.929712460063897, "grad_norm": 2.336801290512085, "learning_rate": 9.594871794871797e-06, "loss": 0.0337, "step": 2029 }, { "epoch": 25.942492012779553, "grad_norm": 3.529595375061035, "learning_rate": 9.589743589743591e-06, "loss": 0.0487, "step": 2030 }, { "epoch": 25.955271565495206, "grad_norm": 3.1817433834075928, "learning_rate": 9.584615384615385e-06, "loss": 0.042, "step": 2031 }, { "epoch": 25.968051118210862, "grad_norm": 4.293992042541504, "learning_rate": 9.579487179487181e-06, "loss": 0.0245, "step": 2032 }, { "epoch": 25.98083067092652, "grad_norm": 3.56746506690979, "learning_rate": 9.574358974358975e-06, "loss": 0.0403, "step": 2033 }, { "epoch": 25.99361022364217, "grad_norm": 2.8027122020721436, "learning_rate": 9.569230769230769e-06, "loss": 0.0311, "step": 2034 }, { "epoch": 26.00638977635783, "grad_norm": 3.442718267440796, "learning_rate": 9.564102564102565e-06, "loss": 0.0521, "step": 2035 }, { "epoch": 26.01916932907348, "grad_norm": 2.420042037963867, "learning_rate": 9.55897435897436e-06, "loss": 0.0343, "step": 2036 }, { "epoch": 26.031948881789138, "grad_norm": 3.9595730304718018, "learning_rate": 9.553846153846155e-06, "loss": 0.0401, "step": 2037 }, { "epoch": 26.044728434504794, "grad_norm": 3.0006279945373535, "learning_rate": 9.548717948717949e-06, "loss": 0.0319, "step": 2038 }, { "epoch": 26.057507987220447, "grad_norm": 2.6789944171905518, "learning_rate": 9.543589743589744e-06, "loss": 0.0245, "step": 2039 }, { "epoch": 26.070287539936103, "grad_norm": 1.75850510597229, "learning_rate": 9.53846153846154e-06, "loss": 0.0294, "step": 2040 }, { "epoch": 26.083067092651756, "grad_norm": 2.2789313793182373, "learning_rate": 9.533333333333334e-06, "loss": 0.0196, "step": 2041 }, { "epoch": 26.095846645367413, "grad_norm": 1.8305696249008179, "learning_rate": 9.52820512820513e-06, "loss": 0.0249, "step": 2042 }, { "epoch": 26.108626198083066, "grad_norm": 1.6572062969207764, "learning_rate": 9.523076923076924e-06, "loss": 0.0193, "step": 2043 }, { "epoch": 26.121405750798722, "grad_norm": 2.569342851638794, "learning_rate": 9.517948717948718e-06, "loss": 0.0449, "step": 2044 }, { "epoch": 26.13418530351438, "grad_norm": 4.079476356506348, "learning_rate": 9.512820512820514e-06, "loss": 0.0287, "step": 2045 }, { "epoch": 26.14696485623003, "grad_norm": 3.0456132888793945, "learning_rate": 9.50769230769231e-06, "loss": 0.0448, "step": 2046 }, { "epoch": 26.159744408945688, "grad_norm": 2.417229413986206, "learning_rate": 9.502564102564103e-06, "loss": 0.047, "step": 2047 }, { "epoch": 26.17252396166134, "grad_norm": 2.991654872894287, "learning_rate": 9.497435897435898e-06, "loss": 0.0459, "step": 2048 }, { "epoch": 26.185303514376997, "grad_norm": 2.4096415042877197, "learning_rate": 9.492307692307693e-06, "loss": 0.0259, "step": 2049 }, { "epoch": 26.19808306709265, "grad_norm": 2.3776893615722656, "learning_rate": 9.487179487179487e-06, "loss": 0.0269, "step": 2050 }, { "epoch": 26.210862619808307, "grad_norm": 3.019699811935425, "learning_rate": 9.482051282051283e-06, "loss": 0.026, "step": 2051 }, { "epoch": 26.223642172523963, "grad_norm": 3.0545332431793213, "learning_rate": 9.476923076923079e-06, "loss": 0.0377, "step": 2052 }, { "epoch": 26.236421725239616, "grad_norm": 4.2493062019348145, "learning_rate": 9.471794871794873e-06, "loss": 0.0342, "step": 2053 }, { "epoch": 26.249201277955272, "grad_norm": 3.70605206489563, "learning_rate": 9.466666666666667e-06, "loss": 0.0452, "step": 2054 }, { "epoch": 26.261980830670925, "grad_norm": 1.6265230178833008, "learning_rate": 9.461538461538463e-06, "loss": 0.0203, "step": 2055 }, { "epoch": 26.27476038338658, "grad_norm": 3.664158821105957, "learning_rate": 9.456410256410257e-06, "loss": 0.0199, "step": 2056 }, { "epoch": 26.287539936102238, "grad_norm": 2.3494155406951904, "learning_rate": 9.451282051282052e-06, "loss": 0.026, "step": 2057 }, { "epoch": 26.30031948881789, "grad_norm": 3.5722639560699463, "learning_rate": 9.446153846153847e-06, "loss": 0.0364, "step": 2058 }, { "epoch": 26.313099041533548, "grad_norm": 4.406216621398926, "learning_rate": 9.441025641025642e-06, "loss": 0.0331, "step": 2059 }, { "epoch": 26.3258785942492, "grad_norm": 1.9493281841278076, "learning_rate": 9.435897435897436e-06, "loss": 0.0352, "step": 2060 }, { "epoch": 26.338658146964857, "grad_norm": 2.5774435997009277, "learning_rate": 9.43076923076923e-06, "loss": 0.0268, "step": 2061 }, { "epoch": 26.35143769968051, "grad_norm": 2.915691375732422, "learning_rate": 9.425641025641026e-06, "loss": 0.0378, "step": 2062 }, { "epoch": 26.364217252396166, "grad_norm": 3.6906867027282715, "learning_rate": 9.420512820512822e-06, "loss": 0.0294, "step": 2063 }, { "epoch": 26.376996805111823, "grad_norm": 2.9189324378967285, "learning_rate": 9.415384615384616e-06, "loss": 0.0212, "step": 2064 }, { "epoch": 26.389776357827476, "grad_norm": 4.310835361480713, "learning_rate": 9.410256410256412e-06, "loss": 0.0604, "step": 2065 }, { "epoch": 26.402555910543132, "grad_norm": 2.1591827869415283, "learning_rate": 9.405128205128206e-06, "loss": 0.0287, "step": 2066 }, { "epoch": 26.415335463258785, "grad_norm": 4.369766712188721, "learning_rate": 9.4e-06, "loss": 0.036, "step": 2067 }, { "epoch": 26.42811501597444, "grad_norm": 4.2029876708984375, "learning_rate": 9.394871794871796e-06, "loss": 0.0441, "step": 2068 }, { "epoch": 26.440894568690094, "grad_norm": 3.996307849884033, "learning_rate": 9.389743589743591e-06, "loss": 0.0749, "step": 2069 }, { "epoch": 26.45367412140575, "grad_norm": 2.7402710914611816, "learning_rate": 9.384615384615385e-06, "loss": 0.026, "step": 2070 }, { "epoch": 26.466453674121407, "grad_norm": 1.8309948444366455, "learning_rate": 9.37948717948718e-06, "loss": 0.0223, "step": 2071 }, { "epoch": 26.47923322683706, "grad_norm": 2.682682752609253, "learning_rate": 9.374358974358975e-06, "loss": 0.0416, "step": 2072 }, { "epoch": 26.492012779552716, "grad_norm": 2.569913864135742, "learning_rate": 9.369230769230771e-06, "loss": 0.0348, "step": 2073 }, { "epoch": 26.50479233226837, "grad_norm": 2.0622801780700684, "learning_rate": 9.364102564102565e-06, "loss": 0.0552, "step": 2074 }, { "epoch": 26.517571884984026, "grad_norm": 3.3317339420318604, "learning_rate": 9.358974358974359e-06, "loss": 0.0274, "step": 2075 }, { "epoch": 26.53035143769968, "grad_norm": 2.5767219066619873, "learning_rate": 9.353846153846155e-06, "loss": 0.0391, "step": 2076 }, { "epoch": 26.543130990415335, "grad_norm": 1.9681037664413452, "learning_rate": 9.348717948717949e-06, "loss": 0.025, "step": 2077 }, { "epoch": 26.55591054313099, "grad_norm": 2.127232074737549, "learning_rate": 9.343589743589745e-06, "loss": 0.0322, "step": 2078 }, { "epoch": 26.568690095846645, "grad_norm": 4.699692249298096, "learning_rate": 9.33846153846154e-06, "loss": 0.0384, "step": 2079 }, { "epoch": 26.5814696485623, "grad_norm": 3.699765682220459, "learning_rate": 9.333333333333334e-06, "loss": 0.0491, "step": 2080 }, { "epoch": 26.594249201277954, "grad_norm": 2.842067241668701, "learning_rate": 9.328205128205128e-06, "loss": 0.0244, "step": 2081 }, { "epoch": 26.60702875399361, "grad_norm": 2.6938209533691406, "learning_rate": 9.323076923076924e-06, "loss": 0.0235, "step": 2082 }, { "epoch": 26.619808306709267, "grad_norm": 4.348531723022461, "learning_rate": 9.317948717948718e-06, "loss": 0.0521, "step": 2083 }, { "epoch": 26.63258785942492, "grad_norm": 2.8389244079589844, "learning_rate": 9.312820512820514e-06, "loss": 0.0423, "step": 2084 }, { "epoch": 26.645367412140576, "grad_norm": 1.9097580909729004, "learning_rate": 9.307692307692308e-06, "loss": 0.0278, "step": 2085 }, { "epoch": 26.65814696485623, "grad_norm": 3.038785696029663, "learning_rate": 9.302564102564104e-06, "loss": 0.0372, "step": 2086 }, { "epoch": 26.670926517571885, "grad_norm": 2.6908535957336426, "learning_rate": 9.297435897435898e-06, "loss": 0.0315, "step": 2087 }, { "epoch": 26.68370607028754, "grad_norm": 4.660581111907959, "learning_rate": 9.292307692307694e-06, "loss": 0.0456, "step": 2088 }, { "epoch": 26.696485623003195, "grad_norm": 4.069143772125244, "learning_rate": 9.28717948717949e-06, "loss": 0.0474, "step": 2089 }, { "epoch": 26.70926517571885, "grad_norm": 2.327202558517456, "learning_rate": 9.282051282051283e-06, "loss": 0.0213, "step": 2090 }, { "epoch": 26.722044728434504, "grad_norm": 2.2371463775634766, "learning_rate": 9.276923076923077e-06, "loss": 0.0355, "step": 2091 }, { "epoch": 26.73482428115016, "grad_norm": 3.8798370361328125, "learning_rate": 9.271794871794873e-06, "loss": 0.0299, "step": 2092 }, { "epoch": 26.747603833865814, "grad_norm": 2.8291265964508057, "learning_rate": 9.266666666666667e-06, "loss": 0.0333, "step": 2093 }, { "epoch": 26.76038338658147, "grad_norm": 2.7667057514190674, "learning_rate": 9.261538461538461e-06, "loss": 0.0348, "step": 2094 }, { "epoch": 26.773162939297123, "grad_norm": 3.423550844192505, "learning_rate": 9.256410256410257e-06, "loss": 0.0444, "step": 2095 }, { "epoch": 26.78594249201278, "grad_norm": 2.025509834289551, "learning_rate": 9.251282051282053e-06, "loss": 0.0254, "step": 2096 }, { "epoch": 26.798722044728436, "grad_norm": 3.826721668243408, "learning_rate": 9.246153846153847e-06, "loss": 0.0405, "step": 2097 }, { "epoch": 26.81150159744409, "grad_norm": 1.5340789556503296, "learning_rate": 9.24102564102564e-06, "loss": 0.0141, "step": 2098 }, { "epoch": 26.824281150159745, "grad_norm": 2.702284574508667, "learning_rate": 9.235897435897437e-06, "loss": 0.0344, "step": 2099 }, { "epoch": 26.837060702875398, "grad_norm": 3.0476367473602295, "learning_rate": 9.230769230769232e-06, "loss": 0.0385, "step": 2100 }, { "epoch": 26.837060702875398, "eval_loss": 0.8120326399803162, "eval_runtime": 183.4599, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.109, "step": 2100 }, { "epoch": 26.849840255591054, "grad_norm": 1.8112848997116089, "learning_rate": 9.225641025641026e-06, "loss": 0.0173, "step": 2101 }, { "epoch": 26.86261980830671, "grad_norm": 2.793485641479492, "learning_rate": 9.220512820512822e-06, "loss": 0.0228, "step": 2102 }, { "epoch": 26.875399361022364, "grad_norm": 3.2606515884399414, "learning_rate": 9.215384615384616e-06, "loss": 0.0425, "step": 2103 }, { "epoch": 26.88817891373802, "grad_norm": 3.649292469024658, "learning_rate": 9.21025641025641e-06, "loss": 0.0281, "step": 2104 }, { "epoch": 26.900958466453673, "grad_norm": 2.2313618659973145, "learning_rate": 9.205128205128206e-06, "loss": 0.0281, "step": 2105 }, { "epoch": 26.91373801916933, "grad_norm": 5.063486099243164, "learning_rate": 9.200000000000002e-06, "loss": 0.0466, "step": 2106 }, { "epoch": 26.926517571884983, "grad_norm": 3.2838034629821777, "learning_rate": 9.194871794871796e-06, "loss": 0.0365, "step": 2107 }, { "epoch": 26.93929712460064, "grad_norm": 3.712921142578125, "learning_rate": 9.18974358974359e-06, "loss": 0.0281, "step": 2108 }, { "epoch": 26.952076677316295, "grad_norm": 4.407195091247559, "learning_rate": 9.184615384615386e-06, "loss": 0.032, "step": 2109 }, { "epoch": 26.96485623003195, "grad_norm": 5.876495361328125, "learning_rate": 9.17948717948718e-06, "loss": 0.0655, "step": 2110 }, { "epoch": 26.977635782747605, "grad_norm": 1.9610528945922852, "learning_rate": 9.174358974358975e-06, "loss": 0.0204, "step": 2111 }, { "epoch": 26.990415335463258, "grad_norm": 2.3272671699523926, "learning_rate": 9.169230769230771e-06, "loss": 0.0247, "step": 2112 }, { "epoch": 27.003194888178914, "grad_norm": 2.827895402908325, "learning_rate": 9.164102564102565e-06, "loss": 0.0215, "step": 2113 }, { "epoch": 27.015974440894567, "grad_norm": 1.6048749685287476, "learning_rate": 9.15897435897436e-06, "loss": 0.0244, "step": 2114 }, { "epoch": 27.028753993610223, "grad_norm": 2.2859067916870117, "learning_rate": 9.153846153846155e-06, "loss": 0.0206, "step": 2115 }, { "epoch": 27.04153354632588, "grad_norm": 2.696887969970703, "learning_rate": 9.148717948717949e-06, "loss": 0.0251, "step": 2116 }, { "epoch": 27.054313099041533, "grad_norm": 1.4276155233383179, "learning_rate": 9.143589743589745e-06, "loss": 0.0207, "step": 2117 }, { "epoch": 27.06709265175719, "grad_norm": 5.58026123046875, "learning_rate": 9.138461538461539e-06, "loss": 0.0437, "step": 2118 }, { "epoch": 27.079872204472842, "grad_norm": 5.844724655151367, "learning_rate": 9.133333333333335e-06, "loss": 0.0436, "step": 2119 }, { "epoch": 27.0926517571885, "grad_norm": 8.150542259216309, "learning_rate": 9.128205128205129e-06, "loss": 0.0422, "step": 2120 }, { "epoch": 27.105431309904155, "grad_norm": 5.604848384857178, "learning_rate": 9.123076923076923e-06, "loss": 0.0382, "step": 2121 }, { "epoch": 27.118210862619808, "grad_norm": 3.6250290870666504, "learning_rate": 9.117948717948718e-06, "loss": 0.0324, "step": 2122 }, { "epoch": 27.130990415335464, "grad_norm": 5.809665679931641, "learning_rate": 9.112820512820514e-06, "loss": 0.0319, "step": 2123 }, { "epoch": 27.143769968051117, "grad_norm": 2.2189152240753174, "learning_rate": 9.107692307692308e-06, "loss": 0.0208, "step": 2124 }, { "epoch": 27.156549520766774, "grad_norm": 2.7135326862335205, "learning_rate": 9.102564102564104e-06, "loss": 0.0243, "step": 2125 }, { "epoch": 27.169329073482427, "grad_norm": 1.4363244771957397, "learning_rate": 9.097435897435898e-06, "loss": 0.014, "step": 2126 }, { "epoch": 27.182108626198083, "grad_norm": 4.288629055023193, "learning_rate": 9.092307692307692e-06, "loss": 0.0569, "step": 2127 }, { "epoch": 27.19488817891374, "grad_norm": 4.174808502197266, "learning_rate": 9.087179487179488e-06, "loss": 0.0383, "step": 2128 }, { "epoch": 27.207667731629392, "grad_norm": 2.7566633224487305, "learning_rate": 9.082051282051284e-06, "loss": 0.0336, "step": 2129 }, { "epoch": 27.22044728434505, "grad_norm": 1.8850542306900024, "learning_rate": 9.076923076923078e-06, "loss": 0.0168, "step": 2130 }, { "epoch": 27.233226837060702, "grad_norm": 1.9132078886032104, "learning_rate": 9.071794871794872e-06, "loss": 0.0155, "step": 2131 }, { "epoch": 27.24600638977636, "grad_norm": 19.71868324279785, "learning_rate": 9.066666666666667e-06, "loss": 0.0332, "step": 2132 }, { "epoch": 27.25878594249201, "grad_norm": 1.4613136053085327, "learning_rate": 9.061538461538463e-06, "loss": 0.0161, "step": 2133 }, { "epoch": 27.271565495207668, "grad_norm": 2.2136759757995605, "learning_rate": 9.056410256410257e-06, "loss": 0.0301, "step": 2134 }, { "epoch": 27.284345047923324, "grad_norm": 2.1000702381134033, "learning_rate": 9.051282051282051e-06, "loss": 0.032, "step": 2135 }, { "epoch": 27.297124600638977, "grad_norm": 2.5691099166870117, "learning_rate": 9.046153846153847e-06, "loss": 0.0538, "step": 2136 }, { "epoch": 27.309904153354633, "grad_norm": 1.641770839691162, "learning_rate": 9.041025641025641e-06, "loss": 0.0149, "step": 2137 }, { "epoch": 27.322683706070286, "grad_norm": 4.829441070556641, "learning_rate": 9.035897435897437e-06, "loss": 0.0441, "step": 2138 }, { "epoch": 27.335463258785943, "grad_norm": 2.505126476287842, "learning_rate": 9.030769230769233e-06, "loss": 0.0289, "step": 2139 }, { "epoch": 27.3482428115016, "grad_norm": 6.310408115386963, "learning_rate": 9.025641025641027e-06, "loss": 0.0431, "step": 2140 }, { "epoch": 27.361022364217252, "grad_norm": 2.2306759357452393, "learning_rate": 9.02051282051282e-06, "loss": 0.0153, "step": 2141 }, { "epoch": 27.37380191693291, "grad_norm": 1.9416497945785522, "learning_rate": 9.015384615384616e-06, "loss": 0.0199, "step": 2142 }, { "epoch": 27.38658146964856, "grad_norm": 4.289925575256348, "learning_rate": 9.01025641025641e-06, "loss": 0.0346, "step": 2143 }, { "epoch": 27.399361022364218, "grad_norm": 1.3997843265533447, "learning_rate": 9.005128205128206e-06, "loss": 0.0153, "step": 2144 }, { "epoch": 27.41214057507987, "grad_norm": 3.835322380065918, "learning_rate": 9e-06, "loss": 0.0603, "step": 2145 }, { "epoch": 27.424920127795527, "grad_norm": 2.464630365371704, "learning_rate": 8.994871794871796e-06, "loss": 0.0285, "step": 2146 }, { "epoch": 27.437699680511184, "grad_norm": 2.7458603382110596, "learning_rate": 8.98974358974359e-06, "loss": 0.0379, "step": 2147 }, { "epoch": 27.450479233226837, "grad_norm": 1.4156213998794556, "learning_rate": 8.984615384615386e-06, "loss": 0.0164, "step": 2148 }, { "epoch": 27.463258785942493, "grad_norm": 1.647200584411621, "learning_rate": 8.979487179487182e-06, "loss": 0.0216, "step": 2149 }, { "epoch": 27.476038338658146, "grad_norm": 2.2965891361236572, "learning_rate": 8.974358974358976e-06, "loss": 0.0231, "step": 2150 }, { "epoch": 27.488817891373802, "grad_norm": 3.635070562362671, "learning_rate": 8.96923076923077e-06, "loss": 0.0221, "step": 2151 }, { "epoch": 27.501597444089455, "grad_norm": 2.297700881958008, "learning_rate": 8.964102564102565e-06, "loss": 0.0231, "step": 2152 }, { "epoch": 27.51437699680511, "grad_norm": 1.8114467859268188, "learning_rate": 8.95897435897436e-06, "loss": 0.0185, "step": 2153 }, { "epoch": 27.527156549520768, "grad_norm": 4.470355987548828, "learning_rate": 8.953846153846153e-06, "loss": 0.0464, "step": 2154 }, { "epoch": 27.53993610223642, "grad_norm": 1.3283096551895142, "learning_rate": 8.94871794871795e-06, "loss": 0.0161, "step": 2155 }, { "epoch": 27.552715654952078, "grad_norm": 7.926822662353516, "learning_rate": 8.943589743589745e-06, "loss": 0.0662, "step": 2156 }, { "epoch": 27.56549520766773, "grad_norm": 2.1085832118988037, "learning_rate": 8.938461538461539e-06, "loss": 0.0244, "step": 2157 }, { "epoch": 27.578274760383387, "grad_norm": 7.27145528793335, "learning_rate": 8.933333333333333e-06, "loss": 0.0482, "step": 2158 }, { "epoch": 27.591054313099043, "grad_norm": 2.787456750869751, "learning_rate": 8.928205128205129e-06, "loss": 0.0379, "step": 2159 }, { "epoch": 27.603833865814696, "grad_norm": 2.238116502761841, "learning_rate": 8.923076923076925e-06, "loss": 0.024, "step": 2160 }, { "epoch": 27.616613418530353, "grad_norm": 5.366321086883545, "learning_rate": 8.917948717948719e-06, "loss": 0.0464, "step": 2161 }, { "epoch": 27.629392971246006, "grad_norm": 4.097918510437012, "learning_rate": 8.912820512820514e-06, "loss": 0.0262, "step": 2162 }, { "epoch": 27.642172523961662, "grad_norm": 4.417905807495117, "learning_rate": 8.907692307692308e-06, "loss": 0.043, "step": 2163 }, { "epoch": 27.654952076677315, "grad_norm": 2.8238251209259033, "learning_rate": 8.902564102564102e-06, "loss": 0.0376, "step": 2164 }, { "epoch": 27.66773162939297, "grad_norm": 1.3213348388671875, "learning_rate": 8.897435897435898e-06, "loss": 0.0146, "step": 2165 }, { "epoch": 27.680511182108628, "grad_norm": 3.444852828979492, "learning_rate": 8.892307692307694e-06, "loss": 0.0278, "step": 2166 }, { "epoch": 27.69329073482428, "grad_norm": 1.3029857873916626, "learning_rate": 8.887179487179488e-06, "loss": 0.0129, "step": 2167 }, { "epoch": 27.706070287539937, "grad_norm": 5.024180889129639, "learning_rate": 8.882051282051282e-06, "loss": 0.0363, "step": 2168 }, { "epoch": 27.71884984025559, "grad_norm": 1.4591314792633057, "learning_rate": 8.876923076923078e-06, "loss": 0.0145, "step": 2169 }, { "epoch": 27.731629392971247, "grad_norm": 2.011525869369507, "learning_rate": 8.871794871794872e-06, "loss": 0.0233, "step": 2170 }, { "epoch": 27.7444089456869, "grad_norm": 2.6536760330200195, "learning_rate": 8.866666666666668e-06, "loss": 0.0259, "step": 2171 }, { "epoch": 27.757188498402556, "grad_norm": 2.8977763652801514, "learning_rate": 8.861538461538463e-06, "loss": 0.0406, "step": 2172 }, { "epoch": 27.769968051118212, "grad_norm": 2.8562774658203125, "learning_rate": 8.856410256410257e-06, "loss": 0.0288, "step": 2173 }, { "epoch": 27.782747603833865, "grad_norm": 5.174256801605225, "learning_rate": 8.851282051282051e-06, "loss": 0.0222, "step": 2174 }, { "epoch": 27.79552715654952, "grad_norm": 1.915551781654358, "learning_rate": 8.846153846153847e-06, "loss": 0.035, "step": 2175 }, { "epoch": 27.808306709265175, "grad_norm": 3.622258424758911, "learning_rate": 8.841025641025641e-06, "loss": 0.0271, "step": 2176 }, { "epoch": 27.82108626198083, "grad_norm": 2.134631633758545, "learning_rate": 8.835897435897437e-06, "loss": 0.0273, "step": 2177 }, { "epoch": 27.833865814696484, "grad_norm": 7.068699359893799, "learning_rate": 8.830769230769231e-06, "loss": 0.0362, "step": 2178 }, { "epoch": 27.84664536741214, "grad_norm": 3.2578909397125244, "learning_rate": 8.825641025641027e-06, "loss": 0.0328, "step": 2179 }, { "epoch": 27.859424920127797, "grad_norm": 2.324989080429077, "learning_rate": 8.820512820512821e-06, "loss": 0.025, "step": 2180 }, { "epoch": 27.87220447284345, "grad_norm": 1.5827807188034058, "learning_rate": 8.815384615384615e-06, "loss": 0.0219, "step": 2181 }, { "epoch": 27.884984025559106, "grad_norm": 4.256125450134277, "learning_rate": 8.81025641025641e-06, "loss": 0.0267, "step": 2182 }, { "epoch": 27.89776357827476, "grad_norm": 3.0664892196655273, "learning_rate": 8.805128205128206e-06, "loss": 0.0263, "step": 2183 }, { "epoch": 27.910543130990416, "grad_norm": 4.755489826202393, "learning_rate": 8.8e-06, "loss": 0.029, "step": 2184 }, { "epoch": 27.923322683706072, "grad_norm": 3.9942352771759033, "learning_rate": 8.794871794871796e-06, "loss": 0.0378, "step": 2185 }, { "epoch": 27.936102236421725, "grad_norm": 1.4267417192459106, "learning_rate": 8.78974358974359e-06, "loss": 0.0142, "step": 2186 }, { "epoch": 27.94888178913738, "grad_norm": 3.233738899230957, "learning_rate": 8.784615384615386e-06, "loss": 0.0268, "step": 2187 }, { "epoch": 27.961661341853034, "grad_norm": 3.739074468612671, "learning_rate": 8.77948717948718e-06, "loss": 0.0438, "step": 2188 }, { "epoch": 27.97444089456869, "grad_norm": 3.132491111755371, "learning_rate": 8.774358974358976e-06, "loss": 0.0315, "step": 2189 }, { "epoch": 27.987220447284344, "grad_norm": 1.5290716886520386, "learning_rate": 8.76923076923077e-06, "loss": 0.0224, "step": 2190 }, { "epoch": 28.0, "grad_norm": 1.6961673498153687, "learning_rate": 8.764102564102564e-06, "loss": 0.0171, "step": 2191 }, { "epoch": 28.012779552715656, "grad_norm": 1.394661545753479, "learning_rate": 8.75897435897436e-06, "loss": 0.0148, "step": 2192 }, { "epoch": 28.02555910543131, "grad_norm": 1.8422940969467163, "learning_rate": 8.753846153846155e-06, "loss": 0.0188, "step": 2193 }, { "epoch": 28.038338658146966, "grad_norm": 1.0886930227279663, "learning_rate": 8.74871794871795e-06, "loss": 0.0112, "step": 2194 }, { "epoch": 28.05111821086262, "grad_norm": 1.127504587173462, "learning_rate": 8.743589743589743e-06, "loss": 0.0122, "step": 2195 }, { "epoch": 28.063897763578275, "grad_norm": 1.3174153566360474, "learning_rate": 8.73846153846154e-06, "loss": 0.0149, "step": 2196 }, { "epoch": 28.076677316293928, "grad_norm": 0.9179958701133728, "learning_rate": 8.733333333333333e-06, "loss": 0.0109, "step": 2197 }, { "epoch": 28.089456869009584, "grad_norm": 1.8915237188339233, "learning_rate": 8.728205128205129e-06, "loss": 0.0164, "step": 2198 }, { "epoch": 28.10223642172524, "grad_norm": 1.8076939582824707, "learning_rate": 8.723076923076925e-06, "loss": 0.0155, "step": 2199 }, { "epoch": 28.115015974440894, "grad_norm": 1.4523608684539795, "learning_rate": 8.717948717948719e-06, "loss": 0.0163, "step": 2200 }, { "epoch": 28.115015974440894, "eval_loss": 0.8285750150680542, "eval_runtime": 183.9379, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 2200 }, { "epoch": 28.12779552715655, "grad_norm": 3.155454635620117, "learning_rate": 8.712820512820513e-06, "loss": 0.0242, "step": 2201 }, { "epoch": 28.140575079872203, "grad_norm": 0.9504920244216919, "learning_rate": 8.707692307692309e-06, "loss": 0.0113, "step": 2202 }, { "epoch": 28.15335463258786, "grad_norm": 2.4801573753356934, "learning_rate": 8.702564102564103e-06, "loss": 0.0291, "step": 2203 }, { "epoch": 28.166134185303516, "grad_norm": 1.5551204681396484, "learning_rate": 8.697435897435898e-06, "loss": 0.0249, "step": 2204 }, { "epoch": 28.17891373801917, "grad_norm": 1.50502347946167, "learning_rate": 8.692307692307692e-06, "loss": 0.0129, "step": 2205 }, { "epoch": 28.191693290734825, "grad_norm": 3.653285503387451, "learning_rate": 8.687179487179488e-06, "loss": 0.0239, "step": 2206 }, { "epoch": 28.20447284345048, "grad_norm": 2.517505168914795, "learning_rate": 8.682051282051282e-06, "loss": 0.0401, "step": 2207 }, { "epoch": 28.217252396166135, "grad_norm": 3.3538625240325928, "learning_rate": 8.676923076923078e-06, "loss": 0.0457, "step": 2208 }, { "epoch": 28.230031948881788, "grad_norm": 1.3679107427597046, "learning_rate": 8.671794871794874e-06, "loss": 0.0149, "step": 2209 }, { "epoch": 28.242811501597444, "grad_norm": 5.592023849487305, "learning_rate": 8.666666666666668e-06, "loss": 0.0218, "step": 2210 }, { "epoch": 28.2555910543131, "grad_norm": 0.7429912090301514, "learning_rate": 8.661538461538462e-06, "loss": 0.0083, "step": 2211 }, { "epoch": 28.268370607028753, "grad_norm": 1.2271093130111694, "learning_rate": 8.656410256410258e-06, "loss": 0.0131, "step": 2212 }, { "epoch": 28.28115015974441, "grad_norm": 2.5322265625, "learning_rate": 8.651282051282052e-06, "loss": 0.0254, "step": 2213 }, { "epoch": 28.293929712460063, "grad_norm": 2.8084962368011475, "learning_rate": 8.646153846153846e-06, "loss": 0.0189, "step": 2214 }, { "epoch": 28.30670926517572, "grad_norm": 4.381146430969238, "learning_rate": 8.641025641025641e-06, "loss": 0.0203, "step": 2215 }, { "epoch": 28.319488817891372, "grad_norm": 3.287971258163452, "learning_rate": 8.635897435897437e-06, "loss": 0.0324, "step": 2216 }, { "epoch": 28.33226837060703, "grad_norm": 2.3260955810546875, "learning_rate": 8.630769230769231e-06, "loss": 0.0262, "step": 2217 }, { "epoch": 28.345047923322685, "grad_norm": 2.256308078765869, "learning_rate": 8.625641025641025e-06, "loss": 0.0339, "step": 2218 }, { "epoch": 28.357827476038338, "grad_norm": 9.611410140991211, "learning_rate": 8.620512820512821e-06, "loss": 0.0633, "step": 2219 }, { "epoch": 28.370607028753994, "grad_norm": 4.757236957550049, "learning_rate": 8.615384615384617e-06, "loss": 0.0282, "step": 2220 }, { "epoch": 28.383386581469647, "grad_norm": 2.538170337677002, "learning_rate": 8.610256410256411e-06, "loss": 0.0169, "step": 2221 }, { "epoch": 28.396166134185304, "grad_norm": 1.2487812042236328, "learning_rate": 8.605128205128207e-06, "loss": 0.0153, "step": 2222 }, { "epoch": 28.408945686900957, "grad_norm": 3.4212915897369385, "learning_rate": 8.6e-06, "loss": 0.0224, "step": 2223 }, { "epoch": 28.421725239616613, "grad_norm": 2.1509947776794434, "learning_rate": 8.594871794871795e-06, "loss": 0.0217, "step": 2224 }, { "epoch": 28.43450479233227, "grad_norm": 1.3148765563964844, "learning_rate": 8.58974358974359e-06, "loss": 0.0345, "step": 2225 }, { "epoch": 28.447284345047922, "grad_norm": 2.901209592819214, "learning_rate": 8.584615384615386e-06, "loss": 0.0248, "step": 2226 }, { "epoch": 28.46006389776358, "grad_norm": 2.8941667079925537, "learning_rate": 8.57948717948718e-06, "loss": 0.0244, "step": 2227 }, { "epoch": 28.472843450479232, "grad_norm": 5.521927356719971, "learning_rate": 8.574358974358974e-06, "loss": 0.0432, "step": 2228 }, { "epoch": 28.48562300319489, "grad_norm": 1.0991194248199463, "learning_rate": 8.56923076923077e-06, "loss": 0.0133, "step": 2229 }, { "epoch": 28.498402555910545, "grad_norm": 1.867282748222351, "learning_rate": 8.564102564102564e-06, "loss": 0.0163, "step": 2230 }, { "epoch": 28.511182108626198, "grad_norm": 2.533832311630249, "learning_rate": 8.55897435897436e-06, "loss": 0.03, "step": 2231 }, { "epoch": 28.523961661341854, "grad_norm": 2.8608005046844482, "learning_rate": 8.553846153846156e-06, "loss": 0.0364, "step": 2232 }, { "epoch": 28.536741214057507, "grad_norm": 1.3630461692810059, "learning_rate": 8.54871794871795e-06, "loss": 0.0164, "step": 2233 }, { "epoch": 28.549520766773163, "grad_norm": 1.4716297388076782, "learning_rate": 8.543589743589744e-06, "loss": 0.0189, "step": 2234 }, { "epoch": 28.562300319488816, "grad_norm": 3.877401113510132, "learning_rate": 8.53846153846154e-06, "loss": 0.0324, "step": 2235 }, { "epoch": 28.575079872204473, "grad_norm": 1.6375051736831665, "learning_rate": 8.533333333333335e-06, "loss": 0.0164, "step": 2236 }, { "epoch": 28.58785942492013, "grad_norm": 1.801792860031128, "learning_rate": 8.52820512820513e-06, "loss": 0.0188, "step": 2237 }, { "epoch": 28.600638977635782, "grad_norm": 3.3205134868621826, "learning_rate": 8.523076923076923e-06, "loss": 0.0266, "step": 2238 }, { "epoch": 28.61341853035144, "grad_norm": 1.5833179950714111, "learning_rate": 8.517948717948719e-06, "loss": 0.0161, "step": 2239 }, { "epoch": 28.62619808306709, "grad_norm": 2.784162998199463, "learning_rate": 8.512820512820513e-06, "loss": 0.0225, "step": 2240 }, { "epoch": 28.638977635782748, "grad_norm": 4.481935977935791, "learning_rate": 8.507692307692307e-06, "loss": 0.0506, "step": 2241 }, { "epoch": 28.6517571884984, "grad_norm": 1.981535792350769, "learning_rate": 8.502564102564103e-06, "loss": 0.0213, "step": 2242 }, { "epoch": 28.664536741214057, "grad_norm": 1.7869662046432495, "learning_rate": 8.497435897435899e-06, "loss": 0.028, "step": 2243 }, { "epoch": 28.677316293929714, "grad_norm": 2.6905384063720703, "learning_rate": 8.492307692307693e-06, "loss": 0.0259, "step": 2244 }, { "epoch": 28.690095846645367, "grad_norm": 4.806192874908447, "learning_rate": 8.487179487179488e-06, "loss": 0.0369, "step": 2245 }, { "epoch": 28.702875399361023, "grad_norm": 2.636756181716919, "learning_rate": 8.482051282051283e-06, "loss": 0.0254, "step": 2246 }, { "epoch": 28.715654952076676, "grad_norm": 2.5176260471343994, "learning_rate": 8.476923076923078e-06, "loss": 0.0205, "step": 2247 }, { "epoch": 28.728434504792332, "grad_norm": 3.5982775688171387, "learning_rate": 8.471794871794872e-06, "loss": 0.0317, "step": 2248 }, { "epoch": 28.74121405750799, "grad_norm": 2.210441827774048, "learning_rate": 8.466666666666668e-06, "loss": 0.0197, "step": 2249 }, { "epoch": 28.75399361022364, "grad_norm": 1.3656779527664185, "learning_rate": 8.461538461538462e-06, "loss": 0.0125, "step": 2250 }, { "epoch": 28.766773162939298, "grad_norm": 1.8053436279296875, "learning_rate": 8.456410256410256e-06, "loss": 0.0121, "step": 2251 }, { "epoch": 28.77955271565495, "grad_norm": 2.3068583011627197, "learning_rate": 8.451282051282052e-06, "loss": 0.017, "step": 2252 }, { "epoch": 28.792332268370608, "grad_norm": 3.386035442352295, "learning_rate": 8.446153846153848e-06, "loss": 0.0307, "step": 2253 }, { "epoch": 28.80511182108626, "grad_norm": 1.64307701587677, "learning_rate": 8.441025641025642e-06, "loss": 0.017, "step": 2254 }, { "epoch": 28.817891373801917, "grad_norm": 3.6835851669311523, "learning_rate": 8.435897435897436e-06, "loss": 0.0422, "step": 2255 }, { "epoch": 28.830670926517573, "grad_norm": 5.623953819274902, "learning_rate": 8.430769230769231e-06, "loss": 0.0452, "step": 2256 }, { "epoch": 28.843450479233226, "grad_norm": 2.691822052001953, "learning_rate": 8.425641025641026e-06, "loss": 0.017, "step": 2257 }, { "epoch": 28.856230031948883, "grad_norm": 3.718013286590576, "learning_rate": 8.420512820512821e-06, "loss": 0.028, "step": 2258 }, { "epoch": 28.869009584664536, "grad_norm": 14.377763748168945, "learning_rate": 8.415384615384617e-06, "loss": 0.0477, "step": 2259 }, { "epoch": 28.881789137380192, "grad_norm": 2.0842843055725098, "learning_rate": 8.410256410256411e-06, "loss": 0.0174, "step": 2260 }, { "epoch": 28.894568690095845, "grad_norm": 6.70881986618042, "learning_rate": 8.405128205128205e-06, "loss": 0.0365, "step": 2261 }, { "epoch": 28.9073482428115, "grad_norm": 2.3938944339752197, "learning_rate": 8.400000000000001e-06, "loss": 0.0291, "step": 2262 }, { "epoch": 28.920127795527158, "grad_norm": 10.553678512573242, "learning_rate": 8.394871794871795e-06, "loss": 0.0226, "step": 2263 }, { "epoch": 28.93290734824281, "grad_norm": 2.9902539253234863, "learning_rate": 8.38974358974359e-06, "loss": 0.0339, "step": 2264 }, { "epoch": 28.945686900958467, "grad_norm": 1.8003623485565186, "learning_rate": 8.384615384615385e-06, "loss": 0.0236, "step": 2265 }, { "epoch": 28.95846645367412, "grad_norm": 3.184990882873535, "learning_rate": 8.37948717948718e-06, "loss": 0.0317, "step": 2266 }, { "epoch": 28.971246006389777, "grad_norm": 2.8180389404296875, "learning_rate": 8.374358974358975e-06, "loss": 0.0206, "step": 2267 }, { "epoch": 28.984025559105433, "grad_norm": 2.6118054389953613, "learning_rate": 8.36923076923077e-06, "loss": 0.0248, "step": 2268 }, { "epoch": 28.996805111821086, "grad_norm": 4.395126819610596, "learning_rate": 8.364102564102566e-06, "loss": 0.0305, "step": 2269 }, { "epoch": 29.009584664536742, "grad_norm": 1.494640588760376, "learning_rate": 8.35897435897436e-06, "loss": 0.0128, "step": 2270 }, { "epoch": 29.022364217252395, "grad_norm": 2.212526321411133, "learning_rate": 8.353846153846154e-06, "loss": 0.0347, "step": 2271 }, { "epoch": 29.03514376996805, "grad_norm": 1.449722409248352, "learning_rate": 8.34871794871795e-06, "loss": 0.0134, "step": 2272 }, { "epoch": 29.047923322683705, "grad_norm": 1.4646117687225342, "learning_rate": 8.343589743589744e-06, "loss": 0.0191, "step": 2273 }, { "epoch": 29.06070287539936, "grad_norm": 2.184894561767578, "learning_rate": 8.338461538461538e-06, "loss": 0.0188, "step": 2274 }, { "epoch": 29.073482428115017, "grad_norm": 0.5592262148857117, "learning_rate": 8.333333333333334e-06, "loss": 0.0063, "step": 2275 }, { "epoch": 29.08626198083067, "grad_norm": 1.5937706232070923, "learning_rate": 8.32820512820513e-06, "loss": 0.0164, "step": 2276 }, { "epoch": 29.099041533546327, "grad_norm": 0.9277424812316895, "learning_rate": 8.323076923076924e-06, "loss": 0.0111, "step": 2277 }, { "epoch": 29.11182108626198, "grad_norm": 2.7014410495758057, "learning_rate": 8.317948717948718e-06, "loss": 0.0151, "step": 2278 }, { "epoch": 29.124600638977636, "grad_norm": 3.2854347229003906, "learning_rate": 8.312820512820513e-06, "loss": 0.0126, "step": 2279 }, { "epoch": 29.13738019169329, "grad_norm": 1.0488464832305908, "learning_rate": 8.307692307692309e-06, "loss": 0.0116, "step": 2280 }, { "epoch": 29.150159744408946, "grad_norm": 1.374603509902954, "learning_rate": 8.302564102564103e-06, "loss": 0.0144, "step": 2281 }, { "epoch": 29.162939297124602, "grad_norm": 0.8280434608459473, "learning_rate": 8.297435897435899e-06, "loss": 0.0081, "step": 2282 }, { "epoch": 29.175718849840255, "grad_norm": 1.7680490016937256, "learning_rate": 8.292307692307693e-06, "loss": 0.0194, "step": 2283 }, { "epoch": 29.18849840255591, "grad_norm": 1.6616615056991577, "learning_rate": 8.287179487179487e-06, "loss": 0.0153, "step": 2284 }, { "epoch": 29.201277955271564, "grad_norm": 1.9223436117172241, "learning_rate": 8.282051282051283e-06, "loss": 0.0189, "step": 2285 }, { "epoch": 29.21405750798722, "grad_norm": 1.9705466032028198, "learning_rate": 8.276923076923078e-06, "loss": 0.0138, "step": 2286 }, { "epoch": 29.226837060702877, "grad_norm": 1.6173869371414185, "learning_rate": 8.271794871794873e-06, "loss": 0.0175, "step": 2287 }, { "epoch": 29.23961661341853, "grad_norm": 1.4238561391830444, "learning_rate": 8.266666666666667e-06, "loss": 0.0154, "step": 2288 }, { "epoch": 29.252396166134186, "grad_norm": 9.690901756286621, "learning_rate": 8.261538461538462e-06, "loss": 0.0228, "step": 2289 }, { "epoch": 29.26517571884984, "grad_norm": 1.6259404420852661, "learning_rate": 8.256410256410256e-06, "loss": 0.0138, "step": 2290 }, { "epoch": 29.277955271565496, "grad_norm": 2.3683691024780273, "learning_rate": 8.251282051282052e-06, "loss": 0.0145, "step": 2291 }, { "epoch": 29.29073482428115, "grad_norm": 2.1553542613983154, "learning_rate": 8.246153846153848e-06, "loss": 0.0177, "step": 2292 }, { "epoch": 29.303514376996805, "grad_norm": 2.189448833465576, "learning_rate": 8.241025641025642e-06, "loss": 0.0196, "step": 2293 }, { "epoch": 29.31629392971246, "grad_norm": 1.2564376592636108, "learning_rate": 8.235897435897436e-06, "loss": 0.0138, "step": 2294 }, { "epoch": 29.329073482428115, "grad_norm": 2.2801411151885986, "learning_rate": 8.230769230769232e-06, "loss": 0.0215, "step": 2295 }, { "epoch": 29.34185303514377, "grad_norm": 1.3593823909759521, "learning_rate": 8.225641025641027e-06, "loss": 0.0139, "step": 2296 }, { "epoch": 29.354632587859424, "grad_norm": 1.1615716218948364, "learning_rate": 8.220512820512822e-06, "loss": 0.0124, "step": 2297 }, { "epoch": 29.36741214057508, "grad_norm": 4.084254264831543, "learning_rate": 8.215384615384616e-06, "loss": 0.0576, "step": 2298 }, { "epoch": 29.380191693290733, "grad_norm": 2.3387715816497803, "learning_rate": 8.210256410256411e-06, "loss": 0.0231, "step": 2299 }, { "epoch": 29.39297124600639, "grad_norm": 1.4509865045547485, "learning_rate": 8.205128205128205e-06, "loss": 0.0177, "step": 2300 }, { "epoch": 29.39297124600639, "eval_loss": 0.8588030338287354, "eval_runtime": 183.6274, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 2300 }, { "epoch": 29.405750798722046, "grad_norm": 0.7722755074501038, "learning_rate": 8.2e-06, "loss": 0.0082, "step": 2301 }, { "epoch": 29.4185303514377, "grad_norm": 2.2694647312164307, "learning_rate": 8.194871794871795e-06, "loss": 0.0214, "step": 2302 }, { "epoch": 29.431309904153355, "grad_norm": 3.7953007221221924, "learning_rate": 8.189743589743591e-06, "loss": 0.0396, "step": 2303 }, { "epoch": 29.44408945686901, "grad_norm": 1.1408644914627075, "learning_rate": 8.184615384615385e-06, "loss": 0.0151, "step": 2304 }, { "epoch": 29.456869009584665, "grad_norm": 1.9948118925094604, "learning_rate": 8.17948717948718e-06, "loss": 0.0157, "step": 2305 }, { "epoch": 29.46964856230032, "grad_norm": 0.918438732624054, "learning_rate": 8.174358974358975e-06, "loss": 0.0101, "step": 2306 }, { "epoch": 29.482428115015974, "grad_norm": 1.8009915351867676, "learning_rate": 8.16923076923077e-06, "loss": 0.0173, "step": 2307 }, { "epoch": 29.49520766773163, "grad_norm": 1.3889222145080566, "learning_rate": 8.164102564102565e-06, "loss": 0.0131, "step": 2308 }, { "epoch": 29.507987220447284, "grad_norm": 1.0000081062316895, "learning_rate": 8.15897435897436e-06, "loss": 0.0117, "step": 2309 }, { "epoch": 29.52076677316294, "grad_norm": 3.719639778137207, "learning_rate": 8.153846153846154e-06, "loss": 0.0305, "step": 2310 }, { "epoch": 29.533546325878593, "grad_norm": 4.300498962402344, "learning_rate": 8.148717948717948e-06, "loss": 0.0467, "step": 2311 }, { "epoch": 29.54632587859425, "grad_norm": 3.517829418182373, "learning_rate": 8.143589743589744e-06, "loss": 0.0433, "step": 2312 }, { "epoch": 29.559105431309906, "grad_norm": 1.670379877090454, "learning_rate": 8.13846153846154e-06, "loss": 0.0203, "step": 2313 }, { "epoch": 29.57188498402556, "grad_norm": 2.073993682861328, "learning_rate": 8.133333333333334e-06, "loss": 0.0222, "step": 2314 }, { "epoch": 29.584664536741215, "grad_norm": 2.1130599975585938, "learning_rate": 8.12820512820513e-06, "loss": 0.0234, "step": 2315 }, { "epoch": 29.597444089456868, "grad_norm": 1.0086466073989868, "learning_rate": 8.123076923076924e-06, "loss": 0.012, "step": 2316 }, { "epoch": 29.610223642172524, "grad_norm": 2.0270586013793945, "learning_rate": 8.117948717948718e-06, "loss": 0.0137, "step": 2317 }, { "epoch": 29.623003194888177, "grad_norm": 1.058440089225769, "learning_rate": 8.112820512820514e-06, "loss": 0.0104, "step": 2318 }, { "epoch": 29.635782747603834, "grad_norm": 2.1392710208892822, "learning_rate": 8.10769230769231e-06, "loss": 0.0136, "step": 2319 }, { "epoch": 29.64856230031949, "grad_norm": 2.7256596088409424, "learning_rate": 8.102564102564103e-06, "loss": 0.0161, "step": 2320 }, { "epoch": 29.661341853035143, "grad_norm": 3.502432107925415, "learning_rate": 8.097435897435897e-06, "loss": 0.0237, "step": 2321 }, { "epoch": 29.6741214057508, "grad_norm": 4.068586826324463, "learning_rate": 8.092307692307693e-06, "loss": 0.0366, "step": 2322 }, { "epoch": 29.686900958466452, "grad_norm": 2.0909790992736816, "learning_rate": 8.087179487179487e-06, "loss": 0.0148, "step": 2323 }, { "epoch": 29.69968051118211, "grad_norm": 1.4317904710769653, "learning_rate": 8.082051282051283e-06, "loss": 0.0106, "step": 2324 }, { "epoch": 29.712460063897765, "grad_norm": 4.125987529754639, "learning_rate": 8.076923076923077e-06, "loss": 0.0261, "step": 2325 }, { "epoch": 29.72523961661342, "grad_norm": 3.255063533782959, "learning_rate": 8.071794871794873e-06, "loss": 0.0176, "step": 2326 }, { "epoch": 29.738019169329075, "grad_norm": 0.9352941513061523, "learning_rate": 8.066666666666667e-06, "loss": 0.0095, "step": 2327 }, { "epoch": 29.750798722044728, "grad_norm": 2.9449105262756348, "learning_rate": 8.061538461538463e-06, "loss": 0.0155, "step": 2328 }, { "epoch": 29.763578274760384, "grad_norm": 2.225759506225586, "learning_rate": 8.056410256410258e-06, "loss": 0.0124, "step": 2329 }, { "epoch": 29.776357827476037, "grad_norm": 2.320310115814209, "learning_rate": 8.051282051282052e-06, "loss": 0.0177, "step": 2330 }, { "epoch": 29.789137380191693, "grad_norm": 1.8711694478988647, "learning_rate": 8.046153846153846e-06, "loss": 0.0236, "step": 2331 }, { "epoch": 29.80191693290735, "grad_norm": 1.8947467803955078, "learning_rate": 8.041025641025642e-06, "loss": 0.0123, "step": 2332 }, { "epoch": 29.814696485623003, "grad_norm": 1.9755359888076782, "learning_rate": 8.035897435897436e-06, "loss": 0.0133, "step": 2333 }, { "epoch": 29.82747603833866, "grad_norm": 2.1724817752838135, "learning_rate": 8.03076923076923e-06, "loss": 0.0189, "step": 2334 }, { "epoch": 29.840255591054312, "grad_norm": 2.910400629043579, "learning_rate": 8.025641025641026e-06, "loss": 0.0262, "step": 2335 }, { "epoch": 29.85303514376997, "grad_norm": 2.85970139503479, "learning_rate": 8.020512820512822e-06, "loss": 0.0272, "step": 2336 }, { "epoch": 29.86581469648562, "grad_norm": 3.4145076274871826, "learning_rate": 8.015384615384616e-06, "loss": 0.0154, "step": 2337 }, { "epoch": 29.878594249201278, "grad_norm": 2.902740240097046, "learning_rate": 8.01025641025641e-06, "loss": 0.0183, "step": 2338 }, { "epoch": 29.891373801916934, "grad_norm": 3.356585741043091, "learning_rate": 8.005128205128206e-06, "loss": 0.0363, "step": 2339 }, { "epoch": 29.904153354632587, "grad_norm": 50.76426315307617, "learning_rate": 8.000000000000001e-06, "loss": 0.0385, "step": 2340 }, { "epoch": 29.916932907348244, "grad_norm": 3.5773375034332275, "learning_rate": 7.994871794871795e-06, "loss": 0.0267, "step": 2341 }, { "epoch": 29.929712460063897, "grad_norm": 3.0727083683013916, "learning_rate": 7.989743589743591e-06, "loss": 0.0273, "step": 2342 }, { "epoch": 29.942492012779553, "grad_norm": 3.3302500247955322, "learning_rate": 7.984615384615385e-06, "loss": 0.017, "step": 2343 }, { "epoch": 29.955271565495206, "grad_norm": 1.594466209411621, "learning_rate": 7.97948717948718e-06, "loss": 0.0186, "step": 2344 }, { "epoch": 29.968051118210862, "grad_norm": 1.5575155019760132, "learning_rate": 7.974358974358975e-06, "loss": 0.0271, "step": 2345 }, { "epoch": 29.98083067092652, "grad_norm": 1.5307331085205078, "learning_rate": 7.96923076923077e-06, "loss": 0.0144, "step": 2346 }, { "epoch": 29.99361022364217, "grad_norm": 2.816906690597534, "learning_rate": 7.964102564102565e-06, "loss": 0.0155, "step": 2347 }, { "epoch": 30.00638977635783, "grad_norm": 1.9447122812271118, "learning_rate": 7.958974358974359e-06, "loss": 0.0191, "step": 2348 }, { "epoch": 30.01916932907348, "grad_norm": 1.1538459062576294, "learning_rate": 7.953846153846155e-06, "loss": 0.0111, "step": 2349 }, { "epoch": 30.031948881789138, "grad_norm": 1.9048233032226562, "learning_rate": 7.948717948717949e-06, "loss": 0.0108, "step": 2350 }, { "epoch": 30.044728434504794, "grad_norm": 0.9791685938835144, "learning_rate": 7.943589743589744e-06, "loss": 0.0123, "step": 2351 }, { "epoch": 30.057507987220447, "grad_norm": 1.4688340425491333, "learning_rate": 7.93846153846154e-06, "loss": 0.0099, "step": 2352 }, { "epoch": 30.070287539936103, "grad_norm": 11.503856658935547, "learning_rate": 7.933333333333334e-06, "loss": 0.0124, "step": 2353 }, { "epoch": 30.083067092651756, "grad_norm": 1.4641727209091187, "learning_rate": 7.928205128205128e-06, "loss": 0.0087, "step": 2354 }, { "epoch": 30.095846645367413, "grad_norm": 1.5176548957824707, "learning_rate": 7.923076923076924e-06, "loss": 0.0133, "step": 2355 }, { "epoch": 30.108626198083066, "grad_norm": 1.0059568881988525, "learning_rate": 7.91794871794872e-06, "loss": 0.0082, "step": 2356 }, { "epoch": 30.121405750798722, "grad_norm": 1.561390995979309, "learning_rate": 7.912820512820514e-06, "loss": 0.0223, "step": 2357 }, { "epoch": 30.13418530351438, "grad_norm": 0.5975431799888611, "learning_rate": 7.907692307692308e-06, "loss": 0.0055, "step": 2358 }, { "epoch": 30.14696485623003, "grad_norm": 0.4060616195201874, "learning_rate": 7.902564102564104e-06, "loss": 0.0044, "step": 2359 }, { "epoch": 30.159744408945688, "grad_norm": 3.557813882827759, "learning_rate": 7.897435897435898e-06, "loss": 0.0321, "step": 2360 }, { "epoch": 30.17252396166134, "grad_norm": 0.8282617926597595, "learning_rate": 7.892307692307692e-06, "loss": 0.0068, "step": 2361 }, { "epoch": 30.185303514376997, "grad_norm": 2.1953506469726562, "learning_rate": 7.887179487179487e-06, "loss": 0.0209, "step": 2362 }, { "epoch": 30.19808306709265, "grad_norm": 0.8145207166671753, "learning_rate": 7.882051282051283e-06, "loss": 0.0072, "step": 2363 }, { "epoch": 30.210862619808307, "grad_norm": 2.264665365219116, "learning_rate": 7.876923076923077e-06, "loss": 0.0113, "step": 2364 }, { "epoch": 30.223642172523963, "grad_norm": 1.1967779397964478, "learning_rate": 7.871794871794873e-06, "loss": 0.0112, "step": 2365 }, { "epoch": 30.236421725239616, "grad_norm": 2.0275306701660156, "learning_rate": 7.866666666666667e-06, "loss": 0.0176, "step": 2366 }, { "epoch": 30.249201277955272, "grad_norm": 0.7311588525772095, "learning_rate": 7.861538461538463e-06, "loss": 0.0084, "step": 2367 }, { "epoch": 30.261980830670925, "grad_norm": 7.656560897827148, "learning_rate": 7.856410256410257e-06, "loss": 0.0419, "step": 2368 }, { "epoch": 30.27476038338658, "grad_norm": 2.2187883853912354, "learning_rate": 7.851282051282053e-06, "loss": 0.0166, "step": 2369 }, { "epoch": 30.287539936102238, "grad_norm": 1.6911665201187134, "learning_rate": 7.846153846153847e-06, "loss": 0.0312, "step": 2370 }, { "epoch": 30.30031948881789, "grad_norm": 1.257012963294983, "learning_rate": 7.84102564102564e-06, "loss": 0.0142, "step": 2371 }, { "epoch": 30.313099041533548, "grad_norm": 0.9386212825775146, "learning_rate": 7.835897435897436e-06, "loss": 0.0094, "step": 2372 }, { "epoch": 30.3258785942492, "grad_norm": 1.295942783355713, "learning_rate": 7.830769230769232e-06, "loss": 0.0094, "step": 2373 }, { "epoch": 30.338658146964857, "grad_norm": 5.017524242401123, "learning_rate": 7.825641025641026e-06, "loss": 0.0184, "step": 2374 }, { "epoch": 30.35143769968051, "grad_norm": 1.9900352954864502, "learning_rate": 7.820512820512822e-06, "loss": 0.0169, "step": 2375 }, { "epoch": 30.364217252396166, "grad_norm": 1.0820534229278564, "learning_rate": 7.815384615384616e-06, "loss": 0.0136, "step": 2376 }, { "epoch": 30.376996805111823, "grad_norm": 4.093630313873291, "learning_rate": 7.81025641025641e-06, "loss": 0.041, "step": 2377 }, { "epoch": 30.389776357827476, "grad_norm": 1.1857229471206665, "learning_rate": 7.805128205128206e-06, "loss": 0.0118, "step": 2378 }, { "epoch": 30.402555910543132, "grad_norm": 3.726752758026123, "learning_rate": 7.800000000000002e-06, "loss": 0.0137, "step": 2379 }, { "epoch": 30.415335463258785, "grad_norm": 1.2537568807601929, "learning_rate": 7.794871794871796e-06, "loss": 0.0088, "step": 2380 }, { "epoch": 30.42811501597444, "grad_norm": 1.6540377140045166, "learning_rate": 7.78974358974359e-06, "loss": 0.0074, "step": 2381 }, { "epoch": 30.440894568690094, "grad_norm": 0.9351989030838013, "learning_rate": 7.784615384615385e-06, "loss": 0.0101, "step": 2382 }, { "epoch": 30.45367412140575, "grad_norm": 1.9741319417953491, "learning_rate": 7.77948717948718e-06, "loss": 0.016, "step": 2383 }, { "epoch": 30.466453674121407, "grad_norm": 0.8153372406959534, "learning_rate": 7.774358974358975e-06, "loss": 0.0079, "step": 2384 }, { "epoch": 30.47923322683706, "grad_norm": 2.5167553424835205, "learning_rate": 7.76923076923077e-06, "loss": 0.0145, "step": 2385 }, { "epoch": 30.492012779552716, "grad_norm": 0.9691523313522339, "learning_rate": 7.764102564102565e-06, "loss": 0.0089, "step": 2386 }, { "epoch": 30.50479233226837, "grad_norm": 2.4006729125976562, "learning_rate": 7.758974358974359e-06, "loss": 0.0301, "step": 2387 }, { "epoch": 30.517571884984026, "grad_norm": 1.315584659576416, "learning_rate": 7.753846153846155e-06, "loss": 0.0139, "step": 2388 }, { "epoch": 30.53035143769968, "grad_norm": 0.833059549331665, "learning_rate": 7.74871794871795e-06, "loss": 0.0084, "step": 2389 }, { "epoch": 30.543130990415335, "grad_norm": 0.7655555605888367, "learning_rate": 7.743589743589745e-06, "loss": 0.0075, "step": 2390 }, { "epoch": 30.55591054313099, "grad_norm": 2.363828659057617, "learning_rate": 7.738461538461539e-06, "loss": 0.0162, "step": 2391 }, { "epoch": 30.568690095846645, "grad_norm": 1.7295135259628296, "learning_rate": 7.733333333333334e-06, "loss": 0.0133, "step": 2392 }, { "epoch": 30.5814696485623, "grad_norm": 0.7795832753181458, "learning_rate": 7.728205128205128e-06, "loss": 0.0079, "step": 2393 }, { "epoch": 30.594249201277954, "grad_norm": 4.533823013305664, "learning_rate": 7.723076923076924e-06, "loss": 0.06, "step": 2394 }, { "epoch": 30.60702875399361, "grad_norm": 1.1290048360824585, "learning_rate": 7.717948717948718e-06, "loss": 0.0083, "step": 2395 }, { "epoch": 30.619808306709267, "grad_norm": 1.0387988090515137, "learning_rate": 7.712820512820514e-06, "loss": 0.0126, "step": 2396 }, { "epoch": 30.63258785942492, "grad_norm": 1.8188968896865845, "learning_rate": 7.707692307692308e-06, "loss": 0.0141, "step": 2397 }, { "epoch": 30.645367412140576, "grad_norm": 1.7080775499343872, "learning_rate": 7.702564102564102e-06, "loss": 0.0122, "step": 2398 }, { "epoch": 30.65814696485623, "grad_norm": 3.921550750732422, "learning_rate": 7.697435897435898e-06, "loss": 0.0255, "step": 2399 }, { "epoch": 30.670926517571885, "grad_norm": 1.774515986442566, "learning_rate": 7.692307692307694e-06, "loss": 0.0156, "step": 2400 }, { "epoch": 30.670926517571885, "eval_loss": 0.8820593953132629, "eval_runtime": 183.7504, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 2400 }, { "epoch": 30.68370607028754, "grad_norm": 1.382828950881958, "learning_rate": 7.687179487179488e-06, "loss": 0.0152, "step": 2401 }, { "epoch": 30.696485623003195, "grad_norm": 1.670392394065857, "learning_rate": 7.682051282051283e-06, "loss": 0.0134, "step": 2402 }, { "epoch": 30.70926517571885, "grad_norm": 2.938199520111084, "learning_rate": 7.676923076923077e-06, "loss": 0.028, "step": 2403 }, { "epoch": 30.722044728434504, "grad_norm": 1.7198128700256348, "learning_rate": 7.671794871794871e-06, "loss": 0.0116, "step": 2404 }, { "epoch": 30.73482428115016, "grad_norm": 1.027203917503357, "learning_rate": 7.666666666666667e-06, "loss": 0.0083, "step": 2405 }, { "epoch": 30.747603833865814, "grad_norm": 1.1331881284713745, "learning_rate": 7.661538461538463e-06, "loss": 0.0108, "step": 2406 }, { "epoch": 30.76038338658147, "grad_norm": 2.0122077465057373, "learning_rate": 7.656410256410257e-06, "loss": 0.0151, "step": 2407 }, { "epoch": 30.773162939297123, "grad_norm": 1.0752384662628174, "learning_rate": 7.651282051282051e-06, "loss": 0.0087, "step": 2408 }, { "epoch": 30.78594249201278, "grad_norm": 0.7439934015274048, "learning_rate": 7.646153846153847e-06, "loss": 0.0069, "step": 2409 }, { "epoch": 30.798722044728436, "grad_norm": 1.182193636894226, "learning_rate": 7.641025641025641e-06, "loss": 0.0089, "step": 2410 }, { "epoch": 30.81150159744409, "grad_norm": 1.1009929180145264, "learning_rate": 7.635897435897437e-06, "loss": 0.0117, "step": 2411 }, { "epoch": 30.824281150159745, "grad_norm": 1.8570705652236938, "learning_rate": 7.630769230769232e-06, "loss": 0.0107, "step": 2412 }, { "epoch": 30.837060702875398, "grad_norm": 1.9369806051254272, "learning_rate": 7.6256410256410264e-06, "loss": 0.0102, "step": 2413 }, { "epoch": 30.849840255591054, "grad_norm": 2.9887959957122803, "learning_rate": 7.620512820512821e-06, "loss": 0.0207, "step": 2414 }, { "epoch": 30.86261980830671, "grad_norm": 1.9918948411941528, "learning_rate": 7.615384615384615e-06, "loss": 0.0203, "step": 2415 }, { "epoch": 30.875399361022364, "grad_norm": 2.1750328540802, "learning_rate": 7.610256410256411e-06, "loss": 0.0208, "step": 2416 }, { "epoch": 30.88817891373802, "grad_norm": 1.5640554428100586, "learning_rate": 7.605128205128206e-06, "loss": 0.0166, "step": 2417 }, { "epoch": 30.900958466453673, "grad_norm": 1.598807692527771, "learning_rate": 7.600000000000001e-06, "loss": 0.0104, "step": 2418 }, { "epoch": 30.91373801916933, "grad_norm": 40.95674133300781, "learning_rate": 7.594871794871795e-06, "loss": 0.0492, "step": 2419 }, { "epoch": 30.926517571884983, "grad_norm": 2.3611934185028076, "learning_rate": 7.58974358974359e-06, "loss": 0.016, "step": 2420 }, { "epoch": 30.93929712460064, "grad_norm": 1.6652703285217285, "learning_rate": 7.584615384615385e-06, "loss": 0.0128, "step": 2421 }, { "epoch": 30.952076677316295, "grad_norm": 3.1491055488586426, "learning_rate": 7.5794871794871805e-06, "loss": 0.0193, "step": 2422 }, { "epoch": 30.96485623003195, "grad_norm": 3.1785836219787598, "learning_rate": 7.574358974358975e-06, "loss": 0.0239, "step": 2423 }, { "epoch": 30.977635782747605, "grad_norm": 1.2700327634811401, "learning_rate": 7.5692307692307695e-06, "loss": 0.014, "step": 2424 }, { "epoch": 30.990415335463258, "grad_norm": 2.9445688724517822, "learning_rate": 7.564102564102564e-06, "loss": 0.0289, "step": 2425 }, { "epoch": 31.003194888178914, "grad_norm": 1.4139279127120972, "learning_rate": 7.558974358974359e-06, "loss": 0.0157, "step": 2426 }, { "epoch": 31.015974440894567, "grad_norm": 1.359743356704712, "learning_rate": 7.553846153846155e-06, "loss": 0.0114, "step": 2427 }, { "epoch": 31.028753993610223, "grad_norm": 1.6461807489395142, "learning_rate": 7.54871794871795e-06, "loss": 0.0102, "step": 2428 }, { "epoch": 31.04153354632588, "grad_norm": 1.2406097650527954, "learning_rate": 7.543589743589744e-06, "loss": 0.0096, "step": 2429 }, { "epoch": 31.054313099041533, "grad_norm": 0.6446853876113892, "learning_rate": 7.538461538461539e-06, "loss": 0.0067, "step": 2430 }, { "epoch": 31.06709265175719, "grad_norm": 0.4024180471897125, "learning_rate": 7.533333333333334e-06, "loss": 0.0044, "step": 2431 }, { "epoch": 31.079872204472842, "grad_norm": 0.8510559797286987, "learning_rate": 7.528205128205129e-06, "loss": 0.0064, "step": 2432 }, { "epoch": 31.0926517571885, "grad_norm": 2.8283066749572754, "learning_rate": 7.523076923076924e-06, "loss": 0.0271, "step": 2433 }, { "epoch": 31.105431309904155, "grad_norm": 0.76324462890625, "learning_rate": 7.5179487179487185e-06, "loss": 0.0086, "step": 2434 }, { "epoch": 31.118210862619808, "grad_norm": 0.6604694128036499, "learning_rate": 7.512820512820513e-06, "loss": 0.0076, "step": 2435 }, { "epoch": 31.130990415335464, "grad_norm": 0.856167197227478, "learning_rate": 7.507692307692308e-06, "loss": 0.0081, "step": 2436 }, { "epoch": 31.143769968051117, "grad_norm": 0.48480814695358276, "learning_rate": 7.502564102564102e-06, "loss": 0.0053, "step": 2437 }, { "epoch": 31.156549520766774, "grad_norm": 4.314848899841309, "learning_rate": 7.497435897435899e-06, "loss": 0.0498, "step": 2438 }, { "epoch": 31.169329073482427, "grad_norm": 3.0163071155548096, "learning_rate": 7.492307692307693e-06, "loss": 0.0158, "step": 2439 }, { "epoch": 31.182108626198083, "grad_norm": 1.104270577430725, "learning_rate": 7.487179487179488e-06, "loss": 0.0081, "step": 2440 }, { "epoch": 31.19488817891374, "grad_norm": 1.8899494409561157, "learning_rate": 7.482051282051283e-06, "loss": 0.0149, "step": 2441 }, { "epoch": 31.207667731629392, "grad_norm": 2.2089486122131348, "learning_rate": 7.476923076923077e-06, "loss": 0.0136, "step": 2442 }, { "epoch": 31.22044728434505, "grad_norm": 0.7608822584152222, "learning_rate": 7.4717948717948726e-06, "loss": 0.0064, "step": 2443 }, { "epoch": 31.233226837060702, "grad_norm": 1.0831680297851562, "learning_rate": 7.4666666666666675e-06, "loss": 0.0108, "step": 2444 }, { "epoch": 31.24600638977636, "grad_norm": 0.8487801551818848, "learning_rate": 7.461538461538462e-06, "loss": 0.0065, "step": 2445 }, { "epoch": 31.25878594249201, "grad_norm": 0.7963667511940002, "learning_rate": 7.456410256410257e-06, "loss": 0.0066, "step": 2446 }, { "epoch": 31.271565495207668, "grad_norm": 0.7370432019233704, "learning_rate": 7.451282051282051e-06, "loss": 0.0083, "step": 2447 }, { "epoch": 31.284345047923324, "grad_norm": 1.1382296085357666, "learning_rate": 7.446153846153846e-06, "loss": 0.0083, "step": 2448 }, { "epoch": 31.297124600638977, "grad_norm": 1.2064332962036133, "learning_rate": 7.441025641025642e-06, "loss": 0.008, "step": 2449 }, { "epoch": 31.309904153354633, "grad_norm": 1.4648174047470093, "learning_rate": 7.435897435897437e-06, "loss": 0.0073, "step": 2450 }, { "epoch": 31.322683706070286, "grad_norm": 1.0092828273773193, "learning_rate": 7.430769230769232e-06, "loss": 0.0085, "step": 2451 }, { "epoch": 31.335463258785943, "grad_norm": 2.0985512733459473, "learning_rate": 7.425641025641026e-06, "loss": 0.0088, "step": 2452 }, { "epoch": 31.3482428115016, "grad_norm": 2.0856070518493652, "learning_rate": 7.420512820512821e-06, "loss": 0.0115, "step": 2453 }, { "epoch": 31.361022364217252, "grad_norm": 3.9369513988494873, "learning_rate": 7.4153846153846164e-06, "loss": 0.0291, "step": 2454 }, { "epoch": 31.37380191693291, "grad_norm": 0.8650026917457581, "learning_rate": 7.410256410256411e-06, "loss": 0.0085, "step": 2455 }, { "epoch": 31.38658146964856, "grad_norm": 2.597470760345459, "learning_rate": 7.405128205128206e-06, "loss": 0.0131, "step": 2456 }, { "epoch": 31.399361022364218, "grad_norm": 1.0550893545150757, "learning_rate": 7.4e-06, "loss": 0.0071, "step": 2457 }, { "epoch": 31.41214057507987, "grad_norm": 1.2370715141296387, "learning_rate": 7.394871794871795e-06, "loss": 0.0079, "step": 2458 }, { "epoch": 31.424920127795527, "grad_norm": 2.8915867805480957, "learning_rate": 7.38974358974359e-06, "loss": 0.0274, "step": 2459 }, { "epoch": 31.437699680511184, "grad_norm": 1.7772983312606812, "learning_rate": 7.384615384615386e-06, "loss": 0.0082, "step": 2460 }, { "epoch": 31.450479233226837, "grad_norm": 2.2857325077056885, "learning_rate": 7.37948717948718e-06, "loss": 0.0135, "step": 2461 }, { "epoch": 31.463258785942493, "grad_norm": 3.7650675773620605, "learning_rate": 7.374358974358975e-06, "loss": 0.0247, "step": 2462 }, { "epoch": 31.476038338658146, "grad_norm": 2.1733803749084473, "learning_rate": 7.36923076923077e-06, "loss": 0.0179, "step": 2463 }, { "epoch": 31.488817891373802, "grad_norm": 3.0555264949798584, "learning_rate": 7.364102564102565e-06, "loss": 0.0138, "step": 2464 }, { "epoch": 31.501597444089455, "grad_norm": 4.623447895050049, "learning_rate": 7.35897435897436e-06, "loss": 0.0163, "step": 2465 }, { "epoch": 31.51437699680511, "grad_norm": 1.1830363273620605, "learning_rate": 7.353846153846154e-06, "loss": 0.0064, "step": 2466 }, { "epoch": 31.527156549520768, "grad_norm": 1.612902283668518, "learning_rate": 7.348717948717949e-06, "loss": 0.0148, "step": 2467 }, { "epoch": 31.53993610223642, "grad_norm": 2.0270512104034424, "learning_rate": 7.343589743589744e-06, "loss": 0.0143, "step": 2468 }, { "epoch": 31.552715654952078, "grad_norm": 4.224149227142334, "learning_rate": 7.338461538461539e-06, "loss": 0.041, "step": 2469 }, { "epoch": 31.56549520766773, "grad_norm": 1.3380467891693115, "learning_rate": 7.333333333333333e-06, "loss": 0.0072, "step": 2470 }, { "epoch": 31.578274760383387, "grad_norm": 0.8487076163291931, "learning_rate": 7.328205128205129e-06, "loss": 0.011, "step": 2471 }, { "epoch": 31.591054313099043, "grad_norm": 0.8845389485359192, "learning_rate": 7.323076923076924e-06, "loss": 0.0092, "step": 2472 }, { "epoch": 31.603833865814696, "grad_norm": 2.505274772644043, "learning_rate": 7.317948717948719e-06, "loss": 0.0123, "step": 2473 }, { "epoch": 31.616613418530353, "grad_norm": 1.9374302625656128, "learning_rate": 7.312820512820514e-06, "loss": 0.0086, "step": 2474 }, { "epoch": 31.629392971246006, "grad_norm": 0.6446139216423035, "learning_rate": 7.307692307692308e-06, "loss": 0.0055, "step": 2475 }, { "epoch": 31.642172523961662, "grad_norm": 0.738622784614563, "learning_rate": 7.302564102564103e-06, "loss": 0.0076, "step": 2476 }, { "epoch": 31.654952076677315, "grad_norm": 1.9127014875411987, "learning_rate": 7.297435897435898e-06, "loss": 0.0121, "step": 2477 }, { "epoch": 31.66773162939297, "grad_norm": 3.925105571746826, "learning_rate": 7.292307692307693e-06, "loss": 0.0257, "step": 2478 }, { "epoch": 31.680511182108628, "grad_norm": 1.7984658479690552, "learning_rate": 7.287179487179487e-06, "loss": 0.011, "step": 2479 }, { "epoch": 31.69329073482428, "grad_norm": 2.5924363136291504, "learning_rate": 7.282051282051282e-06, "loss": 0.0202, "step": 2480 }, { "epoch": 31.706070287539937, "grad_norm": 3.3326330184936523, "learning_rate": 7.276923076923077e-06, "loss": 0.0312, "step": 2481 }, { "epoch": 31.71884984025559, "grad_norm": 2.6017725467681885, "learning_rate": 7.271794871794873e-06, "loss": 0.0229, "step": 2482 }, { "epoch": 31.731629392971247, "grad_norm": 2.4673171043395996, "learning_rate": 7.266666666666668e-06, "loss": 0.0214, "step": 2483 }, { "epoch": 31.7444089456869, "grad_norm": 1.627975583076477, "learning_rate": 7.261538461538462e-06, "loss": 0.0215, "step": 2484 }, { "epoch": 31.757188498402556, "grad_norm": 2.3996481895446777, "learning_rate": 7.256410256410257e-06, "loss": 0.0184, "step": 2485 }, { "epoch": 31.769968051118212, "grad_norm": 1.8067755699157715, "learning_rate": 7.2512820512820515e-06, "loss": 0.0076, "step": 2486 }, { "epoch": 31.782747603833865, "grad_norm": 0.9003115892410278, "learning_rate": 7.246153846153847e-06, "loss": 0.0089, "step": 2487 }, { "epoch": 31.79552715654952, "grad_norm": 1.9725600481033325, "learning_rate": 7.241025641025642e-06, "loss": 0.0175, "step": 2488 }, { "epoch": 31.808306709265175, "grad_norm": 0.8610435128211975, "learning_rate": 7.235897435897436e-06, "loss": 0.0214, "step": 2489 }, { "epoch": 31.82108626198083, "grad_norm": 1.769567847251892, "learning_rate": 7.230769230769231e-06, "loss": 0.0149, "step": 2490 }, { "epoch": 31.833865814696484, "grad_norm": 1.4577288627624512, "learning_rate": 7.225641025641026e-06, "loss": 0.0114, "step": 2491 }, { "epoch": 31.84664536741214, "grad_norm": 1.6688158512115479, "learning_rate": 7.220512820512822e-06, "loss": 0.0151, "step": 2492 }, { "epoch": 31.859424920127797, "grad_norm": 1.0488781929016113, "learning_rate": 7.215384615384617e-06, "loss": 0.0078, "step": 2493 }, { "epoch": 31.87220447284345, "grad_norm": 1.9858828783035278, "learning_rate": 7.210256410256411e-06, "loss": 0.0101, "step": 2494 }, { "epoch": 31.884984025559106, "grad_norm": 2.6534643173217773, "learning_rate": 7.205128205128206e-06, "loss": 0.0147, "step": 2495 }, { "epoch": 31.89776357827476, "grad_norm": 0.849526047706604, "learning_rate": 7.2000000000000005e-06, "loss": 0.0081, "step": 2496 }, { "epoch": 31.910543130990416, "grad_norm": 1.108740210533142, "learning_rate": 7.1948717948717946e-06, "loss": 0.0106, "step": 2497 }, { "epoch": 31.923322683706072, "grad_norm": 1.1372129917144775, "learning_rate": 7.189743589743591e-06, "loss": 0.0086, "step": 2498 }, { "epoch": 31.936102236421725, "grad_norm": 0.8639397621154785, "learning_rate": 7.184615384615385e-06, "loss": 0.0097, "step": 2499 }, { "epoch": 31.94888178913738, "grad_norm": 1.0387763977050781, "learning_rate": 7.17948717948718e-06, "loss": 0.0107, "step": 2500 }, { "epoch": 31.94888178913738, "eval_loss": 0.9020013213157654, "eval_runtime": 183.5845, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 2500 }, { "epoch": 31.961661341853034, "grad_norm": 1.7540512084960938, "learning_rate": 7.174358974358975e-06, "loss": 0.0101, "step": 2501 }, { "epoch": 31.97444089456869, "grad_norm": 1.5571906566619873, "learning_rate": 7.169230769230769e-06, "loss": 0.0119, "step": 2502 }, { "epoch": 31.987220447284344, "grad_norm": 3.689589023590088, "learning_rate": 7.164102564102565e-06, "loss": 0.0188, "step": 2503 }, { "epoch": 32.0, "grad_norm": 9.199419975280762, "learning_rate": 7.15897435897436e-06, "loss": 0.0465, "step": 2504 }, { "epoch": 32.01277955271566, "grad_norm": 47.8953857421875, "learning_rate": 7.153846153846155e-06, "loss": 0.0376, "step": 2505 }, { "epoch": 32.02555910543131, "grad_norm": 0.7875796556472778, "learning_rate": 7.1487179487179495e-06, "loss": 0.0102, "step": 2506 }, { "epoch": 32.03833865814696, "grad_norm": 1.4419995546340942, "learning_rate": 7.1435897435897436e-06, "loss": 0.0114, "step": 2507 }, { "epoch": 32.05111821086262, "grad_norm": 2.094075918197632, "learning_rate": 7.1384615384615385e-06, "loss": 0.0124, "step": 2508 }, { "epoch": 32.063897763578275, "grad_norm": 0.5076902508735657, "learning_rate": 7.133333333333334e-06, "loss": 0.0048, "step": 2509 }, { "epoch": 32.07667731629393, "grad_norm": 1.2808444499969482, "learning_rate": 7.128205128205129e-06, "loss": 0.0195, "step": 2510 }, { "epoch": 32.08945686900959, "grad_norm": 1.2106831073760986, "learning_rate": 7.123076923076924e-06, "loss": 0.0126, "step": 2511 }, { "epoch": 32.10223642172524, "grad_norm": 2.212151288986206, "learning_rate": 7.117948717948718e-06, "loss": 0.0068, "step": 2512 }, { "epoch": 32.115015974440894, "grad_norm": 1.6798101663589478, "learning_rate": 7.112820512820513e-06, "loss": 0.0091, "step": 2513 }, { "epoch": 32.12779552715655, "grad_norm": 0.4618903696537018, "learning_rate": 7.107692307692309e-06, "loss": 0.005, "step": 2514 }, { "epoch": 32.14057507987221, "grad_norm": 2.6432876586914062, "learning_rate": 7.102564102564104e-06, "loss": 0.0137, "step": 2515 }, { "epoch": 32.153354632587856, "grad_norm": 0.6872953772544861, "learning_rate": 7.0974358974358985e-06, "loss": 0.0082, "step": 2516 }, { "epoch": 32.16613418530351, "grad_norm": 0.45526954531669617, "learning_rate": 7.0923076923076926e-06, "loss": 0.0057, "step": 2517 }, { "epoch": 32.17891373801917, "grad_norm": 1.115525245666504, "learning_rate": 7.0871794871794875e-06, "loss": 0.0101, "step": 2518 }, { "epoch": 32.191693290734825, "grad_norm": 1.0506788492202759, "learning_rate": 7.082051282051282e-06, "loss": 0.0073, "step": 2519 }, { "epoch": 32.20447284345048, "grad_norm": 0.7957150936126709, "learning_rate": 7.076923076923078e-06, "loss": 0.0063, "step": 2520 }, { "epoch": 32.21725239616613, "grad_norm": 0.6349469423294067, "learning_rate": 7.071794871794872e-06, "loss": 0.0054, "step": 2521 }, { "epoch": 32.23003194888179, "grad_norm": 2.232160806655884, "learning_rate": 7.066666666666667e-06, "loss": 0.012, "step": 2522 }, { "epoch": 32.242811501597444, "grad_norm": 0.7386549115180969, "learning_rate": 7.061538461538462e-06, "loss": 0.0059, "step": 2523 }, { "epoch": 32.2555910543131, "grad_norm": 0.6190231442451477, "learning_rate": 7.056410256410257e-06, "loss": 0.0063, "step": 2524 }, { "epoch": 32.26837060702876, "grad_norm": 0.7233198881149292, "learning_rate": 7.051282051282053e-06, "loss": 0.0091, "step": 2525 }, { "epoch": 32.281150159744406, "grad_norm": 0.612321138381958, "learning_rate": 7.046153846153847e-06, "loss": 0.0068, "step": 2526 }, { "epoch": 32.29392971246006, "grad_norm": 2.0452654361724854, "learning_rate": 7.0410256410256415e-06, "loss": 0.0131, "step": 2527 }, { "epoch": 32.30670926517572, "grad_norm": 1.4713987112045288, "learning_rate": 7.0358974358974364e-06, "loss": 0.0089, "step": 2528 }, { "epoch": 32.319488817891376, "grad_norm": 2.9305877685546875, "learning_rate": 7.030769230769231e-06, "loss": 0.0205, "step": 2529 }, { "epoch": 32.33226837060703, "grad_norm": 3.299680233001709, "learning_rate": 7.025641025641025e-06, "loss": 0.0293, "step": 2530 }, { "epoch": 32.34504792332268, "grad_norm": 1.3041348457336426, "learning_rate": 7.020512820512821e-06, "loss": 0.0115, "step": 2531 }, { "epoch": 32.35782747603834, "grad_norm": 1.1142361164093018, "learning_rate": 7.015384615384616e-06, "loss": 0.0093, "step": 2532 }, { "epoch": 32.370607028753994, "grad_norm": 2.617877721786499, "learning_rate": 7.010256410256411e-06, "loss": 0.0168, "step": 2533 }, { "epoch": 32.38338658146965, "grad_norm": 1.5802733898162842, "learning_rate": 7.005128205128206e-06, "loss": 0.012, "step": 2534 }, { "epoch": 32.3961661341853, "grad_norm": 3.298232316970825, "learning_rate": 7e-06, "loss": 0.0196, "step": 2535 }, { "epoch": 32.40894568690096, "grad_norm": 1.9718133211135864, "learning_rate": 6.994871794871796e-06, "loss": 0.011, "step": 2536 }, { "epoch": 32.42172523961661, "grad_norm": 2.5648529529571533, "learning_rate": 6.9897435897435905e-06, "loss": 0.0283, "step": 2537 }, { "epoch": 32.43450479233227, "grad_norm": 1.1077498197555542, "learning_rate": 6.9846153846153854e-06, "loss": 0.0076, "step": 2538 }, { "epoch": 32.447284345047926, "grad_norm": 0.5472296476364136, "learning_rate": 6.9794871794871795e-06, "loss": 0.0048, "step": 2539 }, { "epoch": 32.460063897763575, "grad_norm": 3.183478832244873, "learning_rate": 6.974358974358974e-06, "loss": 0.0139, "step": 2540 }, { "epoch": 32.47284345047923, "grad_norm": 1.6973299980163574, "learning_rate": 6.96923076923077e-06, "loss": 0.0178, "step": 2541 }, { "epoch": 32.48562300319489, "grad_norm": 0.7870661020278931, "learning_rate": 6.964102564102565e-06, "loss": 0.007, "step": 2542 }, { "epoch": 32.498402555910545, "grad_norm": 1.308314561843872, "learning_rate": 6.95897435897436e-06, "loss": 0.0075, "step": 2543 }, { "epoch": 32.5111821086262, "grad_norm": 0.9130819439888, "learning_rate": 6.953846153846154e-06, "loss": 0.009, "step": 2544 }, { "epoch": 32.52396166134185, "grad_norm": 0.6322726607322693, "learning_rate": 6.948717948717949e-06, "loss": 0.006, "step": 2545 }, { "epoch": 32.53674121405751, "grad_norm": 1.411386251449585, "learning_rate": 6.943589743589744e-06, "loss": 0.02, "step": 2546 }, { "epoch": 32.54952076677316, "grad_norm": 0.5440376400947571, "learning_rate": 6.9384615384615395e-06, "loss": 0.0052, "step": 2547 }, { "epoch": 32.56230031948882, "grad_norm": 1.4267332553863525, "learning_rate": 6.9333333333333344e-06, "loss": 0.0069, "step": 2548 }, { "epoch": 32.575079872204476, "grad_norm": 0.9322569966316223, "learning_rate": 6.9282051282051285e-06, "loss": 0.0083, "step": 2549 }, { "epoch": 32.587859424920126, "grad_norm": 3.507944107055664, "learning_rate": 6.923076923076923e-06, "loss": 0.0276, "step": 2550 }, { "epoch": 32.60063897763578, "grad_norm": 0.6120370030403137, "learning_rate": 6.917948717948718e-06, "loss": 0.0054, "step": 2551 }, { "epoch": 32.61341853035144, "grad_norm": 1.4462029933929443, "learning_rate": 6.912820512820514e-06, "loss": 0.0112, "step": 2552 }, { "epoch": 32.626198083067095, "grad_norm": 9.039031028747559, "learning_rate": 6.907692307692309e-06, "loss": 0.0083, "step": 2553 }, { "epoch": 32.638977635782744, "grad_norm": 1.2227258682250977, "learning_rate": 6.902564102564103e-06, "loss": 0.0088, "step": 2554 }, { "epoch": 32.6517571884984, "grad_norm": 2.9861364364624023, "learning_rate": 6.897435897435898e-06, "loss": 0.0166, "step": 2555 }, { "epoch": 32.66453674121406, "grad_norm": 0.9244511127471924, "learning_rate": 6.892307692307693e-06, "loss": 0.0069, "step": 2556 }, { "epoch": 32.677316293929714, "grad_norm": 2.979329824447632, "learning_rate": 6.887179487179488e-06, "loss": 0.012, "step": 2557 }, { "epoch": 32.69009584664537, "grad_norm": 1.4540989398956299, "learning_rate": 6.882051282051283e-06, "loss": 0.0108, "step": 2558 }, { "epoch": 32.70287539936102, "grad_norm": 2.675171375274658, "learning_rate": 6.8769230769230775e-06, "loss": 0.0207, "step": 2559 }, { "epoch": 32.715654952076676, "grad_norm": 1.284254550933838, "learning_rate": 6.871794871794872e-06, "loss": 0.0063, "step": 2560 }, { "epoch": 32.72843450479233, "grad_norm": 2.4144058227539062, "learning_rate": 6.866666666666667e-06, "loss": 0.0146, "step": 2561 }, { "epoch": 32.74121405750799, "grad_norm": 2.105947971343994, "learning_rate": 6.861538461538461e-06, "loss": 0.0167, "step": 2562 }, { "epoch": 32.753993610223645, "grad_norm": 0.9299395084381104, "learning_rate": 6.856410256410257e-06, "loss": 0.0088, "step": 2563 }, { "epoch": 32.766773162939295, "grad_norm": 1.4186254739761353, "learning_rate": 6.851282051282052e-06, "loss": 0.0081, "step": 2564 }, { "epoch": 32.77955271565495, "grad_norm": 2.574317216873169, "learning_rate": 6.846153846153847e-06, "loss": 0.0192, "step": 2565 }, { "epoch": 32.79233226837061, "grad_norm": 0.4454169273376465, "learning_rate": 6.841025641025642e-06, "loss": 0.0039, "step": 2566 }, { "epoch": 32.805111821086264, "grad_norm": 3.510132312774658, "learning_rate": 6.835897435897436e-06, "loss": 0.0174, "step": 2567 }, { "epoch": 32.81789137380191, "grad_norm": 2.4825921058654785, "learning_rate": 6.830769230769231e-06, "loss": 0.0243, "step": 2568 }, { "epoch": 32.83067092651757, "grad_norm": 0.8653742671012878, "learning_rate": 6.8256410256410265e-06, "loss": 0.0069, "step": 2569 }, { "epoch": 32.843450479233226, "grad_norm": 2.524458408355713, "learning_rate": 6.820512820512821e-06, "loss": 0.0157, "step": 2570 }, { "epoch": 32.85623003194888, "grad_norm": 1.3834733963012695, "learning_rate": 6.815384615384616e-06, "loss": 0.0059, "step": 2571 }, { "epoch": 32.86900958466454, "grad_norm": 0.8807143568992615, "learning_rate": 6.81025641025641e-06, "loss": 0.0076, "step": 2572 }, { "epoch": 32.88178913738019, "grad_norm": 4.4518561363220215, "learning_rate": 6.805128205128205e-06, "loss": 0.0194, "step": 2573 }, { "epoch": 32.894568690095845, "grad_norm": 0.6095708608627319, "learning_rate": 6.800000000000001e-06, "loss": 0.0064, "step": 2574 }, { "epoch": 32.9073482428115, "grad_norm": 4.341555595397949, "learning_rate": 6.794871794871796e-06, "loss": 0.0369, "step": 2575 }, { "epoch": 32.92012779552716, "grad_norm": 3.9712815284729004, "learning_rate": 6.789743589743591e-06, "loss": 0.0314, "step": 2576 }, { "epoch": 32.932907348242814, "grad_norm": 0.6758043766021729, "learning_rate": 6.784615384615385e-06, "loss": 0.0063, "step": 2577 }, { "epoch": 32.945686900958464, "grad_norm": 3.790515422821045, "learning_rate": 6.77948717948718e-06, "loss": 0.0175, "step": 2578 }, { "epoch": 32.95846645367412, "grad_norm": 1.8134866952896118, "learning_rate": 6.774358974358975e-06, "loss": 0.0131, "step": 2579 }, { "epoch": 32.97124600638978, "grad_norm": 0.7580832839012146, "learning_rate": 6.76923076923077e-06, "loss": 0.0068, "step": 2580 }, { "epoch": 32.98402555910543, "grad_norm": 1.2273781299591064, "learning_rate": 6.764102564102564e-06, "loss": 0.0068, "step": 2581 }, { "epoch": 32.99680511182109, "grad_norm": 3.0153393745422363, "learning_rate": 6.758974358974359e-06, "loss": 0.0205, "step": 2582 }, { "epoch": 33.00958466453674, "grad_norm": 0.7186205983161926, "learning_rate": 6.753846153846154e-06, "loss": 0.0071, "step": 2583 }, { "epoch": 33.022364217252395, "grad_norm": 0.7623568773269653, "learning_rate": 6.748717948717949e-06, "loss": 0.0067, "step": 2584 }, { "epoch": 33.03514376996805, "grad_norm": 1.622061848640442, "learning_rate": 6.743589743589745e-06, "loss": 0.0097, "step": 2585 }, { "epoch": 33.04792332268371, "grad_norm": 0.7285746335983276, "learning_rate": 6.738461538461539e-06, "loss": 0.0053, "step": 2586 }, { "epoch": 33.06070287539936, "grad_norm": 0.7648468613624573, "learning_rate": 6.733333333333334e-06, "loss": 0.0052, "step": 2587 }, { "epoch": 33.073482428115014, "grad_norm": 2.6559994220733643, "learning_rate": 6.728205128205129e-06, "loss": 0.0138, "step": 2588 }, { "epoch": 33.08626198083067, "grad_norm": 2.8083527088165283, "learning_rate": 6.723076923076924e-06, "loss": 0.0149, "step": 2589 }, { "epoch": 33.09904153354633, "grad_norm": 0.924113392829895, "learning_rate": 6.717948717948718e-06, "loss": 0.0086, "step": 2590 }, { "epoch": 33.11182108626198, "grad_norm": 0.6254075765609741, "learning_rate": 6.712820512820513e-06, "loss": 0.0035, "step": 2591 }, { "epoch": 33.12460063897763, "grad_norm": 0.8559412956237793, "learning_rate": 6.707692307692308e-06, "loss": 0.0067, "step": 2592 }, { "epoch": 33.13738019169329, "grad_norm": 0.5838778614997864, "learning_rate": 6.702564102564103e-06, "loss": 0.0041, "step": 2593 }, { "epoch": 33.150159744408946, "grad_norm": 0.6391317844390869, "learning_rate": 6.697435897435898e-06, "loss": 0.0057, "step": 2594 }, { "epoch": 33.1629392971246, "grad_norm": 2.1996893882751465, "learning_rate": 6.692307692307692e-06, "loss": 0.0124, "step": 2595 }, { "epoch": 33.17571884984026, "grad_norm": 3.635840892791748, "learning_rate": 6.687179487179488e-06, "loss": 0.0202, "step": 2596 }, { "epoch": 33.18849840255591, "grad_norm": 1.2818849086761475, "learning_rate": 6.682051282051283e-06, "loss": 0.0076, "step": 2597 }, { "epoch": 33.201277955271564, "grad_norm": 0.7113415002822876, "learning_rate": 6.676923076923078e-06, "loss": 0.0054, "step": 2598 }, { "epoch": 33.21405750798722, "grad_norm": 0.8988485932350159, "learning_rate": 6.671794871794873e-06, "loss": 0.0053, "step": 2599 }, { "epoch": 33.22683706070288, "grad_norm": 1.191688895225525, "learning_rate": 6.666666666666667e-06, "loss": 0.009, "step": 2600 }, { "epoch": 33.22683706070288, "eval_loss": 0.9255790710449219, "eval_runtime": 183.4128, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.109, "step": 2600 }, { "epoch": 33.239616613418534, "grad_norm": 0.3343746066093445, "learning_rate": 6.661538461538462e-06, "loss": 0.0036, "step": 2601 }, { "epoch": 33.25239616613418, "grad_norm": 0.6919339895248413, "learning_rate": 6.656410256410257e-06, "loss": 0.0059, "step": 2602 }, { "epoch": 33.26517571884984, "grad_norm": 0.7552350163459778, "learning_rate": 6.651282051282052e-06, "loss": 0.0079, "step": 2603 }, { "epoch": 33.277955271565496, "grad_norm": 1.3328794240951538, "learning_rate": 6.646153846153846e-06, "loss": 0.0091, "step": 2604 }, { "epoch": 33.29073482428115, "grad_norm": 0.6286408305168152, "learning_rate": 6.641025641025641e-06, "loss": 0.0056, "step": 2605 }, { "epoch": 33.3035143769968, "grad_norm": 0.548167884349823, "learning_rate": 6.635897435897436e-06, "loss": 0.0062, "step": 2606 }, { "epoch": 33.31629392971246, "grad_norm": 1.6738111972808838, "learning_rate": 6.630769230769232e-06, "loss": 0.0102, "step": 2607 }, { "epoch": 33.329073482428115, "grad_norm": 0.5843906402587891, "learning_rate": 6.625641025641027e-06, "loss": 0.0035, "step": 2608 }, { "epoch": 33.34185303514377, "grad_norm": 0.33983996510505676, "learning_rate": 6.620512820512821e-06, "loss": 0.0036, "step": 2609 }, { "epoch": 33.35463258785943, "grad_norm": 0.44117605686187744, "learning_rate": 6.615384615384616e-06, "loss": 0.0048, "step": 2610 }, { "epoch": 33.36741214057508, "grad_norm": 0.572445809841156, "learning_rate": 6.6102564102564105e-06, "loss": 0.0039, "step": 2611 }, { "epoch": 33.38019169329073, "grad_norm": 1.3580918312072754, "learning_rate": 6.605128205128206e-06, "loss": 0.0081, "step": 2612 }, { "epoch": 33.39297124600639, "grad_norm": 2.4212586879730225, "learning_rate": 6.600000000000001e-06, "loss": 0.0238, "step": 2613 }, { "epoch": 33.405750798722046, "grad_norm": 3.360891342163086, "learning_rate": 6.594871794871795e-06, "loss": 0.0278, "step": 2614 }, { "epoch": 33.4185303514377, "grad_norm": 2.3740994930267334, "learning_rate": 6.58974358974359e-06, "loss": 0.0125, "step": 2615 }, { "epoch": 33.43130990415335, "grad_norm": 3.995941638946533, "learning_rate": 6.584615384615385e-06, "loss": 0.009, "step": 2616 }, { "epoch": 33.44408945686901, "grad_norm": 0.8252402544021606, "learning_rate": 6.57948717948718e-06, "loss": 0.0059, "step": 2617 }, { "epoch": 33.456869009584665, "grad_norm": 3.215841054916382, "learning_rate": 6.574358974358976e-06, "loss": 0.0196, "step": 2618 }, { "epoch": 33.46964856230032, "grad_norm": 2.137525796890259, "learning_rate": 6.56923076923077e-06, "loss": 0.0167, "step": 2619 }, { "epoch": 33.48242811501598, "grad_norm": 0.39872461557388306, "learning_rate": 6.564102564102565e-06, "loss": 0.0046, "step": 2620 }, { "epoch": 33.49520766773163, "grad_norm": 0.7364473342895508, "learning_rate": 6.5589743589743595e-06, "loss": 0.0087, "step": 2621 }, { "epoch": 33.50798722044728, "grad_norm": 0.5490440130233765, "learning_rate": 6.553846153846154e-06, "loss": 0.0062, "step": 2622 }, { "epoch": 33.52076677316294, "grad_norm": 0.509581446647644, "learning_rate": 6.548717948717949e-06, "loss": 0.0059, "step": 2623 }, { "epoch": 33.533546325878596, "grad_norm": 0.47174230217933655, "learning_rate": 6.543589743589744e-06, "loss": 0.0038, "step": 2624 }, { "epoch": 33.546325878594246, "grad_norm": 0.7682559490203857, "learning_rate": 6.538461538461539e-06, "loss": 0.0063, "step": 2625 }, { "epoch": 33.5591054313099, "grad_norm": 1.427538514137268, "learning_rate": 6.533333333333334e-06, "loss": 0.0118, "step": 2626 }, { "epoch": 33.57188498402556, "grad_norm": 0.7195096611976624, "learning_rate": 6.528205128205128e-06, "loss": 0.0063, "step": 2627 }, { "epoch": 33.584664536741215, "grad_norm": 0.47817856073379517, "learning_rate": 6.523076923076923e-06, "loss": 0.0053, "step": 2628 }, { "epoch": 33.59744408945687, "grad_norm": 0.9087768197059631, "learning_rate": 6.517948717948719e-06, "loss": 0.0086, "step": 2629 }, { "epoch": 33.61022364217252, "grad_norm": 0.6149556040763855, "learning_rate": 6.512820512820514e-06, "loss": 0.0069, "step": 2630 }, { "epoch": 33.62300319488818, "grad_norm": 1.6814866065979004, "learning_rate": 6.5076923076923085e-06, "loss": 0.0165, "step": 2631 }, { "epoch": 33.635782747603834, "grad_norm": 2.084594249725342, "learning_rate": 6.5025641025641026e-06, "loss": 0.0138, "step": 2632 }, { "epoch": 33.64856230031949, "grad_norm": 0.8016097545623779, "learning_rate": 6.4974358974358975e-06, "loss": 0.0073, "step": 2633 }, { "epoch": 33.66134185303515, "grad_norm": 0.9108177423477173, "learning_rate": 6.492307692307693e-06, "loss": 0.0098, "step": 2634 }, { "epoch": 33.674121405750796, "grad_norm": 0.7251105904579163, "learning_rate": 6.487179487179488e-06, "loss": 0.0069, "step": 2635 }, { "epoch": 33.68690095846645, "grad_norm": 2.2520105838775635, "learning_rate": 6.482051282051283e-06, "loss": 0.0141, "step": 2636 }, { "epoch": 33.69968051118211, "grad_norm": 5.15416955947876, "learning_rate": 6.476923076923077e-06, "loss": 0.0279, "step": 2637 }, { "epoch": 33.712460063897765, "grad_norm": 2.7999393939971924, "learning_rate": 6.471794871794872e-06, "loss": 0.0195, "step": 2638 }, { "epoch": 33.72523961661342, "grad_norm": 1.4104244709014893, "learning_rate": 6.466666666666667e-06, "loss": 0.01, "step": 2639 }, { "epoch": 33.73801916932907, "grad_norm": 3.1084578037261963, "learning_rate": 6.461538461538463e-06, "loss": 0.007, "step": 2640 }, { "epoch": 33.75079872204473, "grad_norm": 1.16458261013031, "learning_rate": 6.4564102564102575e-06, "loss": 0.0102, "step": 2641 }, { "epoch": 33.763578274760384, "grad_norm": 2.8983755111694336, "learning_rate": 6.4512820512820516e-06, "loss": 0.015, "step": 2642 }, { "epoch": 33.77635782747604, "grad_norm": 0.38780269026756287, "learning_rate": 6.4461538461538465e-06, "loss": 0.0037, "step": 2643 }, { "epoch": 33.78913738019169, "grad_norm": 4.92404317855835, "learning_rate": 6.441025641025641e-06, "loss": 0.0244, "step": 2644 }, { "epoch": 33.801916932907346, "grad_norm": 3.1654813289642334, "learning_rate": 6.435897435897437e-06, "loss": 0.017, "step": 2645 }, { "epoch": 33.814696485623, "grad_norm": 0.5431066155433655, "learning_rate": 6.430769230769231e-06, "loss": 0.0051, "step": 2646 }, { "epoch": 33.82747603833866, "grad_norm": 0.9253678917884827, "learning_rate": 6.425641025641026e-06, "loss": 0.0075, "step": 2647 }, { "epoch": 33.840255591054316, "grad_norm": 1.2948482036590576, "learning_rate": 6.420512820512821e-06, "loss": 0.0074, "step": 2648 }, { "epoch": 33.853035143769965, "grad_norm": 1.8760825395584106, "learning_rate": 6.415384615384616e-06, "loss": 0.01, "step": 2649 }, { "epoch": 33.86581469648562, "grad_norm": 0.7007573246955872, "learning_rate": 6.410256410256412e-06, "loss": 0.0053, "step": 2650 }, { "epoch": 33.87859424920128, "grad_norm": 0.7296006083488464, "learning_rate": 6.405128205128206e-06, "loss": 0.005, "step": 2651 }, { "epoch": 33.891373801916934, "grad_norm": 0.9634267091751099, "learning_rate": 6.4000000000000006e-06, "loss": 0.0076, "step": 2652 }, { "epoch": 33.90415335463259, "grad_norm": 2.8756632804870605, "learning_rate": 6.3948717948717955e-06, "loss": 0.0147, "step": 2653 }, { "epoch": 33.91693290734824, "grad_norm": 1.979262351989746, "learning_rate": 6.38974358974359e-06, "loss": 0.0205, "step": 2654 }, { "epoch": 33.9297124600639, "grad_norm": 1.4359487295150757, "learning_rate": 6.384615384615384e-06, "loss": 0.0085, "step": 2655 }, { "epoch": 33.94249201277955, "grad_norm": 0.5436068773269653, "learning_rate": 6.37948717948718e-06, "loss": 0.004, "step": 2656 }, { "epoch": 33.95527156549521, "grad_norm": 0.5183521509170532, "learning_rate": 6.374358974358975e-06, "loss": 0.0042, "step": 2657 }, { "epoch": 33.968051118210866, "grad_norm": 0.49053263664245605, "learning_rate": 6.36923076923077e-06, "loss": 0.0046, "step": 2658 }, { "epoch": 33.980830670926515, "grad_norm": 4.7706499099731445, "learning_rate": 6.364102564102565e-06, "loss": 0.0579, "step": 2659 }, { "epoch": 33.99361022364217, "grad_norm": 0.7718542814254761, "learning_rate": 6.358974358974359e-06, "loss": 0.0053, "step": 2660 }, { "epoch": 34.00638977635783, "grad_norm": 0.5025060176849365, "learning_rate": 6.353846153846155e-06, "loss": 0.0053, "step": 2661 }, { "epoch": 34.019169329073485, "grad_norm": 2.5457983016967773, "learning_rate": 6.3487179487179495e-06, "loss": 0.0118, "step": 2662 }, { "epoch": 34.031948881789134, "grad_norm": 0.7873379588127136, "learning_rate": 6.3435897435897444e-06, "loss": 0.0051, "step": 2663 }, { "epoch": 34.04472843450479, "grad_norm": 0.3507178723812103, "learning_rate": 6.3384615384615385e-06, "loss": 0.0038, "step": 2664 }, { "epoch": 34.05750798722045, "grad_norm": 1.270276665687561, "learning_rate": 6.333333333333333e-06, "loss": 0.0061, "step": 2665 }, { "epoch": 34.0702875399361, "grad_norm": 1.035638689994812, "learning_rate": 6.328205128205128e-06, "loss": 0.0065, "step": 2666 }, { "epoch": 34.08306709265176, "grad_norm": 0.6638619303703308, "learning_rate": 6.323076923076924e-06, "loss": 0.0061, "step": 2667 }, { "epoch": 34.09584664536741, "grad_norm": 1.7834850549697876, "learning_rate": 6.317948717948719e-06, "loss": 0.0077, "step": 2668 }, { "epoch": 34.108626198083066, "grad_norm": 0.4163866639137268, "learning_rate": 6.312820512820513e-06, "loss": 0.0042, "step": 2669 }, { "epoch": 34.12140575079872, "grad_norm": 3.6426656246185303, "learning_rate": 6.307692307692308e-06, "loss": 0.0245, "step": 2670 }, { "epoch": 34.13418530351438, "grad_norm": 0.363334983587265, "learning_rate": 6.302564102564103e-06, "loss": 0.0042, "step": 2671 }, { "epoch": 34.146964856230035, "grad_norm": 0.37875476479530334, "learning_rate": 6.2974358974358985e-06, "loss": 0.004, "step": 2672 }, { "epoch": 34.159744408945684, "grad_norm": 3.785109758377075, "learning_rate": 6.2923076923076934e-06, "loss": 0.0077, "step": 2673 }, { "epoch": 34.17252396166134, "grad_norm": 3.752488374710083, "learning_rate": 6.2871794871794875e-06, "loss": 0.0091, "step": 2674 }, { "epoch": 34.185303514377, "grad_norm": 0.7148936986923218, "learning_rate": 6.282051282051282e-06, "loss": 0.0071, "step": 2675 }, { "epoch": 34.198083067092654, "grad_norm": 2.916012763977051, "learning_rate": 6.276923076923077e-06, "loss": 0.0149, "step": 2676 }, { "epoch": 34.21086261980831, "grad_norm": 2.615849494934082, "learning_rate": 6.271794871794872e-06, "loss": 0.0114, "step": 2677 }, { "epoch": 34.22364217252396, "grad_norm": 0.3755146563053131, "learning_rate": 6.266666666666668e-06, "loss": 0.0045, "step": 2678 }, { "epoch": 34.236421725239616, "grad_norm": 1.1889039278030396, "learning_rate": 6.261538461538462e-06, "loss": 0.0083, "step": 2679 }, { "epoch": 34.24920127795527, "grad_norm": 0.518305242061615, "learning_rate": 6.256410256410257e-06, "loss": 0.0037, "step": 2680 }, { "epoch": 34.26198083067093, "grad_norm": 1.432346224784851, "learning_rate": 6.251282051282052e-06, "loss": 0.0112, "step": 2681 }, { "epoch": 34.27476038338658, "grad_norm": 0.7005894780158997, "learning_rate": 6.246153846153846e-06, "loss": 0.0056, "step": 2682 }, { "epoch": 34.287539936102235, "grad_norm": 0.5781270861625671, "learning_rate": 6.2410256410256424e-06, "loss": 0.0042, "step": 2683 }, { "epoch": 34.30031948881789, "grad_norm": 0.3330841660499573, "learning_rate": 6.2358974358974365e-06, "loss": 0.0028, "step": 2684 }, { "epoch": 34.31309904153355, "grad_norm": 4.505059242248535, "learning_rate": 6.230769230769231e-06, "loss": 0.0157, "step": 2685 }, { "epoch": 34.325878594249204, "grad_norm": 2.908541440963745, "learning_rate": 6.225641025641026e-06, "loss": 0.0185, "step": 2686 }, { "epoch": 34.33865814696485, "grad_norm": 1.3968557119369507, "learning_rate": 6.22051282051282e-06, "loss": 0.0059, "step": 2687 }, { "epoch": 34.35143769968051, "grad_norm": 0.5780673623085022, "learning_rate": 6.215384615384615e-06, "loss": 0.0059, "step": 2688 }, { "epoch": 34.364217252396166, "grad_norm": 0.33554375171661377, "learning_rate": 6.210256410256411e-06, "loss": 0.0038, "step": 2689 }, { "epoch": 34.37699680511182, "grad_norm": 0.6034390926361084, "learning_rate": 6.205128205128206e-06, "loss": 0.0052, "step": 2690 }, { "epoch": 34.38977635782748, "grad_norm": 0.8513287305831909, "learning_rate": 6.200000000000001e-06, "loss": 0.0052, "step": 2691 }, { "epoch": 34.40255591054313, "grad_norm": 0.47783803939819336, "learning_rate": 6.194871794871795e-06, "loss": 0.004, "step": 2692 }, { "epoch": 34.415335463258785, "grad_norm": 0.5561738014221191, "learning_rate": 6.18974358974359e-06, "loss": 0.0057, "step": 2693 }, { "epoch": 34.42811501597444, "grad_norm": 0.8518915772438049, "learning_rate": 6.1846153846153855e-06, "loss": 0.0081, "step": 2694 }, { "epoch": 34.4408945686901, "grad_norm": 30.251968383789062, "learning_rate": 6.17948717948718e-06, "loss": 0.0125, "step": 2695 }, { "epoch": 34.453674121405754, "grad_norm": 0.7382756471633911, "learning_rate": 6.174358974358975e-06, "loss": 0.0057, "step": 2696 }, { "epoch": 34.466453674121404, "grad_norm": 0.9222352504730225, "learning_rate": 6.169230769230769e-06, "loss": 0.0057, "step": 2697 }, { "epoch": 34.47923322683706, "grad_norm": 1.679905891418457, "learning_rate": 6.164102564102564e-06, "loss": 0.007, "step": 2698 }, { "epoch": 34.49201277955272, "grad_norm": 0.7995689511299133, "learning_rate": 6.15897435897436e-06, "loss": 0.008, "step": 2699 }, { "epoch": 34.50479233226837, "grad_norm": 0.8102620840072632, "learning_rate": 6.153846153846155e-06, "loss": 0.0055, "step": 2700 }, { "epoch": 34.50479233226837, "eval_loss": 0.9436935186386108, "eval_runtime": 183.897, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 2700 }, { "epoch": 34.51757188498402, "grad_norm": 0.44937846064567566, "learning_rate": 6.14871794871795e-06, "loss": 0.0038, "step": 2701 }, { "epoch": 34.53035143769968, "grad_norm": 0.4699499309062958, "learning_rate": 6.143589743589744e-06, "loss": 0.0055, "step": 2702 }, { "epoch": 34.543130990415335, "grad_norm": 1.1993327140808105, "learning_rate": 6.138461538461539e-06, "loss": 0.0065, "step": 2703 }, { "epoch": 34.55591054313099, "grad_norm": 0.4858746826648712, "learning_rate": 6.133333333333334e-06, "loss": 0.0041, "step": 2704 }, { "epoch": 34.56869009584665, "grad_norm": 0.6977940797805786, "learning_rate": 6.128205128205129e-06, "loss": 0.0051, "step": 2705 }, { "epoch": 34.5814696485623, "grad_norm": 0.36483317613601685, "learning_rate": 6.123076923076923e-06, "loss": 0.0032, "step": 2706 }, { "epoch": 34.594249201277954, "grad_norm": 0.4539944529533386, "learning_rate": 6.117948717948718e-06, "loss": 0.0049, "step": 2707 }, { "epoch": 34.60702875399361, "grad_norm": 1.643522024154663, "learning_rate": 6.112820512820513e-06, "loss": 0.0091, "step": 2708 }, { "epoch": 34.61980830670927, "grad_norm": 0.47860094904899597, "learning_rate": 6.107692307692308e-06, "loss": 0.0042, "step": 2709 }, { "epoch": 34.63258785942492, "grad_norm": 1.3490616083145142, "learning_rate": 6.102564102564104e-06, "loss": 0.0091, "step": 2710 }, { "epoch": 34.64536741214057, "grad_norm": 3.0723254680633545, "learning_rate": 6.097435897435898e-06, "loss": 0.0131, "step": 2711 }, { "epoch": 34.65814696485623, "grad_norm": 0.4609510600566864, "learning_rate": 6.092307692307693e-06, "loss": 0.0038, "step": 2712 }, { "epoch": 34.670926517571885, "grad_norm": 0.3855610489845276, "learning_rate": 6.087179487179488e-06, "loss": 0.0038, "step": 2713 }, { "epoch": 34.68370607028754, "grad_norm": 3.6798195838928223, "learning_rate": 6.082051282051283e-06, "loss": 0.0134, "step": 2714 }, { "epoch": 34.6964856230032, "grad_norm": 0.545027494430542, "learning_rate": 6.076923076923077e-06, "loss": 0.0054, "step": 2715 }, { "epoch": 34.70926517571885, "grad_norm": 1.6062592267990112, "learning_rate": 6.071794871794872e-06, "loss": 0.0174, "step": 2716 }, { "epoch": 34.722044728434504, "grad_norm": 0.5605974197387695, "learning_rate": 6.066666666666667e-06, "loss": 0.005, "step": 2717 }, { "epoch": 34.73482428115016, "grad_norm": 0.6980181932449341, "learning_rate": 6.061538461538462e-06, "loss": 0.0059, "step": 2718 }, { "epoch": 34.74760383386582, "grad_norm": 0.5648006796836853, "learning_rate": 6.056410256410257e-06, "loss": 0.0054, "step": 2719 }, { "epoch": 34.760383386581466, "grad_norm": 0.6961598992347717, "learning_rate": 6.051282051282051e-06, "loss": 0.0052, "step": 2720 }, { "epoch": 34.77316293929712, "grad_norm": 0.5445839166641235, "learning_rate": 6.046153846153847e-06, "loss": 0.0053, "step": 2721 }, { "epoch": 34.78594249201278, "grad_norm": 0.4801168143749237, "learning_rate": 6.041025641025642e-06, "loss": 0.0054, "step": 2722 }, { "epoch": 34.798722044728436, "grad_norm": 0.3502941429615021, "learning_rate": 6.035897435897437e-06, "loss": 0.0031, "step": 2723 }, { "epoch": 34.81150159744409, "grad_norm": 2.878469228744507, "learning_rate": 6.030769230769231e-06, "loss": 0.0258, "step": 2724 }, { "epoch": 34.82428115015974, "grad_norm": 1.5865216255187988, "learning_rate": 6.025641025641026e-06, "loss": 0.0127, "step": 2725 }, { "epoch": 34.8370607028754, "grad_norm": 1.8358023166656494, "learning_rate": 6.0205128205128206e-06, "loss": 0.0146, "step": 2726 }, { "epoch": 34.849840255591054, "grad_norm": 2.642219305038452, "learning_rate": 6.015384615384616e-06, "loss": 0.0126, "step": 2727 }, { "epoch": 34.86261980830671, "grad_norm": 2.9801673889160156, "learning_rate": 6.010256410256411e-06, "loss": 0.0151, "step": 2728 }, { "epoch": 34.87539936102237, "grad_norm": 2.9196527004241943, "learning_rate": 6.005128205128205e-06, "loss": 0.012, "step": 2729 }, { "epoch": 34.88817891373802, "grad_norm": 1.0630669593811035, "learning_rate": 6e-06, "loss": 0.0089, "step": 2730 }, { "epoch": 34.90095846645367, "grad_norm": 0.7134859561920166, "learning_rate": 5.994871794871795e-06, "loss": 0.0044, "step": 2731 }, { "epoch": 34.91373801916933, "grad_norm": 3.1851651668548584, "learning_rate": 5.989743589743591e-06, "loss": 0.0175, "step": 2732 }, { "epoch": 34.926517571884986, "grad_norm": 0.4819214642047882, "learning_rate": 5.984615384615386e-06, "loss": 0.0046, "step": 2733 }, { "epoch": 34.93929712460064, "grad_norm": 1.816530466079712, "learning_rate": 5.97948717948718e-06, "loss": 0.0157, "step": 2734 }, { "epoch": 34.95207667731629, "grad_norm": 0.5850124955177307, "learning_rate": 5.974358974358975e-06, "loss": 0.0073, "step": 2735 }, { "epoch": 34.96485623003195, "grad_norm": 8.858750343322754, "learning_rate": 5.9692307692307695e-06, "loss": 0.0335, "step": 2736 }, { "epoch": 34.977635782747605, "grad_norm": 2.54228138923645, "learning_rate": 5.9641025641025644e-06, "loss": 0.0212, "step": 2737 }, { "epoch": 34.99041533546326, "grad_norm": 0.37504932284355164, "learning_rate": 5.95897435897436e-06, "loss": 0.0038, "step": 2738 }, { "epoch": 35.00319488817891, "grad_norm": 0.5887596011161804, "learning_rate": 5.953846153846154e-06, "loss": 0.0049, "step": 2739 }, { "epoch": 35.01597444089457, "grad_norm": 0.47470083832740784, "learning_rate": 5.948717948717949e-06, "loss": 0.0035, "step": 2740 }, { "epoch": 35.02875399361022, "grad_norm": 1.7007302045822144, "learning_rate": 5.943589743589744e-06, "loss": 0.0162, "step": 2741 }, { "epoch": 35.04153354632588, "grad_norm": 0.7021458745002747, "learning_rate": 5.938461538461538e-06, "loss": 0.0129, "step": 2742 }, { "epoch": 35.054313099041536, "grad_norm": 3.1543846130371094, "learning_rate": 5.933333333333335e-06, "loss": 0.0175, "step": 2743 }, { "epoch": 35.067092651757186, "grad_norm": 10.487667083740234, "learning_rate": 5.928205128205129e-06, "loss": 0.0227, "step": 2744 }, { "epoch": 35.07987220447284, "grad_norm": 0.6180176734924316, "learning_rate": 5.923076923076924e-06, "loss": 0.0052, "step": 2745 }, { "epoch": 35.0926517571885, "grad_norm": 0.45216065645217896, "learning_rate": 5.9179487179487185e-06, "loss": 0.0049, "step": 2746 }, { "epoch": 35.105431309904155, "grad_norm": 0.40809720754623413, "learning_rate": 5.912820512820513e-06, "loss": 0.0043, "step": 2747 }, { "epoch": 35.11821086261981, "grad_norm": 1.078952670097351, "learning_rate": 5.907692307692308e-06, "loss": 0.0061, "step": 2748 }, { "epoch": 35.13099041533546, "grad_norm": 0.625017523765564, "learning_rate": 5.902564102564103e-06, "loss": 0.0045, "step": 2749 }, { "epoch": 35.14376996805112, "grad_norm": 0.6739218235015869, "learning_rate": 5.897435897435898e-06, "loss": 0.0057, "step": 2750 }, { "epoch": 35.156549520766774, "grad_norm": 0.4799802899360657, "learning_rate": 5.892307692307693e-06, "loss": 0.0027, "step": 2751 }, { "epoch": 35.16932907348243, "grad_norm": 3.5189943313598633, "learning_rate": 5.887179487179487e-06, "loss": 0.0168, "step": 2752 }, { "epoch": 35.18210862619808, "grad_norm": 3.1091294288635254, "learning_rate": 5.882051282051282e-06, "loss": 0.0276, "step": 2753 }, { "epoch": 35.194888178913736, "grad_norm": 3.039184331893921, "learning_rate": 5.876923076923078e-06, "loss": 0.0174, "step": 2754 }, { "epoch": 35.20766773162939, "grad_norm": 0.737232506275177, "learning_rate": 5.871794871794873e-06, "loss": 0.0067, "step": 2755 }, { "epoch": 35.22044728434505, "grad_norm": 1.1661545038223267, "learning_rate": 5.8666666666666675e-06, "loss": 0.005, "step": 2756 }, { "epoch": 35.233226837060705, "grad_norm": 2.1293647289276123, "learning_rate": 5.861538461538462e-06, "loss": 0.0107, "step": 2757 }, { "epoch": 35.246006389776355, "grad_norm": 0.275880366563797, "learning_rate": 5.8564102564102565e-06, "loss": 0.0026, "step": 2758 }, { "epoch": 35.25878594249201, "grad_norm": 0.5937846899032593, "learning_rate": 5.851282051282052e-06, "loss": 0.0045, "step": 2759 }, { "epoch": 35.27156549520767, "grad_norm": 0.37913593649864197, "learning_rate": 5.846153846153847e-06, "loss": 0.0039, "step": 2760 }, { "epoch": 35.284345047923324, "grad_norm": 0.5930600166320801, "learning_rate": 5.841025641025642e-06, "loss": 0.0052, "step": 2761 }, { "epoch": 35.29712460063898, "grad_norm": 0.6751174330711365, "learning_rate": 5.835897435897436e-06, "loss": 0.0048, "step": 2762 }, { "epoch": 35.30990415335463, "grad_norm": 0.31900155544281006, "learning_rate": 5.830769230769231e-06, "loss": 0.003, "step": 2763 }, { "epoch": 35.322683706070286, "grad_norm": 0.5694495439529419, "learning_rate": 5.825641025641026e-06, "loss": 0.0046, "step": 2764 }, { "epoch": 35.33546325878594, "grad_norm": 1.1516863107681274, "learning_rate": 5.820512820512822e-06, "loss": 0.0046, "step": 2765 }, { "epoch": 35.3482428115016, "grad_norm": 2.9431941509246826, "learning_rate": 5.815384615384616e-06, "loss": 0.0214, "step": 2766 }, { "epoch": 35.361022364217256, "grad_norm": 2.029663562774658, "learning_rate": 5.8102564102564106e-06, "loss": 0.0157, "step": 2767 }, { "epoch": 35.373801916932905, "grad_norm": 0.2581423223018646, "learning_rate": 5.8051282051282055e-06, "loss": 0.0022, "step": 2768 }, { "epoch": 35.38658146964856, "grad_norm": 0.3771861791610718, "learning_rate": 5.8e-06, "loss": 0.003, "step": 2769 }, { "epoch": 35.39936102236422, "grad_norm": 0.69022536277771, "learning_rate": 5.794871794871796e-06, "loss": 0.0048, "step": 2770 }, { "epoch": 35.412140575079874, "grad_norm": 2.675891637802124, "learning_rate": 5.78974358974359e-06, "loss": 0.0229, "step": 2771 }, { "epoch": 35.424920127795524, "grad_norm": 0.2880118489265442, "learning_rate": 5.784615384615385e-06, "loss": 0.003, "step": 2772 }, { "epoch": 35.43769968051118, "grad_norm": 0.38748475909233093, "learning_rate": 5.77948717948718e-06, "loss": 0.0043, "step": 2773 }, { "epoch": 35.45047923322684, "grad_norm": 2.8036811351776123, "learning_rate": 5.774358974358975e-06, "loss": 0.0183, "step": 2774 }, { "epoch": 35.46325878594249, "grad_norm": 1.2062289714813232, "learning_rate": 5.769230769230769e-06, "loss": 0.0097, "step": 2775 }, { "epoch": 35.47603833865815, "grad_norm": 1.0259376764297485, "learning_rate": 5.764102564102565e-06, "loss": 0.0075, "step": 2776 }, { "epoch": 35.4888178913738, "grad_norm": 0.43436387181282043, "learning_rate": 5.7589743589743596e-06, "loss": 0.005, "step": 2777 }, { "epoch": 35.501597444089455, "grad_norm": 0.4108300507068634, "learning_rate": 5.7538461538461545e-06, "loss": 0.0039, "step": 2778 }, { "epoch": 35.51437699680511, "grad_norm": 0.5943469405174255, "learning_rate": 5.748717948717949e-06, "loss": 0.0051, "step": 2779 }, { "epoch": 35.52715654952077, "grad_norm": 0.397146075963974, "learning_rate": 5.743589743589743e-06, "loss": 0.0039, "step": 2780 }, { "epoch": 35.539936102236425, "grad_norm": 2.4300386905670166, "learning_rate": 5.738461538461539e-06, "loss": 0.0132, "step": 2781 }, { "epoch": 35.552715654952074, "grad_norm": 0.35871580243110657, "learning_rate": 5.733333333333334e-06, "loss": 0.0039, "step": 2782 }, { "epoch": 35.56549520766773, "grad_norm": 1.9119770526885986, "learning_rate": 5.728205128205129e-06, "loss": 0.0137, "step": 2783 }, { "epoch": 35.57827476038339, "grad_norm": 0.35279327630996704, "learning_rate": 5.723076923076923e-06, "loss": 0.0035, "step": 2784 }, { "epoch": 35.59105431309904, "grad_norm": 3.315377712249756, "learning_rate": 5.717948717948718e-06, "loss": 0.0195, "step": 2785 }, { "epoch": 35.6038338658147, "grad_norm": 0.4838240444660187, "learning_rate": 5.712820512820513e-06, "loss": 0.004, "step": 2786 }, { "epoch": 35.61661341853035, "grad_norm": 0.40503624081611633, "learning_rate": 5.7076923076923086e-06, "loss": 0.0033, "step": 2787 }, { "epoch": 35.629392971246006, "grad_norm": 0.5054441690444946, "learning_rate": 5.7025641025641035e-06, "loss": 0.005, "step": 2788 }, { "epoch": 35.64217252396166, "grad_norm": 0.4412970542907715, "learning_rate": 5.6974358974358975e-06, "loss": 0.0033, "step": 2789 }, { "epoch": 35.65495207667732, "grad_norm": 0.36365410685539246, "learning_rate": 5.692307692307692e-06, "loss": 0.0036, "step": 2790 }, { "epoch": 35.66773162939297, "grad_norm": 0.5414665937423706, "learning_rate": 5.687179487179487e-06, "loss": 0.0055, "step": 2791 }, { "epoch": 35.680511182108624, "grad_norm": 2.9094033241271973, "learning_rate": 5.682051282051283e-06, "loss": 0.0174, "step": 2792 }, { "epoch": 35.69329073482428, "grad_norm": 0.43583613634109497, "learning_rate": 5.676923076923078e-06, "loss": 0.0038, "step": 2793 }, { "epoch": 35.70607028753994, "grad_norm": 0.3606933653354645, "learning_rate": 5.671794871794872e-06, "loss": 0.0035, "step": 2794 }, { "epoch": 35.718849840255594, "grad_norm": 1.6921635866165161, "learning_rate": 5.666666666666667e-06, "loss": 0.0112, "step": 2795 }, { "epoch": 35.73162939297124, "grad_norm": 0.2892257869243622, "learning_rate": 5.661538461538462e-06, "loss": 0.0037, "step": 2796 }, { "epoch": 35.7444089456869, "grad_norm": 0.5619503259658813, "learning_rate": 5.6564102564102575e-06, "loss": 0.0041, "step": 2797 }, { "epoch": 35.757188498402556, "grad_norm": 0.4655962884426117, "learning_rate": 5.6512820512820524e-06, "loss": 0.0047, "step": 2798 }, { "epoch": 35.76996805111821, "grad_norm": 0.2837389409542084, "learning_rate": 5.6461538461538465e-06, "loss": 0.0035, "step": 2799 }, { "epoch": 35.78274760383387, "grad_norm": 0.3791850805282593, "learning_rate": 5.641025641025641e-06, "loss": 0.0028, "step": 2800 }, { "epoch": 35.78274760383387, "eval_loss": 0.9578615427017212, "eval_runtime": 183.8282, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 2800 }, { "epoch": 35.79552715654952, "grad_norm": 1.7197462320327759, "learning_rate": 5.635897435897436e-06, "loss": 0.0053, "step": 2801 }, { "epoch": 35.808306709265175, "grad_norm": 0.36914217472076416, "learning_rate": 5.63076923076923e-06, "loss": 0.0036, "step": 2802 }, { "epoch": 35.82108626198083, "grad_norm": 0.648004412651062, "learning_rate": 5.625641025641027e-06, "loss": 0.0044, "step": 2803 }, { "epoch": 35.83386581469649, "grad_norm": 1.7772631645202637, "learning_rate": 5.620512820512821e-06, "loss": 0.0122, "step": 2804 }, { "epoch": 35.846645367412144, "grad_norm": 0.5996457934379578, "learning_rate": 5.615384615384616e-06, "loss": 0.0042, "step": 2805 }, { "epoch": 35.85942492012779, "grad_norm": 0.5983651280403137, "learning_rate": 5.610256410256411e-06, "loss": 0.0049, "step": 2806 }, { "epoch": 35.87220447284345, "grad_norm": 0.9759971499443054, "learning_rate": 5.605128205128205e-06, "loss": 0.0057, "step": 2807 }, { "epoch": 35.884984025559106, "grad_norm": 5.0989203453063965, "learning_rate": 5.600000000000001e-06, "loss": 0.032, "step": 2808 }, { "epoch": 35.89776357827476, "grad_norm": 3.5177414417266846, "learning_rate": 5.5948717948717955e-06, "loss": 0.016, "step": 2809 }, { "epoch": 35.91054313099041, "grad_norm": 0.5371911525726318, "learning_rate": 5.58974358974359e-06, "loss": 0.0057, "step": 2810 }, { "epoch": 35.92332268370607, "grad_norm": 0.5497738718986511, "learning_rate": 5.584615384615385e-06, "loss": 0.0045, "step": 2811 }, { "epoch": 35.936102236421725, "grad_norm": 1.1579020023345947, "learning_rate": 5.579487179487179e-06, "loss": 0.0111, "step": 2812 }, { "epoch": 35.94888178913738, "grad_norm": 0.5208929181098938, "learning_rate": 5.574358974358974e-06, "loss": 0.0041, "step": 2813 }, { "epoch": 35.96166134185304, "grad_norm": 0.4912267327308655, "learning_rate": 5.56923076923077e-06, "loss": 0.0042, "step": 2814 }, { "epoch": 35.97444089456869, "grad_norm": 0.3925940990447998, "learning_rate": 5.564102564102565e-06, "loss": 0.0034, "step": 2815 }, { "epoch": 35.98722044728434, "grad_norm": 0.5372114777565002, "learning_rate": 5.55897435897436e-06, "loss": 0.0035, "step": 2816 }, { "epoch": 36.0, "grad_norm": 0.3666168749332428, "learning_rate": 5.553846153846154e-06, "loss": 0.005, "step": 2817 }, { "epoch": 36.01277955271566, "grad_norm": 0.322618305683136, "learning_rate": 5.548717948717949e-06, "loss": 0.0034, "step": 2818 }, { "epoch": 36.02555910543131, "grad_norm": 0.4605313539505005, "learning_rate": 5.5435897435897445e-06, "loss": 0.0041, "step": 2819 }, { "epoch": 36.03833865814696, "grad_norm": 0.3681679666042328, "learning_rate": 5.538461538461539e-06, "loss": 0.003, "step": 2820 }, { "epoch": 36.05111821086262, "grad_norm": 10.06080436706543, "learning_rate": 5.533333333333334e-06, "loss": 0.0087, "step": 2821 }, { "epoch": 36.063897763578275, "grad_norm": 1.4203732013702393, "learning_rate": 5.528205128205128e-06, "loss": 0.006, "step": 2822 }, { "epoch": 36.07667731629393, "grad_norm": 0.5547159314155579, "learning_rate": 5.523076923076923e-06, "loss": 0.0038, "step": 2823 }, { "epoch": 36.08945686900959, "grad_norm": 0.4133658707141876, "learning_rate": 5.517948717948718e-06, "loss": 0.0034, "step": 2824 }, { "epoch": 36.10223642172524, "grad_norm": 0.4233897030353546, "learning_rate": 5.512820512820514e-06, "loss": 0.0036, "step": 2825 }, { "epoch": 36.115015974440894, "grad_norm": 2.718273639678955, "learning_rate": 5.507692307692308e-06, "loss": 0.0177, "step": 2826 }, { "epoch": 36.12779552715655, "grad_norm": 3.14428448677063, "learning_rate": 5.502564102564103e-06, "loss": 0.0135, "step": 2827 }, { "epoch": 36.14057507987221, "grad_norm": 0.31426680088043213, "learning_rate": 5.497435897435898e-06, "loss": 0.003, "step": 2828 }, { "epoch": 36.153354632587856, "grad_norm": 0.35171377658843994, "learning_rate": 5.492307692307693e-06, "loss": 0.0047, "step": 2829 }, { "epoch": 36.16613418530351, "grad_norm": 0.45544207096099854, "learning_rate": 5.487179487179488e-06, "loss": 0.0037, "step": 2830 }, { "epoch": 36.17891373801917, "grad_norm": 2.908658027648926, "learning_rate": 5.4820512820512824e-06, "loss": 0.0063, "step": 2831 }, { "epoch": 36.191693290734825, "grad_norm": 0.33017784357070923, "learning_rate": 5.476923076923077e-06, "loss": 0.0043, "step": 2832 }, { "epoch": 36.20447284345048, "grad_norm": 0.31544455885887146, "learning_rate": 5.471794871794872e-06, "loss": 0.0039, "step": 2833 }, { "epoch": 36.21725239616613, "grad_norm": 0.3912992477416992, "learning_rate": 5.466666666666667e-06, "loss": 0.004, "step": 2834 }, { "epoch": 36.23003194888179, "grad_norm": 1.5009045600891113, "learning_rate": 5.461538461538461e-06, "loss": 0.009, "step": 2835 }, { "epoch": 36.242811501597444, "grad_norm": 0.5826500654220581, "learning_rate": 5.456410256410257e-06, "loss": 0.0028, "step": 2836 }, { "epoch": 36.2555910543131, "grad_norm": 0.47322073578834534, "learning_rate": 5.451282051282052e-06, "loss": 0.0035, "step": 2837 }, { "epoch": 36.26837060702876, "grad_norm": 1.2564668655395508, "learning_rate": 5.446153846153847e-06, "loss": 0.0035, "step": 2838 }, { "epoch": 36.281150159744406, "grad_norm": 3.0140414237976074, "learning_rate": 5.441025641025642e-06, "loss": 0.0139, "step": 2839 }, { "epoch": 36.29392971246006, "grad_norm": 0.25751781463623047, "learning_rate": 5.435897435897436e-06, "loss": 0.0026, "step": 2840 }, { "epoch": 36.30670926517572, "grad_norm": 0.3114520311355591, "learning_rate": 5.430769230769231e-06, "loss": 0.0043, "step": 2841 }, { "epoch": 36.319488817891376, "grad_norm": 2.1022861003875732, "learning_rate": 5.425641025641026e-06, "loss": 0.0088, "step": 2842 }, { "epoch": 36.33226837060703, "grad_norm": 1.3880363702774048, "learning_rate": 5.420512820512821e-06, "loss": 0.0071, "step": 2843 }, { "epoch": 36.34504792332268, "grad_norm": 0.2697431743144989, "learning_rate": 5.415384615384615e-06, "loss": 0.0023, "step": 2844 }, { "epoch": 36.35782747603834, "grad_norm": 2.0496296882629395, "learning_rate": 5.41025641025641e-06, "loss": 0.0084, "step": 2845 }, { "epoch": 36.370607028753994, "grad_norm": 0.378528892993927, "learning_rate": 5.405128205128205e-06, "loss": 0.0042, "step": 2846 }, { "epoch": 36.38338658146965, "grad_norm": 0.2853531539440155, "learning_rate": 5.400000000000001e-06, "loss": 0.0035, "step": 2847 }, { "epoch": 36.3961661341853, "grad_norm": 0.3787241578102112, "learning_rate": 5.394871794871796e-06, "loss": 0.0046, "step": 2848 }, { "epoch": 36.40894568690096, "grad_norm": 2.358002185821533, "learning_rate": 5.38974358974359e-06, "loss": 0.0131, "step": 2849 }, { "epoch": 36.42172523961661, "grad_norm": 0.3563104271888733, "learning_rate": 5.384615384615385e-06, "loss": 0.0037, "step": 2850 }, { "epoch": 36.43450479233227, "grad_norm": 0.29914191365242004, "learning_rate": 5.3794871794871796e-06, "loss": 0.0026, "step": 2851 }, { "epoch": 36.447284345047926, "grad_norm": 0.7096664905548096, "learning_rate": 5.374358974358975e-06, "loss": 0.0036, "step": 2852 }, { "epoch": 36.460063897763575, "grad_norm": 0.6353800296783447, "learning_rate": 5.36923076923077e-06, "loss": 0.0043, "step": 2853 }, { "epoch": 36.47284345047923, "grad_norm": 0.3164236545562744, "learning_rate": 5.364102564102564e-06, "loss": 0.0025, "step": 2854 }, { "epoch": 36.48562300319489, "grad_norm": 2.6620049476623535, "learning_rate": 5.358974358974359e-06, "loss": 0.019, "step": 2855 }, { "epoch": 36.498402555910545, "grad_norm": 4.248519420623779, "learning_rate": 5.353846153846154e-06, "loss": 0.0128, "step": 2856 }, { "epoch": 36.5111821086262, "grad_norm": 0.45837897062301636, "learning_rate": 5.34871794871795e-06, "loss": 0.0043, "step": 2857 }, { "epoch": 36.52396166134185, "grad_norm": 0.44139301776885986, "learning_rate": 5.343589743589745e-06, "loss": 0.0036, "step": 2858 }, { "epoch": 36.53674121405751, "grad_norm": 0.7677149176597595, "learning_rate": 5.338461538461539e-06, "loss": 0.0068, "step": 2859 }, { "epoch": 36.54952076677316, "grad_norm": 0.5062174797058105, "learning_rate": 5.333333333333334e-06, "loss": 0.0038, "step": 2860 }, { "epoch": 36.56230031948882, "grad_norm": 0.29120340943336487, "learning_rate": 5.3282051282051286e-06, "loss": 0.0034, "step": 2861 }, { "epoch": 36.575079872204476, "grad_norm": 0.27878037095069885, "learning_rate": 5.323076923076923e-06, "loss": 0.0027, "step": 2862 }, { "epoch": 36.587859424920126, "grad_norm": 3.050676107406616, "learning_rate": 5.317948717948719e-06, "loss": 0.0253, "step": 2863 }, { "epoch": 36.60063897763578, "grad_norm": 0.8370914459228516, "learning_rate": 5.312820512820513e-06, "loss": 0.0051, "step": 2864 }, { "epoch": 36.61341853035144, "grad_norm": 0.38411661982536316, "learning_rate": 5.307692307692308e-06, "loss": 0.0032, "step": 2865 }, { "epoch": 36.626198083067095, "grad_norm": 1.3328490257263184, "learning_rate": 5.302564102564103e-06, "loss": 0.0054, "step": 2866 }, { "epoch": 36.638977635782744, "grad_norm": 0.33674442768096924, "learning_rate": 5.297435897435897e-06, "loss": 0.0028, "step": 2867 }, { "epoch": 36.6517571884984, "grad_norm": 4.527010917663574, "learning_rate": 5.292307692307693e-06, "loss": 0.0184, "step": 2868 }, { "epoch": 36.66453674121406, "grad_norm": 0.3857632875442505, "learning_rate": 5.287179487179488e-06, "loss": 0.003, "step": 2869 }, { "epoch": 36.677316293929714, "grad_norm": 2.6783156394958496, "learning_rate": 5.282051282051283e-06, "loss": 0.0166, "step": 2870 }, { "epoch": 36.69009584664537, "grad_norm": 0.8151161670684814, "learning_rate": 5.2769230769230775e-06, "loss": 0.0036, "step": 2871 }, { "epoch": 36.70287539936102, "grad_norm": 2.524899959564209, "learning_rate": 5.271794871794872e-06, "loss": 0.023, "step": 2872 }, { "epoch": 36.715654952076676, "grad_norm": 1.1502883434295654, "learning_rate": 5.2666666666666665e-06, "loss": 0.0047, "step": 2873 }, { "epoch": 36.72843450479233, "grad_norm": 0.44396790862083435, "learning_rate": 5.261538461538462e-06, "loss": 0.0032, "step": 2874 }, { "epoch": 36.74121405750799, "grad_norm": 0.389405220746994, "learning_rate": 5.256410256410257e-06, "loss": 0.0027, "step": 2875 }, { "epoch": 36.753993610223645, "grad_norm": 0.5646959543228149, "learning_rate": 5.251282051282052e-06, "loss": 0.0048, "step": 2876 }, { "epoch": 36.766773162939295, "grad_norm": 1.245563268661499, "learning_rate": 5.246153846153846e-06, "loss": 0.0047, "step": 2877 }, { "epoch": 36.77955271565495, "grad_norm": 1.3920589685440063, "learning_rate": 5.241025641025641e-06, "loss": 0.0074, "step": 2878 }, { "epoch": 36.79233226837061, "grad_norm": 1.5733996629714966, "learning_rate": 5.235897435897437e-06, "loss": 0.0088, "step": 2879 }, { "epoch": 36.805111821086264, "grad_norm": 3.1889090538024902, "learning_rate": 5.230769230769232e-06, "loss": 0.0204, "step": 2880 }, { "epoch": 36.81789137380191, "grad_norm": 8.020994186401367, "learning_rate": 5.2256410256410265e-06, "loss": 0.0234, "step": 2881 }, { "epoch": 36.83067092651757, "grad_norm": 0.5541637539863586, "learning_rate": 5.220512820512821e-06, "loss": 0.0033, "step": 2882 }, { "epoch": 36.843450479233226, "grad_norm": 2.588081121444702, "learning_rate": 5.2153846153846155e-06, "loss": 0.016, "step": 2883 }, { "epoch": 36.85623003194888, "grad_norm": 4.180788040161133, "learning_rate": 5.21025641025641e-06, "loss": 0.0149, "step": 2884 }, { "epoch": 36.86900958466454, "grad_norm": 3.450303554534912, "learning_rate": 5.205128205128206e-06, "loss": 0.0214, "step": 2885 }, { "epoch": 36.88178913738019, "grad_norm": 3.1327109336853027, "learning_rate": 5.2e-06, "loss": 0.0166, "step": 2886 }, { "epoch": 36.894568690095845, "grad_norm": 0.4896284341812134, "learning_rate": 5.194871794871795e-06, "loss": 0.0033, "step": 2887 }, { "epoch": 36.9073482428115, "grad_norm": 0.5049822330474854, "learning_rate": 5.18974358974359e-06, "loss": 0.0041, "step": 2888 }, { "epoch": 36.92012779552716, "grad_norm": 2.475656747817993, "learning_rate": 5.184615384615385e-06, "loss": 0.0303, "step": 2889 }, { "epoch": 36.932907348242814, "grad_norm": 1.1910037994384766, "learning_rate": 5.179487179487181e-06, "loss": 0.0113, "step": 2890 }, { "epoch": 36.945686900958464, "grad_norm": 3.3851752281188965, "learning_rate": 5.174358974358975e-06, "loss": 0.0208, "step": 2891 }, { "epoch": 36.95846645367412, "grad_norm": 0.3671295940876007, "learning_rate": 5.16923076923077e-06, "loss": 0.0032, "step": 2892 }, { "epoch": 36.97124600638978, "grad_norm": 0.5597812533378601, "learning_rate": 5.1641025641025645e-06, "loss": 0.005, "step": 2893 }, { "epoch": 36.98402555910543, "grad_norm": 0.8427302837371826, "learning_rate": 5.158974358974359e-06, "loss": 0.0066, "step": 2894 }, { "epoch": 36.99680511182109, "grad_norm": 2.484133005142212, "learning_rate": 5.1538461538461534e-06, "loss": 0.018, "step": 2895 }, { "epoch": 37.00958466453674, "grad_norm": 0.325967401266098, "learning_rate": 5.148717948717949e-06, "loss": 0.0036, "step": 2896 }, { "epoch": 37.022364217252395, "grad_norm": 1.9392606019973755, "learning_rate": 5.143589743589744e-06, "loss": 0.0096, "step": 2897 }, { "epoch": 37.03514376996805, "grad_norm": 4.457958221435547, "learning_rate": 5.138461538461539e-06, "loss": 0.0136, "step": 2898 }, { "epoch": 37.04792332268371, "grad_norm": 0.3894760310649872, "learning_rate": 5.133333333333334e-06, "loss": 0.0033, "step": 2899 }, { "epoch": 37.06070287539936, "grad_norm": 0.23036618530750275, "learning_rate": 5.128205128205128e-06, "loss": 0.0022, "step": 2900 }, { "epoch": 37.06070287539936, "eval_loss": 0.9559409618377686, "eval_runtime": 183.503, "eval_samples_per_second": 0.856, "eval_steps_per_second": 0.109, "step": 2900 }, { "epoch": 37.073482428115014, "grad_norm": 0.328531950712204, "learning_rate": 5.123076923076924e-06, "loss": 0.0036, "step": 2901 }, { "epoch": 37.08626198083067, "grad_norm": 0.5235569477081299, "learning_rate": 5.1179487179487186e-06, "loss": 0.0055, "step": 2902 }, { "epoch": 37.09904153354633, "grad_norm": 0.3692654073238373, "learning_rate": 5.1128205128205135e-06, "loss": 0.0033, "step": 2903 }, { "epoch": 37.11182108626198, "grad_norm": 5.13992977142334, "learning_rate": 5.1076923076923075e-06, "loss": 0.0169, "step": 2904 }, { "epoch": 37.12460063897763, "grad_norm": 1.2046083211898804, "learning_rate": 5.1025641025641024e-06, "loss": 0.0066, "step": 2905 }, { "epoch": 37.13738019169329, "grad_norm": 1.247635006904602, "learning_rate": 5.097435897435898e-06, "loss": 0.0058, "step": 2906 }, { "epoch": 37.150159744408946, "grad_norm": 0.5680016875267029, "learning_rate": 5.092307692307693e-06, "loss": 0.0034, "step": 2907 }, { "epoch": 37.1629392971246, "grad_norm": 1.868544578552246, "learning_rate": 5.087179487179488e-06, "loss": 0.0071, "step": 2908 }, { "epoch": 37.17571884984026, "grad_norm": 0.21541500091552734, "learning_rate": 5.082051282051282e-06, "loss": 0.0021, "step": 2909 }, { "epoch": 37.18849840255591, "grad_norm": 0.38051384687423706, "learning_rate": 5.076923076923077e-06, "loss": 0.0032, "step": 2910 }, { "epoch": 37.201277955271564, "grad_norm": 1.883426547050476, "learning_rate": 5.071794871794872e-06, "loss": 0.0089, "step": 2911 }, { "epoch": 37.21405750798722, "grad_norm": 4.500365734100342, "learning_rate": 5.0666666666666676e-06, "loss": 0.0238, "step": 2912 }, { "epoch": 37.22683706070288, "grad_norm": 2.925161600112915, "learning_rate": 5.0615384615384625e-06, "loss": 0.0152, "step": 2913 }, { "epoch": 37.239616613418534, "grad_norm": 0.9527667760848999, "learning_rate": 5.0564102564102565e-06, "loss": 0.0047, "step": 2914 }, { "epoch": 37.25239616613418, "grad_norm": 0.31250178813934326, "learning_rate": 5.051282051282051e-06, "loss": 0.0028, "step": 2915 }, { "epoch": 37.26517571884984, "grad_norm": 0.2610889673233032, "learning_rate": 5.046153846153846e-06, "loss": 0.0022, "step": 2916 }, { "epoch": 37.277955271565496, "grad_norm": 0.3735126554965973, "learning_rate": 5.041025641025642e-06, "loss": 0.0035, "step": 2917 }, { "epoch": 37.29073482428115, "grad_norm": 0.30350491404533386, "learning_rate": 5.035897435897437e-06, "loss": 0.0028, "step": 2918 }, { "epoch": 37.3035143769968, "grad_norm": 3.4979629516601562, "learning_rate": 5.030769230769231e-06, "loss": 0.0192, "step": 2919 }, { "epoch": 37.31629392971246, "grad_norm": 0.3009140193462372, "learning_rate": 5.025641025641026e-06, "loss": 0.003, "step": 2920 }, { "epoch": 37.329073482428115, "grad_norm": 0.5664937496185303, "learning_rate": 5.020512820512821e-06, "loss": 0.0039, "step": 2921 }, { "epoch": 37.34185303514377, "grad_norm": 0.25941237807273865, "learning_rate": 5.015384615384616e-06, "loss": 0.003, "step": 2922 }, { "epoch": 37.35463258785943, "grad_norm": 0.35156363248825073, "learning_rate": 5.0102564102564115e-06, "loss": 0.0041, "step": 2923 }, { "epoch": 37.36741214057508, "grad_norm": 0.9193771481513977, "learning_rate": 5.0051282051282055e-06, "loss": 0.0053, "step": 2924 }, { "epoch": 37.38019169329073, "grad_norm": 0.4133692681789398, "learning_rate": 5e-06, "loss": 0.0048, "step": 2925 }, { "epoch": 37.39297124600639, "grad_norm": 4.615827560424805, "learning_rate": 4.994871794871795e-06, "loss": 0.0108, "step": 2926 }, { "epoch": 37.405750798722046, "grad_norm": 1.0765419006347656, "learning_rate": 4.98974358974359e-06, "loss": 0.0067, "step": 2927 }, { "epoch": 37.4185303514377, "grad_norm": 0.2866329252719879, "learning_rate": 4.984615384615385e-06, "loss": 0.0023, "step": 2928 }, { "epoch": 37.43130990415335, "grad_norm": 0.7527754902839661, "learning_rate": 4.97948717948718e-06, "loss": 0.003, "step": 2929 }, { "epoch": 37.44408945686901, "grad_norm": 2.4644179344177246, "learning_rate": 4.974358974358975e-06, "loss": 0.0119, "step": 2930 }, { "epoch": 37.456869009584665, "grad_norm": 0.17018601298332214, "learning_rate": 4.96923076923077e-06, "loss": 0.002, "step": 2931 }, { "epoch": 37.46964856230032, "grad_norm": 0.9139269590377808, "learning_rate": 4.964102564102565e-06, "loss": 0.006, "step": 2932 }, { "epoch": 37.48242811501598, "grad_norm": 0.37774431705474854, "learning_rate": 4.95897435897436e-06, "loss": 0.0036, "step": 2933 }, { "epoch": 37.49520766773163, "grad_norm": 0.31033891439437866, "learning_rate": 4.9538461538461545e-06, "loss": 0.0029, "step": 2934 }, { "epoch": 37.50798722044728, "grad_norm": 1.9837037324905396, "learning_rate": 4.948717948717949e-06, "loss": 0.023, "step": 2935 }, { "epoch": 37.52076677316294, "grad_norm": 0.4745427072048187, "learning_rate": 4.943589743589744e-06, "loss": 0.0037, "step": 2936 }, { "epoch": 37.533546325878596, "grad_norm": 0.38666072487831116, "learning_rate": 4.938461538461538e-06, "loss": 0.0038, "step": 2937 }, { "epoch": 37.546325878594246, "grad_norm": 0.3316282629966736, "learning_rate": 4.933333333333334e-06, "loss": 0.0044, "step": 2938 }, { "epoch": 37.5591054313099, "grad_norm": 0.5502758622169495, "learning_rate": 4.928205128205128e-06, "loss": 0.0035, "step": 2939 }, { "epoch": 37.57188498402556, "grad_norm": 3.197373867034912, "learning_rate": 4.923076923076924e-06, "loss": 0.0061, "step": 2940 }, { "epoch": 37.584664536741215, "grad_norm": 49.10783004760742, "learning_rate": 4.917948717948719e-06, "loss": 0.0113, "step": 2941 }, { "epoch": 37.59744408945687, "grad_norm": 3.1348283290863037, "learning_rate": 4.912820512820513e-06, "loss": 0.0161, "step": 2942 }, { "epoch": 37.61022364217252, "grad_norm": 0.2980424463748932, "learning_rate": 4.907692307692309e-06, "loss": 0.0026, "step": 2943 }, { "epoch": 37.62300319488818, "grad_norm": 0.37425318360328674, "learning_rate": 4.902564102564103e-06, "loss": 0.0029, "step": 2944 }, { "epoch": 37.635782747603834, "grad_norm": 0.34164369106292725, "learning_rate": 4.8974358974358975e-06, "loss": 0.0031, "step": 2945 }, { "epoch": 37.64856230031949, "grad_norm": 1.639210820198059, "learning_rate": 4.892307692307693e-06, "loss": 0.0159, "step": 2946 }, { "epoch": 37.66134185303515, "grad_norm": 0.34300586581230164, "learning_rate": 4.887179487179487e-06, "loss": 0.003, "step": 2947 }, { "epoch": 37.674121405750796, "grad_norm": 0.5739888548851013, "learning_rate": 4.882051282051282e-06, "loss": 0.0039, "step": 2948 }, { "epoch": 37.68690095846645, "grad_norm": 0.5022448301315308, "learning_rate": 4.876923076923077e-06, "loss": 0.0043, "step": 2949 }, { "epoch": 37.69968051118211, "grad_norm": 0.7767954468727112, "learning_rate": 4.871794871794872e-06, "loss": 0.0027, "step": 2950 }, { "epoch": 37.712460063897765, "grad_norm": 2.9445812702178955, "learning_rate": 4.866666666666667e-06, "loss": 0.0195, "step": 2951 }, { "epoch": 37.72523961661342, "grad_norm": 3.331125259399414, "learning_rate": 4.861538461538462e-06, "loss": 0.0362, "step": 2952 }, { "epoch": 37.73801916932907, "grad_norm": 0.38296833634376526, "learning_rate": 4.856410256410257e-06, "loss": 0.0027, "step": 2953 }, { "epoch": 37.75079872204473, "grad_norm": 0.27674436569213867, "learning_rate": 4.851282051282052e-06, "loss": 0.0033, "step": 2954 }, { "epoch": 37.763578274760384, "grad_norm": 0.3808777928352356, "learning_rate": 4.8461538461538465e-06, "loss": 0.0027, "step": 2955 }, { "epoch": 37.77635782747604, "grad_norm": 4.131234645843506, "learning_rate": 4.8410256410256414e-06, "loss": 0.04, "step": 2956 }, { "epoch": 37.78913738019169, "grad_norm": 0.3839050829410553, "learning_rate": 4.835897435897436e-06, "loss": 0.0037, "step": 2957 }, { "epoch": 37.801916932907346, "grad_norm": 0.4981125295162201, "learning_rate": 4.830769230769231e-06, "loss": 0.0054, "step": 2958 }, { "epoch": 37.814696485623, "grad_norm": 0.6987520456314087, "learning_rate": 4.825641025641026e-06, "loss": 0.0047, "step": 2959 }, { "epoch": 37.82747603833866, "grad_norm": 3.7170562744140625, "learning_rate": 4.820512820512821e-06, "loss": 0.0276, "step": 2960 }, { "epoch": 37.840255591054316, "grad_norm": 3.376850128173828, "learning_rate": 4.815384615384616e-06, "loss": 0.0226, "step": 2961 }, { "epoch": 37.853035143769965, "grad_norm": 0.2902922034263611, "learning_rate": 4.810256410256411e-06, "loss": 0.003, "step": 2962 }, { "epoch": 37.86581469648562, "grad_norm": 0.4471033215522766, "learning_rate": 4.805128205128206e-06, "loss": 0.0038, "step": 2963 }, { "epoch": 37.87859424920128, "grad_norm": 0.24476750195026398, "learning_rate": 4.800000000000001e-06, "loss": 0.0027, "step": 2964 }, { "epoch": 37.891373801916934, "grad_norm": 0.436432421207428, "learning_rate": 4.7948717948717955e-06, "loss": 0.0037, "step": 2965 }, { "epoch": 37.90415335463259, "grad_norm": 0.403041273355484, "learning_rate": 4.7897435897435904e-06, "loss": 0.004, "step": 2966 }, { "epoch": 37.91693290734824, "grad_norm": 0.39625176787376404, "learning_rate": 4.7846153846153845e-06, "loss": 0.0033, "step": 2967 }, { "epoch": 37.9297124600639, "grad_norm": 2.864386796951294, "learning_rate": 4.77948717948718e-06, "loss": 0.0088, "step": 2968 }, { "epoch": 37.94249201277955, "grad_norm": 0.7663731575012207, "learning_rate": 4.774358974358974e-06, "loss": 0.0086, "step": 2969 }, { "epoch": 37.95527156549521, "grad_norm": 0.352105051279068, "learning_rate": 4.76923076923077e-06, "loss": 0.0036, "step": 2970 }, { "epoch": 37.968051118210866, "grad_norm": 0.330779105424881, "learning_rate": 4.764102564102565e-06, "loss": 0.0028, "step": 2971 }, { "epoch": 37.980830670926515, "grad_norm": 0.3980065882205963, "learning_rate": 4.758974358974359e-06, "loss": 0.0035, "step": 2972 }, { "epoch": 37.99361022364217, "grad_norm": 0.41715294122695923, "learning_rate": 4.753846153846155e-06, "loss": 0.0033, "step": 2973 }, { "epoch": 38.00638977635783, "grad_norm": 0.305719256401062, "learning_rate": 4.748717948717949e-06, "loss": 0.004, "step": 2974 }, { "epoch": 38.019169329073485, "grad_norm": 0.3436070680618286, "learning_rate": 4.743589743589744e-06, "loss": 0.0035, "step": 2975 }, { "epoch": 38.031948881789134, "grad_norm": 0.7291021943092346, "learning_rate": 4.738461538461539e-06, "loss": 0.0035, "step": 2976 }, { "epoch": 38.04472843450479, "grad_norm": 0.37716665863990784, "learning_rate": 4.7333333333333335e-06, "loss": 0.0029, "step": 2977 }, { "epoch": 38.05750798722045, "grad_norm": 1.5769388675689697, "learning_rate": 4.728205128205128e-06, "loss": 0.0071, "step": 2978 }, { "epoch": 38.0702875399361, "grad_norm": 1.3593897819519043, "learning_rate": 4.723076923076923e-06, "loss": 0.0067, "step": 2979 }, { "epoch": 38.08306709265176, "grad_norm": 3.29752254486084, "learning_rate": 4.717948717948718e-06, "loss": 0.0193, "step": 2980 }, { "epoch": 38.09584664536741, "grad_norm": 0.26745957136154175, "learning_rate": 4.712820512820513e-06, "loss": 0.0032, "step": 2981 }, { "epoch": 38.108626198083066, "grad_norm": 0.2419317215681076, "learning_rate": 4.707692307692308e-06, "loss": 0.0027, "step": 2982 }, { "epoch": 38.12140575079872, "grad_norm": 0.45501840114593506, "learning_rate": 4.702564102564103e-06, "loss": 0.0035, "step": 2983 }, { "epoch": 38.13418530351438, "grad_norm": 0.28786441683769226, "learning_rate": 4.697435897435898e-06, "loss": 0.0023, "step": 2984 }, { "epoch": 38.146964856230035, "grad_norm": 0.9460015892982483, "learning_rate": 4.692307692307693e-06, "loss": 0.0033, "step": 2985 }, { "epoch": 38.159744408945684, "grad_norm": 0.3270238935947418, "learning_rate": 4.6871794871794876e-06, "loss": 0.0022, "step": 2986 }, { "epoch": 38.17252396166134, "grad_norm": 0.2666434645652771, "learning_rate": 4.6820512820512825e-06, "loss": 0.0017, "step": 2987 }, { "epoch": 38.185303514377, "grad_norm": 3.4922378063201904, "learning_rate": 4.676923076923077e-06, "loss": 0.0298, "step": 2988 }, { "epoch": 38.198083067092654, "grad_norm": 0.3040410280227661, "learning_rate": 4.671794871794872e-06, "loss": 0.0037, "step": 2989 }, { "epoch": 38.21086261980831, "grad_norm": 0.8676387667655945, "learning_rate": 4.666666666666667e-06, "loss": 0.0045, "step": 2990 }, { "epoch": 38.22364217252396, "grad_norm": 3.9531896114349365, "learning_rate": 4.661538461538462e-06, "loss": 0.0048, "step": 2991 }, { "epoch": 38.236421725239616, "grad_norm": 1.1728514432907104, "learning_rate": 4.656410256410257e-06, "loss": 0.0063, "step": 2992 }, { "epoch": 38.24920127795527, "grad_norm": 0.9603011012077332, "learning_rate": 4.651282051282052e-06, "loss": 0.0042, "step": 2993 }, { "epoch": 38.26198083067093, "grad_norm": 0.2621918320655823, "learning_rate": 4.646153846153847e-06, "loss": 0.0023, "step": 2994 }, { "epoch": 38.27476038338658, "grad_norm": 2.27105712890625, "learning_rate": 4.641025641025642e-06, "loss": 0.0093, "step": 2995 }, { "epoch": 38.287539936102235, "grad_norm": 0.7400266528129578, "learning_rate": 4.6358974358974366e-06, "loss": 0.0036, "step": 2996 }, { "epoch": 38.30031948881789, "grad_norm": 0.25827065110206604, "learning_rate": 4.630769230769231e-06, "loss": 0.0028, "step": 2997 }, { "epoch": 38.31309904153355, "grad_norm": 0.3045773208141327, "learning_rate": 4.625641025641026e-06, "loss": 0.0022, "step": 2998 }, { "epoch": 38.325878594249204, "grad_norm": 0.4771479070186615, "learning_rate": 4.62051282051282e-06, "loss": 0.0044, "step": 2999 }, { "epoch": 38.33865814696485, "grad_norm": 5.8205246925354, "learning_rate": 4.615384615384616e-06, "loss": 0.0185, "step": 3000 }, { "epoch": 38.33865814696485, "eval_loss": 0.9848964214324951, "eval_runtime": 183.6684, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 3000 }, { "epoch": 38.35143769968051, "grad_norm": 0.44981399178504944, "learning_rate": 4.610256410256411e-06, "loss": 0.0034, "step": 3001 }, { "epoch": 38.364217252396166, "grad_norm": 0.2838537096977234, "learning_rate": 4.605128205128205e-06, "loss": 0.0024, "step": 3002 }, { "epoch": 38.37699680511182, "grad_norm": 3.2543389797210693, "learning_rate": 4.600000000000001e-06, "loss": 0.0138, "step": 3003 }, { "epoch": 38.38977635782748, "grad_norm": 0.35357922315597534, "learning_rate": 4.594871794871795e-06, "loss": 0.0028, "step": 3004 }, { "epoch": 38.40255591054313, "grad_norm": 0.2850092947483063, "learning_rate": 4.58974358974359e-06, "loss": 0.0025, "step": 3005 }, { "epoch": 38.415335463258785, "grad_norm": 3.4489376544952393, "learning_rate": 4.5846153846153855e-06, "loss": 0.0251, "step": 3006 }, { "epoch": 38.42811501597444, "grad_norm": 0.3378681242465973, "learning_rate": 4.57948717948718e-06, "loss": 0.0025, "step": 3007 }, { "epoch": 38.4408945686901, "grad_norm": 1.5556156635284424, "learning_rate": 4.5743589743589745e-06, "loss": 0.0109, "step": 3008 }, { "epoch": 38.453674121405754, "grad_norm": 0.47560226917266846, "learning_rate": 4.569230769230769e-06, "loss": 0.0034, "step": 3009 }, { "epoch": 38.466453674121404, "grad_norm": 0.6198320984840393, "learning_rate": 4.564102564102564e-06, "loss": 0.0122, "step": 3010 }, { "epoch": 38.47923322683706, "grad_norm": 0.3300226628780365, "learning_rate": 4.558974358974359e-06, "loss": 0.0034, "step": 3011 }, { "epoch": 38.49201277955272, "grad_norm": 0.2798616290092468, "learning_rate": 4.553846153846154e-06, "loss": 0.003, "step": 3012 }, { "epoch": 38.50479233226837, "grad_norm": 0.7014890909194946, "learning_rate": 4.548717948717949e-06, "loss": 0.0048, "step": 3013 }, { "epoch": 38.51757188498402, "grad_norm": 0.28109487891197205, "learning_rate": 4.543589743589744e-06, "loss": 0.0027, "step": 3014 }, { "epoch": 38.53035143769968, "grad_norm": 0.45775285363197327, "learning_rate": 4.538461538461539e-06, "loss": 0.0025, "step": 3015 }, { "epoch": 38.543130990415335, "grad_norm": 0.34922537207603455, "learning_rate": 4.533333333333334e-06, "loss": 0.0036, "step": 3016 }, { "epoch": 38.55591054313099, "grad_norm": 2.250239133834839, "learning_rate": 4.528205128205129e-06, "loss": 0.0098, "step": 3017 }, { "epoch": 38.56869009584665, "grad_norm": 0.5357843041419983, "learning_rate": 4.5230769230769235e-06, "loss": 0.0048, "step": 3018 }, { "epoch": 38.5814696485623, "grad_norm": 1.595799207687378, "learning_rate": 4.517948717948718e-06, "loss": 0.0158, "step": 3019 }, { "epoch": 38.594249201277954, "grad_norm": 1.6008353233337402, "learning_rate": 4.512820512820513e-06, "loss": 0.0074, "step": 3020 }, { "epoch": 38.60702875399361, "grad_norm": 1.9269359111785889, "learning_rate": 4.507692307692308e-06, "loss": 0.0155, "step": 3021 }, { "epoch": 38.61980830670927, "grad_norm": 0.2042221873998642, "learning_rate": 4.502564102564103e-06, "loss": 0.002, "step": 3022 }, { "epoch": 38.63258785942492, "grad_norm": 0.35417866706848145, "learning_rate": 4.497435897435898e-06, "loss": 0.0025, "step": 3023 }, { "epoch": 38.64536741214057, "grad_norm": 3.2746737003326416, "learning_rate": 4.492307692307693e-06, "loss": 0.0119, "step": 3024 }, { "epoch": 38.65814696485623, "grad_norm": 0.31555283069610596, "learning_rate": 4.487179487179488e-06, "loss": 0.0029, "step": 3025 }, { "epoch": 38.670926517571885, "grad_norm": 0.38801953196525574, "learning_rate": 4.482051282051283e-06, "loss": 0.0026, "step": 3026 }, { "epoch": 38.68370607028754, "grad_norm": 1.4678237438201904, "learning_rate": 4.476923076923077e-06, "loss": 0.0065, "step": 3027 }, { "epoch": 38.6964856230032, "grad_norm": 0.3929405212402344, "learning_rate": 4.4717948717948725e-06, "loss": 0.0028, "step": 3028 }, { "epoch": 38.70926517571885, "grad_norm": 0.3595089912414551, "learning_rate": 4.4666666666666665e-06, "loss": 0.0028, "step": 3029 }, { "epoch": 38.722044728434504, "grad_norm": 0.2732454240322113, "learning_rate": 4.461538461538462e-06, "loss": 0.0024, "step": 3030 }, { "epoch": 38.73482428115016, "grad_norm": 2.7754180431365967, "learning_rate": 4.456410256410257e-06, "loss": 0.0156, "step": 3031 }, { "epoch": 38.74760383386582, "grad_norm": 0.30526700615882874, "learning_rate": 4.451282051282051e-06, "loss": 0.0034, "step": 3032 }, { "epoch": 38.760383386581466, "grad_norm": 0.27021709084510803, "learning_rate": 4.446153846153847e-06, "loss": 0.0023, "step": 3033 }, { "epoch": 38.77316293929712, "grad_norm": 0.3636169731616974, "learning_rate": 4.441025641025641e-06, "loss": 0.0033, "step": 3034 }, { "epoch": 38.78594249201278, "grad_norm": 0.2715635895729065, "learning_rate": 4.435897435897436e-06, "loss": 0.0031, "step": 3035 }, { "epoch": 38.798722044728436, "grad_norm": 0.4538557529449463, "learning_rate": 4.430769230769232e-06, "loss": 0.003, "step": 3036 }, { "epoch": 38.81150159744409, "grad_norm": 0.35114040970802307, "learning_rate": 4.425641025641026e-06, "loss": 0.0026, "step": 3037 }, { "epoch": 38.82428115015974, "grad_norm": 2.235126495361328, "learning_rate": 4.420512820512821e-06, "loss": 0.0113, "step": 3038 }, { "epoch": 38.8370607028754, "grad_norm": 1.4098985195159912, "learning_rate": 4.4153846153846155e-06, "loss": 0.0102, "step": 3039 }, { "epoch": 38.849840255591054, "grad_norm": 0.2800547182559967, "learning_rate": 4.4102564102564104e-06, "loss": 0.0033, "step": 3040 }, { "epoch": 38.86261980830671, "grad_norm": 0.9228421449661255, "learning_rate": 4.405128205128205e-06, "loss": 0.0071, "step": 3041 }, { "epoch": 38.87539936102237, "grad_norm": 0.4252464473247528, "learning_rate": 4.4e-06, "loss": 0.0034, "step": 3042 }, { "epoch": 38.88817891373802, "grad_norm": 1.1408131122589111, "learning_rate": 4.394871794871795e-06, "loss": 0.003, "step": 3043 }, { "epoch": 38.90095846645367, "grad_norm": 3.826991081237793, "learning_rate": 4.38974358974359e-06, "loss": 0.0224, "step": 3044 }, { "epoch": 38.91373801916933, "grad_norm": 0.3651482164859772, "learning_rate": 4.384615384615385e-06, "loss": 0.0026, "step": 3045 }, { "epoch": 38.926517571884986, "grad_norm": 3.332813024520874, "learning_rate": 4.37948717948718e-06, "loss": 0.0297, "step": 3046 }, { "epoch": 38.93929712460064, "grad_norm": 0.4605509042739868, "learning_rate": 4.374358974358975e-06, "loss": 0.0044, "step": 3047 }, { "epoch": 38.95207667731629, "grad_norm": 0.5724936723709106, "learning_rate": 4.36923076923077e-06, "loss": 0.0052, "step": 3048 }, { "epoch": 38.96485623003195, "grad_norm": 0.2828877568244934, "learning_rate": 4.3641025641025645e-06, "loss": 0.003, "step": 3049 }, { "epoch": 38.977635782747605, "grad_norm": 2.8186399936676025, "learning_rate": 4.358974358974359e-06, "loss": 0.0181, "step": 3050 }, { "epoch": 38.99041533546326, "grad_norm": 7.178375244140625, "learning_rate": 4.353846153846154e-06, "loss": 0.0181, "step": 3051 }, { "epoch": 39.00319488817891, "grad_norm": 0.39194220304489136, "learning_rate": 4.348717948717949e-06, "loss": 0.0034, "step": 3052 }, { "epoch": 39.01597444089457, "grad_norm": 0.32534223794937134, "learning_rate": 4.343589743589744e-06, "loss": 0.003, "step": 3053 }, { "epoch": 39.02875399361022, "grad_norm": 4.849320411682129, "learning_rate": 4.338461538461539e-06, "loss": 0.0042, "step": 3054 }, { "epoch": 39.04153354632588, "grad_norm": 0.8111512064933777, "learning_rate": 4.333333333333334e-06, "loss": 0.0019, "step": 3055 }, { "epoch": 39.054313099041536, "grad_norm": 0.2802063226699829, "learning_rate": 4.328205128205129e-06, "loss": 0.0024, "step": 3056 }, { "epoch": 39.067092651757186, "grad_norm": 2.6525161266326904, "learning_rate": 4.323076923076923e-06, "loss": 0.0137, "step": 3057 }, { "epoch": 39.07987220447284, "grad_norm": 0.2541932761669159, "learning_rate": 4.317948717948719e-06, "loss": 0.003, "step": 3058 }, { "epoch": 39.0926517571885, "grad_norm": 0.43173283338546753, "learning_rate": 4.312820512820513e-06, "loss": 0.0037, "step": 3059 }, { "epoch": 39.105431309904155, "grad_norm": 1.6924418210983276, "learning_rate": 4.307692307692308e-06, "loss": 0.0097, "step": 3060 }, { "epoch": 39.11821086261981, "grad_norm": 0.2868981659412384, "learning_rate": 4.302564102564103e-06, "loss": 0.0025, "step": 3061 }, { "epoch": 39.13099041533546, "grad_norm": 0.24275152385234833, "learning_rate": 4.297435897435897e-06, "loss": 0.0021, "step": 3062 }, { "epoch": 39.14376996805112, "grad_norm": 1.1402617692947388, "learning_rate": 4.292307692307693e-06, "loss": 0.0057, "step": 3063 }, { "epoch": 39.156549520766774, "grad_norm": 0.29625436663627625, "learning_rate": 4.287179487179487e-06, "loss": 0.0028, "step": 3064 }, { "epoch": 39.16932907348243, "grad_norm": 0.9124428629875183, "learning_rate": 4.282051282051282e-06, "loss": 0.0033, "step": 3065 }, { "epoch": 39.18210862619808, "grad_norm": 0.31289470195770264, "learning_rate": 4.276923076923078e-06, "loss": 0.0028, "step": 3066 }, { "epoch": 39.194888178913736, "grad_norm": 1.9680798053741455, "learning_rate": 4.271794871794872e-06, "loss": 0.0081, "step": 3067 }, { "epoch": 39.20766773162939, "grad_norm": 0.2573452293872833, "learning_rate": 4.266666666666668e-06, "loss": 0.0029, "step": 3068 }, { "epoch": 39.22044728434505, "grad_norm": 2.266730308532715, "learning_rate": 4.261538461538462e-06, "loss": 0.0179, "step": 3069 }, { "epoch": 39.233226837060705, "grad_norm": 0.40569251775741577, "learning_rate": 4.2564102564102566e-06, "loss": 0.0044, "step": 3070 }, { "epoch": 39.246006389776355, "grad_norm": 0.24747313559055328, "learning_rate": 4.2512820512820515e-06, "loss": 0.0029, "step": 3071 }, { "epoch": 39.25878594249201, "grad_norm": 0.6296175122261047, "learning_rate": 4.246153846153846e-06, "loss": 0.0039, "step": 3072 }, { "epoch": 39.27156549520767, "grad_norm": 0.3082866966724396, "learning_rate": 4.241025641025641e-06, "loss": 0.0022, "step": 3073 }, { "epoch": 39.284345047923324, "grad_norm": 2.4070003032684326, "learning_rate": 4.235897435897436e-06, "loss": 0.0071, "step": 3074 }, { "epoch": 39.29712460063898, "grad_norm": 0.5413769483566284, "learning_rate": 4.230769230769231e-06, "loss": 0.0095, "step": 3075 }, { "epoch": 39.30990415335463, "grad_norm": 0.3211362659931183, "learning_rate": 4.225641025641026e-06, "loss": 0.002, "step": 3076 }, { "epoch": 39.322683706070286, "grad_norm": 1.298190951347351, "learning_rate": 4.220512820512821e-06, "loss": 0.0081, "step": 3077 }, { "epoch": 39.33546325878594, "grad_norm": 0.22302180528640747, "learning_rate": 4.215384615384616e-06, "loss": 0.0018, "step": 3078 }, { "epoch": 39.3482428115016, "grad_norm": 0.25386327505111694, "learning_rate": 4.210256410256411e-06, "loss": 0.0023, "step": 3079 }, { "epoch": 39.361022364217256, "grad_norm": 0.2036847472190857, "learning_rate": 4.2051282051282055e-06, "loss": 0.002, "step": 3080 }, { "epoch": 39.373801916932905, "grad_norm": 0.314125120639801, "learning_rate": 4.2000000000000004e-06, "loss": 0.003, "step": 3081 }, { "epoch": 39.38658146964856, "grad_norm": 0.35973840951919556, "learning_rate": 4.194871794871795e-06, "loss": 0.0033, "step": 3082 }, { "epoch": 39.39936102236422, "grad_norm": 0.20182707905769348, "learning_rate": 4.18974358974359e-06, "loss": 0.0023, "step": 3083 }, { "epoch": 39.412140575079874, "grad_norm": 4.579025745391846, "learning_rate": 4.184615384615385e-06, "loss": 0.0253, "step": 3084 }, { "epoch": 39.424920127795524, "grad_norm": 0.4407001733779907, "learning_rate": 4.17948717948718e-06, "loss": 0.004, "step": 3085 }, { "epoch": 39.43769968051118, "grad_norm": 2.039889097213745, "learning_rate": 4.174358974358975e-06, "loss": 0.0088, "step": 3086 }, { "epoch": 39.45047923322684, "grad_norm": 0.9066301584243774, "learning_rate": 4.169230769230769e-06, "loss": 0.0076, "step": 3087 }, { "epoch": 39.46325878594249, "grad_norm": 10.292778015136719, "learning_rate": 4.164102564102565e-06, "loss": 0.0347, "step": 3088 }, { "epoch": 39.47603833865815, "grad_norm": 0.22446775436401367, "learning_rate": 4.158974358974359e-06, "loss": 0.0024, "step": 3089 }, { "epoch": 39.4888178913738, "grad_norm": 2.612048387527466, "learning_rate": 4.1538461538461545e-06, "loss": 0.0233, "step": 3090 }, { "epoch": 39.501597444089455, "grad_norm": 1.2295528650283813, "learning_rate": 4.1487179487179494e-06, "loss": 0.0043, "step": 3091 }, { "epoch": 39.51437699680511, "grad_norm": 0.2916811406612396, "learning_rate": 4.1435897435897435e-06, "loss": 0.0024, "step": 3092 }, { "epoch": 39.52715654952077, "grad_norm": 0.45092812180519104, "learning_rate": 4.138461538461539e-06, "loss": 0.0033, "step": 3093 }, { "epoch": 39.539936102236425, "grad_norm": 0.4554513990879059, "learning_rate": 4.133333333333333e-06, "loss": 0.0034, "step": 3094 }, { "epoch": 39.552715654952074, "grad_norm": 0.3552434742450714, "learning_rate": 4.128205128205128e-06, "loss": 0.0024, "step": 3095 }, { "epoch": 39.56549520766773, "grad_norm": 2.480602264404297, "learning_rate": 4.123076923076924e-06, "loss": 0.0075, "step": 3096 }, { "epoch": 39.57827476038339, "grad_norm": 0.3173219561576843, "learning_rate": 4.117948717948718e-06, "loss": 0.0022, "step": 3097 }, { "epoch": 39.59105431309904, "grad_norm": 0.46305596828460693, "learning_rate": 4.112820512820514e-06, "loss": 0.004, "step": 3098 }, { "epoch": 39.6038338658147, "grad_norm": 0.5275067090988159, "learning_rate": 4.107692307692308e-06, "loss": 0.0039, "step": 3099 }, { "epoch": 39.61661341853035, "grad_norm": 2.0127456188201904, "learning_rate": 4.102564102564103e-06, "loss": 0.0135, "step": 3100 }, { "epoch": 39.61661341853035, "eval_loss": 0.9879663586616516, "eval_runtime": 183.9322, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3100 }, { "epoch": 39.629392971246006, "grad_norm": 0.3211822211742401, "learning_rate": 4.097435897435898e-06, "loss": 0.0026, "step": 3101 }, { "epoch": 39.64217252396166, "grad_norm": 1.2732728719711304, "learning_rate": 4.0923076923076925e-06, "loss": 0.0035, "step": 3102 }, { "epoch": 39.65495207667732, "grad_norm": 1.2496635913848877, "learning_rate": 4.087179487179487e-06, "loss": 0.0037, "step": 3103 }, { "epoch": 39.66773162939297, "grad_norm": 3.28835391998291, "learning_rate": 4.082051282051282e-06, "loss": 0.0081, "step": 3104 }, { "epoch": 39.680511182108624, "grad_norm": 0.25976139307022095, "learning_rate": 4.076923076923077e-06, "loss": 0.0025, "step": 3105 }, { "epoch": 39.69329073482428, "grad_norm": 0.3309383690357208, "learning_rate": 4.071794871794872e-06, "loss": 0.0036, "step": 3106 }, { "epoch": 39.70607028753994, "grad_norm": 1.8688527345657349, "learning_rate": 4.066666666666667e-06, "loss": 0.0079, "step": 3107 }, { "epoch": 39.718849840255594, "grad_norm": 1.6854923963546753, "learning_rate": 4.061538461538462e-06, "loss": 0.0113, "step": 3108 }, { "epoch": 39.73162939297124, "grad_norm": 1.842885136604309, "learning_rate": 4.056410256410257e-06, "loss": 0.0077, "step": 3109 }, { "epoch": 39.7444089456869, "grad_norm": 3.535550594329834, "learning_rate": 4.051282051282052e-06, "loss": 0.0205, "step": 3110 }, { "epoch": 39.757188498402556, "grad_norm": 0.29745614528656006, "learning_rate": 4.0461538461538466e-06, "loss": 0.0032, "step": 3111 }, { "epoch": 39.76996805111821, "grad_norm": 3.15605092048645, "learning_rate": 4.0410256410256415e-06, "loss": 0.0232, "step": 3112 }, { "epoch": 39.78274760383387, "grad_norm": 0.2782306969165802, "learning_rate": 4.035897435897436e-06, "loss": 0.0027, "step": 3113 }, { "epoch": 39.79552715654952, "grad_norm": 0.4322669804096222, "learning_rate": 4.030769230769231e-06, "loss": 0.004, "step": 3114 }, { "epoch": 39.808306709265175, "grad_norm": 1.0812246799468994, "learning_rate": 4.025641025641026e-06, "loss": 0.0057, "step": 3115 }, { "epoch": 39.82108626198083, "grad_norm": 3.448333978652954, "learning_rate": 4.020512820512821e-06, "loss": 0.0227, "step": 3116 }, { "epoch": 39.83386581469649, "grad_norm": 2.9860076904296875, "learning_rate": 4.015384615384615e-06, "loss": 0.0124, "step": 3117 }, { "epoch": 39.846645367412144, "grad_norm": 2.200941562652588, "learning_rate": 4.010256410256411e-06, "loss": 0.0127, "step": 3118 }, { "epoch": 39.85942492012779, "grad_norm": 0.4125717580318451, "learning_rate": 4.005128205128205e-06, "loss": 0.0026, "step": 3119 }, { "epoch": 39.87220447284345, "grad_norm": 2.127324342727661, "learning_rate": 4.000000000000001e-06, "loss": 0.015, "step": 3120 }, { "epoch": 39.884984025559106, "grad_norm": 0.2309257984161377, "learning_rate": 3.9948717948717956e-06, "loss": 0.0025, "step": 3121 }, { "epoch": 39.89776357827476, "grad_norm": 0.2240867018699646, "learning_rate": 3.98974358974359e-06, "loss": 0.0018, "step": 3122 }, { "epoch": 39.91054313099041, "grad_norm": 0.3975813686847687, "learning_rate": 3.984615384615385e-06, "loss": 0.0031, "step": 3123 }, { "epoch": 39.92332268370607, "grad_norm": 2.780297040939331, "learning_rate": 3.979487179487179e-06, "loss": 0.0178, "step": 3124 }, { "epoch": 39.936102236421725, "grad_norm": 0.2866804003715515, "learning_rate": 3.974358974358974e-06, "loss": 0.0024, "step": 3125 }, { "epoch": 39.94888178913738, "grad_norm": 0.3142434358596802, "learning_rate": 3.96923076923077e-06, "loss": 0.0028, "step": 3126 }, { "epoch": 39.96166134185304, "grad_norm": 0.3115275204181671, "learning_rate": 3.964102564102564e-06, "loss": 0.0028, "step": 3127 }, { "epoch": 39.97444089456869, "grad_norm": 2.3010635375976562, "learning_rate": 3.95897435897436e-06, "loss": 0.0112, "step": 3128 }, { "epoch": 39.98722044728434, "grad_norm": 0.289372980594635, "learning_rate": 3.953846153846154e-06, "loss": 0.0024, "step": 3129 }, { "epoch": 40.0, "grad_norm": 0.49848124384880066, "learning_rate": 3.948717948717949e-06, "loss": 0.0032, "step": 3130 }, { "epoch": 40.01277955271566, "grad_norm": 0.19198329746723175, "learning_rate": 3.943589743589744e-06, "loss": 0.0018, "step": 3131 }, { "epoch": 40.02555910543131, "grad_norm": 0.3196083903312683, "learning_rate": 3.938461538461539e-06, "loss": 0.0036, "step": 3132 }, { "epoch": 40.03833865814696, "grad_norm": 0.593464195728302, "learning_rate": 3.9333333333333335e-06, "loss": 0.0028, "step": 3133 }, { "epoch": 40.05111821086262, "grad_norm": 0.2691340148448944, "learning_rate": 3.928205128205128e-06, "loss": 0.0032, "step": 3134 }, { "epoch": 40.063897763578275, "grad_norm": 0.20105499029159546, "learning_rate": 3.923076923076923e-06, "loss": 0.0026, "step": 3135 }, { "epoch": 40.07667731629393, "grad_norm": 0.30826905369758606, "learning_rate": 3.917948717948718e-06, "loss": 0.0035, "step": 3136 }, { "epoch": 40.08945686900959, "grad_norm": 0.37188711762428284, "learning_rate": 3.912820512820513e-06, "loss": 0.0034, "step": 3137 }, { "epoch": 40.10223642172524, "grad_norm": 0.3785606026649475, "learning_rate": 3.907692307692308e-06, "loss": 0.0025, "step": 3138 }, { "epoch": 40.115015974440894, "grad_norm": 1.697972297668457, "learning_rate": 3.902564102564103e-06, "loss": 0.0042, "step": 3139 }, { "epoch": 40.12779552715655, "grad_norm": 1.2668052911758423, "learning_rate": 3.897435897435898e-06, "loss": 0.0052, "step": 3140 }, { "epoch": 40.14057507987221, "grad_norm": 2.1785550117492676, "learning_rate": 3.892307692307693e-06, "loss": 0.0086, "step": 3141 }, { "epoch": 40.153354632587856, "grad_norm": 0.5524407029151917, "learning_rate": 3.887179487179488e-06, "loss": 0.0029, "step": 3142 }, { "epoch": 40.16613418530351, "grad_norm": 0.754425585269928, "learning_rate": 3.8820512820512825e-06, "loss": 0.0049, "step": 3143 }, { "epoch": 40.17891373801917, "grad_norm": 0.3464694917201996, "learning_rate": 3.876923076923077e-06, "loss": 0.0045, "step": 3144 }, { "epoch": 40.191693290734825, "grad_norm": 0.23320907354354858, "learning_rate": 3.871794871794872e-06, "loss": 0.0022, "step": 3145 }, { "epoch": 40.20447284345048, "grad_norm": 0.5744196772575378, "learning_rate": 3.866666666666667e-06, "loss": 0.0043, "step": 3146 }, { "epoch": 40.21725239616613, "grad_norm": 0.42899665236473083, "learning_rate": 3.861538461538462e-06, "loss": 0.0031, "step": 3147 }, { "epoch": 40.23003194888179, "grad_norm": 0.516017735004425, "learning_rate": 3.856410256410257e-06, "loss": 0.0079, "step": 3148 }, { "epoch": 40.242811501597444, "grad_norm": 0.4321247637271881, "learning_rate": 3.851282051282051e-06, "loss": 0.0029, "step": 3149 }, { "epoch": 40.2555910543131, "grad_norm": 0.26432710886001587, "learning_rate": 3.846153846153847e-06, "loss": 0.0032, "step": 3150 }, { "epoch": 40.26837060702876, "grad_norm": 0.9468638300895691, "learning_rate": 3.841025641025642e-06, "loss": 0.0048, "step": 3151 }, { "epoch": 40.281150159744406, "grad_norm": 0.1992044448852539, "learning_rate": 3.835897435897436e-06, "loss": 0.0024, "step": 3152 }, { "epoch": 40.29392971246006, "grad_norm": 3.1869659423828125, "learning_rate": 3.8307692307692315e-06, "loss": 0.013, "step": 3153 }, { "epoch": 40.30670926517572, "grad_norm": 0.7191022038459778, "learning_rate": 3.8256410256410255e-06, "loss": 0.0042, "step": 3154 }, { "epoch": 40.319488817891376, "grad_norm": 0.34319254755973816, "learning_rate": 3.8205128205128204e-06, "loss": 0.0037, "step": 3155 }, { "epoch": 40.33226837060703, "grad_norm": 0.37301310896873474, "learning_rate": 3.815384615384616e-06, "loss": 0.0029, "step": 3156 }, { "epoch": 40.34504792332268, "grad_norm": 2.212172031402588, "learning_rate": 3.8102564102564107e-06, "loss": 0.0188, "step": 3157 }, { "epoch": 40.35782747603834, "grad_norm": 0.9722811579704285, "learning_rate": 3.8051282051282056e-06, "loss": 0.0025, "step": 3158 }, { "epoch": 40.370607028753994, "grad_norm": 2.930816173553467, "learning_rate": 3.8000000000000005e-06, "loss": 0.0175, "step": 3159 }, { "epoch": 40.38338658146965, "grad_norm": 3.5118746757507324, "learning_rate": 3.794871794871795e-06, "loss": 0.0169, "step": 3160 }, { "epoch": 40.3961661341853, "grad_norm": 0.24613343179225922, "learning_rate": 3.7897435897435903e-06, "loss": 0.0025, "step": 3161 }, { "epoch": 40.40894568690096, "grad_norm": 0.21622180938720703, "learning_rate": 3.7846153846153847e-06, "loss": 0.0024, "step": 3162 }, { "epoch": 40.42172523961661, "grad_norm": 1.3650420904159546, "learning_rate": 3.7794871794871796e-06, "loss": 0.0067, "step": 3163 }, { "epoch": 40.43450479233227, "grad_norm": 1.9687597751617432, "learning_rate": 3.774358974358975e-06, "loss": 0.0151, "step": 3164 }, { "epoch": 40.447284345047926, "grad_norm": 0.23424799740314484, "learning_rate": 3.7692307692307694e-06, "loss": 0.0018, "step": 3165 }, { "epoch": 40.460063897763575, "grad_norm": 0.3177472651004791, "learning_rate": 3.7641025641025643e-06, "loss": 0.0025, "step": 3166 }, { "epoch": 40.47284345047923, "grad_norm": 0.22956353425979614, "learning_rate": 3.7589743589743592e-06, "loss": 0.0027, "step": 3167 }, { "epoch": 40.48562300319489, "grad_norm": 4.320156574249268, "learning_rate": 3.753846153846154e-06, "loss": 0.0166, "step": 3168 }, { "epoch": 40.498402555910545, "grad_norm": 0.34956955909729004, "learning_rate": 3.7487179487179495e-06, "loss": 0.003, "step": 3169 }, { "epoch": 40.5111821086262, "grad_norm": 0.33838948607444763, "learning_rate": 3.743589743589744e-06, "loss": 0.0031, "step": 3170 }, { "epoch": 40.52396166134185, "grad_norm": 0.42558255791664124, "learning_rate": 3.7384615384615384e-06, "loss": 0.0028, "step": 3171 }, { "epoch": 40.53674121405751, "grad_norm": 0.2204248309135437, "learning_rate": 3.7333333333333337e-06, "loss": 0.0019, "step": 3172 }, { "epoch": 40.54952076677316, "grad_norm": 0.4001370370388031, "learning_rate": 3.7282051282051286e-06, "loss": 0.0026, "step": 3173 }, { "epoch": 40.56230031948882, "grad_norm": 0.21183380484580994, "learning_rate": 3.723076923076923e-06, "loss": 0.0024, "step": 3174 }, { "epoch": 40.575079872204476, "grad_norm": 1.0086758136749268, "learning_rate": 3.7179487179487184e-06, "loss": 0.0052, "step": 3175 }, { "epoch": 40.587859424920126, "grad_norm": 2.1529321670532227, "learning_rate": 3.712820512820513e-06, "loss": 0.015, "step": 3176 }, { "epoch": 40.60063897763578, "grad_norm": 2.298342227935791, "learning_rate": 3.7076923076923082e-06, "loss": 0.0077, "step": 3177 }, { "epoch": 40.61341853035144, "grad_norm": 0.2786836326122284, "learning_rate": 3.702564102564103e-06, "loss": 0.0025, "step": 3178 }, { "epoch": 40.626198083067095, "grad_norm": 0.22754481434822083, "learning_rate": 3.6974358974358976e-06, "loss": 0.0022, "step": 3179 }, { "epoch": 40.638977635782744, "grad_norm": 2.1523077487945557, "learning_rate": 3.692307692307693e-06, "loss": 0.0126, "step": 3180 }, { "epoch": 40.6517571884984, "grad_norm": 0.2736090123653412, "learning_rate": 3.6871794871794874e-06, "loss": 0.0023, "step": 3181 }, { "epoch": 40.66453674121406, "grad_norm": 0.23136316239833832, "learning_rate": 3.6820512820512823e-06, "loss": 0.0026, "step": 3182 }, { "epoch": 40.677316293929714, "grad_norm": 0.3643839657306671, "learning_rate": 3.676923076923077e-06, "loss": 0.0034, "step": 3183 }, { "epoch": 40.69009584664537, "grad_norm": 0.3675735890865326, "learning_rate": 3.671794871794872e-06, "loss": 0.0022, "step": 3184 }, { "epoch": 40.70287539936102, "grad_norm": 3.977429151535034, "learning_rate": 3.6666666666666666e-06, "loss": 0.0171, "step": 3185 }, { "epoch": 40.715654952076676, "grad_norm": 0.21517342329025269, "learning_rate": 3.661538461538462e-06, "loss": 0.0022, "step": 3186 }, { "epoch": 40.72843450479233, "grad_norm": 0.25581324100494385, "learning_rate": 3.656410256410257e-06, "loss": 0.0025, "step": 3187 }, { "epoch": 40.74121405750799, "grad_norm": 0.5155754089355469, "learning_rate": 3.6512820512820517e-06, "loss": 0.0031, "step": 3188 }, { "epoch": 40.753993610223645, "grad_norm": 0.24920091032981873, "learning_rate": 3.6461538461538466e-06, "loss": 0.0022, "step": 3189 }, { "epoch": 40.766773162939295, "grad_norm": 0.22267504036426544, "learning_rate": 3.641025641025641e-06, "loss": 0.0024, "step": 3190 }, { "epoch": 40.77955271565495, "grad_norm": 0.2679823040962219, "learning_rate": 3.6358974358974364e-06, "loss": 0.0029, "step": 3191 }, { "epoch": 40.79233226837061, "grad_norm": 1.3183385133743286, "learning_rate": 3.630769230769231e-06, "loss": 0.0065, "step": 3192 }, { "epoch": 40.805111821086264, "grad_norm": 0.2925851345062256, "learning_rate": 3.6256410256410258e-06, "loss": 0.0022, "step": 3193 }, { "epoch": 40.81789137380191, "grad_norm": 2.8205389976501465, "learning_rate": 3.620512820512821e-06, "loss": 0.0182, "step": 3194 }, { "epoch": 40.83067092651757, "grad_norm": 0.31817200779914856, "learning_rate": 3.6153846153846156e-06, "loss": 0.0025, "step": 3195 }, { "epoch": 40.843450479233226, "grad_norm": 0.19292470812797546, "learning_rate": 3.610256410256411e-06, "loss": 0.0023, "step": 3196 }, { "epoch": 40.85623003194888, "grad_norm": 0.25802552700042725, "learning_rate": 3.6051282051282054e-06, "loss": 0.0025, "step": 3197 }, { "epoch": 40.86900958466454, "grad_norm": 0.3933732807636261, "learning_rate": 3.6000000000000003e-06, "loss": 0.0023, "step": 3198 }, { "epoch": 40.88178913738019, "grad_norm": 2.052565097808838, "learning_rate": 3.5948717948717956e-06, "loss": 0.0154, "step": 3199 }, { "epoch": 40.894568690095845, "grad_norm": 0.23817439377307892, "learning_rate": 3.58974358974359e-06, "loss": 0.0021, "step": 3200 }, { "epoch": 40.894568690095845, "eval_loss": 0.9943722486495972, "eval_runtime": 183.9313, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3200 }, { "epoch": 40.9073482428115, "grad_norm": 0.36221644282341003, "learning_rate": 3.5846153846153845e-06, "loss": 0.0024, "step": 3201 }, { "epoch": 40.92012779552716, "grad_norm": 0.21248416602611542, "learning_rate": 3.57948717948718e-06, "loss": 0.0024, "step": 3202 }, { "epoch": 40.932907348242814, "grad_norm": 0.24060960114002228, "learning_rate": 3.5743589743589748e-06, "loss": 0.0021, "step": 3203 }, { "epoch": 40.945686900958464, "grad_norm": 2.167067289352417, "learning_rate": 3.5692307692307692e-06, "loss": 0.0132, "step": 3204 }, { "epoch": 40.95846645367412, "grad_norm": 0.2268885225057602, "learning_rate": 3.5641025641025646e-06, "loss": 0.0022, "step": 3205 }, { "epoch": 40.97124600638978, "grad_norm": 0.19051924347877502, "learning_rate": 3.558974358974359e-06, "loss": 0.0015, "step": 3206 }, { "epoch": 40.98402555910543, "grad_norm": 0.848506510257721, "learning_rate": 3.5538461538461544e-06, "loss": 0.0022, "step": 3207 }, { "epoch": 40.99680511182109, "grad_norm": 1.9270353317260742, "learning_rate": 3.5487179487179493e-06, "loss": 0.0075, "step": 3208 }, { "epoch": 41.00958466453674, "grad_norm": 2.1617424488067627, "learning_rate": 3.5435897435897437e-06, "loss": 0.0091, "step": 3209 }, { "epoch": 41.022364217252395, "grad_norm": 1.1429330110549927, "learning_rate": 3.538461538461539e-06, "loss": 0.0054, "step": 3210 }, { "epoch": 41.03514376996805, "grad_norm": 0.2642124593257904, "learning_rate": 3.5333333333333335e-06, "loss": 0.0032, "step": 3211 }, { "epoch": 41.04792332268371, "grad_norm": 2.530320167541504, "learning_rate": 3.5282051282051284e-06, "loss": 0.0096, "step": 3212 }, { "epoch": 41.06070287539936, "grad_norm": 2.0875837802886963, "learning_rate": 3.5230769230769233e-06, "loss": 0.0108, "step": 3213 }, { "epoch": 41.073482428115014, "grad_norm": 0.20744475722312927, "learning_rate": 3.5179487179487182e-06, "loss": 0.0016, "step": 3214 }, { "epoch": 41.08626198083067, "grad_norm": 0.17290177941322327, "learning_rate": 3.5128205128205127e-06, "loss": 0.0021, "step": 3215 }, { "epoch": 41.09904153354633, "grad_norm": 0.23610471189022064, "learning_rate": 3.507692307692308e-06, "loss": 0.002, "step": 3216 }, { "epoch": 41.11182108626198, "grad_norm": 0.978726863861084, "learning_rate": 3.502564102564103e-06, "loss": 0.0044, "step": 3217 }, { "epoch": 41.12460063897763, "grad_norm": 1.1839767694473267, "learning_rate": 3.497435897435898e-06, "loss": 0.0039, "step": 3218 }, { "epoch": 41.13738019169329, "grad_norm": 1.5846623182296753, "learning_rate": 3.4923076923076927e-06, "loss": 0.0127, "step": 3219 }, { "epoch": 41.150159744408946, "grad_norm": 0.3181246519088745, "learning_rate": 3.487179487179487e-06, "loss": 0.0026, "step": 3220 }, { "epoch": 41.1629392971246, "grad_norm": 0.21884626150131226, "learning_rate": 3.4820512820512825e-06, "loss": 0.0023, "step": 3221 }, { "epoch": 41.17571884984026, "grad_norm": 0.8826262354850769, "learning_rate": 3.476923076923077e-06, "loss": 0.0029, "step": 3222 }, { "epoch": 41.18849840255591, "grad_norm": 0.27310654520988464, "learning_rate": 3.471794871794872e-06, "loss": 0.0026, "step": 3223 }, { "epoch": 41.201277955271564, "grad_norm": 1.3389889001846313, "learning_rate": 3.4666666666666672e-06, "loss": 0.0049, "step": 3224 }, { "epoch": 41.21405750798722, "grad_norm": 2.1886544227600098, "learning_rate": 3.4615384615384617e-06, "loss": 0.0089, "step": 3225 }, { "epoch": 41.22683706070288, "grad_norm": 0.17433784902095795, "learning_rate": 3.456410256410257e-06, "loss": 0.0022, "step": 3226 }, { "epoch": 41.239616613418534, "grad_norm": 0.37125158309936523, "learning_rate": 3.4512820512820515e-06, "loss": 0.0019, "step": 3227 }, { "epoch": 41.25239616613418, "grad_norm": 0.21987852454185486, "learning_rate": 3.4461538461538464e-06, "loss": 0.0022, "step": 3228 }, { "epoch": 41.26517571884984, "grad_norm": 0.2506076395511627, "learning_rate": 3.4410256410256417e-06, "loss": 0.0032, "step": 3229 }, { "epoch": 41.277955271565496, "grad_norm": 0.24976013600826263, "learning_rate": 3.435897435897436e-06, "loss": 0.002, "step": 3230 }, { "epoch": 41.29073482428115, "grad_norm": 0.21577253937721252, "learning_rate": 3.4307692307692307e-06, "loss": 0.0024, "step": 3231 }, { "epoch": 41.3035143769968, "grad_norm": 0.2852231562137604, "learning_rate": 3.425641025641026e-06, "loss": 0.0027, "step": 3232 }, { "epoch": 41.31629392971246, "grad_norm": 0.240945965051651, "learning_rate": 3.420512820512821e-06, "loss": 0.0019, "step": 3233 }, { "epoch": 41.329073482428115, "grad_norm": 1.3859814405441284, "learning_rate": 3.4153846153846154e-06, "loss": 0.0069, "step": 3234 }, { "epoch": 41.34185303514377, "grad_norm": 0.23556867241859436, "learning_rate": 3.4102564102564107e-06, "loss": 0.0032, "step": 3235 }, { "epoch": 41.35463258785943, "grad_norm": 0.2779128849506378, "learning_rate": 3.405128205128205e-06, "loss": 0.0025, "step": 3236 }, { "epoch": 41.36741214057508, "grad_norm": 0.38275742530822754, "learning_rate": 3.4000000000000005e-06, "loss": 0.0029, "step": 3237 }, { "epoch": 41.38019169329073, "grad_norm": 0.2573952078819275, "learning_rate": 3.3948717948717954e-06, "loss": 0.0023, "step": 3238 }, { "epoch": 41.39297124600639, "grad_norm": 0.5133835673332214, "learning_rate": 3.38974358974359e-06, "loss": 0.0077, "step": 3239 }, { "epoch": 41.405750798722046, "grad_norm": 0.2631109952926636, "learning_rate": 3.384615384615385e-06, "loss": 0.0023, "step": 3240 }, { "epoch": 41.4185303514377, "grad_norm": 0.1972748339176178, "learning_rate": 3.3794871794871797e-06, "loss": 0.0019, "step": 3241 }, { "epoch": 41.43130990415335, "grad_norm": 1.8707705736160278, "learning_rate": 3.3743589743589746e-06, "loss": 0.007, "step": 3242 }, { "epoch": 41.44408945686901, "grad_norm": 0.32073575258255005, "learning_rate": 3.3692307692307695e-06, "loss": 0.0026, "step": 3243 }, { "epoch": 41.456869009584665, "grad_norm": 1.5978692770004272, "learning_rate": 3.3641025641025644e-06, "loss": 0.0071, "step": 3244 }, { "epoch": 41.46964856230032, "grad_norm": 3.0393266677856445, "learning_rate": 3.358974358974359e-06, "loss": 0.0208, "step": 3245 }, { "epoch": 41.48242811501598, "grad_norm": 0.16218027472496033, "learning_rate": 3.353846153846154e-06, "loss": 0.0021, "step": 3246 }, { "epoch": 41.49520766773163, "grad_norm": 0.19654710590839386, "learning_rate": 3.348717948717949e-06, "loss": 0.0023, "step": 3247 }, { "epoch": 41.50798722044728, "grad_norm": 3.003314971923828, "learning_rate": 3.343589743589744e-06, "loss": 0.0086, "step": 3248 }, { "epoch": 41.52076677316294, "grad_norm": 0.26050227880477905, "learning_rate": 3.338461538461539e-06, "loss": 0.0026, "step": 3249 }, { "epoch": 41.533546325878596, "grad_norm": 1.9690760374069214, "learning_rate": 3.3333333333333333e-06, "loss": 0.0086, "step": 3250 }, { "epoch": 41.546325878594246, "grad_norm": 0.18360324203968048, "learning_rate": 3.3282051282051286e-06, "loss": 0.0021, "step": 3251 }, { "epoch": 41.5591054313099, "grad_norm": 0.41593506932258606, "learning_rate": 3.323076923076923e-06, "loss": 0.0037, "step": 3252 }, { "epoch": 41.57188498402556, "grad_norm": 0.26756003499031067, "learning_rate": 3.317948717948718e-06, "loss": 0.0026, "step": 3253 }, { "epoch": 41.584664536741215, "grad_norm": 2.4918761253356934, "learning_rate": 3.3128205128205133e-06, "loss": 0.005, "step": 3254 }, { "epoch": 41.59744408945687, "grad_norm": 0.1668577343225479, "learning_rate": 3.307692307692308e-06, "loss": 0.0013, "step": 3255 }, { "epoch": 41.61022364217252, "grad_norm": 0.19826513528823853, "learning_rate": 3.302564102564103e-06, "loss": 0.002, "step": 3256 }, { "epoch": 41.62300319488818, "grad_norm": 10.235542297363281, "learning_rate": 3.2974358974358976e-06, "loss": 0.035, "step": 3257 }, { "epoch": 41.635782747603834, "grad_norm": 0.4807860553264618, "learning_rate": 3.2923076923076925e-06, "loss": 0.0027, "step": 3258 }, { "epoch": 41.64856230031949, "grad_norm": 0.39432165026664734, "learning_rate": 3.287179487179488e-06, "loss": 0.0034, "step": 3259 }, { "epoch": 41.66134185303515, "grad_norm": 3.0132107734680176, "learning_rate": 3.2820512820512823e-06, "loss": 0.0077, "step": 3260 }, { "epoch": 41.674121405750796, "grad_norm": 0.17874084413051605, "learning_rate": 3.276923076923077e-06, "loss": 0.0012, "step": 3261 }, { "epoch": 41.68690095846645, "grad_norm": 1.4562616348266602, "learning_rate": 3.271794871794872e-06, "loss": 0.0105, "step": 3262 }, { "epoch": 41.69968051118211, "grad_norm": 0.5152537822723389, "learning_rate": 3.266666666666667e-06, "loss": 0.0028, "step": 3263 }, { "epoch": 41.712460063897765, "grad_norm": 0.23496030271053314, "learning_rate": 3.2615384615384615e-06, "loss": 0.0025, "step": 3264 }, { "epoch": 41.72523961661342, "grad_norm": 1.5751100778579712, "learning_rate": 3.256410256410257e-06, "loss": 0.0078, "step": 3265 }, { "epoch": 41.73801916932907, "grad_norm": 1.7696713209152222, "learning_rate": 3.2512820512820513e-06, "loss": 0.0057, "step": 3266 }, { "epoch": 41.75079872204473, "grad_norm": 2.3506906032562256, "learning_rate": 3.2461538461538466e-06, "loss": 0.0149, "step": 3267 }, { "epoch": 41.763578274760384, "grad_norm": 0.7308444380760193, "learning_rate": 3.2410256410256415e-06, "loss": 0.0053, "step": 3268 }, { "epoch": 41.77635782747604, "grad_norm": 0.8668710589408875, "learning_rate": 3.235897435897436e-06, "loss": 0.006, "step": 3269 }, { "epoch": 41.78913738019169, "grad_norm": 0.2136469930410385, "learning_rate": 3.2307692307692313e-06, "loss": 0.0016, "step": 3270 }, { "epoch": 41.801916932907346, "grad_norm": 1.6871494054794312, "learning_rate": 3.2256410256410258e-06, "loss": 0.0068, "step": 3271 }, { "epoch": 41.814696485623, "grad_norm": 0.23570038378238678, "learning_rate": 3.2205128205128207e-06, "loss": 0.003, "step": 3272 }, { "epoch": 41.82747603833866, "grad_norm": 4.999492168426514, "learning_rate": 3.2153846153846156e-06, "loss": 0.0216, "step": 3273 }, { "epoch": 41.840255591054316, "grad_norm": 0.1835344433784485, "learning_rate": 3.2102564102564105e-06, "loss": 0.0017, "step": 3274 }, { "epoch": 41.853035143769965, "grad_norm": 1.509971261024475, "learning_rate": 3.205128205128206e-06, "loss": 0.008, "step": 3275 }, { "epoch": 41.86581469648562, "grad_norm": 0.3104620575904846, "learning_rate": 3.2000000000000003e-06, "loss": 0.003, "step": 3276 }, { "epoch": 41.87859424920128, "grad_norm": 0.21908356249332428, "learning_rate": 3.194871794871795e-06, "loss": 0.0024, "step": 3277 }, { "epoch": 41.891373801916934, "grad_norm": 0.40520310401916504, "learning_rate": 3.18974358974359e-06, "loss": 0.0027, "step": 3278 }, { "epoch": 41.90415335463259, "grad_norm": 0.20701780915260315, "learning_rate": 3.184615384615385e-06, "loss": 0.002, "step": 3279 }, { "epoch": 41.91693290734824, "grad_norm": 0.21897658705711365, "learning_rate": 3.1794871794871795e-06, "loss": 0.0023, "step": 3280 }, { "epoch": 41.9297124600639, "grad_norm": 0.24286028742790222, "learning_rate": 3.1743589743589748e-06, "loss": 0.0025, "step": 3281 }, { "epoch": 41.94249201277955, "grad_norm": 0.25564444065093994, "learning_rate": 3.1692307692307693e-06, "loss": 0.002, "step": 3282 }, { "epoch": 41.95527156549521, "grad_norm": 0.2859918773174286, "learning_rate": 3.164102564102564e-06, "loss": 0.0021, "step": 3283 }, { "epoch": 41.968051118210866, "grad_norm": 0.4360431134700775, "learning_rate": 3.1589743589743595e-06, "loss": 0.0029, "step": 3284 }, { "epoch": 41.980830670926515, "grad_norm": 0.1724979281425476, "learning_rate": 3.153846153846154e-06, "loss": 0.0017, "step": 3285 }, { "epoch": 41.99361022364217, "grad_norm": 3.4681003093719482, "learning_rate": 3.1487179487179493e-06, "loss": 0.0175, "step": 3286 }, { "epoch": 42.00638977635783, "grad_norm": 0.25220033526420593, "learning_rate": 3.1435897435897437e-06, "loss": 0.0023, "step": 3287 }, { "epoch": 42.019169329073485, "grad_norm": 0.44308167695999146, "learning_rate": 3.1384615384615386e-06, "loss": 0.003, "step": 3288 }, { "epoch": 42.031948881789134, "grad_norm": 0.23785103857517242, "learning_rate": 3.133333333333334e-06, "loss": 0.0025, "step": 3289 }, { "epoch": 42.04472843450479, "grad_norm": 0.21543189883232117, "learning_rate": 3.1282051282051284e-06, "loss": 0.0022, "step": 3290 }, { "epoch": 42.05750798722045, "grad_norm": 0.1751144528388977, "learning_rate": 3.123076923076923e-06, "loss": 0.002, "step": 3291 }, { "epoch": 42.0702875399361, "grad_norm": 1.3925294876098633, "learning_rate": 3.1179487179487182e-06, "loss": 0.0082, "step": 3292 }, { "epoch": 42.08306709265176, "grad_norm": 0.1362583190202713, "learning_rate": 3.112820512820513e-06, "loss": 0.0015, "step": 3293 }, { "epoch": 42.09584664536741, "grad_norm": 0.21337972581386566, "learning_rate": 3.1076923076923076e-06, "loss": 0.0024, "step": 3294 }, { "epoch": 42.108626198083066, "grad_norm": 1.0860240459442139, "learning_rate": 3.102564102564103e-06, "loss": 0.0059, "step": 3295 }, { "epoch": 42.12140575079872, "grad_norm": 0.9178386926651001, "learning_rate": 3.0974358974358974e-06, "loss": 0.0031, "step": 3296 }, { "epoch": 42.13418530351438, "grad_norm": 0.21546006202697754, "learning_rate": 3.0923076923076927e-06, "loss": 0.0024, "step": 3297 }, { "epoch": 42.146964856230035, "grad_norm": 1.8423643112182617, "learning_rate": 3.0871794871794876e-06, "loss": 0.0079, "step": 3298 }, { "epoch": 42.159744408945684, "grad_norm": 0.3426179587841034, "learning_rate": 3.082051282051282e-06, "loss": 0.0028, "step": 3299 }, { "epoch": 42.17252396166134, "grad_norm": 0.2042442262172699, "learning_rate": 3.0769230769230774e-06, "loss": 0.002, "step": 3300 }, { "epoch": 42.17252396166134, "eval_loss": 0.9923915863037109, "eval_runtime": 183.7945, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3300 }, { "epoch": 42.185303514377, "grad_norm": 0.3152225911617279, "learning_rate": 3.071794871794872e-06, "loss": 0.0022, "step": 3301 }, { "epoch": 42.198083067092654, "grad_norm": 0.25565582513809204, "learning_rate": 3.066666666666667e-06, "loss": 0.0024, "step": 3302 }, { "epoch": 42.21086261980831, "grad_norm": 3.9883928298950195, "learning_rate": 3.0615384615384617e-06, "loss": 0.0269, "step": 3303 }, { "epoch": 42.22364217252396, "grad_norm": 0.37186405062675476, "learning_rate": 3.0564102564102566e-06, "loss": 0.0027, "step": 3304 }, { "epoch": 42.236421725239616, "grad_norm": 0.6624550819396973, "learning_rate": 3.051282051282052e-06, "loss": 0.0031, "step": 3305 }, { "epoch": 42.24920127795527, "grad_norm": 0.5737404227256775, "learning_rate": 3.0461538461538464e-06, "loss": 0.0017, "step": 3306 }, { "epoch": 42.26198083067093, "grad_norm": 0.8662362098693848, "learning_rate": 3.0410256410256413e-06, "loss": 0.0042, "step": 3307 }, { "epoch": 42.27476038338658, "grad_norm": 0.6699799299240112, "learning_rate": 3.035897435897436e-06, "loss": 0.0092, "step": 3308 }, { "epoch": 42.287539936102235, "grad_norm": 0.6369069218635559, "learning_rate": 3.030769230769231e-06, "loss": 0.0044, "step": 3309 }, { "epoch": 42.30031948881789, "grad_norm": 1.9807689189910889, "learning_rate": 3.0256410256410256e-06, "loss": 0.0099, "step": 3310 }, { "epoch": 42.31309904153355, "grad_norm": 0.16607657074928284, "learning_rate": 3.020512820512821e-06, "loss": 0.0017, "step": 3311 }, { "epoch": 42.325878594249204, "grad_norm": 0.35444679856300354, "learning_rate": 3.0153846153846154e-06, "loss": 0.003, "step": 3312 }, { "epoch": 42.33865814696485, "grad_norm": 0.22463780641555786, "learning_rate": 3.0102564102564103e-06, "loss": 0.0021, "step": 3313 }, { "epoch": 42.35143769968051, "grad_norm": 0.20150989294052124, "learning_rate": 3.0051282051282056e-06, "loss": 0.0023, "step": 3314 }, { "epoch": 42.364217252396166, "grad_norm": 0.24138055741786957, "learning_rate": 3e-06, "loss": 0.0028, "step": 3315 }, { "epoch": 42.37699680511182, "grad_norm": 0.2151312679052353, "learning_rate": 2.9948717948717954e-06, "loss": 0.002, "step": 3316 }, { "epoch": 42.38977635782748, "grad_norm": 0.24440868198871613, "learning_rate": 2.98974358974359e-06, "loss": 0.0019, "step": 3317 }, { "epoch": 42.40255591054313, "grad_norm": 0.18149599432945251, "learning_rate": 2.9846153846153848e-06, "loss": 0.0018, "step": 3318 }, { "epoch": 42.415335463258785, "grad_norm": 0.2243649810552597, "learning_rate": 2.97948717948718e-06, "loss": 0.0016, "step": 3319 }, { "epoch": 42.42811501597444, "grad_norm": 1.1802103519439697, "learning_rate": 2.9743589743589746e-06, "loss": 0.0032, "step": 3320 }, { "epoch": 42.4408945686901, "grad_norm": 3.019127607345581, "learning_rate": 2.969230769230769e-06, "loss": 0.0113, "step": 3321 }, { "epoch": 42.453674121405754, "grad_norm": 2.25211763381958, "learning_rate": 2.9641025641025644e-06, "loss": 0.0111, "step": 3322 }, { "epoch": 42.466453674121404, "grad_norm": 0.3121747374534607, "learning_rate": 2.9589743589743593e-06, "loss": 0.0029, "step": 3323 }, { "epoch": 42.47923322683706, "grad_norm": 2.901637315750122, "learning_rate": 2.953846153846154e-06, "loss": 0.0125, "step": 3324 }, { "epoch": 42.49201277955272, "grad_norm": 0.2974953353404999, "learning_rate": 2.948717948717949e-06, "loss": 0.003, "step": 3325 }, { "epoch": 42.50479233226837, "grad_norm": 0.270323783159256, "learning_rate": 2.9435897435897435e-06, "loss": 0.0023, "step": 3326 }, { "epoch": 42.51757188498402, "grad_norm": 0.18607768416404724, "learning_rate": 2.938461538461539e-06, "loss": 0.0016, "step": 3327 }, { "epoch": 42.53035143769968, "grad_norm": 0.16889779269695282, "learning_rate": 2.9333333333333338e-06, "loss": 0.0015, "step": 3328 }, { "epoch": 42.543130990415335, "grad_norm": 0.2744269073009491, "learning_rate": 2.9282051282051282e-06, "loss": 0.0026, "step": 3329 }, { "epoch": 42.55591054313099, "grad_norm": 0.29199185967445374, "learning_rate": 2.9230769230769236e-06, "loss": 0.0022, "step": 3330 }, { "epoch": 42.56869009584665, "grad_norm": 3.15661358833313, "learning_rate": 2.917948717948718e-06, "loss": 0.0132, "step": 3331 }, { "epoch": 42.5814696485623, "grad_norm": 0.2181830108165741, "learning_rate": 2.912820512820513e-06, "loss": 0.0026, "step": 3332 }, { "epoch": 42.594249201277954, "grad_norm": 0.17178262770175934, "learning_rate": 2.907692307692308e-06, "loss": 0.0021, "step": 3333 }, { "epoch": 42.60702875399361, "grad_norm": 0.21246619522571564, "learning_rate": 2.9025641025641027e-06, "loss": 0.0017, "step": 3334 }, { "epoch": 42.61980830670927, "grad_norm": 0.23296082019805908, "learning_rate": 2.897435897435898e-06, "loss": 0.002, "step": 3335 }, { "epoch": 42.63258785942492, "grad_norm": 2.3438727855682373, "learning_rate": 2.8923076923076925e-06, "loss": 0.0161, "step": 3336 }, { "epoch": 42.64536741214057, "grad_norm": 0.23535417020320892, "learning_rate": 2.8871794871794874e-06, "loss": 0.0023, "step": 3337 }, { "epoch": 42.65814696485623, "grad_norm": 0.19845890998840332, "learning_rate": 2.8820512820512823e-06, "loss": 0.0023, "step": 3338 }, { "epoch": 42.670926517571885, "grad_norm": 0.23049385845661163, "learning_rate": 2.8769230769230772e-06, "loss": 0.0018, "step": 3339 }, { "epoch": 42.68370607028754, "grad_norm": 1.5415147542953491, "learning_rate": 2.8717948717948717e-06, "loss": 0.0076, "step": 3340 }, { "epoch": 42.6964856230032, "grad_norm": 1.1236743927001953, "learning_rate": 2.866666666666667e-06, "loss": 0.0049, "step": 3341 }, { "epoch": 42.70926517571885, "grad_norm": 3.2558176517486572, "learning_rate": 2.8615384615384615e-06, "loss": 0.0125, "step": 3342 }, { "epoch": 42.722044728434504, "grad_norm": 0.40388306975364685, "learning_rate": 2.8564102564102564e-06, "loss": 0.0027, "step": 3343 }, { "epoch": 42.73482428115016, "grad_norm": 0.5524995923042297, "learning_rate": 2.8512820512820517e-06, "loss": 0.0028, "step": 3344 }, { "epoch": 42.74760383386582, "grad_norm": 0.34669360518455505, "learning_rate": 2.846153846153846e-06, "loss": 0.003, "step": 3345 }, { "epoch": 42.760383386581466, "grad_norm": 1.4820116758346558, "learning_rate": 2.8410256410256415e-06, "loss": 0.0072, "step": 3346 }, { "epoch": 42.77316293929712, "grad_norm": 0.27356067299842834, "learning_rate": 2.835897435897436e-06, "loss": 0.0027, "step": 3347 }, { "epoch": 42.78594249201278, "grad_norm": 2.8262336254119873, "learning_rate": 2.830769230769231e-06, "loss": 0.009, "step": 3348 }, { "epoch": 42.798722044728436, "grad_norm": 0.6990281939506531, "learning_rate": 2.8256410256410262e-06, "loss": 0.005, "step": 3349 }, { "epoch": 42.81150159744409, "grad_norm": 0.2962161898612976, "learning_rate": 2.8205128205128207e-06, "loss": 0.0023, "step": 3350 }, { "epoch": 42.82428115015974, "grad_norm": 1.1499954462051392, "learning_rate": 2.815384615384615e-06, "loss": 0.0036, "step": 3351 }, { "epoch": 42.8370607028754, "grad_norm": 2.363464117050171, "learning_rate": 2.8102564102564105e-06, "loss": 0.0114, "step": 3352 }, { "epoch": 42.849840255591054, "grad_norm": 2.9876015186309814, "learning_rate": 2.8051282051282054e-06, "loss": 0.0117, "step": 3353 }, { "epoch": 42.86261980830671, "grad_norm": 1.9356184005737305, "learning_rate": 2.8000000000000003e-06, "loss": 0.0053, "step": 3354 }, { "epoch": 42.87539936102237, "grad_norm": 0.16695638000965118, "learning_rate": 2.794871794871795e-06, "loss": 0.0018, "step": 3355 }, { "epoch": 42.88817891373802, "grad_norm": 1.636541724205017, "learning_rate": 2.7897435897435897e-06, "loss": 0.0038, "step": 3356 }, { "epoch": 42.90095846645367, "grad_norm": 0.3252117335796356, "learning_rate": 2.784615384615385e-06, "loss": 0.0028, "step": 3357 }, { "epoch": 42.91373801916933, "grad_norm": 3.5567755699157715, "learning_rate": 2.77948717948718e-06, "loss": 0.0273, "step": 3358 }, { "epoch": 42.926517571884986, "grad_norm": 0.3256838321685791, "learning_rate": 2.7743589743589744e-06, "loss": 0.0027, "step": 3359 }, { "epoch": 42.93929712460064, "grad_norm": 0.17103208601474762, "learning_rate": 2.7692307692307697e-06, "loss": 0.0016, "step": 3360 }, { "epoch": 42.95207667731629, "grad_norm": 1.3459094762802124, "learning_rate": 2.764102564102564e-06, "loss": 0.0058, "step": 3361 }, { "epoch": 42.96485623003195, "grad_norm": 0.1608177274465561, "learning_rate": 2.758974358974359e-06, "loss": 0.0019, "step": 3362 }, { "epoch": 42.977635782747605, "grad_norm": 0.2738124430179596, "learning_rate": 2.753846153846154e-06, "loss": 0.0026, "step": 3363 }, { "epoch": 42.99041533546326, "grad_norm": 0.4253382980823517, "learning_rate": 2.748717948717949e-06, "loss": 0.0035, "step": 3364 }, { "epoch": 43.00319488817891, "grad_norm": 0.41924622654914856, "learning_rate": 2.743589743589744e-06, "loss": 0.0036, "step": 3365 }, { "epoch": 43.01597444089457, "grad_norm": 0.18903520703315735, "learning_rate": 2.7384615384615387e-06, "loss": 0.0019, "step": 3366 }, { "epoch": 43.02875399361022, "grad_norm": 0.20113691687583923, "learning_rate": 2.7333333333333336e-06, "loss": 0.002, "step": 3367 }, { "epoch": 43.04153354632588, "grad_norm": 0.8627514839172363, "learning_rate": 2.7282051282051285e-06, "loss": 0.0041, "step": 3368 }, { "epoch": 43.054313099041536, "grad_norm": 1.7973607778549194, "learning_rate": 2.7230769230769234e-06, "loss": 0.0083, "step": 3369 }, { "epoch": 43.067092651757186, "grad_norm": 0.3085075318813324, "learning_rate": 2.717948717948718e-06, "loss": 0.0022, "step": 3370 }, { "epoch": 43.07987220447284, "grad_norm": 0.19819965958595276, "learning_rate": 2.712820512820513e-06, "loss": 0.002, "step": 3371 }, { "epoch": 43.0926517571885, "grad_norm": 0.23764221370220184, "learning_rate": 2.7076923076923076e-06, "loss": 0.0018, "step": 3372 }, { "epoch": 43.105431309904155, "grad_norm": 0.20486418902873993, "learning_rate": 2.7025641025641025e-06, "loss": 0.002, "step": 3373 }, { "epoch": 43.11821086261981, "grad_norm": 0.6274747252464294, "learning_rate": 2.697435897435898e-06, "loss": 0.0034, "step": 3374 }, { "epoch": 43.13099041533546, "grad_norm": 0.25211164355278015, "learning_rate": 2.6923076923076923e-06, "loss": 0.0024, "step": 3375 }, { "epoch": 43.14376996805112, "grad_norm": 2.6378355026245117, "learning_rate": 2.6871794871794877e-06, "loss": 0.0107, "step": 3376 }, { "epoch": 43.156549520766774, "grad_norm": 1.5214442014694214, "learning_rate": 2.682051282051282e-06, "loss": 0.0045, "step": 3377 }, { "epoch": 43.16932907348243, "grad_norm": 2.6448490619659424, "learning_rate": 2.676923076923077e-06, "loss": 0.0141, "step": 3378 }, { "epoch": 43.18210862619808, "grad_norm": 1.736473560333252, "learning_rate": 2.6717948717948724e-06, "loss": 0.0044, "step": 3379 }, { "epoch": 43.194888178913736, "grad_norm": 6.5054121017456055, "learning_rate": 2.666666666666667e-06, "loss": 0.0107, "step": 3380 }, { "epoch": 43.20766773162939, "grad_norm": 2.9443705081939697, "learning_rate": 2.6615384615384613e-06, "loss": 0.0145, "step": 3381 }, { "epoch": 43.22044728434505, "grad_norm": 0.2461695671081543, "learning_rate": 2.6564102564102566e-06, "loss": 0.002, "step": 3382 }, { "epoch": 43.233226837060705, "grad_norm": 0.21339628100395203, "learning_rate": 2.6512820512820515e-06, "loss": 0.0015, "step": 3383 }, { "epoch": 43.246006389776355, "grad_norm": 0.12452192604541779, "learning_rate": 2.6461538461538464e-06, "loss": 0.0013, "step": 3384 }, { "epoch": 43.25878594249201, "grad_norm": 0.18087056279182434, "learning_rate": 2.6410256410256413e-06, "loss": 0.0017, "step": 3385 }, { "epoch": 43.27156549520767, "grad_norm": 2.1488447189331055, "learning_rate": 2.635897435897436e-06, "loss": 0.0067, "step": 3386 }, { "epoch": 43.284345047923324, "grad_norm": 0.18616983294487, "learning_rate": 2.630769230769231e-06, "loss": 0.0017, "step": 3387 }, { "epoch": 43.29712460063898, "grad_norm": 0.15523196756839752, "learning_rate": 2.625641025641026e-06, "loss": 0.0016, "step": 3388 }, { "epoch": 43.30990415335463, "grad_norm": 0.16610245406627655, "learning_rate": 2.6205128205128205e-06, "loss": 0.0019, "step": 3389 }, { "epoch": 43.322683706070286, "grad_norm": 0.2482658475637436, "learning_rate": 2.615384615384616e-06, "loss": 0.0026, "step": 3390 }, { "epoch": 43.33546325878594, "grad_norm": 0.20092348754405975, "learning_rate": 2.6102564102564103e-06, "loss": 0.0021, "step": 3391 }, { "epoch": 43.3482428115016, "grad_norm": 0.30072319507598877, "learning_rate": 2.605128205128205e-06, "loss": 0.0022, "step": 3392 }, { "epoch": 43.361022364217256, "grad_norm": 1.7357025146484375, "learning_rate": 2.6e-06, "loss": 0.01, "step": 3393 }, { "epoch": 43.373801916932905, "grad_norm": 0.37861010432243347, "learning_rate": 2.594871794871795e-06, "loss": 0.0042, "step": 3394 }, { "epoch": 43.38658146964856, "grad_norm": 0.26171085238456726, "learning_rate": 2.5897435897435903e-06, "loss": 0.0024, "step": 3395 }, { "epoch": 43.39936102236422, "grad_norm": 0.1893923431634903, "learning_rate": 2.584615384615385e-06, "loss": 0.0023, "step": 3396 }, { "epoch": 43.412140575079874, "grad_norm": 0.19825433194637299, "learning_rate": 2.5794871794871797e-06, "loss": 0.0015, "step": 3397 }, { "epoch": 43.424920127795524, "grad_norm": 0.17092125117778778, "learning_rate": 2.5743589743589746e-06, "loss": 0.0021, "step": 3398 }, { "epoch": 43.43769968051118, "grad_norm": 0.857297420501709, "learning_rate": 2.5692307692307695e-06, "loss": 0.0052, "step": 3399 }, { "epoch": 43.45047923322684, "grad_norm": 1.8602384328842163, "learning_rate": 2.564102564102564e-06, "loss": 0.0079, "step": 3400 }, { "epoch": 43.45047923322684, "eval_loss": 1.001037836074829, "eval_runtime": 183.7653, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3400 }, { "epoch": 43.46325878594249, "grad_norm": 2.684396743774414, "learning_rate": 2.5589743589743593e-06, "loss": 0.0137, "step": 3401 }, { "epoch": 43.47603833865815, "grad_norm": 0.7329052090644836, "learning_rate": 2.5538461538461538e-06, "loss": 0.0028, "step": 3402 }, { "epoch": 43.4888178913738, "grad_norm": 3.4295055866241455, "learning_rate": 2.548717948717949e-06, "loss": 0.0225, "step": 3403 }, { "epoch": 43.501597444089455, "grad_norm": 0.20623815059661865, "learning_rate": 2.543589743589744e-06, "loss": 0.0024, "step": 3404 }, { "epoch": 43.51437699680511, "grad_norm": 0.20949018001556396, "learning_rate": 2.5384615384615385e-06, "loss": 0.0023, "step": 3405 }, { "epoch": 43.52715654952077, "grad_norm": 0.29363885521888733, "learning_rate": 2.5333333333333338e-06, "loss": 0.003, "step": 3406 }, { "epoch": 43.539936102236425, "grad_norm": 0.44644200801849365, "learning_rate": 2.5282051282051283e-06, "loss": 0.0023, "step": 3407 }, { "epoch": 43.552715654952074, "grad_norm": 10.232358932495117, "learning_rate": 2.523076923076923e-06, "loss": 0.0425, "step": 3408 }, { "epoch": 43.56549520766773, "grad_norm": 0.23224446177482605, "learning_rate": 2.5179487179487185e-06, "loss": 0.0024, "step": 3409 }, { "epoch": 43.57827476038339, "grad_norm": 0.3602463901042938, "learning_rate": 2.512820512820513e-06, "loss": 0.0029, "step": 3410 }, { "epoch": 43.59105431309904, "grad_norm": 0.2921648323535919, "learning_rate": 2.507692307692308e-06, "loss": 0.0021, "step": 3411 }, { "epoch": 43.6038338658147, "grad_norm": 0.21882840991020203, "learning_rate": 2.5025641025641028e-06, "loss": 0.002, "step": 3412 }, { "epoch": 43.61661341853035, "grad_norm": 0.22644449770450592, "learning_rate": 2.4974358974358977e-06, "loss": 0.0018, "step": 3413 }, { "epoch": 43.629392971246006, "grad_norm": 3.2386162281036377, "learning_rate": 2.4923076923076926e-06, "loss": 0.0214, "step": 3414 }, { "epoch": 43.64217252396166, "grad_norm": 0.6819486618041992, "learning_rate": 2.4871794871794875e-06, "loss": 0.0049, "step": 3415 }, { "epoch": 43.65495207667732, "grad_norm": 2.0926530361175537, "learning_rate": 2.4820512820512824e-06, "loss": 0.0069, "step": 3416 }, { "epoch": 43.66773162939297, "grad_norm": 4.31627082824707, "learning_rate": 2.4769230769230773e-06, "loss": 0.0065, "step": 3417 }, { "epoch": 43.680511182108624, "grad_norm": 2.114065647125244, "learning_rate": 2.471794871794872e-06, "loss": 0.0089, "step": 3418 }, { "epoch": 43.69329073482428, "grad_norm": 0.32869184017181396, "learning_rate": 2.466666666666667e-06, "loss": 0.0028, "step": 3419 }, { "epoch": 43.70607028753994, "grad_norm": 0.21543429791927338, "learning_rate": 2.461538461538462e-06, "loss": 0.0023, "step": 3420 }, { "epoch": 43.718849840255594, "grad_norm": 0.31127381324768066, "learning_rate": 2.4564102564102564e-06, "loss": 0.0033, "step": 3421 }, { "epoch": 43.73162939297124, "grad_norm": 1.919940710067749, "learning_rate": 2.4512820512820513e-06, "loss": 0.006, "step": 3422 }, { "epoch": 43.7444089456869, "grad_norm": 0.20285959541797638, "learning_rate": 2.4461538461538466e-06, "loss": 0.0019, "step": 3423 }, { "epoch": 43.757188498402556, "grad_norm": 0.2318740338087082, "learning_rate": 2.441025641025641e-06, "loss": 0.0019, "step": 3424 }, { "epoch": 43.76996805111821, "grad_norm": 0.30630600452423096, "learning_rate": 2.435897435897436e-06, "loss": 0.0026, "step": 3425 }, { "epoch": 43.78274760383387, "grad_norm": 0.23179428279399872, "learning_rate": 2.430769230769231e-06, "loss": 0.002, "step": 3426 }, { "epoch": 43.79552715654952, "grad_norm": 1.9203321933746338, "learning_rate": 2.425641025641026e-06, "loss": 0.0049, "step": 3427 }, { "epoch": 43.808306709265175, "grad_norm": 0.611625611782074, "learning_rate": 2.4205128205128207e-06, "loss": 0.0064, "step": 3428 }, { "epoch": 43.82108626198083, "grad_norm": 0.4640527069568634, "learning_rate": 2.4153846153846156e-06, "loss": 0.0035, "step": 3429 }, { "epoch": 43.83386581469649, "grad_norm": 0.23337207734584808, "learning_rate": 2.4102564102564105e-06, "loss": 0.0023, "step": 3430 }, { "epoch": 43.846645367412144, "grad_norm": 0.2928684651851654, "learning_rate": 2.4051282051282054e-06, "loss": 0.0027, "step": 3431 }, { "epoch": 43.85942492012779, "grad_norm": 2.824985980987549, "learning_rate": 2.4000000000000003e-06, "loss": 0.0083, "step": 3432 }, { "epoch": 43.87220447284345, "grad_norm": 0.2678843140602112, "learning_rate": 2.3948717948717952e-06, "loss": 0.0028, "step": 3433 }, { "epoch": 43.884984025559106, "grad_norm": 5.614882469177246, "learning_rate": 2.38974358974359e-06, "loss": 0.0041, "step": 3434 }, { "epoch": 43.89776357827476, "grad_norm": 0.19786269962787628, "learning_rate": 2.384615384615385e-06, "loss": 0.002, "step": 3435 }, { "epoch": 43.91054313099041, "grad_norm": 0.18499012291431427, "learning_rate": 2.3794871794871795e-06, "loss": 0.0019, "step": 3436 }, { "epoch": 43.92332268370607, "grad_norm": 0.18536154925823212, "learning_rate": 2.3743589743589744e-06, "loss": 0.0025, "step": 3437 }, { "epoch": 43.936102236421725, "grad_norm": 0.2182464748620987, "learning_rate": 2.3692307692307697e-06, "loss": 0.0023, "step": 3438 }, { "epoch": 43.94888178913738, "grad_norm": 0.19127224385738373, "learning_rate": 2.364102564102564e-06, "loss": 0.0018, "step": 3439 }, { "epoch": 43.96166134185304, "grad_norm": 0.5205743312835693, "learning_rate": 2.358974358974359e-06, "loss": 0.0024, "step": 3440 }, { "epoch": 43.97444089456869, "grad_norm": 3.6712076663970947, "learning_rate": 2.353846153846154e-06, "loss": 0.0217, "step": 3441 }, { "epoch": 43.98722044728434, "grad_norm": 0.6396458745002747, "learning_rate": 2.348717948717949e-06, "loss": 0.0019, "step": 3442 }, { "epoch": 44.0, "grad_norm": 2.4400033950805664, "learning_rate": 2.3435897435897438e-06, "loss": 0.0133, "step": 3443 }, { "epoch": 44.01277955271566, "grad_norm": 0.16040480136871338, "learning_rate": 2.3384615384615387e-06, "loss": 0.0015, "step": 3444 }, { "epoch": 44.02555910543131, "grad_norm": 0.2779988646507263, "learning_rate": 2.3333333333333336e-06, "loss": 0.0019, "step": 3445 }, { "epoch": 44.03833865814696, "grad_norm": 1.6264729499816895, "learning_rate": 2.3282051282051285e-06, "loss": 0.006, "step": 3446 }, { "epoch": 44.05111821086262, "grad_norm": 0.21936100721359253, "learning_rate": 2.3230769230769234e-06, "loss": 0.0019, "step": 3447 }, { "epoch": 44.063897763578275, "grad_norm": 1.0929688215255737, "learning_rate": 2.3179487179487183e-06, "loss": 0.0057, "step": 3448 }, { "epoch": 44.07667731629393, "grad_norm": 0.2932341694831848, "learning_rate": 2.312820512820513e-06, "loss": 0.0027, "step": 3449 }, { "epoch": 44.08945686900959, "grad_norm": 0.8114387392997742, "learning_rate": 2.307692307692308e-06, "loss": 0.0033, "step": 3450 }, { "epoch": 44.10223642172524, "grad_norm": 0.17941796779632568, "learning_rate": 2.3025641025641026e-06, "loss": 0.0021, "step": 3451 }, { "epoch": 44.115015974440894, "grad_norm": 0.16614599525928497, "learning_rate": 2.2974358974358975e-06, "loss": 0.0016, "step": 3452 }, { "epoch": 44.12779552715655, "grad_norm": 0.47889798879623413, "learning_rate": 2.2923076923076928e-06, "loss": 0.0022, "step": 3453 }, { "epoch": 44.14057507987221, "grad_norm": 0.2067495584487915, "learning_rate": 2.2871794871794872e-06, "loss": 0.0025, "step": 3454 }, { "epoch": 44.153354632587856, "grad_norm": 1.3304522037506104, "learning_rate": 2.282051282051282e-06, "loss": 0.0038, "step": 3455 }, { "epoch": 44.16613418530351, "grad_norm": 0.1890897899866104, "learning_rate": 2.276923076923077e-06, "loss": 0.0019, "step": 3456 }, { "epoch": 44.17891373801917, "grad_norm": 1.378034234046936, "learning_rate": 2.271794871794872e-06, "loss": 0.0063, "step": 3457 }, { "epoch": 44.191693290734825, "grad_norm": 0.1573825478553772, "learning_rate": 2.266666666666667e-06, "loss": 0.0018, "step": 3458 }, { "epoch": 44.20447284345048, "grad_norm": 0.2968226373195648, "learning_rate": 2.2615384615384617e-06, "loss": 0.0026, "step": 3459 }, { "epoch": 44.21725239616613, "grad_norm": 0.4137711226940155, "learning_rate": 2.2564102564102566e-06, "loss": 0.0018, "step": 3460 }, { "epoch": 44.23003194888179, "grad_norm": 0.19083532691001892, "learning_rate": 2.2512820512820515e-06, "loss": 0.0016, "step": 3461 }, { "epoch": 44.242811501597444, "grad_norm": 0.14727646112442017, "learning_rate": 2.2461538461538464e-06, "loss": 0.0015, "step": 3462 }, { "epoch": 44.2555910543131, "grad_norm": 0.2691571116447449, "learning_rate": 2.2410256410256413e-06, "loss": 0.0019, "step": 3463 }, { "epoch": 44.26837060702876, "grad_norm": 0.24418750405311584, "learning_rate": 2.2358974358974362e-06, "loss": 0.0018, "step": 3464 }, { "epoch": 44.281150159744406, "grad_norm": 0.8175804615020752, "learning_rate": 2.230769230769231e-06, "loss": 0.0037, "step": 3465 }, { "epoch": 44.29392971246006, "grad_norm": 0.14546926319599152, "learning_rate": 2.2256410256410256e-06, "loss": 0.0017, "step": 3466 }, { "epoch": 44.30670926517572, "grad_norm": 0.267969012260437, "learning_rate": 2.2205128205128205e-06, "loss": 0.0027, "step": 3467 }, { "epoch": 44.319488817891376, "grad_norm": 0.17838163673877716, "learning_rate": 2.215384615384616e-06, "loss": 0.0018, "step": 3468 }, { "epoch": 44.33226837060703, "grad_norm": 0.18786239624023438, "learning_rate": 2.2102564102564103e-06, "loss": 0.0016, "step": 3469 }, { "epoch": 44.34504792332268, "grad_norm": 0.22198481857776642, "learning_rate": 2.2051282051282052e-06, "loss": 0.0027, "step": 3470 }, { "epoch": 44.35782747603834, "grad_norm": 0.2121235728263855, "learning_rate": 2.2e-06, "loss": 0.0021, "step": 3471 }, { "epoch": 44.370607028753994, "grad_norm": 0.7737230062484741, "learning_rate": 2.194871794871795e-06, "loss": 0.0033, "step": 3472 }, { "epoch": 44.38338658146965, "grad_norm": 0.29892033338546753, "learning_rate": 2.18974358974359e-06, "loss": 0.0017, "step": 3473 }, { "epoch": 44.3961661341853, "grad_norm": 0.7860128879547119, "learning_rate": 2.184615384615385e-06, "loss": 0.0028, "step": 3474 }, { "epoch": 44.40894568690096, "grad_norm": 0.4038985073566437, "learning_rate": 2.1794871794871797e-06, "loss": 0.0022, "step": 3475 }, { "epoch": 44.42172523961661, "grad_norm": 0.23415660858154297, "learning_rate": 2.1743589743589746e-06, "loss": 0.0023, "step": 3476 }, { "epoch": 44.43450479233227, "grad_norm": 0.1980719119310379, "learning_rate": 2.1692307692307695e-06, "loss": 0.0022, "step": 3477 }, { "epoch": 44.447284345047926, "grad_norm": 3.2138073444366455, "learning_rate": 2.1641025641025644e-06, "loss": 0.0176, "step": 3478 }, { "epoch": 44.460063897763575, "grad_norm": 0.22042608261108398, "learning_rate": 2.1589743589743593e-06, "loss": 0.0023, "step": 3479 }, { "epoch": 44.47284345047923, "grad_norm": 0.21012216806411743, "learning_rate": 2.153846153846154e-06, "loss": 0.0021, "step": 3480 }, { "epoch": 44.48562300319489, "grad_norm": 0.2902188301086426, "learning_rate": 2.1487179487179487e-06, "loss": 0.0027, "step": 3481 }, { "epoch": 44.498402555910545, "grad_norm": 3.315211296081543, "learning_rate": 2.1435897435897436e-06, "loss": 0.0171, "step": 3482 }, { "epoch": 44.5111821086262, "grad_norm": 0.19962161779403687, "learning_rate": 2.138461538461539e-06, "loss": 0.0019, "step": 3483 }, { "epoch": 44.52396166134185, "grad_norm": 0.23143111169338226, "learning_rate": 2.133333333333334e-06, "loss": 0.0022, "step": 3484 }, { "epoch": 44.53674121405751, "grad_norm": 0.20259682834148407, "learning_rate": 2.1282051282051283e-06, "loss": 0.0021, "step": 3485 }, { "epoch": 44.54952076677316, "grad_norm": 1.61074697971344, "learning_rate": 2.123076923076923e-06, "loss": 0.0072, "step": 3486 }, { "epoch": 44.56230031948882, "grad_norm": 0.18312349915504456, "learning_rate": 2.117948717948718e-06, "loss": 0.0019, "step": 3487 }, { "epoch": 44.575079872204476, "grad_norm": 0.7772158980369568, "learning_rate": 2.112820512820513e-06, "loss": 0.0033, "step": 3488 }, { "epoch": 44.587859424920126, "grad_norm": 0.530954897403717, "learning_rate": 2.107692307692308e-06, "loss": 0.0073, "step": 3489 }, { "epoch": 44.60063897763578, "grad_norm": 0.31667378544807434, "learning_rate": 2.1025641025641028e-06, "loss": 0.0022, "step": 3490 }, { "epoch": 44.61341853035144, "grad_norm": 1.6839007139205933, "learning_rate": 2.0974358974358977e-06, "loss": 0.0101, "step": 3491 }, { "epoch": 44.626198083067095, "grad_norm": 2.2546749114990234, "learning_rate": 2.0923076923076926e-06, "loss": 0.0112, "step": 3492 }, { "epoch": 44.638977635782744, "grad_norm": 0.6103613972663879, "learning_rate": 2.0871794871794875e-06, "loss": 0.0037, "step": 3493 }, { "epoch": 44.6517571884984, "grad_norm": 0.21684005856513977, "learning_rate": 2.0820512820512824e-06, "loss": 0.002, "step": 3494 }, { "epoch": 44.66453674121406, "grad_norm": 0.17034326493740082, "learning_rate": 2.0769230769230773e-06, "loss": 0.0021, "step": 3495 }, { "epoch": 44.677316293929714, "grad_norm": 0.22330626845359802, "learning_rate": 2.0717948717948717e-06, "loss": 0.0021, "step": 3496 }, { "epoch": 44.69009584664537, "grad_norm": 3.0296339988708496, "learning_rate": 2.0666666666666666e-06, "loss": 0.0089, "step": 3497 }, { "epoch": 44.70287539936102, "grad_norm": 0.19906385242938995, "learning_rate": 2.061538461538462e-06, "loss": 0.0023, "step": 3498 }, { "epoch": 44.715654952076676, "grad_norm": 0.1592789739370346, "learning_rate": 2.056410256410257e-06, "loss": 0.0016, "step": 3499 }, { "epoch": 44.72843450479233, "grad_norm": 0.1241416409611702, "learning_rate": 2.0512820512820513e-06, "loss": 0.0012, "step": 3500 }, { "epoch": 44.72843450479233, "eval_loss": 1.0085582733154297, "eval_runtime": 183.8592, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3500 }, { "epoch": 44.74121405750799, "grad_norm": 0.216583251953125, "learning_rate": 2.0461538461538462e-06, "loss": 0.0028, "step": 3501 }, { "epoch": 44.753993610223645, "grad_norm": 1.870769739151001, "learning_rate": 2.041025641025641e-06, "loss": 0.0104, "step": 3502 }, { "epoch": 44.766773162939295, "grad_norm": 1.918493390083313, "learning_rate": 2.035897435897436e-06, "loss": 0.01, "step": 3503 }, { "epoch": 44.77955271565495, "grad_norm": 4.812563896179199, "learning_rate": 2.030769230769231e-06, "loss": 0.0039, "step": 3504 }, { "epoch": 44.79233226837061, "grad_norm": 4.635837554931641, "learning_rate": 2.025641025641026e-06, "loss": 0.0121, "step": 3505 }, { "epoch": 44.805111821086264, "grad_norm": 0.2971982955932617, "learning_rate": 2.0205128205128207e-06, "loss": 0.0026, "step": 3506 }, { "epoch": 44.81789137380191, "grad_norm": 0.3084527254104614, "learning_rate": 2.0153846153846156e-06, "loss": 0.0034, "step": 3507 }, { "epoch": 44.83067092651757, "grad_norm": 0.1466815024614334, "learning_rate": 2.0102564102564105e-06, "loss": 0.0017, "step": 3508 }, { "epoch": 44.843450479233226, "grad_norm": 2.4110586643218994, "learning_rate": 2.0051282051282054e-06, "loss": 0.0115, "step": 3509 }, { "epoch": 44.85623003194888, "grad_norm": 5.122982978820801, "learning_rate": 2.0000000000000003e-06, "loss": 0.0182, "step": 3510 }, { "epoch": 44.86900958466454, "grad_norm": 0.19120436906814575, "learning_rate": 1.994871794871795e-06, "loss": 0.0021, "step": 3511 }, { "epoch": 44.88178913738019, "grad_norm": 1.6846766471862793, "learning_rate": 1.9897435897435897e-06, "loss": 0.0079, "step": 3512 }, { "epoch": 44.894568690095845, "grad_norm": 0.2311975210905075, "learning_rate": 1.984615384615385e-06, "loss": 0.0023, "step": 3513 }, { "epoch": 44.9073482428115, "grad_norm": 0.26933738589286804, "learning_rate": 1.97948717948718e-06, "loss": 0.0018, "step": 3514 }, { "epoch": 44.92012779552716, "grad_norm": 0.19096258282661438, "learning_rate": 1.9743589743589744e-06, "loss": 0.0023, "step": 3515 }, { "epoch": 44.932907348242814, "grad_norm": 3.0288987159729004, "learning_rate": 1.9692307692307693e-06, "loss": 0.0144, "step": 3516 }, { "epoch": 44.945686900958464, "grad_norm": 0.8595489263534546, "learning_rate": 1.964102564102564e-06, "loss": 0.0043, "step": 3517 }, { "epoch": 44.95846645367412, "grad_norm": 0.15664945542812347, "learning_rate": 1.958974358974359e-06, "loss": 0.0015, "step": 3518 }, { "epoch": 44.97124600638978, "grad_norm": 0.17332378029823303, "learning_rate": 1.953846153846154e-06, "loss": 0.0016, "step": 3519 }, { "epoch": 44.98402555910543, "grad_norm": 0.198051318526268, "learning_rate": 1.948717948717949e-06, "loss": 0.002, "step": 3520 }, { "epoch": 44.99680511182109, "grad_norm": 0.1939736008644104, "learning_rate": 1.943589743589744e-06, "loss": 0.002, "step": 3521 }, { "epoch": 45.00958466453674, "grad_norm": 0.20715303719043732, "learning_rate": 1.9384615384615387e-06, "loss": 0.0021, "step": 3522 }, { "epoch": 45.022364217252395, "grad_norm": 0.1790590137243271, "learning_rate": 1.9333333333333336e-06, "loss": 0.0019, "step": 3523 }, { "epoch": 45.03514376996805, "grad_norm": 0.18158192932605743, "learning_rate": 1.9282051282051285e-06, "loss": 0.0017, "step": 3524 }, { "epoch": 45.04792332268371, "grad_norm": 1.3653088808059692, "learning_rate": 1.9230769230769234e-06, "loss": 0.0052, "step": 3525 }, { "epoch": 45.06070287539936, "grad_norm": 1.2879325151443481, "learning_rate": 1.917948717948718e-06, "loss": 0.0044, "step": 3526 }, { "epoch": 45.073482428115014, "grad_norm": 1.6927472352981567, "learning_rate": 1.9128205128205128e-06, "loss": 0.0065, "step": 3527 }, { "epoch": 45.08626198083067, "grad_norm": 2.4802870750427246, "learning_rate": 1.907692307692308e-06, "loss": 0.0145, "step": 3528 }, { "epoch": 45.09904153354633, "grad_norm": 0.4064551293849945, "learning_rate": 1.9025641025641028e-06, "loss": 0.0028, "step": 3529 }, { "epoch": 45.11182108626198, "grad_norm": 1.8474481105804443, "learning_rate": 1.8974358974358975e-06, "loss": 0.0055, "step": 3530 }, { "epoch": 45.12460063897763, "grad_norm": 0.17202718555927277, "learning_rate": 1.8923076923076924e-06, "loss": 0.0013, "step": 3531 }, { "epoch": 45.13738019169329, "grad_norm": 0.3263075351715088, "learning_rate": 1.8871794871794875e-06, "loss": 0.003, "step": 3532 }, { "epoch": 45.150159744408946, "grad_norm": 0.195256307721138, "learning_rate": 1.8820512820512822e-06, "loss": 0.0019, "step": 3533 }, { "epoch": 45.1629392971246, "grad_norm": 3.3216702938079834, "learning_rate": 1.876923076923077e-06, "loss": 0.011, "step": 3534 }, { "epoch": 45.17571884984026, "grad_norm": 0.5705974102020264, "learning_rate": 1.871794871794872e-06, "loss": 0.003, "step": 3535 }, { "epoch": 45.18849840255591, "grad_norm": 0.15692636370658875, "learning_rate": 1.8666666666666669e-06, "loss": 0.0017, "step": 3536 }, { "epoch": 45.201277955271564, "grad_norm": 0.28287068009376526, "learning_rate": 1.8615384615384616e-06, "loss": 0.0024, "step": 3537 }, { "epoch": 45.21405750798722, "grad_norm": 0.14750763773918152, "learning_rate": 1.8564102564102565e-06, "loss": 0.0014, "step": 3538 }, { "epoch": 45.22683706070288, "grad_norm": 0.15956076979637146, "learning_rate": 1.8512820512820516e-06, "loss": 0.0014, "step": 3539 }, { "epoch": 45.239616613418534, "grad_norm": 0.17729100584983826, "learning_rate": 1.8461538461538465e-06, "loss": 0.0018, "step": 3540 }, { "epoch": 45.25239616613418, "grad_norm": 0.13930313289165497, "learning_rate": 1.8410256410256411e-06, "loss": 0.0013, "step": 3541 }, { "epoch": 45.26517571884984, "grad_norm": 0.2076169103384018, "learning_rate": 1.835897435897436e-06, "loss": 0.0016, "step": 3542 }, { "epoch": 45.277955271565496, "grad_norm": 0.3562934100627899, "learning_rate": 1.830769230769231e-06, "loss": 0.0026, "step": 3543 }, { "epoch": 45.29073482428115, "grad_norm": 0.706603467464447, "learning_rate": 1.8256410256410258e-06, "loss": 0.0026, "step": 3544 }, { "epoch": 45.3035143769968, "grad_norm": 0.1821099817752838, "learning_rate": 1.8205128205128205e-06, "loss": 0.0021, "step": 3545 }, { "epoch": 45.31629392971246, "grad_norm": 0.12550444900989532, "learning_rate": 1.8153846153846154e-06, "loss": 0.0016, "step": 3546 }, { "epoch": 45.329073482428115, "grad_norm": 0.25906607508659363, "learning_rate": 1.8102564102564105e-06, "loss": 0.0022, "step": 3547 }, { "epoch": 45.34185303514377, "grad_norm": 0.20704346895217896, "learning_rate": 1.8051282051282054e-06, "loss": 0.0019, "step": 3548 }, { "epoch": 45.35463258785943, "grad_norm": 0.17126823961734772, "learning_rate": 1.8000000000000001e-06, "loss": 0.0022, "step": 3549 }, { "epoch": 45.36741214057508, "grad_norm": 3.5319011211395264, "learning_rate": 1.794871794871795e-06, "loss": 0.0223, "step": 3550 }, { "epoch": 45.38019169329073, "grad_norm": 3.9966418743133545, "learning_rate": 1.78974358974359e-06, "loss": 0.0119, "step": 3551 }, { "epoch": 45.39297124600639, "grad_norm": 0.8652167320251465, "learning_rate": 1.7846153846153846e-06, "loss": 0.0034, "step": 3552 }, { "epoch": 45.405750798722046, "grad_norm": 0.15536391735076904, "learning_rate": 1.7794871794871795e-06, "loss": 0.0017, "step": 3553 }, { "epoch": 45.4185303514377, "grad_norm": 0.26495033502578735, "learning_rate": 1.7743589743589746e-06, "loss": 0.002, "step": 3554 }, { "epoch": 45.43130990415335, "grad_norm": 0.209041565656662, "learning_rate": 1.7692307692307695e-06, "loss": 0.0018, "step": 3555 }, { "epoch": 45.44408945686901, "grad_norm": 0.2236473560333252, "learning_rate": 1.7641025641025642e-06, "loss": 0.0023, "step": 3556 }, { "epoch": 45.456869009584665, "grad_norm": 1.227617859840393, "learning_rate": 1.7589743589743591e-06, "loss": 0.0062, "step": 3557 }, { "epoch": 45.46964856230032, "grad_norm": 0.4426514208316803, "learning_rate": 1.753846153846154e-06, "loss": 0.0029, "step": 3558 }, { "epoch": 45.48242811501598, "grad_norm": 0.14357681572437286, "learning_rate": 1.748717948717949e-06, "loss": 0.0015, "step": 3559 }, { "epoch": 45.49520766773163, "grad_norm": 0.1984434574842453, "learning_rate": 1.7435897435897436e-06, "loss": 0.0019, "step": 3560 }, { "epoch": 45.50798722044728, "grad_norm": 0.4589730203151703, "learning_rate": 1.7384615384615385e-06, "loss": 0.0022, "step": 3561 }, { "epoch": 45.52076677316294, "grad_norm": 0.14460931718349457, "learning_rate": 1.7333333333333336e-06, "loss": 0.0015, "step": 3562 }, { "epoch": 45.533546325878596, "grad_norm": 0.18210144340991974, "learning_rate": 1.7282051282051285e-06, "loss": 0.0021, "step": 3563 }, { "epoch": 45.546325878594246, "grad_norm": 0.23533381521701813, "learning_rate": 1.7230769230769232e-06, "loss": 0.0016, "step": 3564 }, { "epoch": 45.5591054313099, "grad_norm": 0.17796681821346283, "learning_rate": 1.717948717948718e-06, "loss": 0.0018, "step": 3565 }, { "epoch": 45.57188498402556, "grad_norm": 0.29740259051322937, "learning_rate": 1.712820512820513e-06, "loss": 0.0019, "step": 3566 }, { "epoch": 45.584664536741215, "grad_norm": 0.11241334676742554, "learning_rate": 1.7076923076923077e-06, "loss": 0.0012, "step": 3567 }, { "epoch": 45.59744408945687, "grad_norm": 0.17197149991989136, "learning_rate": 1.7025641025641026e-06, "loss": 0.0018, "step": 3568 }, { "epoch": 45.61022364217252, "grad_norm": 0.207344189286232, "learning_rate": 1.6974358974358977e-06, "loss": 0.0021, "step": 3569 }, { "epoch": 45.62300319488818, "grad_norm": 0.1945933699607849, "learning_rate": 1.6923076923076926e-06, "loss": 0.0025, "step": 3570 }, { "epoch": 45.635782747603834, "grad_norm": 2.508556842803955, "learning_rate": 1.6871794871794873e-06, "loss": 0.0074, "step": 3571 }, { "epoch": 45.64856230031949, "grad_norm": 0.8266756534576416, "learning_rate": 1.6820512820512822e-06, "loss": 0.0024, "step": 3572 }, { "epoch": 45.66134185303515, "grad_norm": 1.3747810125350952, "learning_rate": 1.676923076923077e-06, "loss": 0.0058, "step": 3573 }, { "epoch": 45.674121405750796, "grad_norm": 1.8754050731658936, "learning_rate": 1.671794871794872e-06, "loss": 0.0096, "step": 3574 }, { "epoch": 45.68690095846645, "grad_norm": 0.19026875495910645, "learning_rate": 1.6666666666666667e-06, "loss": 0.0026, "step": 3575 }, { "epoch": 45.69968051118211, "grad_norm": 0.8473201394081116, "learning_rate": 1.6615384615384616e-06, "loss": 0.0042, "step": 3576 }, { "epoch": 45.712460063897765, "grad_norm": 0.47852179408073425, "learning_rate": 1.6564102564102567e-06, "loss": 0.0021, "step": 3577 }, { "epoch": 45.72523961661342, "grad_norm": 23.40967559814453, "learning_rate": 1.6512820512820516e-06, "loss": 0.0304, "step": 3578 }, { "epoch": 45.73801916932907, "grad_norm": 1.9984266757965088, "learning_rate": 1.6461538461538463e-06, "loss": 0.0097, "step": 3579 }, { "epoch": 45.75079872204473, "grad_norm": 0.2590010166168213, "learning_rate": 1.6410256410256412e-06, "loss": 0.0016, "step": 3580 }, { "epoch": 45.763578274760384, "grad_norm": 0.16944079101085663, "learning_rate": 1.635897435897436e-06, "loss": 0.0022, "step": 3581 }, { "epoch": 45.77635782747604, "grad_norm": 2.3917136192321777, "learning_rate": 1.6307692307692307e-06, "loss": 0.0101, "step": 3582 }, { "epoch": 45.78913738019169, "grad_norm": 0.2407619208097458, "learning_rate": 1.6256410256410256e-06, "loss": 0.0021, "step": 3583 }, { "epoch": 45.801916932907346, "grad_norm": 0.23970063030719757, "learning_rate": 1.6205128205128208e-06, "loss": 0.0026, "step": 3584 }, { "epoch": 45.814696485623, "grad_norm": 0.1650267094373703, "learning_rate": 1.6153846153846157e-06, "loss": 0.0015, "step": 3585 }, { "epoch": 45.82747603833866, "grad_norm": 0.2412625551223755, "learning_rate": 1.6102564102564103e-06, "loss": 0.0026, "step": 3586 }, { "epoch": 45.840255591054316, "grad_norm": 0.14923495054244995, "learning_rate": 1.6051282051282052e-06, "loss": 0.0017, "step": 3587 }, { "epoch": 45.853035143769965, "grad_norm": 5.012885570526123, "learning_rate": 1.6000000000000001e-06, "loss": 0.0028, "step": 3588 }, { "epoch": 45.86581469648562, "grad_norm": 0.2059669941663742, "learning_rate": 1.594871794871795e-06, "loss": 0.0018, "step": 3589 }, { "epoch": 45.87859424920128, "grad_norm": 0.44068247079849243, "learning_rate": 1.5897435897435897e-06, "loss": 0.0017, "step": 3590 }, { "epoch": 45.891373801916934, "grad_norm": 1.4092161655426025, "learning_rate": 1.5846153846153846e-06, "loss": 0.0074, "step": 3591 }, { "epoch": 45.90415335463259, "grad_norm": 0.23094308376312256, "learning_rate": 1.5794871794871797e-06, "loss": 0.0022, "step": 3592 }, { "epoch": 45.91693290734824, "grad_norm": 0.2290555089712143, "learning_rate": 1.5743589743589746e-06, "loss": 0.0023, "step": 3593 }, { "epoch": 45.9297124600639, "grad_norm": 0.1883171647787094, "learning_rate": 1.5692307692307693e-06, "loss": 0.0017, "step": 3594 }, { "epoch": 45.94249201277955, "grad_norm": 0.36363181471824646, "learning_rate": 1.5641025641025642e-06, "loss": 0.0019, "step": 3595 }, { "epoch": 45.95527156549521, "grad_norm": 0.18326343595981598, "learning_rate": 1.5589743589743591e-06, "loss": 0.0017, "step": 3596 }, { "epoch": 45.968051118210866, "grad_norm": 2.4599180221557617, "learning_rate": 1.5538461538461538e-06, "loss": 0.0098, "step": 3597 }, { "epoch": 45.980830670926515, "grad_norm": 0.5097063779830933, "learning_rate": 1.5487179487179487e-06, "loss": 0.0064, "step": 3598 }, { "epoch": 45.99361022364217, "grad_norm": 1.839652180671692, "learning_rate": 1.5435897435897438e-06, "loss": 0.01, "step": 3599 }, { "epoch": 46.00638977635783, "grad_norm": 1.2611196041107178, "learning_rate": 1.5384615384615387e-06, "loss": 0.008, "step": 3600 }, { "epoch": 46.00638977635783, "eval_loss": 1.0099000930786133, "eval_runtime": 183.9179, "eval_samples_per_second": 0.854, "eval_steps_per_second": 0.109, "step": 3600 }, { "epoch": 46.019169329073485, "grad_norm": 0.23989491164684296, "learning_rate": 1.5333333333333334e-06, "loss": 0.0017, "step": 3601 }, { "epoch": 46.031948881789134, "grad_norm": 1.742085576057434, "learning_rate": 1.5282051282051283e-06, "loss": 0.0039, "step": 3602 }, { "epoch": 46.04472843450479, "grad_norm": 0.4358189105987549, "learning_rate": 1.5230769230769232e-06, "loss": 0.0019, "step": 3603 }, { "epoch": 46.05750798722045, "grad_norm": 2.8469250202178955, "learning_rate": 1.517948717948718e-06, "loss": 0.0082, "step": 3604 }, { "epoch": 46.0702875399361, "grad_norm": 0.18406391143798828, "learning_rate": 1.5128205128205128e-06, "loss": 0.002, "step": 3605 }, { "epoch": 46.08306709265176, "grad_norm": 1.457541584968567, "learning_rate": 1.5076923076923077e-06, "loss": 0.0062, "step": 3606 }, { "epoch": 46.09584664536741, "grad_norm": 0.17311814427375793, "learning_rate": 1.5025641025641028e-06, "loss": 0.002, "step": 3607 }, { "epoch": 46.108626198083066, "grad_norm": 1.3626841306686401, "learning_rate": 1.4974358974358977e-06, "loss": 0.0047, "step": 3608 }, { "epoch": 46.12140575079872, "grad_norm": 0.17242155969142914, "learning_rate": 1.4923076923076924e-06, "loss": 0.0018, "step": 3609 }, { "epoch": 46.13418530351438, "grad_norm": 0.1784520000219345, "learning_rate": 1.4871794871794873e-06, "loss": 0.0014, "step": 3610 }, { "epoch": 46.146964856230035, "grad_norm": 2.5860233306884766, "learning_rate": 1.4820512820512822e-06, "loss": 0.0118, "step": 3611 }, { "epoch": 46.159744408945684, "grad_norm": 0.16604258120059967, "learning_rate": 1.476923076923077e-06, "loss": 0.002, "step": 3612 }, { "epoch": 46.17252396166134, "grad_norm": 0.20272715389728546, "learning_rate": 1.4717948717948718e-06, "loss": 0.0018, "step": 3613 }, { "epoch": 46.185303514377, "grad_norm": 0.21260668337345123, "learning_rate": 1.4666666666666669e-06, "loss": 0.0022, "step": 3614 }, { "epoch": 46.198083067092654, "grad_norm": 0.20811006426811218, "learning_rate": 1.4615384615384618e-06, "loss": 0.0022, "step": 3615 }, { "epoch": 46.21086261980831, "grad_norm": 0.15792424976825714, "learning_rate": 1.4564102564102565e-06, "loss": 0.0019, "step": 3616 }, { "epoch": 46.22364217252396, "grad_norm": 0.2316557615995407, "learning_rate": 1.4512820512820514e-06, "loss": 0.0029, "step": 3617 }, { "epoch": 46.236421725239616, "grad_norm": 0.17002829909324646, "learning_rate": 1.4461538461538463e-06, "loss": 0.0016, "step": 3618 }, { "epoch": 46.24920127795527, "grad_norm": 0.16938528418540955, "learning_rate": 1.4410256410256412e-06, "loss": 0.0017, "step": 3619 }, { "epoch": 46.26198083067093, "grad_norm": 0.1708260029554367, "learning_rate": 1.4358974358974359e-06, "loss": 0.0016, "step": 3620 }, { "epoch": 46.27476038338658, "grad_norm": 0.1572258621454239, "learning_rate": 1.4307692307692308e-06, "loss": 0.0014, "step": 3621 }, { "epoch": 46.287539936102235, "grad_norm": 0.2195149064064026, "learning_rate": 1.4256410256410259e-06, "loss": 0.0016, "step": 3622 }, { "epoch": 46.30031948881789, "grad_norm": 0.19424758851528168, "learning_rate": 1.4205128205128208e-06, "loss": 0.0015, "step": 3623 }, { "epoch": 46.31309904153355, "grad_norm": 3.137338161468506, "learning_rate": 1.4153846153846155e-06, "loss": 0.0129, "step": 3624 }, { "epoch": 46.325878594249204, "grad_norm": 0.14023944735527039, "learning_rate": 1.4102564102564104e-06, "loss": 0.0013, "step": 3625 }, { "epoch": 46.33865814696485, "grad_norm": 0.17934924364089966, "learning_rate": 1.4051282051282052e-06, "loss": 0.0015, "step": 3626 }, { "epoch": 46.35143769968051, "grad_norm": 2.365750312805176, "learning_rate": 1.4000000000000001e-06, "loss": 0.006, "step": 3627 }, { "epoch": 46.364217252396166, "grad_norm": 0.4310351014137268, "learning_rate": 1.3948717948717948e-06, "loss": 0.0039, "step": 3628 }, { "epoch": 46.37699680511182, "grad_norm": 0.18593768775463104, "learning_rate": 1.38974358974359e-06, "loss": 0.0022, "step": 3629 }, { "epoch": 46.38977635782748, "grad_norm": 0.15974895656108856, "learning_rate": 1.3846153846153848e-06, "loss": 0.0017, "step": 3630 }, { "epoch": 46.40255591054313, "grad_norm": 0.4098556935787201, "learning_rate": 1.3794871794871795e-06, "loss": 0.0055, "step": 3631 }, { "epoch": 46.415335463258785, "grad_norm": 0.2783844470977783, "learning_rate": 1.3743589743589744e-06, "loss": 0.0025, "step": 3632 }, { "epoch": 46.42811501597444, "grad_norm": 1.6561388969421387, "learning_rate": 1.3692307692307693e-06, "loss": 0.006, "step": 3633 }, { "epoch": 46.4408945686901, "grad_norm": 0.14973992109298706, "learning_rate": 1.3641025641025642e-06, "loss": 0.0015, "step": 3634 }, { "epoch": 46.453674121405754, "grad_norm": 0.16307753324508667, "learning_rate": 1.358974358974359e-06, "loss": 0.0018, "step": 3635 }, { "epoch": 46.466453674121404, "grad_norm": 3.942716598510742, "learning_rate": 1.3538461538461538e-06, "loss": 0.0208, "step": 3636 }, { "epoch": 46.47923322683706, "grad_norm": 0.33902183175086975, "learning_rate": 1.348717948717949e-06, "loss": 0.0023, "step": 3637 }, { "epoch": 46.49201277955272, "grad_norm": 0.12845934927463531, "learning_rate": 1.3435897435897438e-06, "loss": 0.0014, "step": 3638 }, { "epoch": 46.50479233226837, "grad_norm": 1.4974967241287231, "learning_rate": 1.3384615384615385e-06, "loss": 0.005, "step": 3639 }, { "epoch": 46.51757188498402, "grad_norm": 0.1921854466199875, "learning_rate": 1.3333333333333334e-06, "loss": 0.0025, "step": 3640 }, { "epoch": 46.53035143769968, "grad_norm": 0.2795115113258362, "learning_rate": 1.3282051282051283e-06, "loss": 0.002, "step": 3641 }, { "epoch": 46.543130990415335, "grad_norm": 0.17906340956687927, "learning_rate": 1.3230769230769232e-06, "loss": 0.0023, "step": 3642 }, { "epoch": 46.55591054313099, "grad_norm": 0.16304001212120056, "learning_rate": 1.317948717948718e-06, "loss": 0.0019, "step": 3643 }, { "epoch": 46.56869009584665, "grad_norm": 2.5445923805236816, "learning_rate": 1.312820512820513e-06, "loss": 0.0069, "step": 3644 }, { "epoch": 46.5814696485623, "grad_norm": 0.23029464483261108, "learning_rate": 1.307692307692308e-06, "loss": 0.0024, "step": 3645 }, { "epoch": 46.594249201277954, "grad_norm": 0.10605199635028839, "learning_rate": 1.3025641025641026e-06, "loss": 0.0011, "step": 3646 }, { "epoch": 46.60702875399361, "grad_norm": 2.268832206726074, "learning_rate": 1.2974358974358975e-06, "loss": 0.0107, "step": 3647 }, { "epoch": 46.61980830670927, "grad_norm": 1.6812770366668701, "learning_rate": 1.2923076923076924e-06, "loss": 0.0085, "step": 3648 }, { "epoch": 46.63258785942492, "grad_norm": 0.11973489075899124, "learning_rate": 1.2871794871794873e-06, "loss": 0.0013, "step": 3649 }, { "epoch": 46.64536741214057, "grad_norm": 0.11298532783985138, "learning_rate": 1.282051282051282e-06, "loss": 0.0012, "step": 3650 }, { "epoch": 46.65814696485623, "grad_norm": 0.19396765530109406, "learning_rate": 1.2769230769230769e-06, "loss": 0.0021, "step": 3651 }, { "epoch": 46.670926517571885, "grad_norm": 0.265076220035553, "learning_rate": 1.271794871794872e-06, "loss": 0.0027, "step": 3652 }, { "epoch": 46.68370607028754, "grad_norm": 0.14036832749843597, "learning_rate": 1.2666666666666669e-06, "loss": 0.0011, "step": 3653 }, { "epoch": 46.6964856230032, "grad_norm": 0.18519698083400726, "learning_rate": 1.2615384615384616e-06, "loss": 0.0024, "step": 3654 }, { "epoch": 46.70926517571885, "grad_norm": 0.8766704201698303, "learning_rate": 1.2564102564102565e-06, "loss": 0.0097, "step": 3655 }, { "epoch": 46.722044728434504, "grad_norm": 0.16572509706020355, "learning_rate": 1.2512820512820514e-06, "loss": 0.0016, "step": 3656 }, { "epoch": 46.73482428115016, "grad_norm": 0.2027369737625122, "learning_rate": 1.2461538461538463e-06, "loss": 0.0023, "step": 3657 }, { "epoch": 46.74760383386582, "grad_norm": 1.8083834648132324, "learning_rate": 1.2410256410256412e-06, "loss": 0.0049, "step": 3658 }, { "epoch": 46.760383386581466, "grad_norm": 0.19146762788295746, "learning_rate": 1.235897435897436e-06, "loss": 0.002, "step": 3659 }, { "epoch": 46.77316293929712, "grad_norm": 1.8865890502929688, "learning_rate": 1.230769230769231e-06, "loss": 0.0048, "step": 3660 }, { "epoch": 46.78594249201278, "grad_norm": 0.19453971087932587, "learning_rate": 1.2256410256410257e-06, "loss": 0.002, "step": 3661 }, { "epoch": 46.798722044728436, "grad_norm": 2.282036304473877, "learning_rate": 1.2205128205128206e-06, "loss": 0.0095, "step": 3662 }, { "epoch": 46.81150159744409, "grad_norm": 1.6410950422286987, "learning_rate": 1.2153846153846155e-06, "loss": 0.0076, "step": 3663 }, { "epoch": 46.82428115015974, "grad_norm": 0.8596600890159607, "learning_rate": 1.2102564102564104e-06, "loss": 0.0039, "step": 3664 }, { "epoch": 46.8370607028754, "grad_norm": 0.16360537707805634, "learning_rate": 1.2051282051282053e-06, "loss": 0.0014, "step": 3665 }, { "epoch": 46.849840255591054, "grad_norm": 0.18147307634353638, "learning_rate": 1.2000000000000002e-06, "loss": 0.0018, "step": 3666 }, { "epoch": 46.86261980830671, "grad_norm": 0.16816723346710205, "learning_rate": 1.194871794871795e-06, "loss": 0.0015, "step": 3667 }, { "epoch": 46.87539936102237, "grad_norm": 1.8942514657974243, "learning_rate": 1.1897435897435897e-06, "loss": 0.0084, "step": 3668 }, { "epoch": 46.88817891373802, "grad_norm": 0.14104534685611725, "learning_rate": 1.1846153846153849e-06, "loss": 0.0014, "step": 3669 }, { "epoch": 46.90095846645367, "grad_norm": 0.18655520677566528, "learning_rate": 1.1794871794871795e-06, "loss": 0.0017, "step": 3670 }, { "epoch": 46.91373801916933, "grad_norm": 0.3814006447792053, "learning_rate": 1.1743589743589744e-06, "loss": 0.0027, "step": 3671 }, { "epoch": 46.926517571884986, "grad_norm": 0.2369621992111206, "learning_rate": 1.1692307692307693e-06, "loss": 0.002, "step": 3672 }, { "epoch": 46.93929712460064, "grad_norm": 2.7895922660827637, "learning_rate": 1.1641025641025642e-06, "loss": 0.008, "step": 3673 }, { "epoch": 46.95207667731629, "grad_norm": 0.2802821695804596, "learning_rate": 1.1589743589743591e-06, "loss": 0.0028, "step": 3674 }, { "epoch": 46.96485623003195, "grad_norm": 0.1200646385550499, "learning_rate": 1.153846153846154e-06, "loss": 0.0013, "step": 3675 }, { "epoch": 46.977635782747605, "grad_norm": 0.19148743152618408, "learning_rate": 1.1487179487179487e-06, "loss": 0.0022, "step": 3676 }, { "epoch": 46.99041533546326, "grad_norm": 1.2269463539123535, "learning_rate": 1.1435897435897436e-06, "loss": 0.0064, "step": 3677 }, { "epoch": 47.00319488817891, "grad_norm": 0.25025907158851624, "learning_rate": 1.1384615384615385e-06, "loss": 0.0024, "step": 3678 }, { "epoch": 47.01597444089457, "grad_norm": 0.17812982201576233, "learning_rate": 1.1333333333333334e-06, "loss": 0.0016, "step": 3679 }, { "epoch": 47.02875399361022, "grad_norm": 0.23735368251800537, "learning_rate": 1.1282051282051283e-06, "loss": 0.002, "step": 3680 }, { "epoch": 47.04153354632588, "grad_norm": 2.0460994243621826, "learning_rate": 1.1230769230769232e-06, "loss": 0.0078, "step": 3681 }, { "epoch": 47.054313099041536, "grad_norm": 0.13436773419380188, "learning_rate": 1.1179487179487181e-06, "loss": 0.0014, "step": 3682 }, { "epoch": 47.067092651757186, "grad_norm": 0.24940666556358337, "learning_rate": 1.1128205128205128e-06, "loss": 0.0023, "step": 3683 }, { "epoch": 47.07987220447284, "grad_norm": 0.16844338178634644, "learning_rate": 1.107692307692308e-06, "loss": 0.0016, "step": 3684 }, { "epoch": 47.0926517571885, "grad_norm": 0.18543598055839539, "learning_rate": 1.1025641025641026e-06, "loss": 0.0019, "step": 3685 }, { "epoch": 47.105431309904155, "grad_norm": 1.0382249355316162, "learning_rate": 1.0974358974358975e-06, "loss": 0.0056, "step": 3686 }, { "epoch": 47.11821086261981, "grad_norm": 0.4113228917121887, "learning_rate": 1.0923076923076924e-06, "loss": 0.0054, "step": 3687 }, { "epoch": 47.13099041533546, "grad_norm": 1.7715373039245605, "learning_rate": 1.0871794871794873e-06, "loss": 0.0084, "step": 3688 }, { "epoch": 47.14376996805112, "grad_norm": 0.19282501935958862, "learning_rate": 1.0820512820512822e-06, "loss": 0.0016, "step": 3689 }, { "epoch": 47.156549520766774, "grad_norm": 0.2740253210067749, "learning_rate": 1.076923076923077e-06, "loss": 0.0026, "step": 3690 }, { "epoch": 47.16932907348243, "grad_norm": 1.1587003469467163, "learning_rate": 1.0717948717948718e-06, "loss": 0.0037, "step": 3691 }, { "epoch": 47.18210862619808, "grad_norm": 0.19705770909786224, "learning_rate": 1.066666666666667e-06, "loss": 0.0021, "step": 3692 }, { "epoch": 47.194888178913736, "grad_norm": 0.14728839695453644, "learning_rate": 1.0615384615384616e-06, "loss": 0.002, "step": 3693 }, { "epoch": 47.20766773162939, "grad_norm": 1.7193306684494019, "learning_rate": 1.0564102564102565e-06, "loss": 0.0053, "step": 3694 }, { "epoch": 47.22044728434505, "grad_norm": 0.907518744468689, "learning_rate": 1.0512820512820514e-06, "loss": 0.0031, "step": 3695 }, { "epoch": 47.233226837060705, "grad_norm": 0.2023201584815979, "learning_rate": 1.0461538461538463e-06, "loss": 0.0019, "step": 3696 }, { "epoch": 47.246006389776355, "grad_norm": 0.13571235537528992, "learning_rate": 1.0410256410256412e-06, "loss": 0.0016, "step": 3697 }, { "epoch": 47.25878594249201, "grad_norm": 2.384904146194458, "learning_rate": 1.0358974358974359e-06, "loss": 0.0108, "step": 3698 }, { "epoch": 47.27156549520767, "grad_norm": 0.15866927802562714, "learning_rate": 1.030769230769231e-06, "loss": 0.0017, "step": 3699 }, { "epoch": 47.284345047923324, "grad_norm": 1.8669986724853516, "learning_rate": 1.0256410256410257e-06, "loss": 0.0062, "step": 3700 }, { "epoch": 47.284345047923324, "eval_loss": 1.00686514377594, "eval_runtime": 184.0665, "eval_samples_per_second": 0.853, "eval_steps_per_second": 0.109, "step": 3700 }, { "epoch": 47.29712460063898, "grad_norm": 0.30873823165893555, "learning_rate": 1.0205128205128206e-06, "loss": 0.0023, "step": 3701 }, { "epoch": 47.30990415335463, "grad_norm": 0.1505894660949707, "learning_rate": 1.0153846153846155e-06, "loss": 0.0018, "step": 3702 }, { "epoch": 47.322683706070286, "grad_norm": 2.2037155628204346, "learning_rate": 1.0102564102564104e-06, "loss": 0.0098, "step": 3703 }, { "epoch": 47.33546325878594, "grad_norm": 1.8789855241775513, "learning_rate": 1.0051282051282053e-06, "loss": 0.0066, "step": 3704 }, { "epoch": 47.3482428115016, "grad_norm": 0.1800302118062973, "learning_rate": 1.0000000000000002e-06, "loss": 0.0022, "step": 3705 }, { "epoch": 47.361022364217256, "grad_norm": 0.16504280269145966, "learning_rate": 9.948717948717949e-07, "loss": 0.0019, "step": 3706 }, { "epoch": 47.373801916932905, "grad_norm": 0.41811108589172363, "learning_rate": 9.8974358974359e-07, "loss": 0.0035, "step": 3707 }, { "epoch": 47.38658146964856, "grad_norm": 0.09904661029577255, "learning_rate": 9.846153846153847e-07, "loss": 0.0011, "step": 3708 }, { "epoch": 47.39936102236422, "grad_norm": 0.21644745767116547, "learning_rate": 9.794871794871796e-07, "loss": 0.0023, "step": 3709 }, { "epoch": 47.412140575079874, "grad_norm": 0.1227216124534607, "learning_rate": 9.743589743589745e-07, "loss": 0.0012, "step": 3710 }, { "epoch": 47.424920127795524, "grad_norm": 0.15440890192985535, "learning_rate": 9.692307692307693e-07, "loss": 0.0017, "step": 3711 }, { "epoch": 47.43769968051118, "grad_norm": 0.19403807818889618, "learning_rate": 9.641025641025642e-07, "loss": 0.0021, "step": 3712 }, { "epoch": 47.45047923322684, "grad_norm": 1.5223453044891357, "learning_rate": 9.58974358974359e-07, "loss": 0.0081, "step": 3713 }, { "epoch": 47.46325878594249, "grad_norm": 0.12501774728298187, "learning_rate": 9.53846153846154e-07, "loss": 0.0012, "step": 3714 }, { "epoch": 47.47603833865815, "grad_norm": 0.9358645677566528, "learning_rate": 9.487179487179487e-07, "loss": 0.0045, "step": 3715 }, { "epoch": 47.4888178913738, "grad_norm": 0.18395741283893585, "learning_rate": 9.435897435897437e-07, "loss": 0.0019, "step": 3716 }, { "epoch": 47.501597444089455, "grad_norm": 0.6766879558563232, "learning_rate": 9.384615384615385e-07, "loss": 0.0036, "step": 3717 }, { "epoch": 47.51437699680511, "grad_norm": 0.3552172780036926, "learning_rate": 9.333333333333334e-07, "loss": 0.0019, "step": 3718 }, { "epoch": 47.52715654952077, "grad_norm": 1.697737693786621, "learning_rate": 9.282051282051282e-07, "loss": 0.0047, "step": 3719 }, { "epoch": 47.539936102236425, "grad_norm": 3.0718917846679688, "learning_rate": 9.230769230769232e-07, "loss": 0.0088, "step": 3720 }, { "epoch": 47.552715654952074, "grad_norm": 0.14367461204528809, "learning_rate": 9.17948717948718e-07, "loss": 0.0016, "step": 3721 }, { "epoch": 47.56549520766773, "grad_norm": 0.5913472175598145, "learning_rate": 9.128205128205129e-07, "loss": 0.003, "step": 3722 }, { "epoch": 47.57827476038339, "grad_norm": 0.17336972057819366, "learning_rate": 9.076923076923077e-07, "loss": 0.002, "step": 3723 }, { "epoch": 47.59105431309904, "grad_norm": 0.21685920655727386, "learning_rate": 9.025641025641027e-07, "loss": 0.0021, "step": 3724 }, { "epoch": 47.6038338658147, "grad_norm": 1.3096964359283447, "learning_rate": 8.974358974358975e-07, "loss": 0.0048, "step": 3725 }, { "epoch": 47.61661341853035, "grad_norm": 0.20485009253025055, "learning_rate": 8.923076923076923e-07, "loss": 0.0018, "step": 3726 }, { "epoch": 47.629392971246006, "grad_norm": 0.14636144042015076, "learning_rate": 8.871794871794873e-07, "loss": 0.0017, "step": 3727 }, { "epoch": 47.64217252396166, "grad_norm": 0.14755165576934814, "learning_rate": 8.820512820512821e-07, "loss": 0.0015, "step": 3728 }, { "epoch": 47.65495207667732, "grad_norm": 0.23167769610881805, "learning_rate": 8.76923076923077e-07, "loss": 0.002, "step": 3729 }, { "epoch": 47.66773162939297, "grad_norm": 0.1636737734079361, "learning_rate": 8.717948717948718e-07, "loss": 0.0019, "step": 3730 }, { "epoch": 47.680511182108624, "grad_norm": 0.1251421570777893, "learning_rate": 8.666666666666668e-07, "loss": 0.0012, "step": 3731 }, { "epoch": 47.69329073482428, "grad_norm": 2.23331618309021, "learning_rate": 8.615384615384616e-07, "loss": 0.007, "step": 3732 }, { "epoch": 47.70607028753994, "grad_norm": 0.22965694963932037, "learning_rate": 8.564102564102565e-07, "loss": 0.0019, "step": 3733 }, { "epoch": 47.718849840255594, "grad_norm": 1.785829782485962, "learning_rate": 8.512820512820513e-07, "loss": 0.0062, "step": 3734 }, { "epoch": 47.73162939297124, "grad_norm": 0.25463688373565674, "learning_rate": 8.461538461538463e-07, "loss": 0.0031, "step": 3735 }, { "epoch": 47.7444089456869, "grad_norm": 0.28074225783348083, "learning_rate": 8.410256410256411e-07, "loss": 0.0016, "step": 3736 }, { "epoch": 47.757188498402556, "grad_norm": 0.26788008213043213, "learning_rate": 8.35897435897436e-07, "loss": 0.0021, "step": 3737 }, { "epoch": 47.76996805111821, "grad_norm": 0.20389509201049805, "learning_rate": 8.307692307692308e-07, "loss": 0.002, "step": 3738 }, { "epoch": 47.78274760383387, "grad_norm": 0.21783320605754852, "learning_rate": 8.256410256410258e-07, "loss": 0.0021, "step": 3739 }, { "epoch": 47.79552715654952, "grad_norm": 0.23883669078350067, "learning_rate": 8.205128205128206e-07, "loss": 0.0017, "step": 3740 }, { "epoch": 47.808306709265175, "grad_norm": 1.630712628364563, "learning_rate": 8.153846153846154e-07, "loss": 0.0082, "step": 3741 }, { "epoch": 47.82108626198083, "grad_norm": 2.074826955795288, "learning_rate": 8.102564102564104e-07, "loss": 0.0098, "step": 3742 }, { "epoch": 47.83386581469649, "grad_norm": 0.18601283431053162, "learning_rate": 8.051282051282052e-07, "loss": 0.0016, "step": 3743 }, { "epoch": 47.846645367412144, "grad_norm": 0.4480161666870117, "learning_rate": 8.000000000000001e-07, "loss": 0.0028, "step": 3744 }, { "epoch": 47.85942492012779, "grad_norm": 1.5206952095031738, "learning_rate": 7.948717948717949e-07, "loss": 0.0081, "step": 3745 }, { "epoch": 47.87220447284345, "grad_norm": 0.16170930862426758, "learning_rate": 7.897435897435899e-07, "loss": 0.0014, "step": 3746 }, { "epoch": 47.884984025559106, "grad_norm": 1.4309064149856567, "learning_rate": 7.846153846153847e-07, "loss": 0.0044, "step": 3747 }, { "epoch": 47.89776357827476, "grad_norm": 0.2414461225271225, "learning_rate": 7.794871794871796e-07, "loss": 0.0014, "step": 3748 }, { "epoch": 47.91054313099041, "grad_norm": 2.7287521362304688, "learning_rate": 7.743589743589744e-07, "loss": 0.0102, "step": 3749 }, { "epoch": 47.92332268370607, "grad_norm": 1.3179041147232056, "learning_rate": 7.692307692307694e-07, "loss": 0.0056, "step": 3750 }, { "epoch": 47.936102236421725, "grad_norm": 1.9829533100128174, "learning_rate": 7.641025641025642e-07, "loss": 0.0047, "step": 3751 }, { "epoch": 47.94888178913738, "grad_norm": 3.517019271850586, "learning_rate": 7.58974358974359e-07, "loss": 0.0112, "step": 3752 }, { "epoch": 47.96166134185304, "grad_norm": 0.16058802604675293, "learning_rate": 7.538461538461538e-07, "loss": 0.0013, "step": 3753 }, { "epoch": 47.97444089456869, "grad_norm": 0.1638914942741394, "learning_rate": 7.487179487179488e-07, "loss": 0.0017, "step": 3754 }, { "epoch": 47.98722044728434, "grad_norm": 0.17627263069152832, "learning_rate": 7.435897435897436e-07, "loss": 0.0017, "step": 3755 }, { "epoch": 48.0, "grad_norm": 2.2881815433502197, "learning_rate": 7.384615384615385e-07, "loss": 0.01, "step": 3756 }, { "epoch": 48.01277955271566, "grad_norm": 0.9438730478286743, "learning_rate": 7.333333333333334e-07, "loss": 0.0043, "step": 3757 }, { "epoch": 48.02555910543131, "grad_norm": 0.1496344655752182, "learning_rate": 7.282051282051282e-07, "loss": 0.0016, "step": 3758 }, { "epoch": 48.03833865814696, "grad_norm": 0.17239366471767426, "learning_rate": 7.230769230769231e-07, "loss": 0.0014, "step": 3759 }, { "epoch": 48.05111821086262, "grad_norm": 0.15948452055454254, "learning_rate": 7.179487179487179e-07, "loss": 0.0017, "step": 3760 }, { "epoch": 48.063897763578275, "grad_norm": 1.7323073148727417, "learning_rate": 7.128205128205129e-07, "loss": 0.0074, "step": 3761 }, { "epoch": 48.07667731629393, "grad_norm": 0.22783014178276062, "learning_rate": 7.076923076923077e-07, "loss": 0.0019, "step": 3762 }, { "epoch": 48.08945686900959, "grad_norm": 0.209009051322937, "learning_rate": 7.025641025641026e-07, "loss": 0.0021, "step": 3763 }, { "epoch": 48.10223642172524, "grad_norm": 0.13151539862155914, "learning_rate": 6.974358974358974e-07, "loss": 0.0016, "step": 3764 }, { "epoch": 48.115015974440894, "grad_norm": 0.1741136908531189, "learning_rate": 6.923076923076924e-07, "loss": 0.0019, "step": 3765 }, { "epoch": 48.12779552715655, "grad_norm": 2.1559276580810547, "learning_rate": 6.871794871794872e-07, "loss": 0.0051, "step": 3766 }, { "epoch": 48.14057507987221, "grad_norm": 0.145247682929039, "learning_rate": 6.820512820512821e-07, "loss": 0.0018, "step": 3767 }, { "epoch": 48.153354632587856, "grad_norm": 2.345256805419922, "learning_rate": 6.769230769230769e-07, "loss": 0.008, "step": 3768 }, { "epoch": 48.16613418530351, "grad_norm": 0.1540064513683319, "learning_rate": 6.717948717948719e-07, "loss": 0.0019, "step": 3769 }, { "epoch": 48.17891373801917, "grad_norm": 1.0705827474594116, "learning_rate": 6.666666666666667e-07, "loss": 0.0024, "step": 3770 }, { "epoch": 48.191693290734825, "grad_norm": 0.13630788028240204, "learning_rate": 6.615384615384616e-07, "loss": 0.0012, "step": 3771 }, { "epoch": 48.20447284345048, "grad_norm": 0.15815885365009308, "learning_rate": 6.564102564102565e-07, "loss": 0.002, "step": 3772 }, { "epoch": 48.21725239616613, "grad_norm": 0.14926832914352417, "learning_rate": 6.512820512820513e-07, "loss": 0.0015, "step": 3773 }, { "epoch": 48.23003194888179, "grad_norm": 0.2645859122276306, "learning_rate": 6.461538461538462e-07, "loss": 0.0023, "step": 3774 }, { "epoch": 48.242811501597444, "grad_norm": 0.1782495379447937, "learning_rate": 6.41025641025641e-07, "loss": 0.0015, "step": 3775 }, { "epoch": 48.2555910543131, "grad_norm": 0.15166476368904114, "learning_rate": 6.35897435897436e-07, "loss": 0.0014, "step": 3776 }, { "epoch": 48.26837060702876, "grad_norm": 0.2873496115207672, "learning_rate": 6.307692307692308e-07, "loss": 0.0018, "step": 3777 }, { "epoch": 48.281150159744406, "grad_norm": 2.4240097999572754, "learning_rate": 6.256410256410257e-07, "loss": 0.0062, "step": 3778 }, { "epoch": 48.29392971246006, "grad_norm": 0.18980348110198975, "learning_rate": 6.205128205128206e-07, "loss": 0.0016, "step": 3779 }, { "epoch": 48.30670926517572, "grad_norm": 0.17332720756530762, "learning_rate": 6.153846153846155e-07, "loss": 0.0013, "step": 3780 }, { "epoch": 48.319488817891376, "grad_norm": 0.14826764166355133, "learning_rate": 6.102564102564103e-07, "loss": 0.0015, "step": 3781 }, { "epoch": 48.33226837060703, "grad_norm": 0.1659523993730545, "learning_rate": 6.051282051282052e-07, "loss": 0.0017, "step": 3782 }, { "epoch": 48.34504792332268, "grad_norm": 0.9668567180633545, "learning_rate": 6.000000000000001e-07, "loss": 0.0032, "step": 3783 }, { "epoch": 48.35782747603834, "grad_norm": 0.24613407254219055, "learning_rate": 5.948717948717949e-07, "loss": 0.0022, "step": 3784 }, { "epoch": 48.370607028753994, "grad_norm": 1.009360671043396, "learning_rate": 5.897435897435898e-07, "loss": 0.0041, "step": 3785 }, { "epoch": 48.38338658146965, "grad_norm": 0.20935726165771484, "learning_rate": 5.846153846153847e-07, "loss": 0.0021, "step": 3786 }, { "epoch": 48.3961661341853, "grad_norm": 2.7185006141662598, "learning_rate": 5.794871794871796e-07, "loss": 0.0061, "step": 3787 }, { "epoch": 48.40894568690096, "grad_norm": 0.20508651435375214, "learning_rate": 5.743589743589744e-07, "loss": 0.0023, "step": 3788 }, { "epoch": 48.42172523961661, "grad_norm": 1.8751914501190186, "learning_rate": 5.692307692307693e-07, "loss": 0.0052, "step": 3789 }, { "epoch": 48.43450479233227, "grad_norm": 0.1863090544939041, "learning_rate": 5.641025641025642e-07, "loss": 0.002, "step": 3790 }, { "epoch": 48.447284345047926, "grad_norm": 0.16236571967601776, "learning_rate": 5.589743589743591e-07, "loss": 0.0017, "step": 3791 }, { "epoch": 48.460063897763575, "grad_norm": 0.15038584172725677, "learning_rate": 5.53846153846154e-07, "loss": 0.0016, "step": 3792 }, { "epoch": 48.47284345047923, "grad_norm": 0.32027992606163025, "learning_rate": 5.487179487179488e-07, "loss": 0.0034, "step": 3793 }, { "epoch": 48.48562300319489, "grad_norm": 0.18535518646240234, "learning_rate": 5.435897435897437e-07, "loss": 0.0018, "step": 3794 }, { "epoch": 48.498402555910545, "grad_norm": 2.3728909492492676, "learning_rate": 5.384615384615386e-07, "loss": 0.0102, "step": 3795 }, { "epoch": 48.5111821086262, "grad_norm": 0.17245027422904968, "learning_rate": 5.333333333333335e-07, "loss": 0.0017, "step": 3796 }, { "epoch": 48.52396166134185, "grad_norm": 0.15697172284126282, "learning_rate": 5.282051282051282e-07, "loss": 0.0013, "step": 3797 }, { "epoch": 48.53674121405751, "grad_norm": 1.6945269107818604, "learning_rate": 5.230769230769231e-07, "loss": 0.007, "step": 3798 }, { "epoch": 48.54952076677316, "grad_norm": 1.6684554815292358, "learning_rate": 5.179487179487179e-07, "loss": 0.0046, "step": 3799 }, { "epoch": 48.56230031948882, "grad_norm": 0.24569877982139587, "learning_rate": 5.128205128205128e-07, "loss": 0.0019, "step": 3800 }, { "epoch": 48.56230031948882, "eval_loss": 1.0110944509506226, "eval_runtime": 183.2434, "eval_samples_per_second": 0.857, "eval_steps_per_second": 0.109, "step": 3800 }, { "epoch": 48.575079872204476, "grad_norm": 0.16229873895645142, "learning_rate": 5.076923076923077e-07, "loss": 0.002, "step": 3801 }, { "epoch": 48.587859424920126, "grad_norm": 0.19286809861660004, "learning_rate": 5.025641025641026e-07, "loss": 0.0018, "step": 3802 }, { "epoch": 48.60063897763578, "grad_norm": 0.14536385238170624, "learning_rate": 4.974358974358974e-07, "loss": 0.0016, "step": 3803 }, { "epoch": 48.61341853035144, "grad_norm": 0.17883838713169098, "learning_rate": 4.923076923076923e-07, "loss": 0.0015, "step": 3804 }, { "epoch": 48.626198083067095, "grad_norm": 0.1891726702451706, "learning_rate": 4.871794871794872e-07, "loss": 0.002, "step": 3805 }, { "epoch": 48.638977635782744, "grad_norm": 0.13854210078716278, "learning_rate": 4.820512820512821e-07, "loss": 0.0014, "step": 3806 }, { "epoch": 48.6517571884984, "grad_norm": 0.21373547613620758, "learning_rate": 4.76923076923077e-07, "loss": 0.0015, "step": 3807 }, { "epoch": 48.66453674121406, "grad_norm": 26.711471557617188, "learning_rate": 4.7179487179487187e-07, "loss": 0.0052, "step": 3808 }, { "epoch": 48.677316293929714, "grad_norm": 0.8255504965782166, "learning_rate": 4.666666666666667e-07, "loss": 0.0032, "step": 3809 }, { "epoch": 48.69009584664537, "grad_norm": 0.15420027077198029, "learning_rate": 4.615384615384616e-07, "loss": 0.0015, "step": 3810 }, { "epoch": 48.70287539936102, "grad_norm": 0.19408243894577026, "learning_rate": 4.5641025641025646e-07, "loss": 0.0019, "step": 3811 }, { "epoch": 48.715654952076676, "grad_norm": 0.4737688899040222, "learning_rate": 4.5128205128205136e-07, "loss": 0.006, "step": 3812 }, { "epoch": 48.72843450479233, "grad_norm": 2.072347640991211, "learning_rate": 4.4615384615384615e-07, "loss": 0.01, "step": 3813 }, { "epoch": 48.74121405750799, "grad_norm": 0.18484090268611908, "learning_rate": 4.4102564102564105e-07, "loss": 0.0016, "step": 3814 }, { "epoch": 48.753993610223645, "grad_norm": 0.1918613314628601, "learning_rate": 4.358974358974359e-07, "loss": 0.0021, "step": 3815 }, { "epoch": 48.766773162939295, "grad_norm": 0.17631064355373383, "learning_rate": 4.307692307692308e-07, "loss": 0.0012, "step": 3816 }, { "epoch": 48.77955271565495, "grad_norm": 0.45219704508781433, "learning_rate": 4.2564102564102564e-07, "loss": 0.0015, "step": 3817 }, { "epoch": 48.79233226837061, "grad_norm": 0.22593191266059875, "learning_rate": 4.2051282051282054e-07, "loss": 0.0029, "step": 3818 }, { "epoch": 48.805111821086264, "grad_norm": 0.9036332964897156, "learning_rate": 4.153846153846154e-07, "loss": 0.0053, "step": 3819 }, { "epoch": 48.81789137380191, "grad_norm": 2.239448070526123, "learning_rate": 4.102564102564103e-07, "loss": 0.0122, "step": 3820 }, { "epoch": 48.83067092651757, "grad_norm": 0.9289953708648682, "learning_rate": 4.051282051282052e-07, "loss": 0.0038, "step": 3821 }, { "epoch": 48.843450479233226, "grad_norm": 0.18843135237693787, "learning_rate": 4.0000000000000003e-07, "loss": 0.0018, "step": 3822 }, { "epoch": 48.85623003194888, "grad_norm": 0.12309607118368149, "learning_rate": 3.9487179487179493e-07, "loss": 0.0014, "step": 3823 }, { "epoch": 48.86900958466454, "grad_norm": 0.1520330011844635, "learning_rate": 3.897435897435898e-07, "loss": 0.0017, "step": 3824 }, { "epoch": 48.88178913738019, "grad_norm": 2.11013126373291, "learning_rate": 3.846153846153847e-07, "loss": 0.0063, "step": 3825 }, { "epoch": 48.894568690095845, "grad_norm": 1.5195311307907104, "learning_rate": 3.794871794871795e-07, "loss": 0.0054, "step": 3826 }, { "epoch": 48.9073482428115, "grad_norm": 3.264005184173584, "learning_rate": 3.743589743589744e-07, "loss": 0.0156, "step": 3827 }, { "epoch": 48.92012779552716, "grad_norm": 2.1601943969726562, "learning_rate": 3.6923076923076927e-07, "loss": 0.0087, "step": 3828 }, { "epoch": 48.932907348242814, "grad_norm": 0.15875303745269775, "learning_rate": 3.641025641025641e-07, "loss": 0.0019, "step": 3829 }, { "epoch": 48.945686900958464, "grad_norm": 0.11997360736131668, "learning_rate": 3.5897435897435896e-07, "loss": 0.0012, "step": 3830 }, { "epoch": 48.95846645367412, "grad_norm": 0.13904717564582825, "learning_rate": 3.5384615384615386e-07, "loss": 0.0016, "step": 3831 }, { "epoch": 48.97124600638978, "grad_norm": 1.2342338562011719, "learning_rate": 3.487179487179487e-07, "loss": 0.0046, "step": 3832 }, { "epoch": 48.98402555910543, "grad_norm": 0.20744307339191437, "learning_rate": 3.435897435897436e-07, "loss": 0.0021, "step": 3833 }, { "epoch": 48.99680511182109, "grad_norm": 0.9647104740142822, "learning_rate": 3.3846153846153845e-07, "loss": 0.0045, "step": 3834 }, { "epoch": 49.00958466453674, "grad_norm": 0.1561460793018341, "learning_rate": 3.3333333333333335e-07, "loss": 0.0015, "step": 3835 }, { "epoch": 49.022364217252395, "grad_norm": 0.16960713267326355, "learning_rate": 3.2820512820512825e-07, "loss": 0.0015, "step": 3836 }, { "epoch": 49.03514376996805, "grad_norm": 0.20731723308563232, "learning_rate": 3.230769230769231e-07, "loss": 0.0022, "step": 3837 }, { "epoch": 49.04792332268371, "grad_norm": 2.431955575942993, "learning_rate": 3.17948717948718e-07, "loss": 0.0061, "step": 3838 }, { "epoch": 49.06070287539936, "grad_norm": 1.6707125902175903, "learning_rate": 3.1282051282051284e-07, "loss": 0.007, "step": 3839 }, { "epoch": 49.073482428115014, "grad_norm": 0.16068719327449799, "learning_rate": 3.0769230769230774e-07, "loss": 0.0015, "step": 3840 }, { "epoch": 49.08626198083067, "grad_norm": 0.22747410833835602, "learning_rate": 3.025641025641026e-07, "loss": 0.0023, "step": 3841 }, { "epoch": 49.09904153354633, "grad_norm": 0.1663256734609604, "learning_rate": 2.9743589743589744e-07, "loss": 0.0017, "step": 3842 }, { "epoch": 49.11182108626198, "grad_norm": 0.513690173625946, "learning_rate": 2.9230769230769234e-07, "loss": 0.0063, "step": 3843 }, { "epoch": 49.12460063897763, "grad_norm": 0.7397521138191223, "learning_rate": 2.871794871794872e-07, "loss": 0.0032, "step": 3844 }, { "epoch": 49.13738019169329, "grad_norm": 0.14274731278419495, "learning_rate": 2.820512820512821e-07, "loss": 0.0014, "step": 3845 }, { "epoch": 49.150159744408946, "grad_norm": 0.16468511521816254, "learning_rate": 2.76923076923077e-07, "loss": 0.0017, "step": 3846 }, { "epoch": 49.1629392971246, "grad_norm": 0.18096625804901123, "learning_rate": 2.717948717948718e-07, "loss": 0.0021, "step": 3847 }, { "epoch": 49.17571884984026, "grad_norm": 0.15279999375343323, "learning_rate": 2.666666666666667e-07, "loss": 0.0015, "step": 3848 }, { "epoch": 49.18849840255591, "grad_norm": 1.5307972431182861, "learning_rate": 2.6153846153846157e-07, "loss": 0.0077, "step": 3849 }, { "epoch": 49.201277955271564, "grad_norm": 0.17980337142944336, "learning_rate": 2.564102564102564e-07, "loss": 0.0026, "step": 3850 }, { "epoch": 49.21405750798722, "grad_norm": 1.1224316358566284, "learning_rate": 2.512820512820513e-07, "loss": 0.0045, "step": 3851 }, { "epoch": 49.22683706070288, "grad_norm": 0.23723852634429932, "learning_rate": 2.4615384615384616e-07, "loss": 0.0027, "step": 3852 }, { "epoch": 49.239616613418534, "grad_norm": 0.976337730884552, "learning_rate": 2.4102564102564106e-07, "loss": 0.0027, "step": 3853 }, { "epoch": 49.25239616613418, "grad_norm": 0.16196438670158386, "learning_rate": 2.3589743589743593e-07, "loss": 0.0015, "step": 3854 }, { "epoch": 49.26517571884984, "grad_norm": 0.18370366096496582, "learning_rate": 2.307692307692308e-07, "loss": 0.002, "step": 3855 }, { "epoch": 49.277955271565496, "grad_norm": 0.1293938308954239, "learning_rate": 2.2564102564102568e-07, "loss": 0.0013, "step": 3856 }, { "epoch": 49.29073482428115, "grad_norm": 0.2622494101524353, "learning_rate": 2.2051282051282053e-07, "loss": 0.0022, "step": 3857 }, { "epoch": 49.3035143769968, "grad_norm": 0.14362357556819916, "learning_rate": 2.153846153846154e-07, "loss": 0.0013, "step": 3858 }, { "epoch": 49.31629392971246, "grad_norm": 0.14474250376224518, "learning_rate": 2.1025641025641027e-07, "loss": 0.0017, "step": 3859 }, { "epoch": 49.329073482428115, "grad_norm": 4.0463643074035645, "learning_rate": 2.0512820512820514e-07, "loss": 0.0145, "step": 3860 }, { "epoch": 49.34185303514377, "grad_norm": 0.1382463425397873, "learning_rate": 2.0000000000000002e-07, "loss": 0.0013, "step": 3861 }, { "epoch": 49.35463258785943, "grad_norm": 0.1417696177959442, "learning_rate": 1.948717948717949e-07, "loss": 0.0014, "step": 3862 }, { "epoch": 49.36741214057508, "grad_norm": 0.3333016633987427, "learning_rate": 1.8974358974358976e-07, "loss": 0.002, "step": 3863 }, { "epoch": 49.38019169329073, "grad_norm": 0.17449496686458588, "learning_rate": 1.8461538461538464e-07, "loss": 0.0019, "step": 3864 }, { "epoch": 49.39297124600639, "grad_norm": 5.625931262969971, "learning_rate": 1.7948717948717948e-07, "loss": 0.0035, "step": 3865 }, { "epoch": 49.405750798722046, "grad_norm": 0.2483331263065338, "learning_rate": 1.7435897435897435e-07, "loss": 0.0018, "step": 3866 }, { "epoch": 49.4185303514377, "grad_norm": 0.13882601261138916, "learning_rate": 1.6923076923076923e-07, "loss": 0.0016, "step": 3867 }, { "epoch": 49.43130990415335, "grad_norm": 0.2559881806373596, "learning_rate": 1.6410256410256413e-07, "loss": 0.0022, "step": 3868 }, { "epoch": 49.44408945686901, "grad_norm": 1.5500948429107666, "learning_rate": 1.58974358974359e-07, "loss": 0.0065, "step": 3869 }, { "epoch": 49.456869009584665, "grad_norm": 0.15375666320323944, "learning_rate": 1.5384615384615387e-07, "loss": 0.0019, "step": 3870 }, { "epoch": 49.46964856230032, "grad_norm": 0.1722114086151123, "learning_rate": 1.4871794871794872e-07, "loss": 0.0018, "step": 3871 }, { "epoch": 49.48242811501598, "grad_norm": 0.13881996273994446, "learning_rate": 1.435897435897436e-07, "loss": 0.0012, "step": 3872 }, { "epoch": 49.49520766773163, "grad_norm": 0.24192899465560913, "learning_rate": 1.384615384615385e-07, "loss": 0.0021, "step": 3873 }, { "epoch": 49.50798722044728, "grad_norm": 1.7498934268951416, "learning_rate": 1.3333333333333336e-07, "loss": 0.0024, "step": 3874 }, { "epoch": 49.52076677316294, "grad_norm": 0.18380694091320038, "learning_rate": 1.282051282051282e-07, "loss": 0.0024, "step": 3875 }, { "epoch": 49.533546325878596, "grad_norm": 0.17073246836662292, "learning_rate": 1.2307692307692308e-07, "loss": 0.0019, "step": 3876 }, { "epoch": 49.546325878594246, "grad_norm": 0.1260090172290802, "learning_rate": 1.1794871794871797e-07, "loss": 0.0011, "step": 3877 }, { "epoch": 49.5591054313099, "grad_norm": 0.2924799621105194, "learning_rate": 1.1282051282051284e-07, "loss": 0.0023, "step": 3878 }, { "epoch": 49.57188498402556, "grad_norm": 1.8029056787490845, "learning_rate": 1.076923076923077e-07, "loss": 0.0056, "step": 3879 }, { "epoch": 49.584664536741215, "grad_norm": 0.16758079826831818, "learning_rate": 1.0256410256410257e-07, "loss": 0.0016, "step": 3880 }, { "epoch": 49.59744408945687, "grad_norm": 0.1508578509092331, "learning_rate": 9.743589743589745e-08, "loss": 0.0014, "step": 3881 }, { "epoch": 49.61022364217252, "grad_norm": 0.1619696170091629, "learning_rate": 9.230769230769232e-08, "loss": 0.0019, "step": 3882 }, { "epoch": 49.62300319488818, "grad_norm": 0.24179552495479584, "learning_rate": 8.717948717948718e-08, "loss": 0.0023, "step": 3883 }, { "epoch": 49.635782747603834, "grad_norm": 0.1684618890285492, "learning_rate": 8.205128205128206e-08, "loss": 0.0016, "step": 3884 }, { "epoch": 49.64856230031949, "grad_norm": 2.7814879417419434, "learning_rate": 7.692307692307694e-08, "loss": 0.0107, "step": 3885 }, { "epoch": 49.66134185303515, "grad_norm": 0.23524853587150574, "learning_rate": 7.17948717948718e-08, "loss": 0.0017, "step": 3886 }, { "epoch": 49.674121405750796, "grad_norm": 0.1791078746318817, "learning_rate": 6.666666666666668e-08, "loss": 0.0017, "step": 3887 }, { "epoch": 49.68690095846645, "grad_norm": 1.7943719625473022, "learning_rate": 6.153846153846154e-08, "loss": 0.0111, "step": 3888 }, { "epoch": 49.69968051118211, "grad_norm": 0.17512063682079315, "learning_rate": 5.641025641025642e-08, "loss": 0.0015, "step": 3889 }, { "epoch": 49.712460063897765, "grad_norm": 0.12796635925769806, "learning_rate": 5.1282051282051286e-08, "loss": 0.0014, "step": 3890 }, { "epoch": 49.72523961661342, "grad_norm": 1.830198884010315, "learning_rate": 4.615384615384616e-08, "loss": 0.005, "step": 3891 }, { "epoch": 49.73801916932907, "grad_norm": 0.974043607711792, "learning_rate": 4.102564102564103e-08, "loss": 0.0049, "step": 3892 }, { "epoch": 49.75079872204473, "grad_norm": 0.9838169813156128, "learning_rate": 3.58974358974359e-08, "loss": 0.004, "step": 3893 }, { "epoch": 49.763578274760384, "grad_norm": 0.9195809960365295, "learning_rate": 3.076923076923077e-08, "loss": 0.003, "step": 3894 }, { "epoch": 49.77635782747604, "grad_norm": 1.8277593851089478, "learning_rate": 2.5641025641025643e-08, "loss": 0.0087, "step": 3895 }, { "epoch": 49.78913738019169, "grad_norm": 0.1428929716348648, "learning_rate": 2.0512820512820516e-08, "loss": 0.0011, "step": 3896 }, { "epoch": 49.801916932907346, "grad_norm": 1.8820996284484863, "learning_rate": 1.5384615384615385e-08, "loss": 0.0072, "step": 3897 }, { "epoch": 49.814696485623, "grad_norm": 1.2029500007629395, "learning_rate": 1.0256410256410258e-08, "loss": 0.0034, "step": 3898 }, { "epoch": 49.82747603833866, "grad_norm": 1.5105140209197998, "learning_rate": 5.128205128205129e-09, "loss": 0.005, "step": 3899 }, { "epoch": 49.840255591054316, "grad_norm": 0.34791117906570435, "learning_rate": 0.0, "loss": 0.0019, "step": 3900 }, { "epoch": 49.840255591054316, "eval_loss": 1.0049241781234741, "eval_runtime": 183.656, "eval_samples_per_second": 0.855, "eval_steps_per_second": 0.109, "step": 3900 } ], "logging_steps": 1, "max_steps": 3900, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 50, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.915877583275712e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }