{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.999740596627756, "eval_steps": 500, "global_step": 38548, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00648508430609598, "grad_norm": 75.29356384277344, "learning_rate": 9.997838305231302e-06, "loss": 3.9392, "step": 50 }, { "epoch": 0.01297016861219196, "grad_norm": 76.29737091064453, "learning_rate": 9.995676610462604e-06, "loss": 3.2764, "step": 100 }, { "epoch": 0.019455252918287938, "grad_norm": 41.812530517578125, "learning_rate": 9.993514915693906e-06, "loss": 2.7916, "step": 150 }, { "epoch": 0.02594033722438392, "grad_norm": 73.83484649658203, "learning_rate": 9.991353220925206e-06, "loss": 2.5989, "step": 200 }, { "epoch": 0.0324254215304799, "grad_norm": 117.84382629394531, "learning_rate": 9.989191526156507e-06, "loss": 2.5196, "step": 250 }, { "epoch": 0.038910505836575876, "grad_norm": 66.67141723632812, "learning_rate": 9.987029831387809e-06, "loss": 2.4034, "step": 300 }, { "epoch": 0.04539559014267185, "grad_norm": 87.57711791992188, "learning_rate": 9.98486813661911e-06, "loss": 2.4226, "step": 350 }, { "epoch": 0.05188067444876784, "grad_norm": 40.56406784057617, "learning_rate": 9.982706441850412e-06, "loss": 2.3475, "step": 400 }, { "epoch": 0.058365758754863814, "grad_norm": 168.55384826660156, "learning_rate": 9.980544747081713e-06, "loss": 2.306, "step": 450 }, { "epoch": 0.0648508430609598, "grad_norm": 72.2697982788086, "learning_rate": 9.978383052313015e-06, "loss": 2.3191, "step": 500 }, { "epoch": 0.07133592736705577, "grad_norm": 140.37997436523438, "learning_rate": 9.976221357544315e-06, "loss": 2.2995, "step": 550 }, { "epoch": 0.07782101167315175, "grad_norm": 48.368682861328125, "learning_rate": 9.974059662775617e-06, "loss": 2.2649, "step": 600 }, { "epoch": 0.08430609597924774, "grad_norm": 49.51039123535156, "learning_rate": 9.971897968006918e-06, "loss": 2.273, "step": 650 }, { "epoch": 0.0907911802853437, "grad_norm": 95.86265563964844, "learning_rate": 9.96973627323822e-06, "loss": 2.2113, "step": 700 }, { "epoch": 0.09727626459143969, "grad_norm": 59.293392181396484, "learning_rate": 9.967574578469521e-06, "loss": 2.2497, "step": 750 }, { "epoch": 0.10376134889753567, "grad_norm": 96.03455352783203, "learning_rate": 9.965412883700823e-06, "loss": 2.1745, "step": 800 }, { "epoch": 0.11024643320363164, "grad_norm": 58.87235641479492, "learning_rate": 9.963251188932125e-06, "loss": 2.2074, "step": 850 }, { "epoch": 0.11673151750972763, "grad_norm": 60.09347152709961, "learning_rate": 9.961089494163424e-06, "loss": 2.2143, "step": 900 }, { "epoch": 0.12321660181582361, "grad_norm": 33.75955581665039, "learning_rate": 9.958927799394726e-06, "loss": 2.1443, "step": 950 }, { "epoch": 0.1297016861219196, "grad_norm": 67.69246673583984, "learning_rate": 9.956766104626028e-06, "loss": 2.1568, "step": 1000 }, { "epoch": 0.13618677042801555, "grad_norm": 61.85429763793945, "learning_rate": 9.95460440985733e-06, "loss": 2.1036, "step": 1050 }, { "epoch": 0.14267185473411154, "grad_norm": 47.63957595825195, "learning_rate": 9.95244271508863e-06, "loss": 2.0988, "step": 1100 }, { "epoch": 0.14915693904020752, "grad_norm": 72.68657684326172, "learning_rate": 9.950281020319932e-06, "loss": 2.0674, "step": 1150 }, { "epoch": 0.1556420233463035, "grad_norm": 137.7786865234375, "learning_rate": 9.948119325551234e-06, "loss": 2.0437, "step": 1200 }, { "epoch": 0.1621271076523995, "grad_norm": 28.063282012939453, "learning_rate": 9.945957630782534e-06, "loss": 2.0778, "step": 1250 }, { "epoch": 0.16861219195849547, "grad_norm": 183.83932495117188, "learning_rate": 9.943795936013836e-06, "loss": 2.0364, "step": 1300 }, { "epoch": 0.17509727626459143, "grad_norm": 69.06026458740234, "learning_rate": 9.941634241245137e-06, "loss": 2.0518, "step": 1350 }, { "epoch": 0.1815823605706874, "grad_norm": 46.897403717041016, "learning_rate": 9.939472546476439e-06, "loss": 2.0278, "step": 1400 }, { "epoch": 0.1880674448767834, "grad_norm": 58.688289642333984, "learning_rate": 9.93731085170774e-06, "loss": 2.0535, "step": 1450 }, { "epoch": 0.19455252918287938, "grad_norm": 45.544673919677734, "learning_rate": 9.935149156939042e-06, "loss": 2.0326, "step": 1500 }, { "epoch": 0.20103761348897536, "grad_norm": 52.032257080078125, "learning_rate": 9.932987462170342e-06, "loss": 2.0506, "step": 1550 }, { "epoch": 0.20752269779507135, "grad_norm": 85.55571746826172, "learning_rate": 9.930825767401643e-06, "loss": 2.0022, "step": 1600 }, { "epoch": 0.2140077821011673, "grad_norm": 82.76732635498047, "learning_rate": 9.928664072632945e-06, "loss": 2.0245, "step": 1650 }, { "epoch": 0.2204928664072633, "grad_norm": 132.71685791015625, "learning_rate": 9.926502377864247e-06, "loss": 2.034, "step": 1700 }, { "epoch": 0.22697795071335927, "grad_norm": 37.891571044921875, "learning_rate": 9.924340683095548e-06, "loss": 2.0265, "step": 1750 }, { "epoch": 0.23346303501945526, "grad_norm": 74.43115234375, "learning_rate": 9.92217898832685e-06, "loss": 1.9993, "step": 1800 }, { "epoch": 0.23994811932555124, "grad_norm": 43.991119384765625, "learning_rate": 9.920017293558151e-06, "loss": 1.95, "step": 1850 }, { "epoch": 0.24643320363164722, "grad_norm": 71.00599670410156, "learning_rate": 9.917855598789451e-06, "loss": 1.9378, "step": 1900 }, { "epoch": 0.2529182879377432, "grad_norm": 99.50753784179688, "learning_rate": 9.915693904020753e-06, "loss": 1.9923, "step": 1950 }, { "epoch": 0.2594033722438392, "grad_norm": 139.31951904296875, "learning_rate": 9.913532209252054e-06, "loss": 1.9471, "step": 2000 }, { "epoch": 0.26588845654993515, "grad_norm": 58.99448776245117, "learning_rate": 9.911370514483356e-06, "loss": 1.8731, "step": 2050 }, { "epoch": 0.2723735408560311, "grad_norm": 72.88382720947266, "learning_rate": 9.909208819714658e-06, "loss": 1.9395, "step": 2100 }, { "epoch": 0.2788586251621271, "grad_norm": 91.64574432373047, "learning_rate": 9.90704712494596e-06, "loss": 1.835, "step": 2150 }, { "epoch": 0.2853437094682231, "grad_norm": 69.67208862304688, "learning_rate": 9.90488543017726e-06, "loss": 1.8692, "step": 2200 }, { "epoch": 0.2918287937743191, "grad_norm": 81.73461151123047, "learning_rate": 9.90272373540856e-06, "loss": 1.8508, "step": 2250 }, { "epoch": 0.29831387808041504, "grad_norm": 89.83018493652344, "learning_rate": 9.900562040639862e-06, "loss": 1.8786, "step": 2300 }, { "epoch": 0.30479896238651105, "grad_norm": 63.75005340576172, "learning_rate": 9.898400345871164e-06, "loss": 1.8883, "step": 2350 }, { "epoch": 0.311284046692607, "grad_norm": 52.06178283691406, "learning_rate": 9.896238651102465e-06, "loss": 1.8696, "step": 2400 }, { "epoch": 0.31776913099870296, "grad_norm": 164.74493408203125, "learning_rate": 9.894076956333767e-06, "loss": 1.8852, "step": 2450 }, { "epoch": 0.324254215304799, "grad_norm": 132.9710235595703, "learning_rate": 9.891915261565069e-06, "loss": 1.8436, "step": 2500 }, { "epoch": 0.33073929961089493, "grad_norm": 26.191741943359375, "learning_rate": 9.88975356679637e-06, "loss": 1.8739, "step": 2550 }, { "epoch": 0.33722438391699094, "grad_norm": 39.41121292114258, "learning_rate": 9.88759187202767e-06, "loss": 1.8541, "step": 2600 }, { "epoch": 0.3437094682230869, "grad_norm": 162.25184631347656, "learning_rate": 9.885430177258972e-06, "loss": 1.838, "step": 2650 }, { "epoch": 0.35019455252918286, "grad_norm": 45.588375091552734, "learning_rate": 9.883268482490273e-06, "loss": 1.8151, "step": 2700 }, { "epoch": 0.35667963683527887, "grad_norm": 58.42121887207031, "learning_rate": 9.881106787721575e-06, "loss": 1.8489, "step": 2750 }, { "epoch": 0.3631647211413748, "grad_norm": 58.45185470581055, "learning_rate": 9.878945092952877e-06, "loss": 1.834, "step": 2800 }, { "epoch": 0.36964980544747084, "grad_norm": 43.55636215209961, "learning_rate": 9.876783398184178e-06, "loss": 1.8172, "step": 2850 }, { "epoch": 0.3761348897535668, "grad_norm": 111.3084487915039, "learning_rate": 9.87462170341548e-06, "loss": 1.7982, "step": 2900 }, { "epoch": 0.38261997405966275, "grad_norm": 48.10725402832031, "learning_rate": 9.87246000864678e-06, "loss": 1.815, "step": 2950 }, { "epoch": 0.38910505836575876, "grad_norm": 204.3961639404297, "learning_rate": 9.870298313878081e-06, "loss": 1.7829, "step": 3000 }, { "epoch": 0.3955901426718547, "grad_norm": 57.22758483886719, "learning_rate": 9.868136619109383e-06, "loss": 1.8291, "step": 3050 }, { "epoch": 0.40207522697795073, "grad_norm": 129.09658813476562, "learning_rate": 9.865974924340684e-06, "loss": 1.8161, "step": 3100 }, { "epoch": 0.4085603112840467, "grad_norm": 36.433780670166016, "learning_rate": 9.863813229571986e-06, "loss": 1.7766, "step": 3150 }, { "epoch": 0.4150453955901427, "grad_norm": 61.64483642578125, "learning_rate": 9.861651534803288e-06, "loss": 1.8117, "step": 3200 }, { "epoch": 0.42153047989623865, "grad_norm": 107.76203918457031, "learning_rate": 9.85948984003459e-06, "loss": 1.7823, "step": 3250 }, { "epoch": 0.4280155642023346, "grad_norm": 118.72686004638672, "learning_rate": 9.857328145265889e-06, "loss": 1.7688, "step": 3300 }, { "epoch": 0.4345006485084306, "grad_norm": 166.0326385498047, "learning_rate": 9.85516645049719e-06, "loss": 1.759, "step": 3350 }, { "epoch": 0.4409857328145266, "grad_norm": 81.56283569335938, "learning_rate": 9.853004755728492e-06, "loss": 1.7628, "step": 3400 }, { "epoch": 0.4474708171206226, "grad_norm": 80.8810043334961, "learning_rate": 9.850843060959794e-06, "loss": 1.7637, "step": 3450 }, { "epoch": 0.45395590142671854, "grad_norm": 61.64249801635742, "learning_rate": 9.848681366191095e-06, "loss": 1.779, "step": 3500 }, { "epoch": 0.4604409857328145, "grad_norm": 88.62084197998047, "learning_rate": 9.846519671422397e-06, "loss": 1.7477, "step": 3550 }, { "epoch": 0.4669260700389105, "grad_norm": 52.68846893310547, "learning_rate": 9.844357976653699e-06, "loss": 1.7579, "step": 3600 }, { "epoch": 0.47341115434500647, "grad_norm": 100.87641143798828, "learning_rate": 9.842196281884999e-06, "loss": 1.7269, "step": 3650 }, { "epoch": 0.4798962386511025, "grad_norm": 110.61991882324219, "learning_rate": 9.8400345871163e-06, "loss": 1.718, "step": 3700 }, { "epoch": 0.48638132295719844, "grad_norm": 191.92201232910156, "learning_rate": 9.8378728923476e-06, "loss": 1.7211, "step": 3750 }, { "epoch": 0.49286640726329445, "grad_norm": 92.11019897460938, "learning_rate": 9.835711197578902e-06, "loss": 1.6597, "step": 3800 }, { "epoch": 0.4993514915693904, "grad_norm": 113.60994720458984, "learning_rate": 9.833549502810203e-06, "loss": 1.702, "step": 3850 }, { "epoch": 0.5058365758754864, "grad_norm": 36.078678131103516, "learning_rate": 9.831387808041505e-06, "loss": 1.6789, "step": 3900 }, { "epoch": 0.5123216601815823, "grad_norm": 36.602169036865234, "learning_rate": 9.829226113272806e-06, "loss": 1.6852, "step": 3950 }, { "epoch": 0.5188067444876784, "grad_norm": 35.471885681152344, "learning_rate": 9.827064418504108e-06, "loss": 1.7237, "step": 4000 }, { "epoch": 0.5252918287937743, "grad_norm": 82.1302261352539, "learning_rate": 9.82490272373541e-06, "loss": 1.6826, "step": 4050 }, { "epoch": 0.5317769130998703, "grad_norm": 88.32125854492188, "learning_rate": 9.82274102896671e-06, "loss": 1.6928, "step": 4100 }, { "epoch": 0.5382619974059663, "grad_norm": 46.689208984375, "learning_rate": 9.820579334198011e-06, "loss": 1.6711, "step": 4150 }, { "epoch": 0.5447470817120622, "grad_norm": 51.88264465332031, "learning_rate": 9.818417639429313e-06, "loss": 1.661, "step": 4200 }, { "epoch": 0.5512321660181583, "grad_norm": 26.311506271362305, "learning_rate": 9.816255944660614e-06, "loss": 1.647, "step": 4250 }, { "epoch": 0.5577172503242542, "grad_norm": 56.531463623046875, "learning_rate": 9.814094249891916e-06, "loss": 1.6632, "step": 4300 }, { "epoch": 0.5642023346303502, "grad_norm": 38.8277702331543, "learning_rate": 9.811932555123218e-06, "loss": 1.6396, "step": 4350 }, { "epoch": 0.5706874189364461, "grad_norm": 110.19149017333984, "learning_rate": 9.809770860354519e-06, "loss": 1.631, "step": 4400 }, { "epoch": 0.5771725032425421, "grad_norm": 31.21686553955078, "learning_rate": 9.807609165585819e-06, "loss": 1.6272, "step": 4450 }, { "epoch": 0.5836575875486382, "grad_norm": 54.84867477416992, "learning_rate": 9.80544747081712e-06, "loss": 1.6055, "step": 4500 }, { "epoch": 0.5901426718547341, "grad_norm": 73.87213134765625, "learning_rate": 9.803285776048422e-06, "loss": 1.5888, "step": 4550 }, { "epoch": 0.5966277561608301, "grad_norm": 33.05928039550781, "learning_rate": 9.801124081279724e-06, "loss": 1.6019, "step": 4600 }, { "epoch": 0.603112840466926, "grad_norm": 58.189022064208984, "learning_rate": 9.798962386511025e-06, "loss": 1.5611, "step": 4650 }, { "epoch": 0.6095979247730221, "grad_norm": 98.57378387451172, "learning_rate": 9.796800691742327e-06, "loss": 1.597, "step": 4700 }, { "epoch": 0.6160830090791181, "grad_norm": 40.484683990478516, "learning_rate": 9.794638996973629e-06, "loss": 1.5792, "step": 4750 }, { "epoch": 0.622568093385214, "grad_norm": 137.65402221679688, "learning_rate": 9.792477302204928e-06, "loss": 1.5859, "step": 4800 }, { "epoch": 0.62905317769131, "grad_norm": 222.86753845214844, "learning_rate": 9.79031560743623e-06, "loss": 1.5545, "step": 4850 }, { "epoch": 0.6355382619974059, "grad_norm": 61.394309997558594, "learning_rate": 9.788153912667532e-06, "loss": 1.5275, "step": 4900 }, { "epoch": 0.642023346303502, "grad_norm": 56.79536437988281, "learning_rate": 9.785992217898833e-06, "loss": 1.5688, "step": 4950 }, { "epoch": 0.648508430609598, "grad_norm": 35.67102813720703, "learning_rate": 9.783830523130135e-06, "loss": 1.5359, "step": 5000 }, { "epoch": 0.6549935149156939, "grad_norm": 106.68746948242188, "learning_rate": 9.781668828361436e-06, "loss": 1.5443, "step": 5050 }, { "epoch": 0.6614785992217899, "grad_norm": 118.8326644897461, "learning_rate": 9.779507133592736e-06, "loss": 1.5223, "step": 5100 }, { "epoch": 0.6679636835278858, "grad_norm": 98.10018920898438, "learning_rate": 9.777345438824038e-06, "loss": 1.5243, "step": 5150 }, { "epoch": 0.6744487678339819, "grad_norm": 77.71648406982422, "learning_rate": 9.77518374405534e-06, "loss": 1.5383, "step": 5200 }, { "epoch": 0.6809338521400778, "grad_norm": 58.089149475097656, "learning_rate": 9.773022049286641e-06, "loss": 1.4545, "step": 5250 }, { "epoch": 0.6874189364461738, "grad_norm": 37.549198150634766, "learning_rate": 9.770860354517943e-06, "loss": 1.454, "step": 5300 }, { "epoch": 0.6939040207522698, "grad_norm": 105.52291107177734, "learning_rate": 9.768698659749244e-06, "loss": 1.4583, "step": 5350 }, { "epoch": 0.7003891050583657, "grad_norm": 48.97454833984375, "learning_rate": 9.766536964980546e-06, "loss": 1.4752, "step": 5400 }, { "epoch": 0.7068741893644618, "grad_norm": 46.371768951416016, "learning_rate": 9.764375270211846e-06, "loss": 1.4495, "step": 5450 }, { "epoch": 0.7133592736705577, "grad_norm": 46.270530700683594, "learning_rate": 9.762213575443147e-06, "loss": 1.4777, "step": 5500 }, { "epoch": 0.7198443579766537, "grad_norm": 74.16635131835938, "learning_rate": 9.760051880674449e-06, "loss": 1.4826, "step": 5550 }, { "epoch": 0.7263294422827496, "grad_norm": 128.18382263183594, "learning_rate": 9.75789018590575e-06, "loss": 1.4844, "step": 5600 }, { "epoch": 0.7328145265888456, "grad_norm": 69.79833984375, "learning_rate": 9.755728491137052e-06, "loss": 1.5204, "step": 5650 }, { "epoch": 0.7392996108949417, "grad_norm": 60.08644104003906, "learning_rate": 9.753566796368354e-06, "loss": 1.4153, "step": 5700 }, { "epoch": 0.7457846952010376, "grad_norm": 28.086040496826172, "learning_rate": 9.751405101599655e-06, "loss": 1.4365, "step": 5750 }, { "epoch": 0.7522697795071336, "grad_norm": 123.76924133300781, "learning_rate": 9.749243406830955e-06, "loss": 1.4517, "step": 5800 }, { "epoch": 0.7587548638132295, "grad_norm": 51.66524124145508, "learning_rate": 9.747081712062257e-06, "loss": 1.3986, "step": 5850 }, { "epoch": 0.7652399481193255, "grad_norm": 108.23607635498047, "learning_rate": 9.744920017293558e-06, "loss": 1.4244, "step": 5900 }, { "epoch": 0.7717250324254216, "grad_norm": 231.63455200195312, "learning_rate": 9.74275832252486e-06, "loss": 1.4064, "step": 5950 }, { "epoch": 0.7782101167315175, "grad_norm": 81.5296630859375, "learning_rate": 9.740596627756162e-06, "loss": 1.4159, "step": 6000 }, { "epoch": 0.7846952010376135, "grad_norm": 88.66846466064453, "learning_rate": 9.738434932987463e-06, "loss": 1.3972, "step": 6050 }, { "epoch": 0.7911802853437094, "grad_norm": 51.40989303588867, "learning_rate": 9.736273238218765e-06, "loss": 1.3936, "step": 6100 }, { "epoch": 0.7976653696498055, "grad_norm": 125.95860290527344, "learning_rate": 9.734111543450065e-06, "loss": 1.4171, "step": 6150 }, { "epoch": 0.8041504539559015, "grad_norm": 106.81758117675781, "learning_rate": 9.731949848681366e-06, "loss": 1.3815, "step": 6200 }, { "epoch": 0.8106355382619974, "grad_norm": 59.69066619873047, "learning_rate": 9.729788153912668e-06, "loss": 1.3268, "step": 6250 }, { "epoch": 0.8171206225680934, "grad_norm": 58.66751480102539, "learning_rate": 9.72762645914397e-06, "loss": 1.3593, "step": 6300 }, { "epoch": 0.8236057068741893, "grad_norm": 67.43231964111328, "learning_rate": 9.725464764375271e-06, "loss": 1.3639, "step": 6350 }, { "epoch": 0.8300907911802854, "grad_norm": 42.44819259643555, "learning_rate": 9.723303069606573e-06, "loss": 1.3664, "step": 6400 }, { "epoch": 0.8365758754863813, "grad_norm": 31.140090942382812, "learning_rate": 9.721141374837874e-06, "loss": 1.3689, "step": 6450 }, { "epoch": 0.8430609597924773, "grad_norm": 295.80523681640625, "learning_rate": 9.718979680069174e-06, "loss": 1.3763, "step": 6500 }, { "epoch": 0.8495460440985733, "grad_norm": 88.26421356201172, "learning_rate": 9.716817985300476e-06, "loss": 1.3537, "step": 6550 }, { "epoch": 0.8560311284046692, "grad_norm": 193.05654907226562, "learning_rate": 9.714656290531777e-06, "loss": 1.3507, "step": 6600 }, { "epoch": 0.8625162127107653, "grad_norm": 79.03955078125, "learning_rate": 9.712494595763079e-06, "loss": 1.3388, "step": 6650 }, { "epoch": 0.8690012970168612, "grad_norm": 50.94293975830078, "learning_rate": 9.71033290099438e-06, "loss": 1.3568, "step": 6700 }, { "epoch": 0.8754863813229572, "grad_norm": 39.64507293701172, "learning_rate": 9.708171206225682e-06, "loss": 1.3824, "step": 6750 }, { "epoch": 0.8819714656290532, "grad_norm": 72.73592376708984, "learning_rate": 9.706009511456984e-06, "loss": 1.3592, "step": 6800 }, { "epoch": 0.8884565499351491, "grad_norm": 25.956851959228516, "learning_rate": 9.703847816688284e-06, "loss": 1.3338, "step": 6850 }, { "epoch": 0.8949416342412452, "grad_norm": 96.9691162109375, "learning_rate": 9.701686121919585e-06, "loss": 1.3522, "step": 6900 }, { "epoch": 0.9014267185473411, "grad_norm": 24.184741973876953, "learning_rate": 9.699524427150887e-06, "loss": 1.3276, "step": 6950 }, { "epoch": 0.9079118028534371, "grad_norm": 38.254638671875, "learning_rate": 9.697362732382188e-06, "loss": 1.3185, "step": 7000 }, { "epoch": 0.914396887159533, "grad_norm": 160.06329345703125, "learning_rate": 9.69520103761349e-06, "loss": 1.3145, "step": 7050 }, { "epoch": 0.920881971465629, "grad_norm": 194.62234497070312, "learning_rate": 9.693039342844792e-06, "loss": 1.3505, "step": 7100 }, { "epoch": 0.9273670557717251, "grad_norm": 63.943546295166016, "learning_rate": 9.690877648076093e-06, "loss": 1.3353, "step": 7150 }, { "epoch": 0.933852140077821, "grad_norm": 64.3890151977539, "learning_rate": 9.688715953307393e-06, "loss": 1.2994, "step": 7200 }, { "epoch": 0.940337224383917, "grad_norm": 42.98583984375, "learning_rate": 9.686554258538695e-06, "loss": 1.3034, "step": 7250 }, { "epoch": 0.9468223086900129, "grad_norm": 112.66468811035156, "learning_rate": 9.684392563769996e-06, "loss": 1.2642, "step": 7300 }, { "epoch": 0.953307392996109, "grad_norm": 100.0568618774414, "learning_rate": 9.682230869001298e-06, "loss": 1.2919, "step": 7350 }, { "epoch": 0.959792477302205, "grad_norm": 26.995040893554688, "learning_rate": 9.6800691742326e-06, "loss": 1.2775, "step": 7400 }, { "epoch": 0.9662775616083009, "grad_norm": 26.033170700073242, "learning_rate": 9.677907479463901e-06, "loss": 1.2675, "step": 7450 }, { "epoch": 0.9727626459143969, "grad_norm": 53.325523376464844, "learning_rate": 9.675745784695201e-06, "loss": 1.2947, "step": 7500 }, { "epoch": 0.9792477302204928, "grad_norm": 71.00118255615234, "learning_rate": 9.673584089926503e-06, "loss": 1.3299, "step": 7550 }, { "epoch": 0.9857328145265889, "grad_norm": 115.75421142578125, "learning_rate": 9.671422395157804e-06, "loss": 1.3157, "step": 7600 }, { "epoch": 0.9922178988326849, "grad_norm": 40.338565826416016, "learning_rate": 9.669260700389106e-06, "loss": 1.3313, "step": 7650 }, { "epoch": 0.9987029831387808, "grad_norm": 44.46931457519531, "learning_rate": 9.667099005620407e-06, "loss": 1.3073, "step": 7700 }, { "epoch": 1.0051880674448768, "grad_norm": 38.489139556884766, "learning_rate": 9.664937310851709e-06, "loss": 1.2656, "step": 7750 }, { "epoch": 1.0116731517509727, "grad_norm": 118.02386474609375, "learning_rate": 9.66277561608301e-06, "loss": 1.2827, "step": 7800 }, { "epoch": 1.0181582360570687, "grad_norm": 73.46105194091797, "learning_rate": 9.66061392131431e-06, "loss": 1.3098, "step": 7850 }, { "epoch": 1.0246433203631646, "grad_norm": 76.57545471191406, "learning_rate": 9.658452226545612e-06, "loss": 1.2673, "step": 7900 }, { "epoch": 1.0311284046692606, "grad_norm": 34.5427360534668, "learning_rate": 9.656290531776914e-06, "loss": 1.2507, "step": 7950 }, { "epoch": 1.0376134889753568, "grad_norm": 93.37530517578125, "learning_rate": 9.654128837008215e-06, "loss": 1.2523, "step": 8000 }, { "epoch": 1.0440985732814527, "grad_norm": 104.25950622558594, "learning_rate": 9.651967142239517e-06, "loss": 1.2446, "step": 8050 }, { "epoch": 1.0505836575875487, "grad_norm": 139.35931396484375, "learning_rate": 9.649805447470818e-06, "loss": 1.24, "step": 8100 }, { "epoch": 1.0570687418936446, "grad_norm": 160.05386352539062, "learning_rate": 9.64764375270212e-06, "loss": 1.2476, "step": 8150 }, { "epoch": 1.0635538261997406, "grad_norm": 28.035104751586914, "learning_rate": 9.64548205793342e-06, "loss": 1.265, "step": 8200 }, { "epoch": 1.0700389105058365, "grad_norm": 37.27667236328125, "learning_rate": 9.643320363164722e-06, "loss": 1.206, "step": 8250 }, { "epoch": 1.0765239948119325, "grad_norm": 35.973751068115234, "learning_rate": 9.641158668396023e-06, "loss": 1.2393, "step": 8300 }, { "epoch": 1.0830090791180285, "grad_norm": 101.7255859375, "learning_rate": 9.638996973627325e-06, "loss": 1.2775, "step": 8350 }, { "epoch": 1.0894941634241244, "grad_norm": 54.50769805908203, "learning_rate": 9.636835278858626e-06, "loss": 1.2302, "step": 8400 }, { "epoch": 1.0959792477302206, "grad_norm": 68.53856658935547, "learning_rate": 9.634673584089928e-06, "loss": 1.2285, "step": 8450 }, { "epoch": 1.1024643320363166, "grad_norm": 106.03568267822266, "learning_rate": 9.63251188932123e-06, "loss": 1.2285, "step": 8500 }, { "epoch": 1.1089494163424125, "grad_norm": 110.32369232177734, "learning_rate": 9.63035019455253e-06, "loss": 1.2455, "step": 8550 }, { "epoch": 1.1154345006485085, "grad_norm": 71.17969512939453, "learning_rate": 9.628188499783831e-06, "loss": 1.2908, "step": 8600 }, { "epoch": 1.1219195849546044, "grad_norm": 100.68138885498047, "learning_rate": 9.626026805015133e-06, "loss": 1.2489, "step": 8650 }, { "epoch": 1.1284046692607004, "grad_norm": 31.05600929260254, "learning_rate": 9.623865110246434e-06, "loss": 1.2033, "step": 8700 }, { "epoch": 1.1348897535667963, "grad_norm": 56.18037033081055, "learning_rate": 9.621703415477736e-06, "loss": 1.2507, "step": 8750 }, { "epoch": 1.1413748378728923, "grad_norm": 66.67138671875, "learning_rate": 9.619541720709037e-06, "loss": 1.2256, "step": 8800 }, { "epoch": 1.1478599221789882, "grad_norm": 89.71563720703125, "learning_rate": 9.617380025940339e-06, "loss": 1.2045, "step": 8850 }, { "epoch": 1.1543450064850842, "grad_norm": 83.06657409667969, "learning_rate": 9.615218331171639e-06, "loss": 1.1957, "step": 8900 }, { "epoch": 1.1608300907911804, "grad_norm": 32.97713851928711, "learning_rate": 9.61305663640294e-06, "loss": 1.2178, "step": 8950 }, { "epoch": 1.1673151750972763, "grad_norm": 40.30419158935547, "learning_rate": 9.610894941634242e-06, "loss": 1.2114, "step": 9000 }, { "epoch": 1.1738002594033723, "grad_norm": 26.891326904296875, "learning_rate": 9.608733246865544e-06, "loss": 1.2049, "step": 9050 }, { "epoch": 1.1802853437094682, "grad_norm": 95.95230102539062, "learning_rate": 9.606571552096845e-06, "loss": 1.1984, "step": 9100 }, { "epoch": 1.1867704280155642, "grad_norm": 137.105712890625, "learning_rate": 9.604409857328147e-06, "loss": 1.1697, "step": 9150 }, { "epoch": 1.1932555123216602, "grad_norm": 145.55722045898438, "learning_rate": 9.602248162559448e-06, "loss": 1.2145, "step": 9200 }, { "epoch": 1.1997405966277561, "grad_norm": 200.1924591064453, "learning_rate": 9.600086467790748e-06, "loss": 1.1548, "step": 9250 }, { "epoch": 1.206225680933852, "grad_norm": 119.41325378417969, "learning_rate": 9.59792477302205e-06, "loss": 1.1784, "step": 9300 }, { "epoch": 1.212710765239948, "grad_norm": 33.50049591064453, "learning_rate": 9.595763078253352e-06, "loss": 1.2024, "step": 9350 }, { "epoch": 1.2191958495460442, "grad_norm": 106.87812805175781, "learning_rate": 9.593601383484653e-06, "loss": 1.211, "step": 9400 }, { "epoch": 1.2256809338521402, "grad_norm": 50.5958366394043, "learning_rate": 9.591439688715955e-06, "loss": 1.1891, "step": 9450 }, { "epoch": 1.2321660181582361, "grad_norm": 66.71959686279297, "learning_rate": 9.589277993947256e-06, "loss": 1.1643, "step": 9500 }, { "epoch": 1.238651102464332, "grad_norm": 74.7675552368164, "learning_rate": 9.587116299178558e-06, "loss": 1.1527, "step": 9550 }, { "epoch": 1.245136186770428, "grad_norm": 48.216217041015625, "learning_rate": 9.584954604409858e-06, "loss": 1.1978, "step": 9600 }, { "epoch": 1.251621271076524, "grad_norm": 49.2801628112793, "learning_rate": 9.58279290964116e-06, "loss": 1.1669, "step": 9650 }, { "epoch": 1.25810635538262, "grad_norm": 31.61471939086914, "learning_rate": 9.580631214872461e-06, "loss": 1.1393, "step": 9700 }, { "epoch": 1.264591439688716, "grad_norm": 84.38628387451172, "learning_rate": 9.578469520103763e-06, "loss": 1.146, "step": 9750 }, { "epoch": 1.2710765239948119, "grad_norm": 64.64751434326172, "learning_rate": 9.576307825335064e-06, "loss": 1.178, "step": 9800 }, { "epoch": 1.2775616083009078, "grad_norm": 38.89653396606445, "learning_rate": 9.574146130566366e-06, "loss": 1.174, "step": 9850 }, { "epoch": 1.2840466926070038, "grad_norm": 154.65513610839844, "learning_rate": 9.571984435797666e-06, "loss": 1.1372, "step": 9900 }, { "epoch": 1.2905317769131, "grad_norm": 44.31837844848633, "learning_rate": 9.569822741028967e-06, "loss": 1.1371, "step": 9950 }, { "epoch": 1.297016861219196, "grad_norm": 66.79319763183594, "learning_rate": 9.567661046260269e-06, "loss": 1.1161, "step": 10000 }, { "epoch": 1.3035019455252919, "grad_norm": 109.10542297363281, "learning_rate": 9.56549935149157e-06, "loss": 1.1321, "step": 10050 }, { "epoch": 1.3099870298313878, "grad_norm": 52.56429672241211, "learning_rate": 9.563337656722872e-06, "loss": 1.1606, "step": 10100 }, { "epoch": 1.3164721141374838, "grad_norm": 106.3527603149414, "learning_rate": 9.561175961954174e-06, "loss": 1.154, "step": 10150 }, { "epoch": 1.3229571984435797, "grad_norm": 275.15216064453125, "learning_rate": 9.559014267185475e-06, "loss": 1.1477, "step": 10200 }, { "epoch": 1.3294422827496757, "grad_norm": 146.71636962890625, "learning_rate": 9.556852572416775e-06, "loss": 1.154, "step": 10250 }, { "epoch": 1.3359273670557716, "grad_norm": 91.92938995361328, "learning_rate": 9.554690877648077e-06, "loss": 1.1194, "step": 10300 }, { "epoch": 1.3424124513618678, "grad_norm": 34.08713912963867, "learning_rate": 9.552529182879378e-06, "loss": 1.1176, "step": 10350 }, { "epoch": 1.3488975356679638, "grad_norm": 234.9642791748047, "learning_rate": 9.55036748811068e-06, "loss": 1.1294, "step": 10400 }, { "epoch": 1.3553826199740597, "grad_norm": 89.80880737304688, "learning_rate": 9.548205793341982e-06, "loss": 1.1138, "step": 10450 }, { "epoch": 1.3618677042801557, "grad_norm": 54.69230651855469, "learning_rate": 9.546044098573283e-06, "loss": 1.1153, "step": 10500 }, { "epoch": 1.3683527885862516, "grad_norm": 43.032466888427734, "learning_rate": 9.543882403804585e-06, "loss": 1.1041, "step": 10550 }, { "epoch": 1.3748378728923476, "grad_norm": 110.12335205078125, "learning_rate": 9.541720709035885e-06, "loss": 1.1056, "step": 10600 }, { "epoch": 1.3813229571984436, "grad_norm": 109.36446380615234, "learning_rate": 9.539559014267186e-06, "loss": 1.1288, "step": 10650 }, { "epoch": 1.3878080415045395, "grad_norm": 200.33387756347656, "learning_rate": 9.537397319498488e-06, "loss": 1.101, "step": 10700 }, { "epoch": 1.3942931258106355, "grad_norm": 31.898895263671875, "learning_rate": 9.53523562472979e-06, "loss": 1.0919, "step": 10750 }, { "epoch": 1.4007782101167314, "grad_norm": 41.42584991455078, "learning_rate": 9.533073929961091e-06, "loss": 1.0914, "step": 10800 }, { "epoch": 1.4072632944228274, "grad_norm": 55.4241828918457, "learning_rate": 9.530912235192391e-06, "loss": 1.0457, "step": 10850 }, { "epoch": 1.4137483787289233, "grad_norm": 92.30503845214844, "learning_rate": 9.528750540423693e-06, "loss": 1.1102, "step": 10900 }, { "epoch": 1.4202334630350195, "grad_norm": 183.83782958984375, "learning_rate": 9.526588845654994e-06, "loss": 1.0788, "step": 10950 }, { "epoch": 1.4267185473411155, "grad_norm": 58.78367614746094, "learning_rate": 9.524427150886296e-06, "loss": 1.0883, "step": 11000 }, { "epoch": 1.4332036316472114, "grad_norm": 122.31067657470703, "learning_rate": 9.522265456117596e-06, "loss": 1.0949, "step": 11050 }, { "epoch": 1.4396887159533074, "grad_norm": 92.71478271484375, "learning_rate": 9.520103761348897e-06, "loss": 1.1128, "step": 11100 }, { "epoch": 1.4461738002594033, "grad_norm": 63.70943832397461, "learning_rate": 9.517942066580199e-06, "loss": 1.1321, "step": 11150 }, { "epoch": 1.4526588845654993, "grad_norm": 55.85942840576172, "learning_rate": 9.5157803718115e-06, "loss": 1.1075, "step": 11200 }, { "epoch": 1.4591439688715953, "grad_norm": 141.53460693359375, "learning_rate": 9.513618677042802e-06, "loss": 1.0777, "step": 11250 }, { "epoch": 1.4656290531776914, "grad_norm": 52.136348724365234, "learning_rate": 9.511456982274104e-06, "loss": 1.094, "step": 11300 }, { "epoch": 1.4721141374837874, "grad_norm": 66.62019348144531, "learning_rate": 9.509295287505405e-06, "loss": 1.1122, "step": 11350 }, { "epoch": 1.4785992217898833, "grad_norm": 37.23124313354492, "learning_rate": 9.507133592736705e-06, "loss": 1.1029, "step": 11400 }, { "epoch": 1.4850843060959793, "grad_norm": 141.20053100585938, "learning_rate": 9.504971897968007e-06, "loss": 1.0891, "step": 11450 }, { "epoch": 1.4915693904020753, "grad_norm": 70.93553924560547, "learning_rate": 9.502810203199308e-06, "loss": 1.0799, "step": 11500 }, { "epoch": 1.4980544747081712, "grad_norm": 78.5645980834961, "learning_rate": 9.50064850843061e-06, "loss": 1.0771, "step": 11550 }, { "epoch": 1.5045395590142672, "grad_norm": 110.03238677978516, "learning_rate": 9.498486813661911e-06, "loss": 1.052, "step": 11600 }, { "epoch": 1.5110246433203631, "grad_norm": 94.38980865478516, "learning_rate": 9.496325118893213e-06, "loss": 1.0831, "step": 11650 }, { "epoch": 1.517509727626459, "grad_norm": 72.01763153076172, "learning_rate": 9.494163424124515e-06, "loss": 1.0674, "step": 11700 }, { "epoch": 1.523994811932555, "grad_norm": 51.9689826965332, "learning_rate": 9.492001729355815e-06, "loss": 1.0831, "step": 11750 }, { "epoch": 1.530479896238651, "grad_norm": 107.07817840576172, "learning_rate": 9.489840034587116e-06, "loss": 1.0426, "step": 11800 }, { "epoch": 1.536964980544747, "grad_norm": 47.38414764404297, "learning_rate": 9.487678339818418e-06, "loss": 1.0711, "step": 11850 }, { "epoch": 1.543450064850843, "grad_norm": 45.52274703979492, "learning_rate": 9.48551664504972e-06, "loss": 1.0781, "step": 11900 }, { "epoch": 1.549935149156939, "grad_norm": 26.50186538696289, "learning_rate": 9.483354950281021e-06, "loss": 1.0574, "step": 11950 }, { "epoch": 1.556420233463035, "grad_norm": 41.2259521484375, "learning_rate": 9.481193255512323e-06, "loss": 1.0754, "step": 12000 }, { "epoch": 1.562905317769131, "grad_norm": 35.60273361206055, "learning_rate": 9.479031560743624e-06, "loss": 1.0288, "step": 12050 }, { "epoch": 1.569390402075227, "grad_norm": 41.92966842651367, "learning_rate": 9.476869865974924e-06, "loss": 1.0488, "step": 12100 }, { "epoch": 1.575875486381323, "grad_norm": 18.675764083862305, "learning_rate": 9.474708171206226e-06, "loss": 1.0531, "step": 12150 }, { "epoch": 1.582360570687419, "grad_norm": 108.12574005126953, "learning_rate": 9.472546476437527e-06, "loss": 1.0376, "step": 12200 }, { "epoch": 1.588845654993515, "grad_norm": 112.02227020263672, "learning_rate": 9.470384781668829e-06, "loss": 1.0343, "step": 12250 }, { "epoch": 1.595330739299611, "grad_norm": 98.89630126953125, "learning_rate": 9.46822308690013e-06, "loss": 1.0823, "step": 12300 }, { "epoch": 1.601815823605707, "grad_norm": 118.65319061279297, "learning_rate": 9.466061392131432e-06, "loss": 1.0556, "step": 12350 }, { "epoch": 1.608300907911803, "grad_norm": 129.68385314941406, "learning_rate": 9.463899697362734e-06, "loss": 1.0468, "step": 12400 }, { "epoch": 1.6147859922178989, "grad_norm": 76.83685302734375, "learning_rate": 9.461738002594033e-06, "loss": 1.0861, "step": 12450 }, { "epoch": 1.6212710765239948, "grad_norm": 157.33180236816406, "learning_rate": 9.459576307825335e-06, "loss": 1.0452, "step": 12500 }, { "epoch": 1.6277561608300908, "grad_norm": 68.68656158447266, "learning_rate": 9.457414613056637e-06, "loss": 1.0309, "step": 12550 }, { "epoch": 1.6342412451361867, "grad_norm": 168.48919677734375, "learning_rate": 9.455252918287938e-06, "loss": 1.0167, "step": 12600 }, { "epoch": 1.6407263294422827, "grad_norm": 57.15532684326172, "learning_rate": 9.45309122351924e-06, "loss": 0.9928, "step": 12650 }, { "epoch": 1.6472114137483787, "grad_norm": 108.53094482421875, "learning_rate": 9.450929528750541e-06, "loss": 0.9977, "step": 12700 }, { "epoch": 1.6536964980544746, "grad_norm": 51.94723129272461, "learning_rate": 9.448767833981843e-06, "loss": 1.0463, "step": 12750 }, { "epoch": 1.6601815823605706, "grad_norm": 190.38787841796875, "learning_rate": 9.446606139213143e-06, "loss": 1.0505, "step": 12800 }, { "epoch": 1.6666666666666665, "grad_norm": 53.69584274291992, "learning_rate": 9.444444444444445e-06, "loss": 1.0064, "step": 12850 }, { "epoch": 1.6731517509727627, "grad_norm": 116.31331634521484, "learning_rate": 9.442282749675746e-06, "loss": 1.0066, "step": 12900 }, { "epoch": 1.6796368352788587, "grad_norm": 102.2055892944336, "learning_rate": 9.440121054907048e-06, "loss": 0.9889, "step": 12950 }, { "epoch": 1.6861219195849546, "grad_norm": 78.99929809570312, "learning_rate": 9.43795936013835e-06, "loss": 1.0151, "step": 13000 }, { "epoch": 1.6926070038910506, "grad_norm": 97.79879760742188, "learning_rate": 9.435797665369651e-06, "loss": 0.9862, "step": 13050 }, { "epoch": 1.6990920881971465, "grad_norm": 30.27912139892578, "learning_rate": 9.433635970600953e-06, "loss": 1.0031, "step": 13100 }, { "epoch": 1.7055771725032427, "grad_norm": 47.64608383178711, "learning_rate": 9.431474275832252e-06, "loss": 0.9915, "step": 13150 }, { "epoch": 1.7120622568093387, "grad_norm": 73.29598236083984, "learning_rate": 9.429312581063554e-06, "loss": 0.9978, "step": 13200 }, { "epoch": 1.7185473411154346, "grad_norm": 54.02968215942383, "learning_rate": 9.427150886294856e-06, "loss": 0.9901, "step": 13250 }, { "epoch": 1.7250324254215306, "grad_norm": 100.88853454589844, "learning_rate": 9.424989191526157e-06, "loss": 1.0254, "step": 13300 }, { "epoch": 1.7315175097276265, "grad_norm": 105.97042083740234, "learning_rate": 9.422827496757459e-06, "loss": 1.0053, "step": 13350 }, { "epoch": 1.7380025940337225, "grad_norm": 110.1412582397461, "learning_rate": 9.42066580198876e-06, "loss": 0.996, "step": 13400 }, { "epoch": 1.7444876783398184, "grad_norm": 102.8427505493164, "learning_rate": 9.41850410722006e-06, "loss": 1.0155, "step": 13450 }, { "epoch": 1.7509727626459144, "grad_norm": 87.20895385742188, "learning_rate": 9.416342412451362e-06, "loss": 0.9852, "step": 13500 }, { "epoch": 1.7574578469520103, "grad_norm": 77.43791198730469, "learning_rate": 9.414180717682663e-06, "loss": 0.9895, "step": 13550 }, { "epoch": 1.7639429312581063, "grad_norm": 172.30885314941406, "learning_rate": 9.412019022913965e-06, "loss": 0.9743, "step": 13600 }, { "epoch": 1.7704280155642023, "grad_norm": 70.97063446044922, "learning_rate": 9.409857328145267e-06, "loss": 0.9753, "step": 13650 }, { "epoch": 1.7769130998702982, "grad_norm": 123.29631805419922, "learning_rate": 9.407695633376568e-06, "loss": 0.9374, "step": 13700 }, { "epoch": 1.7833981841763942, "grad_norm": 169.43450927734375, "learning_rate": 9.40553393860787e-06, "loss": 0.9733, "step": 13750 }, { "epoch": 1.7898832684824901, "grad_norm": 94.36160278320312, "learning_rate": 9.40337224383917e-06, "loss": 0.9741, "step": 13800 }, { "epoch": 1.796368352788586, "grad_norm": 67.74256896972656, "learning_rate": 9.401210549070471e-06, "loss": 0.9556, "step": 13850 }, { "epoch": 1.8028534370946823, "grad_norm": 171.77330017089844, "learning_rate": 9.399048854301773e-06, "loss": 0.9841, "step": 13900 }, { "epoch": 1.8093385214007782, "grad_norm": 110.4674301147461, "learning_rate": 9.396887159533075e-06, "loss": 1.0008, "step": 13950 }, { "epoch": 1.8158236057068742, "grad_norm": 171.76177978515625, "learning_rate": 9.394725464764376e-06, "loss": 0.9835, "step": 14000 }, { "epoch": 1.8223086900129701, "grad_norm": 130.97406005859375, "learning_rate": 9.392563769995678e-06, "loss": 0.9498, "step": 14050 }, { "epoch": 1.8287937743190663, "grad_norm": 74.91665649414062, "learning_rate": 9.39040207522698e-06, "loss": 0.9621, "step": 14100 }, { "epoch": 1.8352788586251623, "grad_norm": 48.18241500854492, "learning_rate": 9.38824038045828e-06, "loss": 0.9802, "step": 14150 }, { "epoch": 1.8417639429312582, "grad_norm": 133.60479736328125, "learning_rate": 9.38607868568958e-06, "loss": 0.9622, "step": 14200 }, { "epoch": 1.8482490272373542, "grad_norm": 163.2623291015625, "learning_rate": 9.383916990920882e-06, "loss": 0.959, "step": 14250 }, { "epoch": 1.8547341115434501, "grad_norm": 56.28314208984375, "learning_rate": 9.381755296152184e-06, "loss": 0.9607, "step": 14300 }, { "epoch": 1.861219195849546, "grad_norm": 115.68190002441406, "learning_rate": 9.379593601383486e-06, "loss": 0.9643, "step": 14350 }, { "epoch": 1.867704280155642, "grad_norm": 57.527828216552734, "learning_rate": 9.377431906614787e-06, "loss": 0.9458, "step": 14400 }, { "epoch": 1.874189364461738, "grad_norm": 83.91288757324219, "learning_rate": 9.375270211846089e-06, "loss": 0.937, "step": 14450 }, { "epoch": 1.880674448767834, "grad_norm": 205.3312530517578, "learning_rate": 9.373108517077389e-06, "loss": 0.9382, "step": 14500 }, { "epoch": 1.88715953307393, "grad_norm": 84.1654281616211, "learning_rate": 9.37094682230869e-06, "loss": 0.9737, "step": 14550 }, { "epoch": 1.8936446173800259, "grad_norm": 59.71659469604492, "learning_rate": 9.368785127539992e-06, "loss": 0.9766, "step": 14600 }, { "epoch": 1.9001297016861218, "grad_norm": 62.78482437133789, "learning_rate": 9.366623432771293e-06, "loss": 0.9746, "step": 14650 }, { "epoch": 1.9066147859922178, "grad_norm": 41.973777770996094, "learning_rate": 9.364461738002595e-06, "loss": 0.965, "step": 14700 }, { "epoch": 1.9130998702983137, "grad_norm": 26.649688720703125, "learning_rate": 9.362300043233897e-06, "loss": 0.949, "step": 14750 }, { "epoch": 1.9195849546044097, "grad_norm": 43.40812683105469, "learning_rate": 9.360138348465198e-06, "loss": 0.9204, "step": 14800 }, { "epoch": 1.9260700389105059, "grad_norm": 72.37606811523438, "learning_rate": 9.357976653696498e-06, "loss": 0.9599, "step": 14850 }, { "epoch": 1.9325551232166018, "grad_norm": 24.634532928466797, "learning_rate": 9.3558149589278e-06, "loss": 0.9309, "step": 14900 }, { "epoch": 1.9390402075226978, "grad_norm": 74.19110870361328, "learning_rate": 9.353653264159101e-06, "loss": 0.9173, "step": 14950 }, { "epoch": 1.9455252918287937, "grad_norm": 76.68376922607422, "learning_rate": 9.351491569390403e-06, "loss": 0.9305, "step": 15000 }, { "epoch": 1.9520103761348897, "grad_norm": 80.31610107421875, "learning_rate": 9.349329874621705e-06, "loss": 0.9095, "step": 15050 }, { "epoch": 1.9584954604409859, "grad_norm": 59.694969177246094, "learning_rate": 9.347168179853006e-06, "loss": 0.9157, "step": 15100 }, { "epoch": 1.9649805447470818, "grad_norm": 47.985164642333984, "learning_rate": 9.345006485084308e-06, "loss": 0.9553, "step": 15150 }, { "epoch": 1.9714656290531778, "grad_norm": 119.7039566040039, "learning_rate": 9.342844790315608e-06, "loss": 0.9479, "step": 15200 }, { "epoch": 1.9779507133592737, "grad_norm": 84.06747436523438, "learning_rate": 9.34068309554691e-06, "loss": 0.9539, "step": 15250 }, { "epoch": 1.9844357976653697, "grad_norm": 182.93211364746094, "learning_rate": 9.33852140077821e-06, "loss": 0.9279, "step": 15300 }, { "epoch": 1.9909208819714657, "grad_norm": 33.33463668823242, "learning_rate": 9.336359706009512e-06, "loss": 0.9237, "step": 15350 }, { "epoch": 1.9974059662775616, "grad_norm": 98.06361389160156, "learning_rate": 9.334198011240814e-06, "loss": 0.9449, "step": 15400 }, { "epoch": 2.0038910505836576, "grad_norm": 40.220664978027344, "learning_rate": 9.332036316472116e-06, "loss": 0.9192, "step": 15450 }, { "epoch": 2.0103761348897535, "grad_norm": 67.13005828857422, "learning_rate": 9.329874621703417e-06, "loss": 0.9487, "step": 15500 }, { "epoch": 2.0168612191958495, "grad_norm": 163.42137145996094, "learning_rate": 9.327712926934717e-06, "loss": 0.9684, "step": 15550 }, { "epoch": 2.0233463035019454, "grad_norm": 82.5510025024414, "learning_rate": 9.325551232166019e-06, "loss": 0.9344, "step": 15600 }, { "epoch": 2.0298313878080414, "grad_norm": 203.52099609375, "learning_rate": 9.32338953739732e-06, "loss": 0.8956, "step": 15650 }, { "epoch": 2.0363164721141374, "grad_norm": 72.38980865478516, "learning_rate": 9.321227842628622e-06, "loss": 0.9391, "step": 15700 }, { "epoch": 2.0428015564202333, "grad_norm": 50.11948013305664, "learning_rate": 9.319066147859923e-06, "loss": 0.9348, "step": 15750 }, { "epoch": 2.0492866407263293, "grad_norm": 122.09666442871094, "learning_rate": 9.316904453091225e-06, "loss": 0.8871, "step": 15800 }, { "epoch": 2.0557717250324252, "grad_norm": 59.022274017333984, "learning_rate": 9.314742758322527e-06, "loss": 0.9101, "step": 15850 }, { "epoch": 2.062256809338521, "grad_norm": 76.15840148925781, "learning_rate": 9.312581063553827e-06, "loss": 0.9273, "step": 15900 }, { "epoch": 2.0687418936446176, "grad_norm": 47.89101791381836, "learning_rate": 9.310419368785128e-06, "loss": 0.9355, "step": 15950 }, { "epoch": 2.0752269779507135, "grad_norm": 229.04345703125, "learning_rate": 9.30825767401643e-06, "loss": 0.9091, "step": 16000 }, { "epoch": 2.0817120622568095, "grad_norm": 133.06822204589844, "learning_rate": 9.306095979247731e-06, "loss": 0.9448, "step": 16050 }, { "epoch": 2.0881971465629054, "grad_norm": 58.23340606689453, "learning_rate": 9.303934284479033e-06, "loss": 0.9028, "step": 16100 }, { "epoch": 2.0946822308690014, "grad_norm": 232.0340118408203, "learning_rate": 9.301772589710335e-06, "loss": 0.9424, "step": 16150 }, { "epoch": 2.1011673151750974, "grad_norm": 22.04237937927246, "learning_rate": 9.299610894941634e-06, "loss": 0.9227, "step": 16200 }, { "epoch": 2.1076523994811933, "grad_norm": 116.25421142578125, "learning_rate": 9.297449200172936e-06, "loss": 0.8914, "step": 16250 }, { "epoch": 2.1141374837872893, "grad_norm": 69.3602066040039, "learning_rate": 9.295287505404238e-06, "loss": 0.8985, "step": 16300 }, { "epoch": 2.1206225680933852, "grad_norm": 145.1238555908203, "learning_rate": 9.29312581063554e-06, "loss": 0.8987, "step": 16350 }, { "epoch": 2.127107652399481, "grad_norm": 24.35103988647461, "learning_rate": 9.29096411586684e-06, "loss": 0.9213, "step": 16400 }, { "epoch": 2.133592736705577, "grad_norm": 37.310787200927734, "learning_rate": 9.288802421098142e-06, "loss": 0.8847, "step": 16450 }, { "epoch": 2.140077821011673, "grad_norm": 132.53892517089844, "learning_rate": 9.286640726329444e-06, "loss": 0.9068, "step": 16500 }, { "epoch": 2.146562905317769, "grad_norm": 75.88333892822266, "learning_rate": 9.284479031560744e-06, "loss": 0.894, "step": 16550 }, { "epoch": 2.153047989623865, "grad_norm": 251.23751831054688, "learning_rate": 9.282317336792046e-06, "loss": 0.9422, "step": 16600 }, { "epoch": 2.159533073929961, "grad_norm": 32.46202850341797, "learning_rate": 9.280155642023347e-06, "loss": 0.9291, "step": 16650 }, { "epoch": 2.166018158236057, "grad_norm": 53.387718200683594, "learning_rate": 9.277993947254649e-06, "loss": 0.8967, "step": 16700 }, { "epoch": 2.172503242542153, "grad_norm": 209.8604278564453, "learning_rate": 9.27583225248595e-06, "loss": 0.9111, "step": 16750 }, { "epoch": 2.178988326848249, "grad_norm": 96.47901153564453, "learning_rate": 9.273670557717252e-06, "loss": 0.9166, "step": 16800 }, { "epoch": 2.1854734111543452, "grad_norm": 52.16880798339844, "learning_rate": 9.271508862948553e-06, "loss": 0.8909, "step": 16850 }, { "epoch": 2.191958495460441, "grad_norm": 170.49676513671875, "learning_rate": 9.269347168179853e-06, "loss": 0.898, "step": 16900 }, { "epoch": 2.198443579766537, "grad_norm": 55.0761604309082, "learning_rate": 9.267185473411155e-06, "loss": 0.9078, "step": 16950 }, { "epoch": 2.204928664072633, "grad_norm": 124.61663055419922, "learning_rate": 9.265023778642457e-06, "loss": 0.928, "step": 17000 }, { "epoch": 2.211413748378729, "grad_norm": 49.64213562011719, "learning_rate": 9.262862083873758e-06, "loss": 0.9018, "step": 17050 }, { "epoch": 2.217898832684825, "grad_norm": 143.7904052734375, "learning_rate": 9.26070038910506e-06, "loss": 0.8655, "step": 17100 }, { "epoch": 2.224383916990921, "grad_norm": 139.10025024414062, "learning_rate": 9.258538694336361e-06, "loss": 0.9088, "step": 17150 }, { "epoch": 2.230869001297017, "grad_norm": 18.64621925354004, "learning_rate": 9.256376999567663e-06, "loss": 0.8923, "step": 17200 }, { "epoch": 2.237354085603113, "grad_norm": 154.90325927734375, "learning_rate": 9.254215304798963e-06, "loss": 0.912, "step": 17250 }, { "epoch": 2.243839169909209, "grad_norm": 87.64720916748047, "learning_rate": 9.252053610030264e-06, "loss": 0.8789, "step": 17300 }, { "epoch": 2.250324254215305, "grad_norm": 56.62800216674805, "learning_rate": 9.249891915261566e-06, "loss": 0.8899, "step": 17350 }, { "epoch": 2.2568093385214008, "grad_norm": 37.476234436035156, "learning_rate": 9.247730220492868e-06, "loss": 0.8846, "step": 17400 }, { "epoch": 2.2632944228274967, "grad_norm": 60.178428649902344, "learning_rate": 9.24556852572417e-06, "loss": 0.9088, "step": 17450 }, { "epoch": 2.2697795071335927, "grad_norm": 113.12017059326172, "learning_rate": 9.24340683095547e-06, "loss": 0.8523, "step": 17500 }, { "epoch": 2.2762645914396886, "grad_norm": 70.21991729736328, "learning_rate": 9.241245136186772e-06, "loss": 0.8874, "step": 17550 }, { "epoch": 2.2827496757457846, "grad_norm": 20.540199279785156, "learning_rate": 9.239083441418072e-06, "loss": 0.8262, "step": 17600 }, { "epoch": 2.2892347600518805, "grad_norm": 32.57448959350586, "learning_rate": 9.236921746649374e-06, "loss": 0.8445, "step": 17650 }, { "epoch": 2.2957198443579765, "grad_norm": 232.79153442382812, "learning_rate": 9.234760051880676e-06, "loss": 0.8666, "step": 17700 }, { "epoch": 2.3022049286640724, "grad_norm": 52.618385314941406, "learning_rate": 9.232598357111977e-06, "loss": 0.8744, "step": 17750 }, { "epoch": 2.3086900129701684, "grad_norm": 47.01662826538086, "learning_rate": 9.230436662343279e-06, "loss": 0.8673, "step": 17800 }, { "epoch": 2.3151750972762644, "grad_norm": 52.647891998291016, "learning_rate": 9.22827496757458e-06, "loss": 0.8946, "step": 17850 }, { "epoch": 2.3216601815823608, "grad_norm": 66.30323791503906, "learning_rate": 9.226113272805882e-06, "loss": 0.9222, "step": 17900 }, { "epoch": 2.3281452658884567, "grad_norm": 78.40735626220703, "learning_rate": 9.223951578037182e-06, "loss": 0.8958, "step": 17950 }, { "epoch": 2.3346303501945527, "grad_norm": 156.3478240966797, "learning_rate": 9.221789883268483e-06, "loss": 0.8631, "step": 18000 }, { "epoch": 2.3411154345006486, "grad_norm": 46.133201599121094, "learning_rate": 9.219628188499785e-06, "loss": 0.8669, "step": 18050 }, { "epoch": 2.3476005188067446, "grad_norm": 117.3602523803711, "learning_rate": 9.217466493731085e-06, "loss": 0.8592, "step": 18100 }, { "epoch": 2.3540856031128405, "grad_norm": 99.78243255615234, "learning_rate": 9.215304798962386e-06, "loss": 0.8563, "step": 18150 }, { "epoch": 2.3605706874189365, "grad_norm": 47.9234504699707, "learning_rate": 9.213143104193688e-06, "loss": 0.8678, "step": 18200 }, { "epoch": 2.3670557717250325, "grad_norm": 83.74739837646484, "learning_rate": 9.21098140942499e-06, "loss": 0.8752, "step": 18250 }, { "epoch": 2.3735408560311284, "grad_norm": 36.51896667480469, "learning_rate": 9.208819714656291e-06, "loss": 0.9021, "step": 18300 }, { "epoch": 2.3800259403372244, "grad_norm": 31.101106643676758, "learning_rate": 9.206658019887593e-06, "loss": 0.8338, "step": 18350 }, { "epoch": 2.3865110246433203, "grad_norm": 103.6131591796875, "learning_rate": 9.204496325118893e-06, "loss": 0.9075, "step": 18400 }, { "epoch": 2.3929961089494163, "grad_norm": 36.490447998046875, "learning_rate": 9.202334630350194e-06, "loss": 0.8567, "step": 18450 }, { "epoch": 2.3994811932555122, "grad_norm": 55.931556701660156, "learning_rate": 9.200172935581496e-06, "loss": 0.8783, "step": 18500 }, { "epoch": 2.405966277561608, "grad_norm": 78.6571044921875, "learning_rate": 9.198011240812798e-06, "loss": 0.8902, "step": 18550 }, { "epoch": 2.412451361867704, "grad_norm": 106.48160552978516, "learning_rate": 9.195849546044099e-06, "loss": 0.8735, "step": 18600 }, { "epoch": 2.4189364461738, "grad_norm": 160.64849853515625, "learning_rate": 9.1936878512754e-06, "loss": 0.8662, "step": 18650 }, { "epoch": 2.425421530479896, "grad_norm": 97.8504867553711, "learning_rate": 9.191526156506702e-06, "loss": 0.8682, "step": 18700 }, { "epoch": 2.4319066147859925, "grad_norm": 70.43258666992188, "learning_rate": 9.189364461738002e-06, "loss": 0.8945, "step": 18750 }, { "epoch": 2.4383916990920884, "grad_norm": 112.30128479003906, "learning_rate": 9.187202766969304e-06, "loss": 0.8751, "step": 18800 }, { "epoch": 2.4448767833981844, "grad_norm": 112.90283203125, "learning_rate": 9.185041072200605e-06, "loss": 0.8573, "step": 18850 }, { "epoch": 2.4513618677042803, "grad_norm": 36.05859375, "learning_rate": 9.182879377431907e-06, "loss": 0.8304, "step": 18900 }, { "epoch": 2.4578469520103763, "grad_norm": 72.84355163574219, "learning_rate": 9.180717682663209e-06, "loss": 0.8208, "step": 18950 }, { "epoch": 2.4643320363164722, "grad_norm": 125.35198974609375, "learning_rate": 9.17855598789451e-06, "loss": 0.8643, "step": 19000 }, { "epoch": 2.470817120622568, "grad_norm": 93.8465805053711, "learning_rate": 9.176394293125812e-06, "loss": 0.8591, "step": 19050 }, { "epoch": 2.477302204928664, "grad_norm": 114.83902740478516, "learning_rate": 9.174232598357112e-06, "loss": 0.836, "step": 19100 }, { "epoch": 2.48378728923476, "grad_norm": 61.47188949584961, "learning_rate": 9.172070903588413e-06, "loss": 0.8594, "step": 19150 }, { "epoch": 2.490272373540856, "grad_norm": 81.23229217529297, "learning_rate": 9.169909208819715e-06, "loss": 0.8223, "step": 19200 }, { "epoch": 2.496757457846952, "grad_norm": 143.3751678466797, "learning_rate": 9.167747514051016e-06, "loss": 0.8492, "step": 19250 }, { "epoch": 2.503242542153048, "grad_norm": 75.92655181884766, "learning_rate": 9.165585819282318e-06, "loss": 0.834, "step": 19300 }, { "epoch": 2.509727626459144, "grad_norm": 67.34745788574219, "learning_rate": 9.16342412451362e-06, "loss": 0.8322, "step": 19350 }, { "epoch": 2.51621271076524, "grad_norm": 125.6097640991211, "learning_rate": 9.161262429744921e-06, "loss": 0.8121, "step": 19400 }, { "epoch": 2.522697795071336, "grad_norm": 104.31269836425781, "learning_rate": 9.159100734976221e-06, "loss": 0.8242, "step": 19450 }, { "epoch": 2.529182879377432, "grad_norm": 88.86971282958984, "learning_rate": 9.156939040207523e-06, "loss": 0.8465, "step": 19500 }, { "epoch": 2.5356679636835278, "grad_norm": 84.49606323242188, "learning_rate": 9.154777345438824e-06, "loss": 0.8228, "step": 19550 }, { "epoch": 2.5421530479896237, "grad_norm": 72.06951904296875, "learning_rate": 9.152615650670126e-06, "loss": 0.8321, "step": 19600 }, { "epoch": 2.5486381322957197, "grad_norm": 51.27252197265625, "learning_rate": 9.150453955901428e-06, "loss": 0.8377, "step": 19650 }, { "epoch": 2.5551232166018156, "grad_norm": 82.98815155029297, "learning_rate": 9.148292261132729e-06, "loss": 0.8577, "step": 19700 }, { "epoch": 2.5616083009079116, "grad_norm": 86.29476928710938, "learning_rate": 9.146130566364029e-06, "loss": 0.838, "step": 19750 }, { "epoch": 2.5680933852140075, "grad_norm": 201.86570739746094, "learning_rate": 9.14396887159533e-06, "loss": 0.8341, "step": 19800 }, { "epoch": 2.5745784695201035, "grad_norm": 48.80326461791992, "learning_rate": 9.141807176826632e-06, "loss": 0.8183, "step": 19850 }, { "epoch": 2.5810635538262, "grad_norm": 123.20867156982422, "learning_rate": 9.139645482057934e-06, "loss": 0.8041, "step": 19900 }, { "epoch": 2.587548638132296, "grad_norm": 77.76668548583984, "learning_rate": 9.137483787289235e-06, "loss": 0.8027, "step": 19950 }, { "epoch": 2.594033722438392, "grad_norm": 60.8740119934082, "learning_rate": 9.135322092520537e-06, "loss": 0.8354, "step": 20000 }, { "epoch": 2.6005188067444878, "grad_norm": 33.433929443359375, "learning_rate": 9.133160397751839e-06, "loss": 0.8297, "step": 20050 }, { "epoch": 2.6070038910505837, "grad_norm": 101.6844253540039, "learning_rate": 9.130998702983139e-06, "loss": 0.8595, "step": 20100 }, { "epoch": 2.6134889753566797, "grad_norm": 56.76240921020508, "learning_rate": 9.12883700821444e-06, "loss": 0.8318, "step": 20150 }, { "epoch": 2.6199740596627756, "grad_norm": 77.91346740722656, "learning_rate": 9.126675313445742e-06, "loss": 0.8171, "step": 20200 }, { "epoch": 2.6264591439688716, "grad_norm": 22.083127975463867, "learning_rate": 9.124513618677043e-06, "loss": 0.8436, "step": 20250 }, { "epoch": 2.6329442282749675, "grad_norm": 32.83180618286133, "learning_rate": 9.122351923908345e-06, "loss": 0.8413, "step": 20300 }, { "epoch": 2.6394293125810635, "grad_norm": 80.33685302734375, "learning_rate": 9.120190229139646e-06, "loss": 0.8307, "step": 20350 }, { "epoch": 2.6459143968871595, "grad_norm": 106.72901916503906, "learning_rate": 9.118028534370948e-06, "loss": 0.8143, "step": 20400 }, { "epoch": 2.6523994811932554, "grad_norm": 84.93223571777344, "learning_rate": 9.115866839602248e-06, "loss": 0.8212, "step": 20450 }, { "epoch": 2.6588845654993514, "grad_norm": 100.1551513671875, "learning_rate": 9.11370514483355e-06, "loss": 0.8289, "step": 20500 }, { "epoch": 2.6653696498054473, "grad_norm": 86.93508911132812, "learning_rate": 9.111543450064851e-06, "loss": 0.8047, "step": 20550 }, { "epoch": 2.6718547341115433, "grad_norm": 43.016624450683594, "learning_rate": 9.109381755296153e-06, "loss": 0.7988, "step": 20600 }, { "epoch": 2.6783398184176397, "grad_norm": 310.767822265625, "learning_rate": 9.107220060527454e-06, "loss": 0.8227, "step": 20650 }, { "epoch": 2.6848249027237356, "grad_norm": 82.60010528564453, "learning_rate": 9.105058365758756e-06, "loss": 0.8148, "step": 20700 }, { "epoch": 2.6913099870298316, "grad_norm": 76.9372329711914, "learning_rate": 9.102896670990058e-06, "loss": 0.7776, "step": 20750 }, { "epoch": 2.6977950713359276, "grad_norm": 41.984886169433594, "learning_rate": 9.100734976221357e-06, "loss": 0.8093, "step": 20800 }, { "epoch": 2.7042801556420235, "grad_norm": 58.13618850708008, "learning_rate": 9.098573281452659e-06, "loss": 0.8415, "step": 20850 }, { "epoch": 2.7107652399481195, "grad_norm": 66.05621337890625, "learning_rate": 9.09641158668396e-06, "loss": 0.8096, "step": 20900 }, { "epoch": 2.7172503242542154, "grad_norm": 63.902557373046875, "learning_rate": 9.094249891915262e-06, "loss": 0.7865, "step": 20950 }, { "epoch": 2.7237354085603114, "grad_norm": 41.3662109375, "learning_rate": 9.092088197146564e-06, "loss": 0.8213, "step": 21000 }, { "epoch": 2.7302204928664073, "grad_norm": 53.82701873779297, "learning_rate": 9.089926502377865e-06, "loss": 0.8267, "step": 21050 }, { "epoch": 2.7367055771725033, "grad_norm": 76.71524047851562, "learning_rate": 9.087764807609167e-06, "loss": 0.8065, "step": 21100 }, { "epoch": 2.7431906614785992, "grad_norm": 34.62066650390625, "learning_rate": 9.085603112840467e-06, "loss": 0.764, "step": 21150 }, { "epoch": 2.749675745784695, "grad_norm": 165.2742462158203, "learning_rate": 9.083441418071769e-06, "loss": 0.7828, "step": 21200 }, { "epoch": 2.756160830090791, "grad_norm": 82.91865539550781, "learning_rate": 9.08127972330307e-06, "loss": 0.8355, "step": 21250 }, { "epoch": 2.762645914396887, "grad_norm": 60.068851470947266, "learning_rate": 9.079118028534372e-06, "loss": 0.8015, "step": 21300 }, { "epoch": 2.769130998702983, "grad_norm": 194.20948791503906, "learning_rate": 9.076956333765673e-06, "loss": 0.8166, "step": 21350 }, { "epoch": 2.775616083009079, "grad_norm": 49.6822509765625, "learning_rate": 9.074794638996975e-06, "loss": 0.7728, "step": 21400 }, { "epoch": 2.782101167315175, "grad_norm": 73.5209732055664, "learning_rate": 9.072632944228276e-06, "loss": 0.7917, "step": 21450 }, { "epoch": 2.788586251621271, "grad_norm": 156.21685791015625, "learning_rate": 9.070471249459576e-06, "loss": 0.7691, "step": 21500 }, { "epoch": 2.795071335927367, "grad_norm": 85.61043548583984, "learning_rate": 9.068309554690878e-06, "loss": 0.7911, "step": 21550 }, { "epoch": 2.801556420233463, "grad_norm": 144.1258087158203, "learning_rate": 9.06614785992218e-06, "loss": 0.7966, "step": 21600 }, { "epoch": 2.808041504539559, "grad_norm": 45.8646125793457, "learning_rate": 9.063986165153481e-06, "loss": 0.8261, "step": 21650 }, { "epoch": 2.8145265888456548, "grad_norm": 58.49191665649414, "learning_rate": 9.061824470384783e-06, "loss": 0.8226, "step": 21700 }, { "epoch": 2.8210116731517507, "grad_norm": 105.04296112060547, "learning_rate": 9.059662775616084e-06, "loss": 0.7782, "step": 21750 }, { "epoch": 2.8274967574578467, "grad_norm": 62.90886688232422, "learning_rate": 9.057501080847386e-06, "loss": 0.7693, "step": 21800 }, { "epoch": 2.833981841763943, "grad_norm": 79.02916717529297, "learning_rate": 9.055339386078686e-06, "loss": 0.7863, "step": 21850 }, { "epoch": 2.840466926070039, "grad_norm": 92.87028503417969, "learning_rate": 9.053177691309987e-06, "loss": 0.7804, "step": 21900 }, { "epoch": 2.846952010376135, "grad_norm": 88.81787872314453, "learning_rate": 9.051015996541289e-06, "loss": 0.802, "step": 21950 }, { "epoch": 2.853437094682231, "grad_norm": 140.72811889648438, "learning_rate": 9.04885430177259e-06, "loss": 0.801, "step": 22000 }, { "epoch": 2.859922178988327, "grad_norm": 190.2725067138672, "learning_rate": 9.046692607003892e-06, "loss": 0.793, "step": 22050 }, { "epoch": 2.866407263294423, "grad_norm": 122.08084869384766, "learning_rate": 9.044530912235194e-06, "loss": 0.7703, "step": 22100 }, { "epoch": 2.872892347600519, "grad_norm": 217.95184326171875, "learning_rate": 9.042369217466494e-06, "loss": 0.8127, "step": 22150 }, { "epoch": 2.8793774319066148, "grad_norm": 71.10440826416016, "learning_rate": 9.040207522697795e-06, "loss": 0.7741, "step": 22200 }, { "epoch": 2.8858625162127107, "grad_norm": 101.68942260742188, "learning_rate": 9.038045827929097e-06, "loss": 0.7949, "step": 22250 }, { "epoch": 2.8923476005188067, "grad_norm": 55.40034484863281, "learning_rate": 9.035884133160399e-06, "loss": 0.7572, "step": 22300 }, { "epoch": 2.8988326848249026, "grad_norm": 33.14478302001953, "learning_rate": 9.0337224383917e-06, "loss": 0.7708, "step": 22350 }, { "epoch": 2.9053177691309986, "grad_norm": 182.9443359375, "learning_rate": 9.031560743623002e-06, "loss": 0.7756, "step": 22400 }, { "epoch": 2.9118028534370946, "grad_norm": 55.46072769165039, "learning_rate": 9.029399048854303e-06, "loss": 0.8147, "step": 22450 }, { "epoch": 2.9182879377431905, "grad_norm": 122.65208435058594, "learning_rate": 9.027237354085603e-06, "loss": 0.7468, "step": 22500 }, { "epoch": 2.924773022049287, "grad_norm": 127.9378662109375, "learning_rate": 9.025075659316905e-06, "loss": 0.7679, "step": 22550 }, { "epoch": 2.931258106355383, "grad_norm": 105.78032684326172, "learning_rate": 9.022913964548206e-06, "loss": 0.7804, "step": 22600 }, { "epoch": 2.937743190661479, "grad_norm": 24.228551864624023, "learning_rate": 9.020752269779508e-06, "loss": 0.7805, "step": 22650 }, { "epoch": 2.9442282749675748, "grad_norm": 106.55142974853516, "learning_rate": 9.01859057501081e-06, "loss": 0.7818, "step": 22700 }, { "epoch": 2.9507133592736707, "grad_norm": 188.49441528320312, "learning_rate": 9.016428880242111e-06, "loss": 0.7815, "step": 22750 }, { "epoch": 2.9571984435797667, "grad_norm": 130.7115478515625, "learning_rate": 9.014267185473413e-06, "loss": 0.7834, "step": 22800 }, { "epoch": 2.9636835278858626, "grad_norm": 46.354881286621094, "learning_rate": 9.012105490704713e-06, "loss": 0.7945, "step": 22850 }, { "epoch": 2.9701686121919586, "grad_norm": 52.4910774230957, "learning_rate": 9.009943795936014e-06, "loss": 0.769, "step": 22900 }, { "epoch": 2.9766536964980546, "grad_norm": 231.7021026611328, "learning_rate": 9.007782101167316e-06, "loss": 0.7773, "step": 22950 }, { "epoch": 2.9831387808041505, "grad_norm": 68.80513763427734, "learning_rate": 9.005620406398617e-06, "loss": 0.7665, "step": 23000 }, { "epoch": 2.9896238651102465, "grad_norm": 21.473207473754883, "learning_rate": 9.003458711629919e-06, "loss": 0.7801, "step": 23050 }, { "epoch": 2.9961089494163424, "grad_norm": 120.36124420166016, "learning_rate": 9.00129701686122e-06, "loss": 0.7948, "step": 23100 }, { "epoch": 3.0025940337224384, "grad_norm": 74.29264831542969, "learning_rate": 8.999135322092522e-06, "loss": 0.7434, "step": 23150 }, { "epoch": 3.0090791180285343, "grad_norm": 93.22494506835938, "learning_rate": 8.996973627323822e-06, "loss": 0.7349, "step": 23200 }, { "epoch": 3.0155642023346303, "grad_norm": 30.082307815551758, "learning_rate": 8.994811932555124e-06, "loss": 0.7339, "step": 23250 }, { "epoch": 3.0220492866407263, "grad_norm": 31.523271560668945, "learning_rate": 8.992650237786425e-06, "loss": 0.7256, "step": 23300 }, { "epoch": 3.028534370946822, "grad_norm": 43.012237548828125, "learning_rate": 8.990488543017727e-06, "loss": 0.7657, "step": 23350 }, { "epoch": 3.035019455252918, "grad_norm": 55.685081481933594, "learning_rate": 8.988326848249028e-06, "loss": 0.7587, "step": 23400 }, { "epoch": 3.041504539559014, "grad_norm": 51.72869110107422, "learning_rate": 8.98616515348033e-06, "loss": 0.7697, "step": 23450 }, { "epoch": 3.04798962386511, "grad_norm": 49.6856689453125, "learning_rate": 8.984003458711632e-06, "loss": 0.7699, "step": 23500 }, { "epoch": 3.054474708171206, "grad_norm": 62.46233367919922, "learning_rate": 8.981841763942932e-06, "loss": 0.7785, "step": 23550 }, { "epoch": 3.060959792477302, "grad_norm": 129.84275817871094, "learning_rate": 8.979680069174233e-06, "loss": 0.7474, "step": 23600 }, { "epoch": 3.0674448767833984, "grad_norm": 28.303911209106445, "learning_rate": 8.977518374405535e-06, "loss": 0.7629, "step": 23650 }, { "epoch": 3.0739299610894943, "grad_norm": 74.46251678466797, "learning_rate": 8.975356679636836e-06, "loss": 0.7633, "step": 23700 }, { "epoch": 3.0804150453955903, "grad_norm": 27.983522415161133, "learning_rate": 8.973194984868138e-06, "loss": 0.7769, "step": 23750 }, { "epoch": 3.0869001297016863, "grad_norm": 71.08908081054688, "learning_rate": 8.97103329009944e-06, "loss": 0.7656, "step": 23800 }, { "epoch": 3.093385214007782, "grad_norm": 100.88603210449219, "learning_rate": 8.968871595330741e-06, "loss": 0.7546, "step": 23850 }, { "epoch": 3.099870298313878, "grad_norm": 159.69082641601562, "learning_rate": 8.966709900562041e-06, "loss": 0.7591, "step": 23900 }, { "epoch": 3.106355382619974, "grad_norm": 28.74492073059082, "learning_rate": 8.964548205793343e-06, "loss": 0.7779, "step": 23950 }, { "epoch": 3.11284046692607, "grad_norm": 86.59606170654297, "learning_rate": 8.962386511024644e-06, "loss": 0.7592, "step": 24000 }, { "epoch": 3.119325551232166, "grad_norm": 77.73062133789062, "learning_rate": 8.960224816255946e-06, "loss": 0.7932, "step": 24050 }, { "epoch": 3.125810635538262, "grad_norm": 82.81999969482422, "learning_rate": 8.958063121487247e-06, "loss": 0.7468, "step": 24100 }, { "epoch": 3.132295719844358, "grad_norm": 106.86148834228516, "learning_rate": 8.955901426718549e-06, "loss": 0.7473, "step": 24150 }, { "epoch": 3.138780804150454, "grad_norm": 73.26065063476562, "learning_rate": 8.95373973194985e-06, "loss": 0.7653, "step": 24200 }, { "epoch": 3.14526588845655, "grad_norm": 154.48199462890625, "learning_rate": 8.95157803718115e-06, "loss": 0.7799, "step": 24250 }, { "epoch": 3.151750972762646, "grad_norm": 165.397216796875, "learning_rate": 8.949416342412452e-06, "loss": 0.7668, "step": 24300 }, { "epoch": 3.1582360570687418, "grad_norm": 54.25576400756836, "learning_rate": 8.947254647643754e-06, "loss": 0.7501, "step": 24350 }, { "epoch": 3.1647211413748377, "grad_norm": 78.98974609375, "learning_rate": 8.945092952875055e-06, "loss": 0.7546, "step": 24400 }, { "epoch": 3.1712062256809337, "grad_norm": 69.79071807861328, "learning_rate": 8.942931258106357e-06, "loss": 0.7455, "step": 24450 }, { "epoch": 3.1776913099870296, "grad_norm": 99.46908569335938, "learning_rate": 8.940769563337658e-06, "loss": 0.7438, "step": 24500 }, { "epoch": 3.184176394293126, "grad_norm": 87.56387329101562, "learning_rate": 8.938607868568958e-06, "loss": 0.7421, "step": 24550 }, { "epoch": 3.190661478599222, "grad_norm": 53.633941650390625, "learning_rate": 8.93644617380026e-06, "loss": 0.7625, "step": 24600 }, { "epoch": 3.197146562905318, "grad_norm": 108.66197967529297, "learning_rate": 8.934284479031562e-06, "loss": 0.7474, "step": 24650 }, { "epoch": 3.203631647211414, "grad_norm": 62.14433670043945, "learning_rate": 8.932122784262863e-06, "loss": 0.7359, "step": 24700 }, { "epoch": 3.21011673151751, "grad_norm": 110.50857543945312, "learning_rate": 8.929961089494165e-06, "loss": 0.7402, "step": 24750 }, { "epoch": 3.216601815823606, "grad_norm": 36.320377349853516, "learning_rate": 8.927799394725466e-06, "loss": 0.7385, "step": 24800 }, { "epoch": 3.223086900129702, "grad_norm": 119.52420043945312, "learning_rate": 8.925637699956768e-06, "loss": 0.755, "step": 24850 }, { "epoch": 3.2295719844357977, "grad_norm": 229.50978088378906, "learning_rate": 8.923476005188068e-06, "loss": 0.7408, "step": 24900 }, { "epoch": 3.2360570687418937, "grad_norm": 29.48551368713379, "learning_rate": 8.92131431041937e-06, "loss": 0.7422, "step": 24950 }, { "epoch": 3.2425421530479897, "grad_norm": 77.79827880859375, "learning_rate": 8.919152615650671e-06, "loss": 0.748, "step": 25000 }, { "epoch": 3.2490272373540856, "grad_norm": 58.29311752319336, "learning_rate": 8.916990920881973e-06, "loss": 0.7363, "step": 25050 }, { "epoch": 3.2555123216601816, "grad_norm": 22.339330673217773, "learning_rate": 8.914829226113274e-06, "loss": 0.7558, "step": 25100 }, { "epoch": 3.2619974059662775, "grad_norm": 154.0586700439453, "learning_rate": 8.912667531344576e-06, "loss": 0.7527, "step": 25150 }, { "epoch": 3.2684824902723735, "grad_norm": 33.30474090576172, "learning_rate": 8.910505836575877e-06, "loss": 0.7338, "step": 25200 }, { "epoch": 3.2749675745784694, "grad_norm": 70.1267318725586, "learning_rate": 8.908344141807177e-06, "loss": 0.7626, "step": 25250 }, { "epoch": 3.2814526588845654, "grad_norm": 214.113525390625, "learning_rate": 8.906182447038479e-06, "loss": 0.7451, "step": 25300 }, { "epoch": 3.2879377431906613, "grad_norm": 83.08194732666016, "learning_rate": 8.90402075226978e-06, "loss": 0.7545, "step": 25350 }, { "epoch": 3.2944228274967573, "grad_norm": 100.41940307617188, "learning_rate": 8.90185905750108e-06, "loss": 0.7442, "step": 25400 }, { "epoch": 3.3009079118028533, "grad_norm": 67.69851684570312, "learning_rate": 8.899697362732382e-06, "loss": 0.7333, "step": 25450 }, { "epoch": 3.307392996108949, "grad_norm": 35.9471549987793, "learning_rate": 8.897535667963684e-06, "loss": 0.7444, "step": 25500 }, { "epoch": 3.313878080415045, "grad_norm": 192.07264709472656, "learning_rate": 8.895373973194985e-06, "loss": 0.7427, "step": 25550 }, { "epoch": 3.3203631647211416, "grad_norm": 71.07801055908203, "learning_rate": 8.893212278426287e-06, "loss": 0.762, "step": 25600 }, { "epoch": 3.3268482490272375, "grad_norm": 94.97274780273438, "learning_rate": 8.891050583657588e-06, "loss": 0.7571, "step": 25650 }, { "epoch": 3.3333333333333335, "grad_norm": 64.86588287353516, "learning_rate": 8.888888888888888e-06, "loss": 0.7267, "step": 25700 }, { "epoch": 3.3398184176394294, "grad_norm": 46.446414947509766, "learning_rate": 8.88672719412019e-06, "loss": 0.7433, "step": 25750 }, { "epoch": 3.3463035019455254, "grad_norm": 141.70608520507812, "learning_rate": 8.884565499351491e-06, "loss": 0.7115, "step": 25800 }, { "epoch": 3.3527885862516213, "grad_norm": 131.68763732910156, "learning_rate": 8.882403804582793e-06, "loss": 0.7456, "step": 25850 }, { "epoch": 3.3592736705577173, "grad_norm": 44.90886306762695, "learning_rate": 8.880242109814095e-06, "loss": 0.7671, "step": 25900 }, { "epoch": 3.3657587548638133, "grad_norm": 76.7698974609375, "learning_rate": 8.878080415045396e-06, "loss": 0.7414, "step": 25950 }, { "epoch": 3.372243839169909, "grad_norm": 73.65957641601562, "learning_rate": 8.875918720276698e-06, "loss": 0.7381, "step": 26000 }, { "epoch": 3.378728923476005, "grad_norm": 98.279052734375, "learning_rate": 8.873757025507998e-06, "loss": 0.7289, "step": 26050 }, { "epoch": 3.385214007782101, "grad_norm": 87.40727233886719, "learning_rate": 8.8715953307393e-06, "loss": 0.7374, "step": 26100 }, { "epoch": 3.391699092088197, "grad_norm": 147.8469696044922, "learning_rate": 8.869433635970601e-06, "loss": 0.7193, "step": 26150 }, { "epoch": 3.398184176394293, "grad_norm": 57.17820358276367, "learning_rate": 8.867271941201903e-06, "loss": 0.7522, "step": 26200 }, { "epoch": 3.404669260700389, "grad_norm": 527.1165771484375, "learning_rate": 8.865110246433204e-06, "loss": 0.7249, "step": 26250 }, { "epoch": 3.411154345006485, "grad_norm": 110.1869125366211, "learning_rate": 8.862948551664506e-06, "loss": 0.7226, "step": 26300 }, { "epoch": 3.417639429312581, "grad_norm": 86.62249755859375, "learning_rate": 8.860786856895807e-06, "loss": 0.7605, "step": 26350 }, { "epoch": 3.424124513618677, "grad_norm": 53.44112014770508, "learning_rate": 8.858625162127107e-06, "loss": 0.7259, "step": 26400 }, { "epoch": 3.4306095979247733, "grad_norm": 53.45317840576172, "learning_rate": 8.856463467358409e-06, "loss": 0.7322, "step": 26450 }, { "epoch": 3.4370946822308692, "grad_norm": 75.9814682006836, "learning_rate": 8.85430177258971e-06, "loss": 0.7389, "step": 26500 }, { "epoch": 3.443579766536965, "grad_norm": 72.01563262939453, "learning_rate": 8.852140077821012e-06, "loss": 0.7388, "step": 26550 }, { "epoch": 3.450064850843061, "grad_norm": 108.14093017578125, "learning_rate": 8.849978383052314e-06, "loss": 0.7414, "step": 26600 }, { "epoch": 3.456549935149157, "grad_norm": 146.429443359375, "learning_rate": 8.847816688283615e-06, "loss": 0.7322, "step": 26650 }, { "epoch": 3.463035019455253, "grad_norm": 254.16734313964844, "learning_rate": 8.845654993514917e-06, "loss": 0.7494, "step": 26700 }, { "epoch": 3.469520103761349, "grad_norm": 186.4697265625, "learning_rate": 8.843493298746217e-06, "loss": 0.7497, "step": 26750 }, { "epoch": 3.476005188067445, "grad_norm": 110.53705596923828, "learning_rate": 8.841331603977518e-06, "loss": 0.7513, "step": 26800 }, { "epoch": 3.482490272373541, "grad_norm": 95.7660903930664, "learning_rate": 8.83916990920882e-06, "loss": 0.7256, "step": 26850 }, { "epoch": 3.488975356679637, "grad_norm": 60.745643615722656, "learning_rate": 8.837008214440121e-06, "loss": 0.7224, "step": 26900 }, { "epoch": 3.495460440985733, "grad_norm": 40.43708419799805, "learning_rate": 8.834846519671423e-06, "loss": 0.7239, "step": 26950 }, { "epoch": 3.501945525291829, "grad_norm": 42.59388732910156, "learning_rate": 8.832684824902725e-06, "loss": 0.7199, "step": 27000 }, { "epoch": 3.5084306095979247, "grad_norm": 71.25556945800781, "learning_rate": 8.830523130134026e-06, "loss": 0.7256, "step": 27050 }, { "epoch": 3.5149156939040207, "grad_norm": 92.77458190917969, "learning_rate": 8.828361435365326e-06, "loss": 0.7285, "step": 27100 }, { "epoch": 3.5214007782101167, "grad_norm": 52.927757263183594, "learning_rate": 8.826199740596628e-06, "loss": 0.7478, "step": 27150 }, { "epoch": 3.5278858625162126, "grad_norm": 71.94493865966797, "learning_rate": 8.82403804582793e-06, "loss": 0.7345, "step": 27200 }, { "epoch": 3.5343709468223086, "grad_norm": 58.30330276489258, "learning_rate": 8.821876351059231e-06, "loss": 0.6936, "step": 27250 }, { "epoch": 3.5408560311284045, "grad_norm": 54.03791046142578, "learning_rate": 8.819714656290533e-06, "loss": 0.7284, "step": 27300 }, { "epoch": 3.5473411154345005, "grad_norm": 86.29717254638672, "learning_rate": 8.817552961521834e-06, "loss": 0.7216, "step": 27350 }, { "epoch": 3.5538261997405964, "grad_norm": 105.19668579101562, "learning_rate": 8.815391266753136e-06, "loss": 0.7469, "step": 27400 }, { "epoch": 3.5603112840466924, "grad_norm": 174.84385681152344, "learning_rate": 8.813229571984436e-06, "loss": 0.7122, "step": 27450 }, { "epoch": 3.5667963683527883, "grad_norm": 88.76931762695312, "learning_rate": 8.811067877215737e-06, "loss": 0.7361, "step": 27500 }, { "epoch": 3.5732814526588843, "grad_norm": 238.61947631835938, "learning_rate": 8.808906182447039e-06, "loss": 0.7396, "step": 27550 }, { "epoch": 3.5797665369649807, "grad_norm": 105.16651916503906, "learning_rate": 8.80674448767834e-06, "loss": 0.7316, "step": 27600 }, { "epoch": 3.5862516212710767, "grad_norm": 219.0015869140625, "learning_rate": 8.804582792909642e-06, "loss": 0.7254, "step": 27650 }, { "epoch": 3.5927367055771726, "grad_norm": 140.21543884277344, "learning_rate": 8.802421098140944e-06, "loss": 0.7555, "step": 27700 }, { "epoch": 3.5992217898832686, "grad_norm": 89.52685546875, "learning_rate": 8.800259403372245e-06, "loss": 0.7362, "step": 27750 }, { "epoch": 3.6057068741893645, "grad_norm": 59.332977294921875, "learning_rate": 8.798097708603545e-06, "loss": 0.7284, "step": 27800 }, { "epoch": 3.6121919584954605, "grad_norm": 110.76482391357422, "learning_rate": 8.795936013834847e-06, "loss": 0.6894, "step": 27850 }, { "epoch": 3.6186770428015564, "grad_norm": 38.199073791503906, "learning_rate": 8.793774319066148e-06, "loss": 0.6988, "step": 27900 }, { "epoch": 3.6251621271076524, "grad_norm": 63.44047927856445, "learning_rate": 8.79161262429745e-06, "loss": 0.734, "step": 27950 }, { "epoch": 3.6316472114137484, "grad_norm": 121.4446029663086, "learning_rate": 8.789450929528751e-06, "loss": 0.7129, "step": 28000 }, { "epoch": 3.6381322957198443, "grad_norm": 70.12100982666016, "learning_rate": 8.787289234760053e-06, "loss": 0.6867, "step": 28050 }, { "epoch": 3.6446173800259403, "grad_norm": 51.042972564697266, "learning_rate": 8.785127539991353e-06, "loss": 0.7204, "step": 28100 }, { "epoch": 3.6511024643320362, "grad_norm": 43.0015869140625, "learning_rate": 8.782965845222655e-06, "loss": 0.7225, "step": 28150 }, { "epoch": 3.657587548638132, "grad_norm": 59.59611129760742, "learning_rate": 8.780804150453956e-06, "loss": 0.7149, "step": 28200 }, { "epoch": 3.664072632944228, "grad_norm": 25.105127334594727, "learning_rate": 8.778642455685258e-06, "loss": 0.6864, "step": 28250 }, { "epoch": 3.670557717250324, "grad_norm": 62.92705154418945, "learning_rate": 8.77648076091656e-06, "loss": 0.7048, "step": 28300 }, { "epoch": 3.6770428015564205, "grad_norm": 154.20318603515625, "learning_rate": 8.774319066147861e-06, "loss": 0.6617, "step": 28350 }, { "epoch": 3.6835278858625164, "grad_norm": 212.035400390625, "learning_rate": 8.772157371379163e-06, "loss": 0.6981, "step": 28400 }, { "epoch": 3.6900129701686124, "grad_norm": 98.92573547363281, "learning_rate": 8.769995676610462e-06, "loss": 0.7024, "step": 28450 }, { "epoch": 3.6964980544747084, "grad_norm": 136.00390625, "learning_rate": 8.767833981841764e-06, "loss": 0.7532, "step": 28500 }, { "epoch": 3.7029831387808043, "grad_norm": 249.03781127929688, "learning_rate": 8.765672287073066e-06, "loss": 0.6805, "step": 28550 }, { "epoch": 3.7094682230869003, "grad_norm": 37.31251525878906, "learning_rate": 8.763510592304367e-06, "loss": 0.7054, "step": 28600 }, { "epoch": 3.7159533073929962, "grad_norm": 75.17498779296875, "learning_rate": 8.761348897535669e-06, "loss": 0.7303, "step": 28650 }, { "epoch": 3.722438391699092, "grad_norm": 146.90443420410156, "learning_rate": 8.75918720276697e-06, "loss": 0.7149, "step": 28700 }, { "epoch": 3.728923476005188, "grad_norm": 37.123870849609375, "learning_rate": 8.757025507998272e-06, "loss": 0.7096, "step": 28750 }, { "epoch": 3.735408560311284, "grad_norm": 54.98661422729492, "learning_rate": 8.754863813229572e-06, "loss": 0.7424, "step": 28800 }, { "epoch": 3.74189364461738, "grad_norm": 135.15431213378906, "learning_rate": 8.752702118460874e-06, "loss": 0.7179, "step": 28850 }, { "epoch": 3.748378728923476, "grad_norm": 159.3280792236328, "learning_rate": 8.750540423692175e-06, "loss": 0.7466, "step": 28900 }, { "epoch": 3.754863813229572, "grad_norm": 111.12368774414062, "learning_rate": 8.748378728923477e-06, "loss": 0.71, "step": 28950 }, { "epoch": 3.761348897535668, "grad_norm": 95.70431518554688, "learning_rate": 8.746217034154778e-06, "loss": 0.6719, "step": 29000 }, { "epoch": 3.767833981841764, "grad_norm": 116.32410430908203, "learning_rate": 8.74405533938608e-06, "loss": 0.6959, "step": 29050 }, { "epoch": 3.77431906614786, "grad_norm": 48.57170867919922, "learning_rate": 8.741893644617381e-06, "loss": 0.7021, "step": 29100 }, { "epoch": 3.780804150453956, "grad_norm": 145.74124145507812, "learning_rate": 8.739731949848681e-06, "loss": 0.6982, "step": 29150 }, { "epoch": 3.7872892347600517, "grad_norm": 110.97146606445312, "learning_rate": 8.737570255079983e-06, "loss": 0.6707, "step": 29200 }, { "epoch": 3.7937743190661477, "grad_norm": 80.89407348632812, "learning_rate": 8.735408560311285e-06, "loss": 0.704, "step": 29250 }, { "epoch": 3.8002594033722437, "grad_norm": 117.62003326416016, "learning_rate": 8.733246865542586e-06, "loss": 0.737, "step": 29300 }, { "epoch": 3.8067444876783396, "grad_norm": 236.39186096191406, "learning_rate": 8.731085170773888e-06, "loss": 0.6954, "step": 29350 }, { "epoch": 3.8132295719844356, "grad_norm": 204.6386260986328, "learning_rate": 8.72892347600519e-06, "loss": 0.7258, "step": 29400 }, { "epoch": 3.8197146562905315, "grad_norm": 175.2502899169922, "learning_rate": 8.726761781236491e-06, "loss": 0.7116, "step": 29450 }, { "epoch": 3.8261997405966275, "grad_norm": 64.19542694091797, "learning_rate": 8.724600086467791e-06, "loss": 0.7335, "step": 29500 }, { "epoch": 3.832684824902724, "grad_norm": 67.48596954345703, "learning_rate": 8.722438391699092e-06, "loss": 0.6889, "step": 29550 }, { "epoch": 3.83916990920882, "grad_norm": 87.38389587402344, "learning_rate": 8.720276696930394e-06, "loss": 0.6961, "step": 29600 }, { "epoch": 3.845654993514916, "grad_norm": 42.56321334838867, "learning_rate": 8.718115002161696e-06, "loss": 0.7061, "step": 29650 }, { "epoch": 3.8521400778210118, "grad_norm": 104.84762573242188, "learning_rate": 8.715953307392997e-06, "loss": 0.6809, "step": 29700 }, { "epoch": 3.8586251621271077, "grad_norm": 84.26802062988281, "learning_rate": 8.713791612624299e-06, "loss": 0.6955, "step": 29750 }, { "epoch": 3.8651102464332037, "grad_norm": 76.20053100585938, "learning_rate": 8.7116299178556e-06, "loss": 0.6869, "step": 29800 }, { "epoch": 3.8715953307392996, "grad_norm": 52.06394958496094, "learning_rate": 8.7094682230869e-06, "loss": 0.6838, "step": 29850 }, { "epoch": 3.8780804150453956, "grad_norm": 31.091880798339844, "learning_rate": 8.707306528318202e-06, "loss": 0.6914, "step": 29900 }, { "epoch": 3.8845654993514915, "grad_norm": 154.26475524902344, "learning_rate": 8.705144833549504e-06, "loss": 0.7303, "step": 29950 }, { "epoch": 3.8910505836575875, "grad_norm": 70.6423568725586, "learning_rate": 8.702983138780805e-06, "loss": 0.6856, "step": 30000 }, { "epoch": 3.8975356679636834, "grad_norm": 70.91290283203125, "learning_rate": 8.700821444012107e-06, "loss": 0.6972, "step": 30050 }, { "epoch": 3.9040207522697794, "grad_norm": 87.57475280761719, "learning_rate": 8.698659749243408e-06, "loss": 0.6854, "step": 30100 }, { "epoch": 3.9105058365758754, "grad_norm": 63.372528076171875, "learning_rate": 8.69649805447471e-06, "loss": 0.6723, "step": 30150 }, { "epoch": 3.9169909208819713, "grad_norm": 108.40290069580078, "learning_rate": 8.69433635970601e-06, "loss": 0.686, "step": 30200 }, { "epoch": 3.9234760051880677, "grad_norm": 134.44715881347656, "learning_rate": 8.692174664937311e-06, "loss": 0.6884, "step": 30250 }, { "epoch": 3.9299610894941637, "grad_norm": 45.21245574951172, "learning_rate": 8.690012970168613e-06, "loss": 0.6974, "step": 30300 }, { "epoch": 3.9364461738002596, "grad_norm": 64.46482849121094, "learning_rate": 8.687851275399915e-06, "loss": 0.6928, "step": 30350 }, { "epoch": 3.9429312581063556, "grad_norm": 113.75922393798828, "learning_rate": 8.685689580631216e-06, "loss": 0.7125, "step": 30400 }, { "epoch": 3.9494163424124515, "grad_norm": 106.91778564453125, "learning_rate": 8.683527885862518e-06, "loss": 0.6689, "step": 30450 }, { "epoch": 3.9559014267185475, "grad_norm": 145.61880493164062, "learning_rate": 8.681366191093818e-06, "loss": 0.7005, "step": 30500 }, { "epoch": 3.9623865110246435, "grad_norm": 147.24017333984375, "learning_rate": 8.67920449632512e-06, "loss": 0.6987, "step": 30550 }, { "epoch": 3.9688715953307394, "grad_norm": 86.30076599121094, "learning_rate": 8.677042801556421e-06, "loss": 0.7059, "step": 30600 }, { "epoch": 3.9753566796368354, "grad_norm": 118.67623138427734, "learning_rate": 8.674881106787722e-06, "loss": 0.6777, "step": 30650 }, { "epoch": 3.9818417639429313, "grad_norm": 48.108436584472656, "learning_rate": 8.672719412019024e-06, "loss": 0.6791, "step": 30700 }, { "epoch": 3.9883268482490273, "grad_norm": 81.96046447753906, "learning_rate": 8.670557717250326e-06, "loss": 0.6913, "step": 30750 }, { "epoch": 3.9948119325551232, "grad_norm": 197.388916015625, "learning_rate": 8.668396022481627e-06, "loss": 0.7043, "step": 30800 }, { "epoch": 4.001297016861219, "grad_norm": 35.324703216552734, "learning_rate": 8.666234327712927e-06, "loss": 0.7125, "step": 30850 }, { "epoch": 4.007782101167315, "grad_norm": 105.50518035888672, "learning_rate": 8.664072632944229e-06, "loss": 0.707, "step": 30900 }, { "epoch": 4.014267185473411, "grad_norm": 23.028858184814453, "learning_rate": 8.66191093817553e-06, "loss": 0.6371, "step": 30950 }, { "epoch": 4.020752269779507, "grad_norm": 72.48033142089844, "learning_rate": 8.659749243406832e-06, "loss": 0.6719, "step": 31000 }, { "epoch": 4.027237354085603, "grad_norm": 186.94964599609375, "learning_rate": 8.657587548638134e-06, "loss": 0.657, "step": 31050 }, { "epoch": 4.033722438391699, "grad_norm": 77.0679702758789, "learning_rate": 8.655425853869435e-06, "loss": 0.6947, "step": 31100 }, { "epoch": 4.040207522697795, "grad_norm": 44.03890609741211, "learning_rate": 8.653264159100737e-06, "loss": 0.6778, "step": 31150 }, { "epoch": 4.046692607003891, "grad_norm": 71.50305938720703, "learning_rate": 8.651102464332037e-06, "loss": 0.6779, "step": 31200 }, { "epoch": 4.053177691309987, "grad_norm": 81.2274398803711, "learning_rate": 8.648940769563338e-06, "loss": 0.6696, "step": 31250 }, { "epoch": 4.059662775616083, "grad_norm": 169.8217315673828, "learning_rate": 8.64677907479464e-06, "loss": 0.6846, "step": 31300 }, { "epoch": 4.066147859922179, "grad_norm": 87.5166244506836, "learning_rate": 8.644617380025941e-06, "loss": 0.6826, "step": 31350 }, { "epoch": 4.072632944228275, "grad_norm": 189.89439392089844, "learning_rate": 8.642455685257243e-06, "loss": 0.6891, "step": 31400 }, { "epoch": 4.079118028534371, "grad_norm": 34.17830276489258, "learning_rate": 8.640293990488545e-06, "loss": 0.7317, "step": 31450 }, { "epoch": 4.085603112840467, "grad_norm": 135.60418701171875, "learning_rate": 8.638132295719846e-06, "loss": 0.6752, "step": 31500 }, { "epoch": 4.092088197146563, "grad_norm": 81.31814575195312, "learning_rate": 8.635970600951146e-06, "loss": 0.7003, "step": 31550 }, { "epoch": 4.0985732814526585, "grad_norm": 302.2116394042969, "learning_rate": 8.633808906182448e-06, "loss": 0.6662, "step": 31600 }, { "epoch": 4.1050583657587545, "grad_norm": 36.41209030151367, "learning_rate": 8.63164721141375e-06, "loss": 0.6726, "step": 31650 }, { "epoch": 4.1115434500648504, "grad_norm": 164.67007446289062, "learning_rate": 8.629485516645051e-06, "loss": 0.7068, "step": 31700 }, { "epoch": 4.118028534370946, "grad_norm": 43.526405334472656, "learning_rate": 8.627323821876352e-06, "loss": 0.6729, "step": 31750 }, { "epoch": 4.124513618677042, "grad_norm": 110.07795715332031, "learning_rate": 8.625162127107654e-06, "loss": 0.683, "step": 31800 }, { "epoch": 4.130998702983139, "grad_norm": 89.71601867675781, "learning_rate": 8.623000432338956e-06, "loss": 0.6792, "step": 31850 }, { "epoch": 4.137483787289235, "grad_norm": 101.19843292236328, "learning_rate": 8.620838737570256e-06, "loss": 0.6505, "step": 31900 }, { "epoch": 4.143968871595331, "grad_norm": 230.81871032714844, "learning_rate": 8.618677042801557e-06, "loss": 0.6748, "step": 31950 }, { "epoch": 4.150453955901427, "grad_norm": 114.94778442382812, "learning_rate": 8.616515348032859e-06, "loss": 0.6755, "step": 32000 }, { "epoch": 4.156939040207523, "grad_norm": 34.266761779785156, "learning_rate": 8.61435365326416e-06, "loss": 0.6572, "step": 32050 }, { "epoch": 4.163424124513619, "grad_norm": 80.04161071777344, "learning_rate": 8.612191958495462e-06, "loss": 0.6804, "step": 32100 }, { "epoch": 4.169909208819715, "grad_norm": 350.19573974609375, "learning_rate": 8.610030263726764e-06, "loss": 0.6955, "step": 32150 }, { "epoch": 4.176394293125811, "grad_norm": 83.74986267089844, "learning_rate": 8.607868568958065e-06, "loss": 0.6856, "step": 32200 }, { "epoch": 4.182879377431907, "grad_norm": 137.72669982910156, "learning_rate": 8.605706874189365e-06, "loss": 0.6541, "step": 32250 }, { "epoch": 4.189364461738003, "grad_norm": 45.202903747558594, "learning_rate": 8.603545179420667e-06, "loss": 0.679, "step": 32300 }, { "epoch": 4.195849546044099, "grad_norm": 191.8456268310547, "learning_rate": 8.601383484651968e-06, "loss": 0.6594, "step": 32350 }, { "epoch": 4.202334630350195, "grad_norm": 152.2100830078125, "learning_rate": 8.59922178988327e-06, "loss": 0.6553, "step": 32400 }, { "epoch": 4.208819714656291, "grad_norm": 56.737754821777344, "learning_rate": 8.597060095114571e-06, "loss": 0.6915, "step": 32450 }, { "epoch": 4.215304798962387, "grad_norm": 49.64228057861328, "learning_rate": 8.594898400345873e-06, "loss": 0.6773, "step": 32500 }, { "epoch": 4.221789883268483, "grad_norm": 57.6026496887207, "learning_rate": 8.592736705577173e-06, "loss": 0.7089, "step": 32550 }, { "epoch": 4.2282749675745785, "grad_norm": 58.62641143798828, "learning_rate": 8.590575010808474e-06, "loss": 0.6433, "step": 32600 }, { "epoch": 4.2347600518806745, "grad_norm": 101.03966522216797, "learning_rate": 8.588413316039776e-06, "loss": 0.6496, "step": 32650 }, { "epoch": 4.2412451361867705, "grad_norm": 198.1434326171875, "learning_rate": 8.586251621271076e-06, "loss": 0.6757, "step": 32700 }, { "epoch": 4.247730220492866, "grad_norm": 78.59976196289062, "learning_rate": 8.584089926502378e-06, "loss": 0.6913, "step": 32750 }, { "epoch": 4.254215304798962, "grad_norm": 94.35735321044922, "learning_rate": 8.58192823173368e-06, "loss": 0.6982, "step": 32800 }, { "epoch": 4.260700389105058, "grad_norm": 140.77769470214844, "learning_rate": 8.57976653696498e-06, "loss": 0.6753, "step": 32850 }, { "epoch": 4.267185473411154, "grad_norm": 59.85847091674805, "learning_rate": 8.577604842196282e-06, "loss": 0.644, "step": 32900 }, { "epoch": 4.27367055771725, "grad_norm": 49.44724655151367, "learning_rate": 8.575443147427584e-06, "loss": 0.676, "step": 32950 }, { "epoch": 4.280155642023346, "grad_norm": 58.50251007080078, "learning_rate": 8.573281452658886e-06, "loss": 0.6919, "step": 33000 }, { "epoch": 4.286640726329442, "grad_norm": 52.0682258605957, "learning_rate": 8.571119757890185e-06, "loss": 0.6517, "step": 33050 }, { "epoch": 4.293125810635538, "grad_norm": 100.81246948242188, "learning_rate": 8.568958063121487e-06, "loss": 0.7115, "step": 33100 }, { "epoch": 4.299610894941634, "grad_norm": 126.4149398803711, "learning_rate": 8.566796368352789e-06, "loss": 0.6621, "step": 33150 }, { "epoch": 4.30609597924773, "grad_norm": 82.7846908569336, "learning_rate": 8.56463467358409e-06, "loss": 0.6653, "step": 33200 }, { "epoch": 4.312581063553826, "grad_norm": 111.23580932617188, "learning_rate": 8.562472978815392e-06, "loss": 0.6779, "step": 33250 }, { "epoch": 4.319066147859922, "grad_norm": 88.04605102539062, "learning_rate": 8.560311284046693e-06, "loss": 0.6984, "step": 33300 }, { "epoch": 4.325551232166018, "grad_norm": 34.93830871582031, "learning_rate": 8.558149589277995e-06, "loss": 0.6901, "step": 33350 }, { "epoch": 4.332036316472114, "grad_norm": 86.28446197509766, "learning_rate": 8.555987894509295e-06, "loss": 0.6808, "step": 33400 }, { "epoch": 4.33852140077821, "grad_norm": 52.025169372558594, "learning_rate": 8.553826199740596e-06, "loss": 0.6769, "step": 33450 }, { "epoch": 4.345006485084306, "grad_norm": 103.3537826538086, "learning_rate": 8.551664504971898e-06, "loss": 0.674, "step": 33500 }, { "epoch": 4.351491569390402, "grad_norm": 131.0025634765625, "learning_rate": 8.5495028102032e-06, "loss": 0.6634, "step": 33550 }, { "epoch": 4.357976653696498, "grad_norm": 36.6743049621582, "learning_rate": 8.547341115434501e-06, "loss": 0.6771, "step": 33600 }, { "epoch": 4.364461738002594, "grad_norm": 91.82353210449219, "learning_rate": 8.545179420665803e-06, "loss": 0.6709, "step": 33650 }, { "epoch": 4.3709468223086905, "grad_norm": 67.49322509765625, "learning_rate": 8.543017725897104e-06, "loss": 0.6381, "step": 33700 }, { "epoch": 4.377431906614786, "grad_norm": 53.42247772216797, "learning_rate": 8.540856031128404e-06, "loss": 0.6886, "step": 33750 }, { "epoch": 4.383916990920882, "grad_norm": 221.73178100585938, "learning_rate": 8.538694336359706e-06, "loss": 0.6354, "step": 33800 }, { "epoch": 4.390402075226978, "grad_norm": 103.88397216796875, "learning_rate": 8.536532641591008e-06, "loss": 0.6807, "step": 33850 }, { "epoch": 4.396887159533074, "grad_norm": 45.40660858154297, "learning_rate": 8.53437094682231e-06, "loss": 0.6404, "step": 33900 }, { "epoch": 4.40337224383917, "grad_norm": 65.8223876953125, "learning_rate": 8.53220925205361e-06, "loss": 0.6567, "step": 33950 }, { "epoch": 4.409857328145266, "grad_norm": 245.63230895996094, "learning_rate": 8.530047557284912e-06, "loss": 0.6437, "step": 34000 }, { "epoch": 4.416342412451362, "grad_norm": 125.60919952392578, "learning_rate": 8.527885862516212e-06, "loss": 0.647, "step": 34050 }, { "epoch": 4.422827496757458, "grad_norm": 147.76620483398438, "learning_rate": 8.525724167747514e-06, "loss": 0.6771, "step": 34100 }, { "epoch": 4.429312581063554, "grad_norm": 118.33441925048828, "learning_rate": 8.523562472978815e-06, "loss": 0.6826, "step": 34150 }, { "epoch": 4.43579766536965, "grad_norm": 289.2904052734375, "learning_rate": 8.521400778210117e-06, "loss": 0.6607, "step": 34200 }, { "epoch": 4.442282749675746, "grad_norm": 53.50255584716797, "learning_rate": 8.519239083441419e-06, "loss": 0.6409, "step": 34250 }, { "epoch": 4.448767833981842, "grad_norm": 133.48831176757812, "learning_rate": 8.51707738867272e-06, "loss": 0.6691, "step": 34300 }, { "epoch": 4.455252918287938, "grad_norm": 119.14691925048828, "learning_rate": 8.514915693904022e-06, "loss": 0.6595, "step": 34350 }, { "epoch": 4.461738002594034, "grad_norm": 323.05889892578125, "learning_rate": 8.512753999135322e-06, "loss": 0.6648, "step": 34400 }, { "epoch": 4.46822308690013, "grad_norm": 193.7076873779297, "learning_rate": 8.510592304366623e-06, "loss": 0.6661, "step": 34450 }, { "epoch": 4.474708171206226, "grad_norm": 85.69574737548828, "learning_rate": 8.508430609597925e-06, "loss": 0.6683, "step": 34500 }, { "epoch": 4.481193255512322, "grad_norm": 23.649465560913086, "learning_rate": 8.506268914829226e-06, "loss": 0.6423, "step": 34550 }, { "epoch": 4.487678339818418, "grad_norm": 109.6485366821289, "learning_rate": 8.504107220060528e-06, "loss": 0.6924, "step": 34600 }, { "epoch": 4.494163424124514, "grad_norm": 76.71481323242188, "learning_rate": 8.50194552529183e-06, "loss": 0.6041, "step": 34650 }, { "epoch": 4.50064850843061, "grad_norm": 33.22921371459961, "learning_rate": 8.499783830523131e-06, "loss": 0.6208, "step": 34700 }, { "epoch": 4.5071335927367056, "grad_norm": 47.12236022949219, "learning_rate": 8.497622135754431e-06, "loss": 0.6477, "step": 34750 }, { "epoch": 4.5136186770428015, "grad_norm": 51.28311538696289, "learning_rate": 8.495460440985733e-06, "loss": 0.628, "step": 34800 }, { "epoch": 4.5201037613488975, "grad_norm": 25.358299255371094, "learning_rate": 8.493298746217034e-06, "loss": 0.6582, "step": 34850 }, { "epoch": 4.526588845654993, "grad_norm": 97.27490997314453, "learning_rate": 8.491137051448336e-06, "loss": 0.6382, "step": 34900 }, { "epoch": 4.533073929961089, "grad_norm": 176.92462158203125, "learning_rate": 8.488975356679638e-06, "loss": 0.6489, "step": 34950 }, { "epoch": 4.539559014267185, "grad_norm": 46.83137130737305, "learning_rate": 8.486813661910939e-06, "loss": 0.6509, "step": 35000 }, { "epoch": 4.546044098573281, "grad_norm": 27.511350631713867, "learning_rate": 8.48465196714224e-06, "loss": 0.6425, "step": 35050 }, { "epoch": 4.552529182879377, "grad_norm": 85.7640609741211, "learning_rate": 8.48249027237354e-06, "loss": 0.6727, "step": 35100 }, { "epoch": 4.559014267185473, "grad_norm": 246.8522491455078, "learning_rate": 8.480328577604842e-06, "loss": 0.6551, "step": 35150 }, { "epoch": 4.565499351491569, "grad_norm": 145.3149871826172, "learning_rate": 8.478166882836144e-06, "loss": 0.655, "step": 35200 }, { "epoch": 4.571984435797665, "grad_norm": 98.9753189086914, "learning_rate": 8.476005188067445e-06, "loss": 0.6396, "step": 35250 }, { "epoch": 4.578469520103761, "grad_norm": 103.30072021484375, "learning_rate": 8.473843493298747e-06, "loss": 0.6491, "step": 35300 }, { "epoch": 4.584954604409857, "grad_norm": 46.122684478759766, "learning_rate": 8.471681798530049e-06, "loss": 0.6505, "step": 35350 }, { "epoch": 4.591439688715953, "grad_norm": 183.2648468017578, "learning_rate": 8.46952010376135e-06, "loss": 0.6504, "step": 35400 }, { "epoch": 4.597924773022049, "grad_norm": 37.44175338745117, "learning_rate": 8.46735840899265e-06, "loss": 0.6747, "step": 35450 }, { "epoch": 4.604409857328145, "grad_norm": 42.08739471435547, "learning_rate": 8.465196714223952e-06, "loss": 0.6365, "step": 35500 }, { "epoch": 4.610894941634241, "grad_norm": 86.90052032470703, "learning_rate": 8.463035019455253e-06, "loss": 0.6565, "step": 35550 }, { "epoch": 4.617380025940337, "grad_norm": 134.0282440185547, "learning_rate": 8.460873324686555e-06, "loss": 0.6702, "step": 35600 }, { "epoch": 4.623865110246433, "grad_norm": 47.65680694580078, "learning_rate": 8.458711629917856e-06, "loss": 0.6805, "step": 35650 }, { "epoch": 4.630350194552529, "grad_norm": 74.25086212158203, "learning_rate": 8.456549935149158e-06, "loss": 0.697, "step": 35700 }, { "epoch": 4.636835278858625, "grad_norm": 245.19024658203125, "learning_rate": 8.45438824038046e-06, "loss": 0.6648, "step": 35750 }, { "epoch": 4.6433203631647215, "grad_norm": 59.609580993652344, "learning_rate": 8.45222654561176e-06, "loss": 0.6457, "step": 35800 }, { "epoch": 4.6498054474708175, "grad_norm": 68.63418579101562, "learning_rate": 8.450064850843061e-06, "loss": 0.6434, "step": 35850 }, { "epoch": 4.656290531776913, "grad_norm": 153.90467834472656, "learning_rate": 8.447903156074363e-06, "loss": 0.6385, "step": 35900 }, { "epoch": 4.662775616083009, "grad_norm": 68.64386749267578, "learning_rate": 8.445741461305664e-06, "loss": 0.6758, "step": 35950 }, { "epoch": 4.669260700389105, "grad_norm": 103.12224578857422, "learning_rate": 8.443579766536966e-06, "loss": 0.6526, "step": 36000 }, { "epoch": 4.675745784695201, "grad_norm": 35.21643829345703, "learning_rate": 8.441418071768268e-06, "loss": 0.6469, "step": 36050 }, { "epoch": 4.682230869001297, "grad_norm": 48.4489631652832, "learning_rate": 8.439256376999569e-06, "loss": 0.6483, "step": 36100 }, { "epoch": 4.688715953307393, "grad_norm": 181.4416046142578, "learning_rate": 8.437094682230869e-06, "loss": 0.6434, "step": 36150 }, { "epoch": 4.695201037613489, "grad_norm": 153.2976837158203, "learning_rate": 8.43493298746217e-06, "loss": 0.6435, "step": 36200 }, { "epoch": 4.701686121919585, "grad_norm": 80.14440155029297, "learning_rate": 8.432771292693472e-06, "loss": 0.6385, "step": 36250 }, { "epoch": 4.708171206225681, "grad_norm": 30.52111053466797, "learning_rate": 8.430609597924774e-06, "loss": 0.6292, "step": 36300 }, { "epoch": 4.714656290531777, "grad_norm": 192.3052520751953, "learning_rate": 8.428447903156075e-06, "loss": 0.6395, "step": 36350 }, { "epoch": 4.721141374837873, "grad_norm": 105.61079406738281, "learning_rate": 8.426286208387377e-06, "loss": 0.6241, "step": 36400 }, { "epoch": 4.727626459143969, "grad_norm": 111.08782196044922, "learning_rate": 8.424124513618679e-06, "loss": 0.6408, "step": 36450 }, { "epoch": 4.734111543450065, "grad_norm": 140.2386932373047, "learning_rate": 8.421962818849979e-06, "loss": 0.63, "step": 36500 }, { "epoch": 4.740596627756161, "grad_norm": 104.75723266601562, "learning_rate": 8.41980112408128e-06, "loss": 0.651, "step": 36550 }, { "epoch": 4.747081712062257, "grad_norm": 38.98159408569336, "learning_rate": 8.417639429312582e-06, "loss": 0.6435, "step": 36600 }, { "epoch": 4.753566796368353, "grad_norm": 250.14450073242188, "learning_rate": 8.415477734543883e-06, "loss": 0.6344, "step": 36650 }, { "epoch": 4.760051880674449, "grad_norm": 50.4091796875, "learning_rate": 8.413316039775185e-06, "loss": 0.6409, "step": 36700 }, { "epoch": 4.766536964980545, "grad_norm": 146.32968139648438, "learning_rate": 8.411154345006486e-06, "loss": 0.6509, "step": 36750 }, { "epoch": 4.773022049286641, "grad_norm": 95.01649475097656, "learning_rate": 8.408992650237786e-06, "loss": 0.6637, "step": 36800 }, { "epoch": 4.779507133592737, "grad_norm": 52.520076751708984, "learning_rate": 8.406830955469088e-06, "loss": 0.6525, "step": 36850 }, { "epoch": 4.785992217898833, "grad_norm": 106.26171112060547, "learning_rate": 8.40466926070039e-06, "loss": 0.6759, "step": 36900 }, { "epoch": 4.7924773022049285, "grad_norm": 82.12842559814453, "learning_rate": 8.402507565931691e-06, "loss": 0.6305, "step": 36950 }, { "epoch": 4.7989623865110245, "grad_norm": 110.25924682617188, "learning_rate": 8.400345871162993e-06, "loss": 0.6231, "step": 37000 }, { "epoch": 4.80544747081712, "grad_norm": 61.60184860229492, "learning_rate": 8.398184176394294e-06, "loss": 0.636, "step": 37050 }, { "epoch": 4.811932555123216, "grad_norm": 106.20768737792969, "learning_rate": 8.396022481625596e-06, "loss": 0.663, "step": 37100 }, { "epoch": 4.818417639429312, "grad_norm": 24.003427505493164, "learning_rate": 8.393860786856896e-06, "loss": 0.6515, "step": 37150 }, { "epoch": 4.824902723735408, "grad_norm": 162.2716522216797, "learning_rate": 8.391699092088197e-06, "loss": 0.6714, "step": 37200 }, { "epoch": 4.831387808041504, "grad_norm": 246.6392059326172, "learning_rate": 8.389537397319499e-06, "loss": 0.6578, "step": 37250 }, { "epoch": 4.8378728923476, "grad_norm": 122.14068603515625, "learning_rate": 8.3873757025508e-06, "loss": 0.6262, "step": 37300 }, { "epoch": 4.844357976653696, "grad_norm": 33.177120208740234, "learning_rate": 8.385214007782102e-06, "loss": 0.6666, "step": 37350 }, { "epoch": 4.850843060959792, "grad_norm": 106.25698852539062, "learning_rate": 8.383052313013404e-06, "loss": 0.6421, "step": 37400 }, { "epoch": 4.857328145265888, "grad_norm": 119.15618133544922, "learning_rate": 8.380890618244705e-06, "loss": 0.6409, "step": 37450 }, { "epoch": 4.863813229571985, "grad_norm": 82.73539733886719, "learning_rate": 8.378728923476005e-06, "loss": 0.6363, "step": 37500 }, { "epoch": 4.870298313878081, "grad_norm": 37.429141998291016, "learning_rate": 8.376567228707307e-06, "loss": 0.6361, "step": 37550 }, { "epoch": 4.876783398184177, "grad_norm": 152.13327026367188, "learning_rate": 8.374405533938609e-06, "loss": 0.6386, "step": 37600 }, { "epoch": 4.883268482490273, "grad_norm": 57.97270584106445, "learning_rate": 8.37224383916991e-06, "loss": 0.6475, "step": 37650 }, { "epoch": 4.889753566796369, "grad_norm": 87.77135467529297, "learning_rate": 8.370082144401212e-06, "loss": 0.6235, "step": 37700 }, { "epoch": 4.896238651102465, "grad_norm": 206.52565002441406, "learning_rate": 8.367920449632513e-06, "loss": 0.6211, "step": 37750 }, { "epoch": 4.902723735408561, "grad_norm": 115.79866027832031, "learning_rate": 8.365758754863815e-06, "loss": 0.6338, "step": 37800 }, { "epoch": 4.909208819714657, "grad_norm": 147.11058044433594, "learning_rate": 8.363597060095115e-06, "loss": 0.6615, "step": 37850 }, { "epoch": 4.915693904020753, "grad_norm": 107.96685028076172, "learning_rate": 8.361435365326416e-06, "loss": 0.6437, "step": 37900 }, { "epoch": 4.9221789883268485, "grad_norm": 75.72913360595703, "learning_rate": 8.359273670557718e-06, "loss": 0.6257, "step": 37950 }, { "epoch": 4.9286640726329445, "grad_norm": 217.0497283935547, "learning_rate": 8.35711197578902e-06, "loss": 0.6603, "step": 38000 }, { "epoch": 4.93514915693904, "grad_norm": 40.60713577270508, "learning_rate": 8.354950281020321e-06, "loss": 0.6293, "step": 38050 }, { "epoch": 4.941634241245136, "grad_norm": 71.73409271240234, "learning_rate": 8.352788586251623e-06, "loss": 0.6409, "step": 38100 }, { "epoch": 4.948119325551232, "grad_norm": 91.70991516113281, "learning_rate": 8.350626891482924e-06, "loss": 0.6299, "step": 38150 }, { "epoch": 4.954604409857328, "grad_norm": 117.5611572265625, "learning_rate": 8.348465196714224e-06, "loss": 0.6322, "step": 38200 }, { "epoch": 4.961089494163424, "grad_norm": 119.25588989257812, "learning_rate": 8.346303501945526e-06, "loss": 0.6419, "step": 38250 }, { "epoch": 4.96757457846952, "grad_norm": 215.7095184326172, "learning_rate": 8.344141807176827e-06, "loss": 0.6331, "step": 38300 }, { "epoch": 4.974059662775616, "grad_norm": 63.63528060913086, "learning_rate": 8.341980112408129e-06, "loss": 0.6632, "step": 38350 }, { "epoch": 4.980544747081712, "grad_norm": 44.25017547607422, "learning_rate": 8.33981841763943e-06, "loss": 0.6519, "step": 38400 }, { "epoch": 4.987029831387808, "grad_norm": 68.59965515136719, "learning_rate": 8.337656722870732e-06, "loss": 0.637, "step": 38450 }, { "epoch": 4.993514915693904, "grad_norm": 136.69644165039062, "learning_rate": 8.335495028102034e-06, "loss": 0.6145, "step": 38500 } ], "logging_steps": 50, "max_steps": 231300, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 38548, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4735422916204544e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }